stash
This commit is contained in:
480
stash/config/scrapers/community/IAFD/IAFD.py
Normal file
480
stash/config/scrapers/community/IAFD/IAFD.py
Normal file
@@ -0,0 +1,480 @@
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import requests
|
||||
import sys
|
||||
import time
|
||||
from typing import Iterable, Callable, TypeVar
|
||||
from datetime import datetime
|
||||
|
||||
from py_common.util import guess_nationality
|
||||
import py_common.log as log
|
||||
|
||||
|
||||
try:
|
||||
import cloudscraper
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
"If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
from lxml import html
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"You need to install the lxml module. (https://lxml.de/installation.html#installation)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
"If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
stash_date = "%Y-%m-%d"
|
||||
iafd_date = "%B %d, %Y"
|
||||
iafd_date_scene = "%b %d, %Y"
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def maybe(
|
||||
values: Iterable[str], f: Callable[[str], (T | None)] = lambda x: x
|
||||
) -> T | None:
|
||||
"""
|
||||
Returns the first value in values that is not "No data" after applying f to it
|
||||
"""
|
||||
return next(
|
||||
(f(x) for x in values if not re.search(r"(?i)no data|no director", x)), None
|
||||
)
|
||||
|
||||
|
||||
def cleandict(d: dict):
|
||||
return {k: v for k, v in d.items() if v}
|
||||
|
||||
|
||||
def map_gender(gender: str):
|
||||
genders = {
|
||||
"f": "Female",
|
||||
"m": "Male",
|
||||
}
|
||||
return genders.get(gender, gender)
|
||||
|
||||
|
||||
def clean_date(date: str) -> str | None:
|
||||
date = date.strip()
|
||||
cleaned = re.sub(r"(\S+\s+\d+,\s+\d+).*", r"\1", date)
|
||||
for date_format in [iafd_date, iafd_date_scene]:
|
||||
try:
|
||||
return datetime.strptime(cleaned, date_format).strftime(stash_date)
|
||||
except ValueError:
|
||||
pass
|
||||
log.warning(f"Unable to parse '{date}' as a date")
|
||||
|
||||
|
||||
def clean_alias(alias: str) -> str | None:
|
||||
# Aliases like "X or Y or Z" are indeterminate
|
||||
# and should not be included
|
||||
if " or " in alias:
|
||||
return None
|
||||
# We do not want studio disambiguation: "X (studio.com)" -> "X"
|
||||
return re.sub(r"\s*\(.*$", "", alias)
|
||||
|
||||
|
||||
def performer_haircolor(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//div/p[starts-with(.,"Hair Color")]/following-sibling::p[1]//text()'
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def performer_weight(tree):
|
||||
return maybe(
|
||||
tree.xpath('//div/p[text()="Weight"]/following-sibling::p[1]//text()'),
|
||||
lambda w: re.sub(r".*\((\d+)\s+kg.*", r"\1", w),
|
||||
)
|
||||
|
||||
|
||||
def performer_height(tree):
|
||||
return maybe(
|
||||
tree.xpath('//div/p[text()="Height"]/following-sibling::p[1]//text()'),
|
||||
lambda h: re.sub(r".*\((\d+)\s+cm.*", r"\1", h),
|
||||
)
|
||||
|
||||
|
||||
def performer_country(tree):
|
||||
return maybe(
|
||||
tree.xpath('//div/p[text()="Nationality"]/following-sibling::p[1]//text()'),
|
||||
lambda c: guess_nationality(re.sub(r"^American,.+", "American", c)),
|
||||
)
|
||||
|
||||
|
||||
def performer_ethnicity(tree):
|
||||
return maybe(
|
||||
tree.xpath('//div[p[text()="Ethnicity"]]/p[@class="biodata"][1]//text()')
|
||||
)
|
||||
|
||||
|
||||
def performer_deathdate(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'(//p[@class="bioheading"][text()="Date of Death"]/following-sibling::p)[1]//text()'
|
||||
),
|
||||
clean_date,
|
||||
)
|
||||
|
||||
|
||||
def performer_birthdate(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'(//p[@class="bioheading"][text()="Birthday"]/following-sibling::p)[1]//text()'
|
||||
),
|
||||
clean_date,
|
||||
)
|
||||
|
||||
|
||||
def performer_instagram(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//p[@class="biodata"]/a[contains(text(),"http://instagram.com/")]/@href'
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def performer_twitter(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//p[@class="biodata"]/a[contains(text(),"http://twitter.com/")]/@href'
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def performer_url(tree):
|
||||
return maybe(
|
||||
tree.xpath('//div[@id="perfwith"]//*[contains(@href,"person.rme")]/@href'),
|
||||
lambda u: f"https://www.iafd.com{u}",
|
||||
)
|
||||
|
||||
|
||||
def performer_gender(tree):
|
||||
def prepend_transgender(gender: str):
|
||||
perf_id = next(
|
||||
iter(tree.xpath('//form[@id="correct"]/input[@name="PerfID"]/@value')), ""
|
||||
)
|
||||
trans = (
|
||||
"Transgender "
|
||||
# IAFD are not consistent with their URLs
|
||||
if any(mark in perf_id for mark in ("_ts", "_ftm", "_mtf"))
|
||||
else ""
|
||||
)
|
||||
return trans + map_gender(gender)
|
||||
|
||||
return maybe(
|
||||
tree.xpath('//form[@id="correct"]/input[@name="Gender"]/@value'),
|
||||
prepend_transgender,
|
||||
)
|
||||
|
||||
|
||||
def performer_name(tree):
|
||||
return maybe(tree.xpath("//h1/text()"), lambda name: name.strip())
|
||||
|
||||
|
||||
def performer_piercings(tree):
|
||||
return maybe(
|
||||
tree.xpath('//div/p[text()="Piercings"]/following-sibling::p[1]//text()')
|
||||
)
|
||||
|
||||
|
||||
def performer_tattoos(tree):
|
||||
return maybe(
|
||||
tree.xpath('//div/p[text()="Tattoos"]/following-sibling::p[1]//text()')
|
||||
)
|
||||
|
||||
|
||||
def performer_aliases(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//div[p[@class="bioheading" and contains(normalize-space(text()),"Performer AKA")]]//div[@class="biodata" and not(text()="No known aliases")]/text()'
|
||||
),
|
||||
lambda aliases: ", ".join(
|
||||
filter(None, (clean_alias(alias) for alias in aliases.split(", ")))
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def performer_careerlength(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//div/p[@class="biodata"][contains(text(),"Started around")]/text()'
|
||||
),
|
||||
lambda c: re.sub(r"(\D+\d\d\D+)$", "", c),
|
||||
)
|
||||
|
||||
|
||||
def performer_measurements(tree):
|
||||
return maybe(
|
||||
tree.xpath('//div/p[text()="Measurements"]/following-sibling::p[1]//text()')
|
||||
)
|
||||
|
||||
|
||||
def scene_director(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//p[@class="bioheading"][text()="Director" or text()="Directors"]/following-sibling::p[1]//text()'
|
||||
),
|
||||
lambda d: d.strip(),
|
||||
)
|
||||
|
||||
|
||||
def scene_studio(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//div[@class="col-xs-12 col-sm-3"]//p[text() = "Studio"]/following-sibling::p[1]//text()'
|
||||
),
|
||||
lambda s: {"name": s},
|
||||
)
|
||||
|
||||
|
||||
def scene_details(tree):
|
||||
return maybe(tree.xpath('//div[@id="synopsis"]/div[@class="padded-panel"]//text()'))
|
||||
|
||||
|
||||
def scene_date(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//div[@class="col-xs-12 col-sm-3"]//p[text() = "Release Date"]/following-sibling::p[1]//text()'
|
||||
),
|
||||
clean_date,
|
||||
)
|
||||
|
||||
|
||||
def scene_title(tree):
|
||||
return maybe(
|
||||
tree.xpath("//h1/text()"), lambda t: re.sub(r"\s*\(\d{4}\)$", "", t.strip())
|
||||
)
|
||||
|
||||
|
||||
def movie_studio(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//p[@class="bioheading"][contains(text(),"Studio" or contains(text(),"Distributor"))]/following-sibling::p[@class="biodata"][1]//text()'
|
||||
),
|
||||
lambda s: {"name": s},
|
||||
)
|
||||
|
||||
|
||||
def movie_date(tree):
|
||||
# If there's no release date we will use the year from the title for an approximate date
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//p[@class="bioheading"][contains(text(), "Release Date")]/following-sibling::p[@class="biodata"][1]/text()'
|
||||
),
|
||||
lambda d: clean_date(d.strip()),
|
||||
) or maybe(
|
||||
tree.xpath("//h1/text()"),
|
||||
lambda t: re.sub(r".*\(([0-9]+)\).*$", r"\1-01-01", t),
|
||||
)
|
||||
|
||||
|
||||
def movie_duration(tree):
|
||||
# Convert duration from minutes to seconds, but keep it a string because that's what stash expects
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//p[@class="bioheading"][contains(text(), "Minutes")]/following-sibling::p[@class="biodata"][1]/text()'
|
||||
),
|
||||
lambda d: str(int(d) * 60),
|
||||
)
|
||||
|
||||
|
||||
def movie_synopsis(tree):
|
||||
return maybe(tree.xpath('//div[@id="synopsis"]/div[@class="padded-panel"]//text()'))
|
||||
|
||||
|
||||
def movie_director(tree):
|
||||
return maybe(
|
||||
tree.xpath(
|
||||
'//p[@class="bioheading"][contains(text(), "Directors")]/following-sibling::p[@class="biodata"][1]/a/text()'
|
||||
),
|
||||
lambda d: d.strip(),
|
||||
)
|
||||
|
||||
|
||||
def movie_title(tree):
|
||||
return maybe(
|
||||
tree.xpath("//h1/text()"), lambda t: re.sub(r"\s*\(\d+\)$", "", t.strip())
|
||||
)
|
||||
|
||||
|
||||
# Only create a single scraper: this saves time when scraping multiple pages
|
||||
# because it doesn't need to get past Cloudflare each time
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
|
||||
def scrape(url: str, retries=0):
|
||||
try:
|
||||
scraped = scraper.get(url, timeout=(3, 7))
|
||||
except requests.exceptions.Timeout as exc_time:
|
||||
log.debug(f"Timeout: {exc_time}")
|
||||
return scrape(url, retries + 1)
|
||||
except Exception as e:
|
||||
log.error(f"scrape error {e}")
|
||||
sys.exit(1)
|
||||
if scraped.status_code >= 400:
|
||||
if retries < 10:
|
||||
wait_time = random.randint(1, 4)
|
||||
log.debug(f"HTTP Error: {scraped.status_code}, waiting {wait_time} seconds")
|
||||
time.sleep(wait_time)
|
||||
return scrape(url, retries + 1)
|
||||
log.error(f"HTTP Error: {scraped.status_code}, giving up")
|
||||
sys.exit(1)
|
||||
return html.fromstring(scraped.content)
|
||||
|
||||
|
||||
def performer_query(query):
|
||||
tree = scrape(
|
||||
f"https://www.iafd.com/results.asp?searchtype=comprehensive&searchstring={query}"
|
||||
)
|
||||
performer_names = tree.xpath(
|
||||
'//table[@id="tblFem" or @id="tblMal"]//td[a[img]]/following-sibling::td[1]/a/text()'
|
||||
)
|
||||
performer_urls = tree.xpath(
|
||||
'//table[@id="tblFem" or @id="tblMal"]//td[a[img]]/following-sibling::td[1]/a/@href'
|
||||
)
|
||||
performers = [
|
||||
{
|
||||
"Name": name,
|
||||
"URL": f"https://www.iafd.com{url}",
|
||||
}
|
||||
for name, url in zip(performer_names, performer_urls)
|
||||
]
|
||||
if not performers:
|
||||
log.warning(f"No performers found for '{query}'")
|
||||
return performers
|
||||
|
||||
|
||||
def performer_from_tree(tree):
|
||||
return {
|
||||
"name": performer_name(tree),
|
||||
"gender": performer_gender(tree),
|
||||
"url": performer_url(tree),
|
||||
"twitter": performer_twitter(tree),
|
||||
"instagram": performer_instagram(tree),
|
||||
"birthdate": performer_birthdate(tree),
|
||||
"death_date": performer_deathdate(tree),
|
||||
"ethnicity": performer_ethnicity(tree),
|
||||
"country": performer_country(tree),
|
||||
"height": performer_height(tree),
|
||||
"weight": performer_weight(tree),
|
||||
"hair_color": performer_haircolor(tree),
|
||||
"measurements": performer_measurements(tree),
|
||||
"career_length": performer_careerlength(tree),
|
||||
"aliases": performer_aliases(tree),
|
||||
"tattoos": performer_tattoos(tree),
|
||||
"piercings": performer_piercings(tree),
|
||||
"images": tree.xpath('//div[@id="headshot"]//img/@src'),
|
||||
}
|
||||
|
||||
|
||||
def scene_from_tree(tree):
|
||||
return {
|
||||
"title": scene_title(tree),
|
||||
"date": scene_date(tree),
|
||||
"details": scene_details(tree),
|
||||
"director": scene_director(tree),
|
||||
"studio": scene_studio(tree),
|
||||
"performers": [
|
||||
{
|
||||
"name": p.text_content(),
|
||||
"url": f"https://www.iafd.com{p.get('href')}",
|
||||
"images": p.xpath("img/@src"),
|
||||
}
|
||||
for p in tree.xpath('//div[@class="castbox"]/p/a')
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def movie_from_tree(tree):
|
||||
return {
|
||||
"name": movie_title(tree),
|
||||
"director": movie_director(tree),
|
||||
"synopsis": movie_synopsis(tree),
|
||||
"duration": movie_duration(tree),
|
||||
"date": movie_date(tree),
|
||||
"aliases": ", ".join(tree.xpath('//div[@class="col-sm-12"]/dl/dd//text()')),
|
||||
"studio": movie_studio(tree),
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser("IAFD Scraper", argument_default="")
|
||||
subparsers = parser.add_subparsers(
|
||||
dest="operation", help="Operation to perform", required=True
|
||||
)
|
||||
|
||||
subparsers.add_parser("search", help="Search for performers").add_argument(
|
||||
"name", nargs="?", help="Name to search for"
|
||||
)
|
||||
subparsers.add_parser("performer", help="Scrape a performer").add_argument(
|
||||
"url", nargs="?", help="Performer URL"
|
||||
)
|
||||
subparsers.add_parser("movie", help="Scrape a movie").add_argument(
|
||||
"url", nargs="?", help="Movie URL"
|
||||
)
|
||||
subparsers.add_parser("scene", help="Scrape a scene").add_argument(
|
||||
"url", nargs="?", help="Scene URL"
|
||||
)
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
parser.print_help(sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
args = parser.parse_args()
|
||||
log.debug(f"Arguments from commandline: {args}")
|
||||
# Script is being piped into, probably by Stash
|
||||
if not sys.stdin.isatty():
|
||||
try:
|
||||
frag = json.load(sys.stdin)
|
||||
args.__dict__.update(frag)
|
||||
log.debug(f"With arguments from stdin: {args}")
|
||||
except json.decoder.JSONDecodeError:
|
||||
log.error("Received invalid JSON from stdin")
|
||||
sys.exit(1)
|
||||
|
||||
if args.operation == "search":
|
||||
name = args.name
|
||||
if not name:
|
||||
log.error("No query provided")
|
||||
sys.exit(1)
|
||||
log.debug(f"Searching for '{name}'")
|
||||
matches = performer_query(name)
|
||||
print(json.dumps(matches))
|
||||
sys.exit(0)
|
||||
|
||||
url = args.url
|
||||
if not url:
|
||||
log.error("No URL provided")
|
||||
sys.exit(1)
|
||||
|
||||
log.debug(f"{args.operation} scraping '{url}'")
|
||||
scraped = scrape(url)
|
||||
result = {}
|
||||
if args.operation == "performer":
|
||||
result = performer_from_tree(scraped)
|
||||
elif args.operation == "movie":
|
||||
result = movie_from_tree(scraped)
|
||||
elif args.operation == "scene":
|
||||
result = scene_from_tree(scraped)
|
||||
|
||||
print(json.dumps(cleandict(result)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
34
stash/config/scrapers/community/IAFD/IAFD.yml
Normal file
34
stash/config/scrapers/community/IAFD/IAFD.yml
Normal file
@@ -0,0 +1,34 @@
|
||||
name: IAFD
|
||||
# requires: py_common
|
||||
|
||||
performerByName:
|
||||
action: script
|
||||
script:
|
||||
- python3
|
||||
- IAFD.py
|
||||
- search
|
||||
performerByURL:
|
||||
- url:
|
||||
- iafd.com/person.rme/perfid=
|
||||
action: script
|
||||
script:
|
||||
- python3
|
||||
- IAFD.py
|
||||
- performer
|
||||
sceneByURL:
|
||||
- url:
|
||||
- iafd.com/title.rme/id=
|
||||
action: script
|
||||
script:
|
||||
- python3
|
||||
- IAFD.py
|
||||
- scene
|
||||
movieByURL:
|
||||
- url:
|
||||
- iafd.com/title.rme/id=
|
||||
action: script
|
||||
script:
|
||||
- python3
|
||||
- IAFD.py
|
||||
- movie
|
||||
# Last Updated September 25, 2023
|
||||
10
stash/config/scrapers/community/IAFD/manifest
Executable file
10
stash/config/scrapers/community/IAFD/manifest
Executable file
@@ -0,0 +1,10 @@
|
||||
id: IAFD
|
||||
name: IAFD
|
||||
metadata: {}
|
||||
version: becae45
|
||||
date: "2024-03-09 10:33:32"
|
||||
requires: []
|
||||
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||
files:
|
||||
- IAFD.py
|
||||
- IAFD.yml
|
||||
Reference in New Issue
Block a user