stash

2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions
--- a/stash/config/scrapers/community/IAFD/IAFD.py
+++ b/stash/config/scrapers/community/IAFD/IAFD.py
@@ -0,0 +1,480 @@
+import argparse
+import json
+import random
+import re
+import requests
+import sys
+import time
+from typing import Iterable, Callable, TypeVar
+from datetime import datetime
+
+from py_common.util import guess_nationality
+import py_common.log as log
+
+
+try:
+    import cloudscraper
+except ModuleNotFoundError:
+    print(
+        "You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)",
+        file=sys.stderr,
+    )
+    print(
+        "If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper",
+        file=sys.stderr,
+    )
+    sys.exit()
+
+try:
+    from lxml import html
+except ModuleNotFoundError:
+    print(
+        "You need to install the lxml module. (https://lxml.de/installation.html#installation)",
+        file=sys.stderr,
+    )
+    print(
+        "If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml",
+        file=sys.stderr,
+    )
+    sys.exit()
+
+stash_date = "%Y-%m-%d"
+iafd_date = "%B %d, %Y"
+iafd_date_scene = "%b %d, %Y"
+
+T = TypeVar("T")
+
+
+def maybe(
+    values: Iterable[str], f: Callable[[str], (T | None)] = lambda x: x
+) -> T | None:
+    """
+    Returns the first value in values that is not "No data" after applying f to it
+    """
+    return next(
+        (f(x) for x in values if not re.search(r"(?i)no data|no director", x)), None
+    )
+
+
+def cleandict(d: dict):
+    return {k: v for k, v in d.items() if v}
+
+
+def map_gender(gender: str):
+    genders = {
+        "f": "Female",
+        "m": "Male",
+    }
+    return genders.get(gender, gender)
+
+
+def clean_date(date: str) -> str | None:
+    date = date.strip()
+    cleaned = re.sub(r"(\S+\s+\d+,\s+\d+).*", r"\1", date)
+    for date_format in [iafd_date, iafd_date_scene]:
+        try:
+            return datetime.strptime(cleaned, date_format).strftime(stash_date)
+        except ValueError:
+            pass
+    log.warning(f"Unable to parse '{date}' as a date")
+
+
+def clean_alias(alias: str) -> str | None:
+    # Aliases like "X or Y or Z" are indeterminate
+    # and should not be included
+    if " or " in alias:
+        return None
+    # We do not want studio disambiguation: "X (studio.com)" -> "X"
+    return re.sub(r"\s*\(.*$", "", alias)
+
+
+def performer_haircolor(tree):
+    return maybe(
+        tree.xpath(
+            '//div/p[starts-with(.,"Hair Color")]/following-sibling::p[1]//text()'
+        )
+    )
+
+
+def performer_weight(tree):
+    return maybe(
+        tree.xpath('//div/p[text()="Weight"]/following-sibling::p[1]//text()'),
+        lambda w: re.sub(r".*\((\d+)\s+kg.*", r"\1", w),
+    )
+
+
+def performer_height(tree):
+    return maybe(
+        tree.xpath('//div/p[text()="Height"]/following-sibling::p[1]//text()'),
+        lambda h: re.sub(r".*\((\d+)\s+cm.*", r"\1", h),
+    )
+
+
+def performer_country(tree):
+    return maybe(
+        tree.xpath('//div/p[text()="Nationality"]/following-sibling::p[1]//text()'),
+        lambda c: guess_nationality(re.sub(r"^American,.+", "American", c)),
+    )
+
+
+def performer_ethnicity(tree):
+    return maybe(
+        tree.xpath('//div[p[text()="Ethnicity"]]/p[@class="biodata"][1]//text()')
+    )
+
+
+def performer_deathdate(tree):
+    return maybe(
+        tree.xpath(
+            '(//p[@class="bioheading"][text()="Date of Death"]/following-sibling::p)[1]//text()'
+        ),
+        clean_date,
+    )
+
+
+def performer_birthdate(tree):
+    return maybe(
+        tree.xpath(
+            '(//p[@class="bioheading"][text()="Birthday"]/following-sibling::p)[1]//text()'
+        ),
+        clean_date,
+    )
+
+
+def performer_instagram(tree):
+    return maybe(
+        tree.xpath(
+            '//p[@class="biodata"]/a[contains(text(),"http://instagram.com/")]/@href'
+        )
+    )
+
+
+def performer_twitter(tree):
+    return maybe(
+        tree.xpath(
+            '//p[@class="biodata"]/a[contains(text(),"http://twitter.com/")]/@href'
+        )
+    )
+
+
+def performer_url(tree):
+    return maybe(
+        tree.xpath('//div[@id="perfwith"]//*[contains(@href,"person.rme")]/@href'),
+        lambda u: f"https://www.iafd.com{u}",
+    )
+
+
+def performer_gender(tree):
+    def prepend_transgender(gender: str):
+        perf_id = next(
+            iter(tree.xpath('//form[@id="correct"]/input[@name="PerfID"]/@value')), ""
+        )
+        trans = (
+            "Transgender "
+            # IAFD are not consistent with their URLs
+            if any(mark in perf_id for mark in ("_ts", "_ftm", "_mtf"))
+            else ""
+        )
+        return trans + map_gender(gender)
+
+    return maybe(
+        tree.xpath('//form[@id="correct"]/input[@name="Gender"]/@value'),
+        prepend_transgender,
+    )
+
+
+def performer_name(tree):
+    return maybe(tree.xpath("//h1/text()"), lambda name: name.strip())
+
+
+def performer_piercings(tree):
+    return maybe(
+        tree.xpath('//div/p[text()="Piercings"]/following-sibling::p[1]//text()')
+    )
+
+
+def performer_tattoos(tree):
+    return maybe(
+        tree.xpath('//div/p[text()="Tattoos"]/following-sibling::p[1]//text()')
+    )
+
+
+def performer_aliases(tree):
+    return maybe(
+        tree.xpath(
+            '//div[p[@class="bioheading" and contains(normalize-space(text()),"Performer AKA")]]//div[@class="biodata" and not(text()="No known aliases")]/text()'
+        ),
+        lambda aliases: ", ".join(
+            filter(None, (clean_alias(alias) for alias in aliases.split(", ")))
+        ),
+    )
+
+
+def performer_careerlength(tree):
+    return maybe(
+        tree.xpath(
+            '//div/p[@class="biodata"][contains(text(),"Started around")]/text()'
+        ),
+        lambda c: re.sub(r"(\D+\d\d\D+)$", "", c),
+    )
+
+
+def performer_measurements(tree):
+    return maybe(
+        tree.xpath('//div/p[text()="Measurements"]/following-sibling::p[1]//text()')
+    )
+
+
+def scene_director(tree):
+    return maybe(
+        tree.xpath(
+            '//p[@class="bioheading"][text()="Director" or text()="Directors"]/following-sibling::p[1]//text()'
+        ),
+        lambda d: d.strip(),
+    )
+
+
+def scene_studio(tree):
+    return maybe(
+        tree.xpath(
+            '//div[@class="col-xs-12 col-sm-3"]//p[text() = "Studio"]/following-sibling::p[1]//text()'
+        ),
+        lambda s: {"name": s},
+    )
+
+
+def scene_details(tree):
+    return maybe(tree.xpath('//div[@id="synopsis"]/div[@class="padded-panel"]//text()'))
+
+
+def scene_date(tree):
+    return maybe(
+        tree.xpath(
+            '//div[@class="col-xs-12 col-sm-3"]//p[text() = "Release Date"]/following-sibling::p[1]//text()'
+        ),
+        clean_date,
+    )
+
+
+def scene_title(tree):
+    return maybe(
+        tree.xpath("//h1/text()"), lambda t: re.sub(r"\s*\(\d{4}\)$", "", t.strip())
+    )
+
+
+def movie_studio(tree):
+    return maybe(
+        tree.xpath(
+            '//p[@class="bioheading"][contains(text(),"Studio" or contains(text(),"Distributor"))]/following-sibling::p[@class="biodata"][1]//text()'
+        ),
+        lambda s: {"name": s},
+    )
+
+
+def movie_date(tree):
+    # If there's no release date we will use the year from the title for an approximate date
+    return maybe(
+        tree.xpath(
+            '//p[@class="bioheading"][contains(text(), "Release Date")]/following-sibling::p[@class="biodata"][1]/text()'
+        ),
+        lambda d: clean_date(d.strip()),
+    ) or maybe(
+        tree.xpath("//h1/text()"),
+        lambda t: re.sub(r".*\(([0-9]+)\).*$", r"\1-01-01", t),
+    )
+
+
+def movie_duration(tree):
+    # Convert duration from minutes to seconds, but keep it a string because that's what stash expects
+    return maybe(
+        tree.xpath(
+            '//p[@class="bioheading"][contains(text(), "Minutes")]/following-sibling::p[@class="biodata"][1]/text()'
+        ),
+        lambda d: str(int(d) * 60),
+    )
+
+
+def movie_synopsis(tree):
+    return maybe(tree.xpath('//div[@id="synopsis"]/div[@class="padded-panel"]//text()'))
+
+
+def movie_director(tree):
+    return maybe(
+        tree.xpath(
+            '//p[@class="bioheading"][contains(text(), "Directors")]/following-sibling::p[@class="biodata"][1]/a/text()'
+        ),
+        lambda d: d.strip(),
+    )
+
+
+def movie_title(tree):
+    return maybe(
+        tree.xpath("//h1/text()"), lambda t: re.sub(r"\s*\(\d+\)$", "", t.strip())
+    )
+
+
+# Only create a single scraper: this saves time when scraping multiple pages
+# because it doesn't need to get past Cloudflare each time
+scraper = cloudscraper.create_scraper()
+
+
+def scrape(url: str, retries=0):
+    try:
+        scraped = scraper.get(url, timeout=(3, 7))
+    except requests.exceptions.Timeout as exc_time:
+        log.debug(f"Timeout: {exc_time}")
+        return scrape(url, retries + 1)
+    except Exception as e:
+        log.error(f"scrape error {e}")
+        sys.exit(1)
+    if scraped.status_code >= 400:
+        if retries < 10:
+            wait_time = random.randint(1, 4)
+            log.debug(f"HTTP Error: {scraped.status_code}, waiting {wait_time} seconds")
+            time.sleep(wait_time)
+            return scrape(url, retries + 1)
+        log.error(f"HTTP Error: {scraped.status_code}, giving up")
+        sys.exit(1)
+    return html.fromstring(scraped.content)
+
+
+def performer_query(query):
+    tree = scrape(
+        f"https://www.iafd.com/results.asp?searchtype=comprehensive&searchstring={query}"
+    )
+    performer_names = tree.xpath(
+        '//table[@id="tblFem" or @id="tblMal"]//td[a[img]]/following-sibling::td[1]/a/text()'
+    )
+    performer_urls = tree.xpath(
+        '//table[@id="tblFem" or @id="tblMal"]//td[a[img]]/following-sibling::td[1]/a/@href'
+    )
+    performers = [
+        {
+            "Name": name,
+            "URL": f"https://www.iafd.com{url}",
+        }
+        for name, url in zip(performer_names, performer_urls)
+    ]
+    if not performers:
+        log.warning(f"No performers found for '{query}'")
+    return performers
+
+
+def performer_from_tree(tree):
+    return {
+        "name": performer_name(tree),
+        "gender": performer_gender(tree),
+        "url": performer_url(tree),
+        "twitter": performer_twitter(tree),
+        "instagram": performer_instagram(tree),
+        "birthdate": performer_birthdate(tree),
+        "death_date": performer_deathdate(tree),
+        "ethnicity": performer_ethnicity(tree),
+        "country": performer_country(tree),
+        "height": performer_height(tree),
+        "weight": performer_weight(tree),
+        "hair_color": performer_haircolor(tree),
+        "measurements": performer_measurements(tree),
+        "career_length": performer_careerlength(tree),
+        "aliases": performer_aliases(tree),
+        "tattoos": performer_tattoos(tree),
+        "piercings": performer_piercings(tree),
+        "images": tree.xpath('//div[@id="headshot"]//img/@src'),
+    }
+
+
+def scene_from_tree(tree):
+    return {
+        "title": scene_title(tree),
+        "date": scene_date(tree),
+        "details": scene_details(tree),
+        "director": scene_director(tree),
+        "studio": scene_studio(tree),
+        "performers": [
+            {
+                "name": p.text_content(),
+                "url": f"https://www.iafd.com{p.get('href')}",
+                "images": p.xpath("img/@src"),
+            }
+            for p in tree.xpath('//div[@class="castbox"]/p/a')
+        ],
+    }
+
+
+def movie_from_tree(tree):
+    return {
+        "name": movie_title(tree),
+        "director": movie_director(tree),
+        "synopsis": movie_synopsis(tree),
+        "duration": movie_duration(tree),
+        "date": movie_date(tree),
+        "aliases": ", ".join(tree.xpath('//div[@class="col-sm-12"]/dl/dd//text()')),
+        "studio": movie_studio(tree),
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser("IAFD Scraper", argument_default="")
+    subparsers = parser.add_subparsers(
+        dest="operation", help="Operation to perform", required=True
+    )
+
+    subparsers.add_parser("search", help="Search for performers").add_argument(
+        "name", nargs="?", help="Name to search for"
+    )
+    subparsers.add_parser("performer", help="Scrape a performer").add_argument(
+        "url", nargs="?", help="Performer URL"
+    )
+    subparsers.add_parser("movie", help="Scrape a movie").add_argument(
+        "url", nargs="?", help="Movie URL"
+    )
+    subparsers.add_parser("scene", help="Scrape a scene").add_argument(
+        "url", nargs="?", help="Scene URL"
+    )
+
+    if len(sys.argv) == 1:
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+
+    args = parser.parse_args()
+    log.debug(f"Arguments from commandline: {args}")
+    # Script is being piped into, probably by Stash
+    if not sys.stdin.isatty():
+        try:
+            frag = json.load(sys.stdin)
+            args.__dict__.update(frag)
+            log.debug(f"With arguments from stdin: {args}")
+        except json.decoder.JSONDecodeError:
+            log.error("Received invalid JSON from stdin")
+            sys.exit(1)
+
+    if args.operation == "search":
+        name = args.name
+        if not name:
+            log.error("No query provided")
+            sys.exit(1)
+        log.debug(f"Searching for '{name}'")
+        matches = performer_query(name)
+        print(json.dumps(matches))
+        sys.exit(0)
+
+    url = args.url
+    if not url:
+        log.error("No URL provided")
+        sys.exit(1)
+
+    log.debug(f"{args.operation} scraping '{url}'")
+    scraped = scrape(url)
+    result = {}
+    if args.operation == "performer":
+        result = performer_from_tree(scraped)
+    elif args.operation == "movie":
+        result = movie_from_tree(scraped)
+    elif args.operation == "scene":
+        result = scene_from_tree(scraped)
+
+    print(json.dumps(cleandict(result)))
+
+
+if __name__ == "__main__":
+    main()