stash

2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions
--- a/stash/config/scrapers/community/GEVI/GEVI.py
+++ b/stash/config/scrapers/community/GEVI/GEVI.py
@@ -0,0 +1,300 @@
+import json
+import re
+import sys
+from urllib.parse import urlparse, urlencode
+
+from bs4 import BeautifulSoup, Tag
+import cloudscraper
+
+from py_common.config import get_config
+from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedMovie, ScrapedStudio
+from py_common.util import scraper_args
+import py_common.log as log
+
+
+config = get_config(
+    default="""# Should we include the parenthesized disambiguation in performer names?
+# false: "John Doe" 
+# true:  "John Doe (II)"
+
+# For names
+disambiguate_names = False
+
+# For aliases
+disambiguate_aliases = False
+"""
+)
+
+
+base_url = urlparse("https://gayeroticvideoindex.com")
+
+
+def abs_url(url: str) -> str:
+    if url.startswith("http"):
+        return url
+    return base_url._replace(path=url).geturl()
+
+
+scraper = cloudscraper.create_scraper()
+scraper.headers.update({"Referer": base_url.netloc})
+
+
+def parse_name(name: str) -> tuple[str, str | None]:
+    "Parses a name and optional disambiguation from a string"
+    match = re.match(r"^(.+?)(?:\s*\((.*?)\))?$", name)
+    if match:
+        return match.group(1), match.group(2)
+    return name, None
+
+
+# For performer/studio links from episode/movie pages
+def name_with_url(link: Tag) -> dict:
+    name = link.get_text(strip=True)
+    if not config.disambiguate_names:
+        name, _ = parse_name(name)
+
+    performer = {"name": name}
+    if (url := link.get("href")) and isinstance(url, str):
+        performer["url"] = abs_url(url)
+    return performer
+
+
+# Not really a HTML table, but the layout is consistent
+def from_table(soup: Tag, key: str) -> str | None:
+    if (tag := soup.find("div", string=key)) and (value := tag.find_next("div")):
+        return value.get_text()
+
+
+def scene_from_url(url: str) -> ScrapedScene | None:
+    res = scraper.get(url)
+    soup = BeautifulSoup(res.text, "html.parser")
+    soup = soup.select("div#data section")
+    if not soup:
+        log.error(f"Cannot find episode section in {url}")
+        return None
+
+    soup = soup[0]
+
+    scene: ScrapedScene = {}
+
+    if title := soup.find("h1"):
+        scene["title"] = title.get_text(strip=True)
+
+    if image := soup.find("img", src=lambda x: "Episodes" in x):
+        scene["image"] = abs_url(image["src"])  # type: ignore
+
+    if details := soup.find("p"):
+        scene["details"] = details.get_text(strip=True)
+
+    if (date := soup.find("span", string="Date:")) and (date := date.next_sibling):
+        scene["date"] = date.get_text(strip=True)
+
+    if performers := soup.find_all("a", href=lambda x: "performer" in x):
+        scene["performers"] = [  # type: ignore
+            {**name_with_url(p), "gender": "MALE"} for p in performers
+        ]
+
+    if studio := soup.find("a", href=lambda x: "company" in x):
+        scene["studio"] = name_with_url(studio)  # type: ignore
+
+    scene["url"] = url
+
+    return scene
+
+
+def scene_from_fragment(args: dict) -> ScrapedScene | None:
+    if url := args.get("url"):
+        return scene_from_url(url)
+    log.error("Cannot scrape scene without a URL")
+
+
+hair_map = {
+    "Blond": "Blonde",
+    "Brown": "Brunette",
+}
+
+# GEVI tracks skin color so there's no way to really know ethnicity
+ethnicity_map = {
+    "White": "Caucasian",
+}
+
+
+def performer_from_url(url: str) -> ScrapedPerformer | None:
+    res = scraper.get(url)
+    soup = BeautifulSoup(res.text, "html.parser")
+    soup = soup.select("div#data section")
+    if not soup:
+        log.error(f"Cannot find performer section in {url}")
+        return None
+
+    soup = soup[0]
+
+    if not (name := soup.find("h1", attrs={"class": "text-yellow-200"})):
+        log.error(f"Cannot find performer name in {url}")
+        return None
+
+    if config.disambiguate_names:
+        name, disambiguation = parse_name(name.text)
+    else:
+        name = name.text
+        disambiguation = None
+
+    performer: ScrapedPerformer = {
+        "name": name,
+        "url": url,
+        "gender": "MALE",
+    }
+
+    if disambiguation:
+        performer["disambiguation"] = disambiguation
+
+    if image := soup.find("img", src=lambda x: "Stars" in x):
+        performer["image"] = base_url._replace(path=image["src"]).geturl()  # type: ignore
+
+    if (hair_color := from_table(soup, "Hair:")) and (hair := hair_map.get(hair_color)):
+        performer["hair_color"] = hair.split(",")[0]  # type: ignore because we've mapped all hair colors
+
+    if eye_color := from_table(soup, "Eyes:"):
+        performer["eye_color"] = eye_color  # type: ignore
+
+    if height := from_table(soup, "Height:"):
+        performer["height"] = height.split("/")[-1].strip().removesuffix("cm")
+
+    if foreskin := from_table(soup, "Foreskin:"):
+        performer["circumcised"] = foreskin
+
+    if dick_size := from_table(soup, "Dick Size:"):
+        performer["penis_length"] = dick_size.split("/")[-1].strip().removesuffix("cm")
+
+    if weight := from_table(soup, "Weight:"):
+        performer["weight"] = weight.split("/")[-1].strip().removesuffix("kg")
+
+    if tattoos := from_table(soup, "Tattoos:"):
+        performer["tattoos"] = tattoos.strip()
+
+    if skin_color := from_table(soup, "Skin:"):
+        performer["ethnicity"] = ethnicity_map.get(skin_color, skin_color)  # type: ignore
+
+    if country := from_table(soup, "From:"):
+        performer["country"] = country.split(",")[-1].strip()
+
+    if birth_year := from_table(soup, "Born:"):
+        # Unfortunately GEVI only tracks birth years, not full dates
+        performer["birthdate"] = f"{birth_year}-01-01"
+
+    if death_year := from_table(soup, "Died:"):
+        performer["death_date"] = f"{death_year}-01-01"
+
+    if (bio := soup.find("div", string="Notes:")) and (bio := bio.find_next("div")):
+        performer["details"] = bio.get_text(separator="\n")
+
+    if aliases := soup.find_all("h2"):
+        if config.disambiguate_aliases:
+            performer["aliases"] = ", ".join(alias.text for alias in aliases)
+        else:
+            deduplicated = {parse_name(alias.text)[0] for alias in aliases}
+            performer["aliases"] = ", ".join(sorted(deduplicated))
+
+    return performer
+
+
+def performer_from_fragment(args: dict) -> ScrapedPerformer | None:
+    if url := args.get("url"):
+        return performer_from_url(url)
+    elif (name := args.get("name")) and (
+        candidate := next(iter(performer_search(name)), None)
+    ):
+        return performer_from_url(candidate["url"])  # type: ignore because we know url will be set
+    log.error("Cannot scrape performer without a URL or name")
+
+
+def performer_search(name: str) -> list[ScrapedPerformer]:
+    search_params = {
+        "draw": 2,
+        "start": 0,
+        "length": 10,
+        "search[value]": name,
+        "search[regex]": "false",
+    }
+    search_url = base_url._replace(path="shpr", query=urlencode(search_params)).geturl()
+    res = scraper.get(search_url)
+    found = [BeautifulSoup(x[1], "html.parser").contents[0] for x in res.json()["data"]]
+    return [
+        {"name": found.text, "url": base_url._replace(path=found["href"]).geturl()}  # type: ignore
+        for found in found
+    ]
+
+
+def movie_from_url(url: str) -> ScrapedMovie | None:
+    res = scraper.get(url)
+    soup = BeautifulSoup(res.text, "html.parser")
+    movie_section = next(iter(soup.select("section#data section")), None)
+    if not movie_section:
+        log.error(f"Cannot find movie section in {url}")
+        return None
+
+    movie: ScrapedMovie = {}
+
+    if name := movie_section.find("h1"):
+        movie["name"] = name.get_text(strip=True)
+
+    if covers := movie_section.find_all("img", src=lambda x: "Covers" in x):
+        movie["front_image"] = abs_url(covers[0]["src"])
+        if len(covers) > 1:
+            movie["back_image"] = abs_url(covers[1]["src"])
+
+    if (details := soup.find("span", string="Description source:")) and (
+        (details := details.parent) and (details := details.find_next("div"))
+    ):
+        movie["synopsis"] = details.get_text(strip=True)
+
+    if (table := movie_section.find("table")) and isinstance(table, Tag):
+        headers = [th.get_text() for th in table.find_all("th")]
+        values = table.find_all("td")
+        table = dict(zip(headers, values))
+
+        if length := table.get("Length"):
+            movie["duration"] = f"{length.get_text(strip=True)}:00"
+
+        if released := table.get("Released"):
+            # Unfortunately GEVI only tracks release years, not full dates
+            movie["date"] = f"{released.get_text(strip=True)}-01-01"
+
+        if distributor_cell := table.get("Distributor"):
+            distributor: ScrapedStudio = name_with_url(distributor_cell.find("a"))  # type: ignore
+            if (studio := distributor_cell.find("br")) and (
+                studio := studio.next_sibling.get_text(strip=True)
+            ):
+                movie["studio"] = {"name": studio, "parent": distributor}
+            else:
+                movie["studio"] = distributor
+
+    if directors := movie_section.find_all("a", href=lambda x: "director" in x):
+        movie["director"] = ", ".join(d.get_text(strip=True) for d in directors)
+
+    movie["url"] = url
+
+    return movie
+
+
+if __name__ == "__main__":
+    op, args = scraper_args()
+    result = None
+    match op, args:
+        case "scene-by-url", {"url": url} if url:
+            result = scene_from_url(url)
+        case "scene-by-fragment", args:
+            result = scene_from_fragment(args)
+        case "performer-by-url", {"url": url}:
+            result = performer_from_url(url)
+        case "performer-by-fragment", args:
+            result = performer_from_fragment(args)
+        case "performer-by-name", {"name": name, "extra": _domains} if name:
+            result = performer_search(name)
+        case "movie-by-url", {"url": url} if url:
+            result = movie_from_url(url)
+        case _:
+            log.error(f"Operation: {op}, arguments: {json.dumps(args)}")
+            sys.exit(1)
+
+    print(json.dumps(result))