import json import re import sys from urllib.parse import urlparse, urlencode from bs4 import BeautifulSoup, Tag import cloudscraper from py_common.config import get_config from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedMovie, ScrapedStudio from py_common.util import scraper_args import py_common.log as log config = get_config( default="""# Should we include the parenthesized disambiguation in performer names? # false: "John Doe" # true: "John Doe (II)" # For names disambiguate_names = False # For aliases disambiguate_aliases = False """ ) base_url = urlparse("https://gayeroticvideoindex.com") def abs_url(url: str) -> str: if url.startswith("http"): return url return base_url._replace(path=url).geturl() scraper = cloudscraper.create_scraper() scraper.headers.update({"Referer": base_url.netloc}) def parse_name(name: str) -> tuple[str, str | None]: "Parses a name and optional disambiguation from a string" match = re.match(r"^(.+?)(?:\s*\((.*?)\))?$", name) if match: return match.group(1), match.group(2) return name, None # For performer/studio links from episode/movie pages def name_with_url(link: Tag) -> dict: name = link.get_text(strip=True) if not config.disambiguate_names: name, _ = parse_name(name) performer = {"name": name} if (url := link.get("href")) and isinstance(url, str): performer["url"] = abs_url(url) return performer # Not really a HTML table, but the layout is consistent def from_table(soup: Tag, key: str) -> str | None: if (tag := soup.find("div", string=key)) and (value := tag.find_next("div")): return value.get_text() def scene_from_url(url: str) -> ScrapedScene | None: res = scraper.get(url) soup = BeautifulSoup(res.text, "html.parser") soup = soup.select("div#data section") if not soup: log.error(f"Cannot find episode section in {url}") return None soup = soup[0] scene: ScrapedScene = {} if title := soup.find("h1"): scene["title"] = title.get_text(strip=True) if image := soup.find("img", src=lambda x: "Episodes" in x): scene["image"] = abs_url(image["src"]) # type: ignore if details := soup.find("p"): scene["details"] = details.get_text(strip=True) if (date := soup.find("span", string="Date:")) and (date := date.next_sibling): scene["date"] = date.get_text(strip=True) if performers := soup.find_all("a", href=lambda x: "performer" in x): scene["performers"] = [ # type: ignore {**name_with_url(p), "gender": "MALE"} for p in performers ] if studio := soup.find("a", href=lambda x: "company" in x): scene["studio"] = name_with_url(studio) # type: ignore scene["url"] = url return scene def scene_from_fragment(args: dict) -> ScrapedScene | None: if url := args.get("url"): return scene_from_url(url) log.error("Cannot scrape scene without a URL") hair_map = { "Blond": "Blonde", "Brown": "Brunette", } # GEVI tracks skin color so there's no way to really know ethnicity ethnicity_map = { "White": "Caucasian", } def performer_from_url(url: str) -> ScrapedPerformer | None: res = scraper.get(url) soup = BeautifulSoup(res.text, "html.parser") soup = soup.select("div#data section") if not soup: log.error(f"Cannot find performer section in {url}") return None soup = soup[0] if not (name := soup.find("h1", attrs={"class": "text-yellow-200"})): log.error(f"Cannot find performer name in {url}") return None if config.disambiguate_names: name, disambiguation = parse_name(name.text) else: name = name.text disambiguation = None performer: ScrapedPerformer = { "name": name, "url": url, "gender": "MALE", } if disambiguation: performer["disambiguation"] = disambiguation if image := soup.find("img", src=lambda x: "Stars" in x): performer["image"] = base_url._replace(path=image["src"]).geturl() # type: ignore if (hair_color := from_table(soup, "Hair:")) and (hair := hair_map.get(hair_color)): performer["hair_color"] = hair.split(",")[0] # type: ignore because we've mapped all hair colors if eye_color := from_table(soup, "Eyes:"): performer["eye_color"] = eye_color # type: ignore if height := from_table(soup, "Height:"): performer["height"] = height.split("/")[-1].strip().removesuffix("cm") if foreskin := from_table(soup, "Foreskin:"): performer["circumcised"] = foreskin if dick_size := from_table(soup, "Dick Size:"): performer["penis_length"] = dick_size.split("/")[-1].strip().removesuffix("cm") if weight := from_table(soup, "Weight:"): performer["weight"] = weight.split("/")[-1].strip().removesuffix("kg") if tattoos := from_table(soup, "Tattoos:"): performer["tattoos"] = tattoos.strip() if skin_color := from_table(soup, "Skin:"): performer["ethnicity"] = ethnicity_map.get(skin_color, skin_color) # type: ignore if country := from_table(soup, "From:"): performer["country"] = country.split(",")[-1].strip() if birth_year := from_table(soup, "Born:"): # Unfortunately GEVI only tracks birth years, not full dates performer["birthdate"] = f"{birth_year}-01-01" if death_year := from_table(soup, "Died:"): performer["death_date"] = f"{death_year}-01-01" if (bio := soup.find("div", string="Notes:")) and (bio := bio.find_next("div")): performer["details"] = bio.get_text(separator="\n") if aliases := soup.find_all("h2"): if config.disambiguate_aliases: performer["aliases"] = ", ".join(alias.text for alias in aliases) else: deduplicated = {parse_name(alias.text)[0] for alias in aliases} performer["aliases"] = ", ".join(sorted(deduplicated)) return performer def performer_from_fragment(args: dict) -> ScrapedPerformer | None: if url := args.get("url"): return performer_from_url(url) elif (name := args.get("name")) and ( candidate := next(iter(performer_search(name)), None) ): return performer_from_url(candidate["url"]) # type: ignore because we know url will be set log.error("Cannot scrape performer without a URL or name") def performer_search(name: str) -> list[ScrapedPerformer]: search_params = { "draw": 2, "start": 0, "length": 10, "search[value]": name, "search[regex]": "false", } search_url = base_url._replace(path="shpr", query=urlencode(search_params)).geturl() res = scraper.get(search_url) found = [BeautifulSoup(x[1], "html.parser").contents[0] for x in res.json()["data"]] return [ {"name": found.text, "url": base_url._replace(path=found["href"]).geturl()} # type: ignore for found in found ] def movie_from_url(url: str) -> ScrapedMovie | None: res = scraper.get(url) soup = BeautifulSoup(res.text, "html.parser") movie_section = next(iter(soup.select("section#data section")), None) if not movie_section: log.error(f"Cannot find movie section in {url}") return None movie: ScrapedMovie = {} if name := movie_section.find("h1"): movie["name"] = name.get_text(strip=True) if covers := movie_section.find_all("img", src=lambda x: "Covers" in x): movie["front_image"] = abs_url(covers[0]["src"]) if len(covers) > 1: movie["back_image"] = abs_url(covers[1]["src"]) if (details := soup.find("span", string="Description source:")) and ( (details := details.parent) and (details := details.find_next("div")) ): movie["synopsis"] = details.get_text(strip=True) if (table := movie_section.find("table")) and isinstance(table, Tag): headers = [th.get_text() for th in table.find_all("th")] values = table.find_all("td") table = dict(zip(headers, values)) if length := table.get("Length"): movie["duration"] = f"{length.get_text(strip=True)}:00" if released := table.get("Released"): # Unfortunately GEVI only tracks release years, not full dates movie["date"] = f"{released.get_text(strip=True)}-01-01" if distributor_cell := table.get("Distributor"): distributor: ScrapedStudio = name_with_url(distributor_cell.find("a")) # type: ignore if (studio := distributor_cell.find("br")) and ( studio := studio.next_sibling.get_text(strip=True) ): movie["studio"] = {"name": studio, "parent": distributor} else: movie["studio"] = distributor if directors := movie_section.find_all("a", href=lambda x: "director" in x): movie["director"] = ", ".join(d.get_text(strip=True) for d in directors) movie["url"] = url return movie if __name__ == "__main__": op, args = scraper_args() result = None match op, args: case "scene-by-url", {"url": url} if url: result = scene_from_url(url) case "scene-by-fragment", args: result = scene_from_fragment(args) case "performer-by-url", {"url": url}: result = performer_from_url(url) case "performer-by-fragment", args: result = performer_from_fragment(args) case "performer-by-name", {"name": name, "extra": _domains} if name: result = performer_search(name) case "movie-by-url", {"url": url} if url: result = movie_from_url(url) case _: log.error(f"Operation: {op}, arguments: {json.dumps(args)}") sys.exit(1) print(json.dumps(result))