301 lines
9.6 KiB
Python
301 lines
9.6 KiB
Python
import json
|
|
import re
|
|
import sys
|
|
from urllib.parse import urlparse, urlencode
|
|
|
|
from bs4 import BeautifulSoup, Tag
|
|
import cloudscraper
|
|
|
|
from py_common.config import get_config
|
|
from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedMovie, ScrapedStudio
|
|
from py_common.util import scraper_args
|
|
import py_common.log as log
|
|
|
|
|
|
config = get_config(
|
|
default="""# Should we include the parenthesized disambiguation in performer names?
|
|
# false: "John Doe"
|
|
# true: "John Doe (II)"
|
|
|
|
# For names
|
|
disambiguate_names = False
|
|
|
|
# For aliases
|
|
disambiguate_aliases = False
|
|
"""
|
|
)
|
|
|
|
|
|
base_url = urlparse("https://gayeroticvideoindex.com")
|
|
|
|
|
|
def abs_url(url: str) -> str:
|
|
if url.startswith("http"):
|
|
return url
|
|
return base_url._replace(path=url).geturl()
|
|
|
|
|
|
scraper = cloudscraper.create_scraper()
|
|
scraper.headers.update({"Referer": base_url.netloc})
|
|
|
|
|
|
def parse_name(name: str) -> tuple[str, str | None]:
|
|
"Parses a name and optional disambiguation from a string"
|
|
match = re.match(r"^(.+?)(?:\s*\((.*?)\))?$", name)
|
|
if match:
|
|
return match.group(1), match.group(2)
|
|
return name, None
|
|
|
|
|
|
# For performer/studio links from episode/movie pages
|
|
def name_with_url(link: Tag) -> dict:
|
|
name = link.get_text(strip=True)
|
|
if not config.disambiguate_names:
|
|
name, _ = parse_name(name)
|
|
|
|
performer = {"name": name}
|
|
if (url := link.get("href")) and isinstance(url, str):
|
|
performer["url"] = abs_url(url)
|
|
return performer
|
|
|
|
|
|
# Not really a HTML table, but the layout is consistent
|
|
def from_table(soup: Tag, key: str) -> str | None:
|
|
if (tag := soup.find("div", string=key)) and (value := tag.find_next("div")):
|
|
return value.get_text()
|
|
|
|
|
|
def scene_from_url(url: str) -> ScrapedScene | None:
|
|
res = scraper.get(url)
|
|
soup = BeautifulSoup(res.text, "html.parser")
|
|
soup = soup.select("div#data section")
|
|
if not soup:
|
|
log.error(f"Cannot find episode section in {url}")
|
|
return None
|
|
|
|
soup = soup[0]
|
|
|
|
scene: ScrapedScene = {}
|
|
|
|
if title := soup.find("h1"):
|
|
scene["title"] = title.get_text(strip=True)
|
|
|
|
if image := soup.find("img", src=lambda x: "Episodes" in x):
|
|
scene["image"] = abs_url(image["src"]) # type: ignore
|
|
|
|
if details := soup.find("p"):
|
|
scene["details"] = details.get_text(strip=True)
|
|
|
|
if (date := soup.find("span", string="Date:")) and (date := date.next_sibling):
|
|
scene["date"] = date.get_text(strip=True)
|
|
|
|
if performers := soup.find_all("a", href=lambda x: "performer" in x):
|
|
scene["performers"] = [ # type: ignore
|
|
{**name_with_url(p), "gender": "MALE"} for p in performers
|
|
]
|
|
|
|
if studio := soup.find("a", href=lambda x: "company" in x):
|
|
scene["studio"] = name_with_url(studio) # type: ignore
|
|
|
|
scene["url"] = url
|
|
|
|
return scene
|
|
|
|
|
|
def scene_from_fragment(args: dict) -> ScrapedScene | None:
|
|
if url := args.get("url"):
|
|
return scene_from_url(url)
|
|
log.error("Cannot scrape scene without a URL")
|
|
|
|
|
|
hair_map = {
|
|
"Blond": "Blonde",
|
|
"Brown": "Brunette",
|
|
}
|
|
|
|
# GEVI tracks skin color so there's no way to really know ethnicity
|
|
ethnicity_map = {
|
|
"White": "Caucasian",
|
|
}
|
|
|
|
|
|
def performer_from_url(url: str) -> ScrapedPerformer | None:
|
|
res = scraper.get(url)
|
|
soup = BeautifulSoup(res.text, "html.parser")
|
|
soup = soup.select("div#data section")
|
|
if not soup:
|
|
log.error(f"Cannot find performer section in {url}")
|
|
return None
|
|
|
|
soup = soup[0]
|
|
|
|
if not (name := soup.find("h1", attrs={"class": "text-yellow-200"})):
|
|
log.error(f"Cannot find performer name in {url}")
|
|
return None
|
|
|
|
if config.disambiguate_names:
|
|
name, disambiguation = parse_name(name.text)
|
|
else:
|
|
name = name.text
|
|
disambiguation = None
|
|
|
|
performer: ScrapedPerformer = {
|
|
"name": name,
|
|
"url": url,
|
|
"gender": "MALE",
|
|
}
|
|
|
|
if disambiguation:
|
|
performer["disambiguation"] = disambiguation
|
|
|
|
if image := soup.find("img", src=lambda x: "Stars" in x):
|
|
performer["image"] = base_url._replace(path=image["src"]).geturl() # type: ignore
|
|
|
|
if (hair_color := from_table(soup, "Hair:")) and (hair := hair_map.get(hair_color)):
|
|
performer["hair_color"] = hair.split(",")[0] # type: ignore because we've mapped all hair colors
|
|
|
|
if eye_color := from_table(soup, "Eyes:"):
|
|
performer["eye_color"] = eye_color # type: ignore
|
|
|
|
if height := from_table(soup, "Height:"):
|
|
performer["height"] = height.split("/")[-1].strip().removesuffix("cm")
|
|
|
|
if foreskin := from_table(soup, "Foreskin:"):
|
|
performer["circumcised"] = foreskin
|
|
|
|
if dick_size := from_table(soup, "Dick Size:"):
|
|
performer["penis_length"] = dick_size.split("/")[-1].strip().removesuffix("cm")
|
|
|
|
if weight := from_table(soup, "Weight:"):
|
|
performer["weight"] = weight.split("/")[-1].strip().removesuffix("kg")
|
|
|
|
if tattoos := from_table(soup, "Tattoos:"):
|
|
performer["tattoos"] = tattoos.strip()
|
|
|
|
if skin_color := from_table(soup, "Skin:"):
|
|
performer["ethnicity"] = ethnicity_map.get(skin_color, skin_color) # type: ignore
|
|
|
|
if country := from_table(soup, "From:"):
|
|
performer["country"] = country.split(",")[-1].strip()
|
|
|
|
if birth_year := from_table(soup, "Born:"):
|
|
# Unfortunately GEVI only tracks birth years, not full dates
|
|
performer["birthdate"] = f"{birth_year}-01-01"
|
|
|
|
if death_year := from_table(soup, "Died:"):
|
|
performer["death_date"] = f"{death_year}-01-01"
|
|
|
|
if (bio := soup.find("div", string="Notes:")) and (bio := bio.find_next("div")):
|
|
performer["details"] = bio.get_text(separator="\n")
|
|
|
|
if aliases := soup.find_all("h2"):
|
|
if config.disambiguate_aliases:
|
|
performer["aliases"] = ", ".join(alias.text for alias in aliases)
|
|
else:
|
|
deduplicated = {parse_name(alias.text)[0] for alias in aliases}
|
|
performer["aliases"] = ", ".join(sorted(deduplicated))
|
|
|
|
return performer
|
|
|
|
|
|
def performer_from_fragment(args: dict) -> ScrapedPerformer | None:
|
|
if url := args.get("url"):
|
|
return performer_from_url(url)
|
|
elif (name := args.get("name")) and (
|
|
candidate := next(iter(performer_search(name)), None)
|
|
):
|
|
return performer_from_url(candidate["url"]) # type: ignore because we know url will be set
|
|
log.error("Cannot scrape performer without a URL or name")
|
|
|
|
|
|
def performer_search(name: str) -> list[ScrapedPerformer]:
|
|
search_params = {
|
|
"draw": 2,
|
|
"start": 0,
|
|
"length": 10,
|
|
"search[value]": name,
|
|
"search[regex]": "false",
|
|
}
|
|
search_url = base_url._replace(path="shpr", query=urlencode(search_params)).geturl()
|
|
res = scraper.get(search_url)
|
|
found = [BeautifulSoup(x[1], "html.parser").contents[0] for x in res.json()["data"]]
|
|
return [
|
|
{"name": found.text, "url": base_url._replace(path=found["href"]).geturl()} # type: ignore
|
|
for found in found
|
|
]
|
|
|
|
|
|
def movie_from_url(url: str) -> ScrapedMovie | None:
|
|
res = scraper.get(url)
|
|
soup = BeautifulSoup(res.text, "html.parser")
|
|
movie_section = next(iter(soup.select("section#data section")), None)
|
|
if not movie_section:
|
|
log.error(f"Cannot find movie section in {url}")
|
|
return None
|
|
|
|
movie: ScrapedMovie = {}
|
|
|
|
if name := movie_section.find("h1"):
|
|
movie["name"] = name.get_text(strip=True)
|
|
|
|
if covers := movie_section.find_all("img", src=lambda x: "Covers" in x):
|
|
movie["front_image"] = abs_url(covers[0]["src"])
|
|
if len(covers) > 1:
|
|
movie["back_image"] = abs_url(covers[1]["src"])
|
|
|
|
if (details := soup.find("span", string="Description source:")) and (
|
|
(details := details.parent) and (details := details.find_next("div"))
|
|
):
|
|
movie["synopsis"] = details.get_text(strip=True)
|
|
|
|
if (table := movie_section.find("table")) and isinstance(table, Tag):
|
|
headers = [th.get_text() for th in table.find_all("th")]
|
|
values = table.find_all("td")
|
|
table = dict(zip(headers, values))
|
|
|
|
if length := table.get("Length"):
|
|
movie["duration"] = f"{length.get_text(strip=True)}:00"
|
|
|
|
if released := table.get("Released"):
|
|
# Unfortunately GEVI only tracks release years, not full dates
|
|
movie["date"] = f"{released.get_text(strip=True)}-01-01"
|
|
|
|
if distributor_cell := table.get("Distributor"):
|
|
distributor: ScrapedStudio = name_with_url(distributor_cell.find("a")) # type: ignore
|
|
if (studio := distributor_cell.find("br")) and (
|
|
studio := studio.next_sibling.get_text(strip=True)
|
|
):
|
|
movie["studio"] = {"name": studio, "parent": distributor}
|
|
else:
|
|
movie["studio"] = distributor
|
|
|
|
if directors := movie_section.find_all("a", href=lambda x: "director" in x):
|
|
movie["director"] = ", ".join(d.get_text(strip=True) for d in directors)
|
|
|
|
movie["url"] = url
|
|
|
|
return movie
|
|
|
|
|
|
if __name__ == "__main__":
|
|
op, args = scraper_args()
|
|
result = None
|
|
match op, args:
|
|
case "scene-by-url", {"url": url} if url:
|
|
result = scene_from_url(url)
|
|
case "scene-by-fragment", args:
|
|
result = scene_from_fragment(args)
|
|
case "performer-by-url", {"url": url}:
|
|
result = performer_from_url(url)
|
|
case "performer-by-fragment", args:
|
|
result = performer_from_fragment(args)
|
|
case "performer-by-name", {"name": name, "extra": _domains} if name:
|
|
result = performer_search(name)
|
|
case "movie-by-url", {"url": url} if url:
|
|
result = movie_from_url(url)
|
|
case _:
|
|
log.error(f"Operation: {op}, arguments: {json.dumps(args)}")
|
|
sys.exit(1)
|
|
|
|
print(json.dumps(result))
|