stash
This commit is contained in:
300
stash/config/scrapers/community/GEVI/GEVI.py
Normal file
300
stash/config/scrapers/community/GEVI/GEVI.py
Normal file
@@ -0,0 +1,300 @@
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from urllib.parse import urlparse, urlencode
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
import cloudscraper
|
||||
|
||||
from py_common.config import get_config
|
||||
from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedMovie, ScrapedStudio
|
||||
from py_common.util import scraper_args
|
||||
import py_common.log as log
|
||||
|
||||
|
||||
config = get_config(
|
||||
default="""# Should we include the parenthesized disambiguation in performer names?
|
||||
# false: "John Doe"
|
||||
# true: "John Doe (II)"
|
||||
|
||||
# For names
|
||||
disambiguate_names = False
|
||||
|
||||
# For aliases
|
||||
disambiguate_aliases = False
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
base_url = urlparse("https://gayeroticvideoindex.com")
|
||||
|
||||
|
||||
def abs_url(url: str) -> str:
|
||||
if url.startswith("http"):
|
||||
return url
|
||||
return base_url._replace(path=url).geturl()
|
||||
|
||||
|
||||
scraper = cloudscraper.create_scraper()
|
||||
scraper.headers.update({"Referer": base_url.netloc})
|
||||
|
||||
|
||||
def parse_name(name: str) -> tuple[str, str | None]:
|
||||
"Parses a name and optional disambiguation from a string"
|
||||
match = re.match(r"^(.+?)(?:\s*\((.*?)\))?$", name)
|
||||
if match:
|
||||
return match.group(1), match.group(2)
|
||||
return name, None
|
||||
|
||||
|
||||
# For performer/studio links from episode/movie pages
|
||||
def name_with_url(link: Tag) -> dict:
|
||||
name = link.get_text(strip=True)
|
||||
if not config.disambiguate_names:
|
||||
name, _ = parse_name(name)
|
||||
|
||||
performer = {"name": name}
|
||||
if (url := link.get("href")) and isinstance(url, str):
|
||||
performer["url"] = abs_url(url)
|
||||
return performer
|
||||
|
||||
|
||||
# Not really a HTML table, but the layout is consistent
|
||||
def from_table(soup: Tag, key: str) -> str | None:
|
||||
if (tag := soup.find("div", string=key)) and (value := tag.find_next("div")):
|
||||
return value.get_text()
|
||||
|
||||
|
||||
def scene_from_url(url: str) -> ScrapedScene | None:
|
||||
res = scraper.get(url)
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
soup = soup.select("div#data section")
|
||||
if not soup:
|
||||
log.error(f"Cannot find episode section in {url}")
|
||||
return None
|
||||
|
||||
soup = soup[0]
|
||||
|
||||
scene: ScrapedScene = {}
|
||||
|
||||
if title := soup.find("h1"):
|
||||
scene["title"] = title.get_text(strip=True)
|
||||
|
||||
if image := soup.find("img", src=lambda x: "Episodes" in x):
|
||||
scene["image"] = abs_url(image["src"]) # type: ignore
|
||||
|
||||
if details := soup.find("p"):
|
||||
scene["details"] = details.get_text(strip=True)
|
||||
|
||||
if (date := soup.find("span", string="Date:")) and (date := date.next_sibling):
|
||||
scene["date"] = date.get_text(strip=True)
|
||||
|
||||
if performers := soup.find_all("a", href=lambda x: "performer" in x):
|
||||
scene["performers"] = [ # type: ignore
|
||||
{**name_with_url(p), "gender": "MALE"} for p in performers
|
||||
]
|
||||
|
||||
if studio := soup.find("a", href=lambda x: "company" in x):
|
||||
scene["studio"] = name_with_url(studio) # type: ignore
|
||||
|
||||
scene["url"] = url
|
||||
|
||||
return scene
|
||||
|
||||
|
||||
def scene_from_fragment(args: dict) -> ScrapedScene | None:
|
||||
if url := args.get("url"):
|
||||
return scene_from_url(url)
|
||||
log.error("Cannot scrape scene without a URL")
|
||||
|
||||
|
||||
hair_map = {
|
||||
"Blond": "Blonde",
|
||||
"Brown": "Brunette",
|
||||
}
|
||||
|
||||
# GEVI tracks skin color so there's no way to really know ethnicity
|
||||
ethnicity_map = {
|
||||
"White": "Caucasian",
|
||||
}
|
||||
|
||||
|
||||
def performer_from_url(url: str) -> ScrapedPerformer | None:
|
||||
res = scraper.get(url)
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
soup = soup.select("div#data section")
|
||||
if not soup:
|
||||
log.error(f"Cannot find performer section in {url}")
|
||||
return None
|
||||
|
||||
soup = soup[0]
|
||||
|
||||
if not (name := soup.find("h1", attrs={"class": "text-yellow-200"})):
|
||||
log.error(f"Cannot find performer name in {url}")
|
||||
return None
|
||||
|
||||
if config.disambiguate_names:
|
||||
name, disambiguation = parse_name(name.text)
|
||||
else:
|
||||
name = name.text
|
||||
disambiguation = None
|
||||
|
||||
performer: ScrapedPerformer = {
|
||||
"name": name,
|
||||
"url": url,
|
||||
"gender": "MALE",
|
||||
}
|
||||
|
||||
if disambiguation:
|
||||
performer["disambiguation"] = disambiguation
|
||||
|
||||
if image := soup.find("img", src=lambda x: "Stars" in x):
|
||||
performer["image"] = base_url._replace(path=image["src"]).geturl() # type: ignore
|
||||
|
||||
if (hair_color := from_table(soup, "Hair:")) and (hair := hair_map.get(hair_color)):
|
||||
performer["hair_color"] = hair.split(",")[0] # type: ignore because we've mapped all hair colors
|
||||
|
||||
if eye_color := from_table(soup, "Eyes:"):
|
||||
performer["eye_color"] = eye_color # type: ignore
|
||||
|
||||
if height := from_table(soup, "Height:"):
|
||||
performer["height"] = height.split("/")[-1].strip().removesuffix("cm")
|
||||
|
||||
if foreskin := from_table(soup, "Foreskin:"):
|
||||
performer["circumcised"] = foreskin
|
||||
|
||||
if dick_size := from_table(soup, "Dick Size:"):
|
||||
performer["penis_length"] = dick_size.split("/")[-1].strip().removesuffix("cm")
|
||||
|
||||
if weight := from_table(soup, "Weight:"):
|
||||
performer["weight"] = weight.split("/")[-1].strip().removesuffix("kg")
|
||||
|
||||
if tattoos := from_table(soup, "Tattoos:"):
|
||||
performer["tattoos"] = tattoos.strip()
|
||||
|
||||
if skin_color := from_table(soup, "Skin:"):
|
||||
performer["ethnicity"] = ethnicity_map.get(skin_color, skin_color) # type: ignore
|
||||
|
||||
if country := from_table(soup, "From:"):
|
||||
performer["country"] = country.split(",")[-1].strip()
|
||||
|
||||
if birth_year := from_table(soup, "Born:"):
|
||||
# Unfortunately GEVI only tracks birth years, not full dates
|
||||
performer["birthdate"] = f"{birth_year}-01-01"
|
||||
|
||||
if death_year := from_table(soup, "Died:"):
|
||||
performer["death_date"] = f"{death_year}-01-01"
|
||||
|
||||
if (bio := soup.find("div", string="Notes:")) and (bio := bio.find_next("div")):
|
||||
performer["details"] = bio.get_text(separator="\n")
|
||||
|
||||
if aliases := soup.find_all("h2"):
|
||||
if config.disambiguate_aliases:
|
||||
performer["aliases"] = ", ".join(alias.text for alias in aliases)
|
||||
else:
|
||||
deduplicated = {parse_name(alias.text)[0] for alias in aliases}
|
||||
performer["aliases"] = ", ".join(sorted(deduplicated))
|
||||
|
||||
return performer
|
||||
|
||||
|
||||
def performer_from_fragment(args: dict) -> ScrapedPerformer | None:
|
||||
if url := args.get("url"):
|
||||
return performer_from_url(url)
|
||||
elif (name := args.get("name")) and (
|
||||
candidate := next(iter(performer_search(name)), None)
|
||||
):
|
||||
return performer_from_url(candidate["url"]) # type: ignore because we know url will be set
|
||||
log.error("Cannot scrape performer without a URL or name")
|
||||
|
||||
|
||||
def performer_search(name: str) -> list[ScrapedPerformer]:
|
||||
search_params = {
|
||||
"draw": 2,
|
||||
"start": 0,
|
||||
"length": 10,
|
||||
"search[value]": name,
|
||||
"search[regex]": "false",
|
||||
}
|
||||
search_url = base_url._replace(path="shpr", query=urlencode(search_params)).geturl()
|
||||
res = scraper.get(search_url)
|
||||
found = [BeautifulSoup(x[1], "html.parser").contents[0] for x in res.json()["data"]]
|
||||
return [
|
||||
{"name": found.text, "url": base_url._replace(path=found["href"]).geturl()} # type: ignore
|
||||
for found in found
|
||||
]
|
||||
|
||||
|
||||
def movie_from_url(url: str) -> ScrapedMovie | None:
|
||||
res = scraper.get(url)
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
movie_section = next(iter(soup.select("section#data section")), None)
|
||||
if not movie_section:
|
||||
log.error(f"Cannot find movie section in {url}")
|
||||
return None
|
||||
|
||||
movie: ScrapedMovie = {}
|
||||
|
||||
if name := movie_section.find("h1"):
|
||||
movie["name"] = name.get_text(strip=True)
|
||||
|
||||
if covers := movie_section.find_all("img", src=lambda x: "Covers" in x):
|
||||
movie["front_image"] = abs_url(covers[0]["src"])
|
||||
if len(covers) > 1:
|
||||
movie["back_image"] = abs_url(covers[1]["src"])
|
||||
|
||||
if (details := soup.find("span", string="Description source:")) and (
|
||||
(details := details.parent) and (details := details.find_next("div"))
|
||||
):
|
||||
movie["synopsis"] = details.get_text(strip=True)
|
||||
|
||||
if (table := movie_section.find("table")) and isinstance(table, Tag):
|
||||
headers = [th.get_text() for th in table.find_all("th")]
|
||||
values = table.find_all("td")
|
||||
table = dict(zip(headers, values))
|
||||
|
||||
if length := table.get("Length"):
|
||||
movie["duration"] = f"{length.get_text(strip=True)}:00"
|
||||
|
||||
if released := table.get("Released"):
|
||||
# Unfortunately GEVI only tracks release years, not full dates
|
||||
movie["date"] = f"{released.get_text(strip=True)}-01-01"
|
||||
|
||||
if distributor_cell := table.get("Distributor"):
|
||||
distributor: ScrapedStudio = name_with_url(distributor_cell.find("a")) # type: ignore
|
||||
if (studio := distributor_cell.find("br")) and (
|
||||
studio := studio.next_sibling.get_text(strip=True)
|
||||
):
|
||||
movie["studio"] = {"name": studio, "parent": distributor}
|
||||
else:
|
||||
movie["studio"] = distributor
|
||||
|
||||
if directors := movie_section.find_all("a", href=lambda x: "director" in x):
|
||||
movie["director"] = ", ".join(d.get_text(strip=True) for d in directors)
|
||||
|
||||
movie["url"] = url
|
||||
|
||||
return movie
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
op, args = scraper_args()
|
||||
result = None
|
||||
match op, args:
|
||||
case "scene-by-url", {"url": url} if url:
|
||||
result = scene_from_url(url)
|
||||
case "scene-by-fragment", args:
|
||||
result = scene_from_fragment(args)
|
||||
case "performer-by-url", {"url": url}:
|
||||
result = performer_from_url(url)
|
||||
case "performer-by-fragment", args:
|
||||
result = performer_from_fragment(args)
|
||||
case "performer-by-name", {"name": name, "extra": _domains} if name:
|
||||
result = performer_search(name)
|
||||
case "movie-by-url", {"url": url} if url:
|
||||
result = movie_from_url(url)
|
||||
case _:
|
||||
log.error(f"Operation: {op}, arguments: {json.dumps(args)}")
|
||||
sys.exit(1)
|
||||
|
||||
print(json.dumps(result))
|
||||
Reference in New Issue
Block a user