import json import re import requests import sys from unicodedata import normalize from html.parser import HTMLParser import py_common.log as log from py_common.types import ScrapedMovie, ScrapedPerformer, ScrapedScene, ScrapedStudio from py_common.util import dig, guess_nationality, replace_all, scraper_args # Maps the `site_domain` key from the API # to studio names currently used on StashDB studio_map = { "2girls1camera.com": "2 Girls 1 Camera", "allanal.com": "All Anal", "alterotic.com": "Alt Erotic", "amazingfilms.com": "Amazing Films", "analonly.com": "Anal Only", "analjesse.com": "Anal Jesse", "benefitmonkey.com": "Benefit Monkey", "biggulpgirls.com": "Big Gulp Girls", "bjraw.com": "BJ Raw", "blackbullchallenge.com": "Black Bull Challenge", "cougarseason.com": "Cougar Season", "creampiethais.com": "Creampie Thais", "deepthroatsirens.com": "Deepthroat Sirens", "dirtyauditions.com": "Dirty Auditions", "divine-dd.com": "Divine-DD", "facialsforever.com": "Facials Forever", "freakmobmedia.com": "FreakMob Media", "gogobarauditions.com": "Gogo Bar Auditions", "gotfilled.com": "Got Filled", "hobybuchanon.com": "Hoby Buchanon", "inkedpov.com": "Inked POV", "inserted.com": "Inserted", "jav888.com": "JAV888", "lady-sonia.com": "Lady Sonia", "lezkey.com": "LezKey", "lucidflix.com": "LucidFlix", "meanfeetfetish.com": "Mean Feet Fetish", "members.hobybuchanon.com": "Hoby Buchanon", "mongerinasia.com": "Monger In Asia", "nylonperv.com": "Nylon Perv", "nympho.com": "Nympho", "poundedpetite.com": "Pounded Petite", "premium-nickmarxx.com": "Nick Marxx", "red-xxx.com": "Red-XXX", "rickysroom.com": "Ricky's Room", "s3xus.com": "S3XUS", "seska.com": "Seska", "sexymodernbull.com": "Sexy Modern Bull", "shesbrandnew.com": "She's Brand New", "sidechick.com": "SIDECHICK", "suckthisdick.com": "Suck This Dick", "swallowed.com": "Swallowed", "thaigirlswild.com": "Thai Girls Wild", "topwebmodels.com": "Top Web Models", "trueanal.com": "True Anal", "twmclassics.com": "TWM Classics", "xful.com": "Xful", "yesgirlz.com": "Yes Girlz", "yummycouple.com": "Yummy Couple", "z-filmz-originals.com": "Z-Filmz", } def clean_url(url: str) -> str: # remove any query parameters return re.sub(r"\?.*", "", url) # Some sites only work with the `tour.` subdomain def fix_url(url: str) -> str: url = url.replace("twmclassics.com", "topwebmodels.com") url = url.replace("suckthisdick.com", "hobybuchanon.com") url = url.replace("premium-nickmarxx.com", "nickmarxx.com") tour_domain = ( "nympho", "allanal", "analonly", "2girls1camera", "biggulpgirls", "deepthroatsirens", "facialsforever", "poundedpetite", "seska", "swallowed", "shesbrandnew", "topwebmodels", "trueanal", "twmclassics", ) return re.sub(rf"//(? str: class ToPlainText(HTMLParser): def __init__(self): super().__init__() self.reset() self.strict = False self.convert_charrefs = True self.text = [] def handle_data(self, d): self.text.append(d) def get_data(self): return "".join(self.text) s = ToPlainText() s.feed(html) return normalize("NFKD", s.get_data()) def fetch_page_props(url: str) -> dict | None: r = requests.get(url) if r.status_code != 200: log.error(f"Failed to fetch page HTML: {r.status_code}") return None matches = re.findall( r'(?: