Files
compose-projects-arr/stash/config/scrapers/community/KBProductions/KBProductions.py
Christoph Califice 0a5f88d75a stash
2025-10-10 09:50:30 -03:00

316 lines
9.5 KiB
Python

import json
import re
import requests
import sys
from unicodedata import normalize
from html.parser import HTMLParser
import py_common.log as log
from py_common.types import ScrapedMovie, ScrapedPerformer, ScrapedScene, ScrapedStudio
from py_common.util import dig, guess_nationality, replace_all, scraper_args
# Maps the `site_domain` key from the API
# to studio names currently used on StashDB
studio_map = {
"2girls1camera.com": "2 Girls 1 Camera",
"allanal.com": "All Anal",
"alterotic.com": "Alt Erotic",
"amazingfilms.com": "Amazing Films",
"analonly.com": "Anal Only",
"analjesse.com": "Anal Jesse",
"benefitmonkey.com": "Benefit Monkey",
"biggulpgirls.com": "Big Gulp Girls",
"bjraw.com": "BJ Raw",
"blackbullchallenge.com": "Black Bull Challenge",
"cougarseason.com": "Cougar Season",
"creampiethais.com": "Creampie Thais",
"deepthroatsirens.com": "Deepthroat Sirens",
"dirtyauditions.com": "Dirty Auditions",
"divine-dd.com": "Divine-DD",
"facialsforever.com": "Facials Forever",
"freakmobmedia.com": "FreakMob Media",
"gogobarauditions.com": "Gogo Bar Auditions",
"gotfilled.com": "Got Filled",
"hobybuchanon.com": "Hoby Buchanon",
"inkedpov.com": "Inked POV",
"inserted.com": "Inserted",
"jav888.com": "JAV888",
"lady-sonia.com": "Lady Sonia",
"lezkey.com": "LezKey",
"lucidflix.com": "LucidFlix",
"meanfeetfetish.com": "Mean Feet Fetish",
"members.hobybuchanon.com": "Hoby Buchanon",
"mongerinasia.com": "Monger In Asia",
"nylonperv.com": "Nylon Perv",
"nympho.com": "Nympho",
"poundedpetite.com": "Pounded Petite",
"premium-nickmarxx.com": "Nick Marxx",
"red-xxx.com": "Red-XXX",
"rickysroom.com": "Ricky's Room",
"s3xus.com": "S3XUS",
"seska.com": "Seska",
"sexymodernbull.com": "Sexy Modern Bull",
"shesbrandnew.com": "She's Brand New",
"sidechick.com": "SIDECHICK",
"suckthisdick.com": "Suck This Dick",
"swallowed.com": "Swallowed",
"thaigirlswild.com": "Thai Girls Wild",
"topwebmodels.com": "Top Web Models",
"trueanal.com": "True Anal",
"twmclassics.com": "TWM Classics",
"xful.com": "Xful",
"yesgirlz.com": "Yes Girlz",
"yummycouple.com": "Yummy Couple",
"z-filmz-originals.com": "Z-Filmz",
}
def clean_url(url: str) -> str:
# remove any query parameters
return re.sub(r"\?.*", "", url)
# Some sites only work with the `tour.` subdomain
def fix_url(url: str) -> str:
url = url.replace("twmclassics.com", "topwebmodels.com")
url = url.replace("suckthisdick.com", "hobybuchanon.com")
url = url.replace("premium-nickmarxx.com", "nickmarxx.com")
tour_domain = (
"nympho",
"allanal",
"analonly",
"2girls1camera",
"biggulpgirls",
"deepthroatsirens",
"facialsforever",
"poundedpetite",
"seska",
"swallowed",
"shesbrandnew",
"topwebmodels",
"trueanal",
"twmclassics",
)
return re.sub(rf"//(?<!tour\.)({'|'.join(tour_domain)})", r"//tour.\1", url)
def strip_tags(html: str) -> str:
class ToPlainText(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs = True
self.text = []
def handle_data(self, d):
self.text.append(d)
def get_data(self):
return "".join(self.text)
s = ToPlainText()
s.feed(html)
return normalize("NFKD", s.get_data())
def fetch_page_props(url: str) -> dict | None:
r = requests.get(url)
if r.status_code != 200:
log.error(f"Failed to fetch page HTML: {r.status_code}")
return None
matches = re.findall(
r'(?:<script id="__NEXT_DATA__" type="application\/json">({.+})<\/script>)',
r.text,
re.MULTILINE,
)
if not matches:
log.error("Could not find JSON data on page")
return None
parsed_json = json.loads(matches[0])
if not (content := dig(parsed_json, "props", "pageProps")):
log.error("Could not find page props in JSON data")
return content
def make_performer_url(slug: str, site: str) -> str:
return f"https://{site}/models/{slug}"
def get_studio(site: str) -> ScrapedStudio:
name = studio_map.get(site, site)
studio: ScrapedStudio = {
"name": name,
"url": f"https://{site}",
}
if name == "Suck This Dick":
studio["parent"] = get_studio("hobybuchanon.com")
return studio
def to_scraped_performer(raw_performer: dict) -> ScrapedPerformer:
performer: ScrapedPerformer = {
"name": raw_performer["name"],
"gender": raw_performer["gender"],
"url": make_performer_url(raw_performer["slug"], raw_performer["site_domain"]),
}
if image := raw_performer.get("thumb"):
performer["image"] = image
if bio := raw_performer.get("Bio"):
performer["details"] = strip_tags(bio)
if (birthdate := raw_performer.get("Birthdate")) and birthdate != "1969-12-31":
performer["birthdate"] = birthdate
if measurements := raw_performer.get("Measurements"):
performer["measurements"] = measurements
if eye_color := raw_performer.get("Eyes"):
performer["eye_color"] = eye_color
if ethnicity := raw_performer.get("Ethnicity"):
performer["ethnicity"] = ethnicity
if (height_ft := raw_performer.get("Height")) and (
h := re.match(r"(\d+)\D+(\d+).+", height_ft)
):
height_cm = round((float(h.group(1)) * 12 + float(h.group(2))) * 2.54)
performer["height"] = str(height_cm)
if (weight_lb := raw_performer.get("Weight")) and (
w := re.match(r"(\d+)\slbs", weight_lb)
):
weight_kg = round(float(w.group(1)) / 2.2046)
performer["weight"] = str(weight_kg)
if hair_color := raw_performer.get("Hair"):
performer["hair_color"] = hair_color
if country := raw_performer.get("Born"):
performer["country"] = guess_nationality(country)
if twitter := raw_performer.get("Twitter", "").removeprefix("@"):
performer["twitter"] = f"https://twitter.com/{twitter}"
if instagram := raw_performer.get("Instagram", "").removeprefix("@"):
performer["instagram"] = f"https://www.instagram.com/{instagram}"
return performer
def to_scraped_movie(raw_movie: dict) -> ScrapedMovie:
movie: ScrapedMovie = {
"name": raw_movie["title"],
}
if date := raw_movie.get("publish_date"):
movie["date"] = date[:10].replace("/", "-")
if duration := raw_movie.get("videos_duration"):
movie["duration"] = duration
if cover := raw_movie.get("trailer_screencap"):
movie["front_image"] = cover
site = raw_movie["site_domain"]
movie["studio"] = get_studio(site)
# There is no reliable way to construct a movie URL from the data
return movie
def to_scraped_scene(raw_scene: dict) -> ScrapedScene:
site = raw_scene["site_domain"]
scene: ScrapedScene = {}
if title := raw_scene.get("title"):
scene["title"] = title
if date := raw_scene.get("publish_date"):
scene["date"] = date[:10].replace("/", "-")
if details := raw_scene.get("description"):
scene["details"] = strip_tags(details)
if scene_id := raw_scene.get("id"):
scene["code"] = str(scene_id)
if models := raw_scene.get("models_thumbs"):
scene["performers"] = [
{
"name": x["name"],
"image": x["thumb"],
"url": make_performer_url(x["slug"], site),
}
for x in models
]
if tags := raw_scene.get("tags"):
scene["tags"] = [{"name": x} for x in tags]
scene["studio"] = get_studio(site)
# trailer_screencap is what's shown on most sites
# extra_thumbnails has the best sizes and in most cases the first one is the same as thumb
# thumb is a good fallback if extra_thumbnails is not available
# final fallback is special_thumbnails
cover_candidates = filter(
None,
(
dig(raw_scene, "poster_url"),
dig(raw_scene, "trailer_screencap"),
dig(raw_scene, "extra_thumbnails", 0),
dig(raw_scene, "thumb"),
dig(raw_scene, "special_thumbnails", 0),
),
)
# No animated scene covers
img_exts = (".jpg", ".jpeg", ".png")
if scene_cover := next((x for x in cover_candidates if x.endswith(img_exts)), None):
scene["image"] = scene_cover
# There is no reliable way to construct a scene URL from the data
return scene
def scrape_scene(url: str) -> ScrapedScene | None:
if not (props := fetch_page_props(url)):
return None
scene = to_scraped_scene(props["content"])
scene["url"] = url
if playlist := dig(props, "playlist", "data", 0):
scene["movies"] = [to_scraped_movie(playlist)]
return scene
def scrape_performer(url: str) -> ScrapedPerformer | None:
if not (props := fetch_page_props(url)):
return None
return to_scraped_performer(props["model"])
if __name__ == "__main__":
op, args = scraper_args()
result = None
match op, args:
case "scene-by-url", {"url": url} if url:
result = scrape_scene(clean_url(url))
case "performer-by-url", {"url": url} if url:
result = scrape_performer(clean_url(url))
case _:
log.error(f"Invalid operation: {op}")
sys.exit(1)
result = replace_all(result, "url", fix_url) # type: ignore
print(json.dumps(result))