This commit is contained in:
Christoph Califice
2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions

View File

@@ -0,0 +1,315 @@
import json
import re
import requests
import sys
from unicodedata import normalize
from html.parser import HTMLParser
import py_common.log as log
from py_common.types import ScrapedMovie, ScrapedPerformer, ScrapedScene, ScrapedStudio
from py_common.util import dig, guess_nationality, replace_all, scraper_args
# Maps the `site_domain` key from the API
# to studio names currently used on StashDB
studio_map = {
"2girls1camera.com": "2 Girls 1 Camera",
"allanal.com": "All Anal",
"alterotic.com": "Alt Erotic",
"amazingfilms.com": "Amazing Films",
"analonly.com": "Anal Only",
"analjesse.com": "Anal Jesse",
"benefitmonkey.com": "Benefit Monkey",
"biggulpgirls.com": "Big Gulp Girls",
"bjraw.com": "BJ Raw",
"blackbullchallenge.com": "Black Bull Challenge",
"cougarseason.com": "Cougar Season",
"creampiethais.com": "Creampie Thais",
"deepthroatsirens.com": "Deepthroat Sirens",
"dirtyauditions.com": "Dirty Auditions",
"divine-dd.com": "Divine-DD",
"facialsforever.com": "Facials Forever",
"freakmobmedia.com": "FreakMob Media",
"gogobarauditions.com": "Gogo Bar Auditions",
"gotfilled.com": "Got Filled",
"hobybuchanon.com": "Hoby Buchanon",
"inkedpov.com": "Inked POV",
"inserted.com": "Inserted",
"jav888.com": "JAV888",
"lady-sonia.com": "Lady Sonia",
"lezkey.com": "LezKey",
"lucidflix.com": "LucidFlix",
"meanfeetfetish.com": "Mean Feet Fetish",
"members.hobybuchanon.com": "Hoby Buchanon",
"mongerinasia.com": "Monger In Asia",
"nylonperv.com": "Nylon Perv",
"nympho.com": "Nympho",
"poundedpetite.com": "Pounded Petite",
"premium-nickmarxx.com": "Nick Marxx",
"red-xxx.com": "Red-XXX",
"rickysroom.com": "Ricky's Room",
"s3xus.com": "S3XUS",
"seska.com": "Seska",
"sexymodernbull.com": "Sexy Modern Bull",
"shesbrandnew.com": "She's Brand New",
"sidechick.com": "SIDECHICK",
"suckthisdick.com": "Suck This Dick",
"swallowed.com": "Swallowed",
"thaigirlswild.com": "Thai Girls Wild",
"topwebmodels.com": "Top Web Models",
"trueanal.com": "True Anal",
"twmclassics.com": "TWM Classics",
"xful.com": "Xful",
"yesgirlz.com": "Yes Girlz",
"yummycouple.com": "Yummy Couple",
"z-filmz-originals.com": "Z-Filmz",
}
def clean_url(url: str) -> str:
# remove any query parameters
return re.sub(r"\?.*", "", url)
# Some sites only work with the `tour.` subdomain
def fix_url(url: str) -> str:
url = url.replace("twmclassics.com", "topwebmodels.com")
url = url.replace("suckthisdick.com", "hobybuchanon.com")
url = url.replace("premium-nickmarxx.com", "nickmarxx.com")
tour_domain = (
"nympho",
"allanal",
"analonly",
"2girls1camera",
"biggulpgirls",
"deepthroatsirens",
"facialsforever",
"poundedpetite",
"seska",
"swallowed",
"shesbrandnew",
"topwebmodels",
"trueanal",
"twmclassics",
)
return re.sub(rf"//(?<!tour\.)({'|'.join(tour_domain)})", r"//tour.\1", url)
def strip_tags(html: str) -> str:
class ToPlainText(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs = True
self.text = []
def handle_data(self, d):
self.text.append(d)
def get_data(self):
return "".join(self.text)
s = ToPlainText()
s.feed(html)
return normalize("NFKD", s.get_data())
def fetch_page_props(url: str) -> dict | None:
r = requests.get(url)
if r.status_code != 200:
log.error(f"Failed to fetch page HTML: {r.status_code}")
return None
matches = re.findall(
r'(?:<script id="__NEXT_DATA__" type="application\/json">({.+})<\/script>)',
r.text,
re.MULTILINE,
)
if not matches:
log.error("Could not find JSON data on page")
return None
parsed_json = json.loads(matches[0])
if not (content := dig(parsed_json, "props", "pageProps")):
log.error("Could not find page props in JSON data")
return content
def make_performer_url(slug: str, site: str) -> str:
return f"https://{site}/models/{slug}"
def get_studio(site: str) -> ScrapedStudio:
name = studio_map.get(site, site)
studio: ScrapedStudio = {
"name": name,
"url": f"https://{site}",
}
if name == "Suck This Dick":
studio["parent"] = get_studio("hobybuchanon.com")
return studio
def to_scraped_performer(raw_performer: dict) -> ScrapedPerformer:
performer: ScrapedPerformer = {
"name": raw_performer["name"],
"gender": raw_performer["gender"],
"url": make_performer_url(raw_performer["slug"], raw_performer["site_domain"]),
}
if image := raw_performer.get("thumb"):
performer["image"] = image
if bio := raw_performer.get("Bio"):
performer["details"] = strip_tags(bio)
if (birthdate := raw_performer.get("Birthdate")) and birthdate != "1969-12-31":
performer["birthdate"] = birthdate
if measurements := raw_performer.get("Measurements"):
performer["measurements"] = measurements
if eye_color := raw_performer.get("Eyes"):
performer["eye_color"] = eye_color
if ethnicity := raw_performer.get("Ethnicity"):
performer["ethnicity"] = ethnicity
if (height_ft := raw_performer.get("Height")) and (
h := re.match(r"(\d+)\D+(\d+).+", height_ft)
):
height_cm = round((float(h.group(1)) * 12 + float(h.group(2))) * 2.54)
performer["height"] = str(height_cm)
if (weight_lb := raw_performer.get("Weight")) and (
w := re.match(r"(\d+)\slbs", weight_lb)
):
weight_kg = round(float(w.group(1)) / 2.2046)
performer["weight"] = str(weight_kg)
if hair_color := raw_performer.get("Hair"):
performer["hair_color"] = hair_color
if country := raw_performer.get("Born"):
performer["country"] = guess_nationality(country)
if twitter := raw_performer.get("Twitter", "").removeprefix("@"):
performer["twitter"] = f"https://twitter.com/{twitter}"
if instagram := raw_performer.get("Instagram", "").removeprefix("@"):
performer["instagram"] = f"https://www.instagram.com/{instagram}"
return performer
def to_scraped_movie(raw_movie: dict) -> ScrapedMovie:
movie: ScrapedMovie = {
"name": raw_movie["title"],
}
if date := raw_movie.get("publish_date"):
movie["date"] = date[:10].replace("/", "-")
if duration := raw_movie.get("videos_duration"):
movie["duration"] = duration
if cover := raw_movie.get("trailer_screencap"):
movie["front_image"] = cover
site = raw_movie["site_domain"]
movie["studio"] = get_studio(site)
# There is no reliable way to construct a movie URL from the data
return movie
def to_scraped_scene(raw_scene: dict) -> ScrapedScene:
site = raw_scene["site_domain"]
scene: ScrapedScene = {}
if title := raw_scene.get("title"):
scene["title"] = title
if date := raw_scene.get("publish_date"):
scene["date"] = date[:10].replace("/", "-")
if details := raw_scene.get("description"):
scene["details"] = strip_tags(details)
if scene_id := raw_scene.get("id"):
scene["code"] = str(scene_id)
if models := raw_scene.get("models_thumbs"):
scene["performers"] = [
{
"name": x["name"],
"image": x["thumb"],
"url": make_performer_url(x["slug"], site),
}
for x in models
]
if tags := raw_scene.get("tags"):
scene["tags"] = [{"name": x} for x in tags]
scene["studio"] = get_studio(site)
# trailer_screencap is what's shown on most sites
# extra_thumbnails has the best sizes and in most cases the first one is the same as thumb
# thumb is a good fallback if extra_thumbnails is not available
# final fallback is special_thumbnails
cover_candidates = filter(
None,
(
dig(raw_scene, "poster_url"),
dig(raw_scene, "trailer_screencap"),
dig(raw_scene, "extra_thumbnails", 0),
dig(raw_scene, "thumb"),
dig(raw_scene, "special_thumbnails", 0),
),
)
# No animated scene covers
img_exts = (".jpg", ".jpeg", ".png")
if scene_cover := next((x for x in cover_candidates if x.endswith(img_exts)), None):
scene["image"] = scene_cover
# There is no reliable way to construct a scene URL from the data
return scene
def scrape_scene(url: str) -> ScrapedScene | None:
if not (props := fetch_page_props(url)):
return None
scene = to_scraped_scene(props["content"])
scene["url"] = url
if playlist := dig(props, "playlist", "data", 0):
scene["movies"] = [to_scraped_movie(playlist)]
return scene
def scrape_performer(url: str) -> ScrapedPerformer | None:
if not (props := fetch_page_props(url)):
return None
return to_scraped_performer(props["model"])
if __name__ == "__main__":
op, args = scraper_args()
result = None
match op, args:
case "scene-by-url", {"url": url} if url:
result = scrape_scene(clean_url(url))
case "performer-by-url", {"url": url} if url:
result = scrape_performer(clean_url(url))
case _:
log.error(f"Invalid operation: {op}")
sys.exit(1)
result = replace_all(result, "url", fix_url) # type: ignore
print(json.dumps(result))

View File

@@ -0,0 +1,154 @@
name: KB Productions
# requires: py_common
# scrapes: 2 Girls 1 Camera, All Anal, Alt Erotic, Amazing Films, Anal Only, Benefit Monkey, Big Gulp Girls, BJ Raw, Black Bull Challenge, Cougar Season, Creampie Thais, Deepthroat Sirens, Dirty Auditions, Divine-DD, Emo Network, Facials Forever, Freak Mob Media, Gay Life Network, Got Filled, Hoby Buchanon, Inked POV, Inserted, JAV888, Lady Sonia, LezKey, LucidFlix, Mean Feet Fetish, Monger In Asia, Nylon Perv, Nympho, Pounded Petite, Red-XXX, Ricky's Room, S3XUS, Seska, Sexy Modern Bull, She's Brand New, SIDECHICK, Suck This Dick, Swallowed, Top Web Models, True Anal, TWM Classics, Xful, Yes Girlz, Yummy Couple, Z-Filmz
sceneByURL:
- action: scrapeXPath
url:
- emonetwork.com
- gaylifenetwork.com
scraper: sceneScraper
- url:
- 2girls1camera.com/scenes
- alterotic.com/videos
- amazingfilms.com/videos
- analjesse.com/trailers
- analonly.com/scenes
- benefitmonkey.com/scenes
- biggulpgirls.com/scenes
- bjraw.com/videos
- blackbullchallenge.com/videos
- cougarseason.com/scenes
- creampiethais.com/videos
- deepthroatsirens.com/scenes
- dirtyauditions.com/scenes
- divine-dd.com/videos
- facialsforever.com/scenes
- freakmobmedia.com/videos
- gogobarauditions.com/trailers
- gotfilled.com/videos
- hobybuchanon.com/behind-the-scenes
- hobybuchanon.com/suck-this-dick
- hobybuchanon.com/updates
- inkedpov.com/scenes
- inserted.com/tour/videos
- inserted.com/videos
- jav888.com/videos
- lady-sonia.com/scenes
- lezkey.com/scenes
- lucidflix.com/episodes
- meanfeetfetish.com/videos
- mongerinasia.com/trailers
- nylonperv.com/videos
- red-xxx.com/scenes
- rickysroom.com/videos
- s3xus.com/scenes
- sexymodernbull.com/videos
- sidechick.com/videos
- thaigirlswild.com/videos
- tour.allanal.com/scenes
- tour.nympho.com/scenes
- tour.poundedpetite.com/scenes
- tour.seska.com/scenes
- tour.shesbrandnew.com/scenes
- tour.swallowed.com/scenes
- tour.topwebmodels.com/scenes
- trueanal.com/scenes
- xful.com/videos
- yesgirlz.com/scenes
- yummycouple.com/videos
- z-filmz-originals.com/videos
action: script
script:
- python
- KBProductions.py
- scene-by-url
performerByURL:
- url:
- 2girls1camera.com/models
- tour.allanal.com/models
- alterotic.com/models
- amazingfilms.com/models
- analonly.com/models
- benefitmonkey.com/models
- biggulpgirls.com/models
- bjraw.com/models
- blackbullchallenge.com/models
- cougarseason.com/models
# creampiethais.com has no model pages
- deepthroatsirens.com/models
- dirtyauditions.com/models
- divine-dd.com/models
- facialsforever.com/models
- freakmobmedia.com/models
- gotfilled.com/models
# /models redirects to /hobyshotties
- hobybuchanon.com/models
- hobybuchanon.com/hobyshotties
- inkedpov.com/models
- inserted.com/models
- inserted.com/tour/models
- jav888.com/models
# lady-sonia.com has no model pages
- lezkey.com/models
- lucidflix.com/models
- meanfeetfetish.com/models
- nylonperv.com/models
- tour.nympho.com/models
- tour.poundedpetite.com/models
# red-xxx.com has no model pages
- rickysroom.com/models
- s3xus.com/models
# seska.com has no model pages
- tour.shesbrandnew.com/models
- sidechick.com/models
- sexymodernbull.com/models
- tour.swallowed.com/models
- tour.topwebmodels.com/models
- trueanal.com/models
- xful.com/models
- yesgirlz.com/models
# yummycouple.com has no model pages
- z-filmz-originals.com/models
action: script
script:
- python
- KBProductions.py
- performer-by-url
xPathScrapers:
sceneScraper:
scene:
Title: //div[@class="trailer"]//img/@alt
Details: //div[@class="videoContainer"]//p[not(contains(@class,'vjs-no-js'))]/text()
Performers:
Name: //h2[contains(text(),'Models')]/a/text()
Image:
selector: //div[@class="trailer"]//img/@src | //video/@poster
postProcess:
- replace:
- regex: ^(https:)?
with: "https:"
Date:
selector: //h1
postProcess:
- replace:
- regex: ^([^-]+).+
with: $1
- parseDate: January 2, 2006
Tags:
Name:
selector: //h4[contains(text(),'Search Tags')]/a/text()
Studio:
Name:
selector: //h3[contains(text(),'Site')]/a/text()
postProcess:
- map:
BeddableBoys: "Beddable Boys"
BestBareBack: "Best Bareback"
EmoTwinks: "Emo Twinks"
ExposedEmos: "Exposed Emos"
HomoEmo: "Homo Emo"
HomoScene: "Homo Scene"
LollipopTwinks: "Lollipop Twinks"
Twinklight: "Twink Light"
# Last Updated March 29, 2024

View File

@@ -0,0 +1,10 @@
id: KBProductions
name: KB Productions
metadata: {}
version: b4d3f2f
date: "2024-03-29 17:59:44"
requires: []
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
files:
- KBProductions.py
- KBProductions.yml