stash
This commit is contained in:
315
stash/config/scrapers/community/KBProductions/KBProductions.py
Normal file
315
stash/config/scrapers/community/KBProductions/KBProductions.py
Normal file
@@ -0,0 +1,315 @@
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
import sys
|
||||
from unicodedata import normalize
|
||||
from html.parser import HTMLParser
|
||||
|
||||
import py_common.log as log
|
||||
from py_common.types import ScrapedMovie, ScrapedPerformer, ScrapedScene, ScrapedStudio
|
||||
from py_common.util import dig, guess_nationality, replace_all, scraper_args
|
||||
|
||||
# Maps the `site_domain` key from the API
|
||||
# to studio names currently used on StashDB
|
||||
studio_map = {
|
||||
"2girls1camera.com": "2 Girls 1 Camera",
|
||||
"allanal.com": "All Anal",
|
||||
"alterotic.com": "Alt Erotic",
|
||||
"amazingfilms.com": "Amazing Films",
|
||||
"analonly.com": "Anal Only",
|
||||
"analjesse.com": "Anal Jesse",
|
||||
"benefitmonkey.com": "Benefit Monkey",
|
||||
"biggulpgirls.com": "Big Gulp Girls",
|
||||
"bjraw.com": "BJ Raw",
|
||||
"blackbullchallenge.com": "Black Bull Challenge",
|
||||
"cougarseason.com": "Cougar Season",
|
||||
"creampiethais.com": "Creampie Thais",
|
||||
"deepthroatsirens.com": "Deepthroat Sirens",
|
||||
"dirtyauditions.com": "Dirty Auditions",
|
||||
"divine-dd.com": "Divine-DD",
|
||||
"facialsforever.com": "Facials Forever",
|
||||
"freakmobmedia.com": "FreakMob Media",
|
||||
"gogobarauditions.com": "Gogo Bar Auditions",
|
||||
"gotfilled.com": "Got Filled",
|
||||
"hobybuchanon.com": "Hoby Buchanon",
|
||||
"inkedpov.com": "Inked POV",
|
||||
"inserted.com": "Inserted",
|
||||
"jav888.com": "JAV888",
|
||||
"lady-sonia.com": "Lady Sonia",
|
||||
"lezkey.com": "LezKey",
|
||||
"lucidflix.com": "LucidFlix",
|
||||
"meanfeetfetish.com": "Mean Feet Fetish",
|
||||
"members.hobybuchanon.com": "Hoby Buchanon",
|
||||
"mongerinasia.com": "Monger In Asia",
|
||||
"nylonperv.com": "Nylon Perv",
|
||||
"nympho.com": "Nympho",
|
||||
"poundedpetite.com": "Pounded Petite",
|
||||
"premium-nickmarxx.com": "Nick Marxx",
|
||||
"red-xxx.com": "Red-XXX",
|
||||
"rickysroom.com": "Ricky's Room",
|
||||
"s3xus.com": "S3XUS",
|
||||
"seska.com": "Seska",
|
||||
"sexymodernbull.com": "Sexy Modern Bull",
|
||||
"shesbrandnew.com": "She's Brand New",
|
||||
"sidechick.com": "SIDECHICK",
|
||||
"suckthisdick.com": "Suck This Dick",
|
||||
"swallowed.com": "Swallowed",
|
||||
"thaigirlswild.com": "Thai Girls Wild",
|
||||
"topwebmodels.com": "Top Web Models",
|
||||
"trueanal.com": "True Anal",
|
||||
"twmclassics.com": "TWM Classics",
|
||||
"xful.com": "Xful",
|
||||
"yesgirlz.com": "Yes Girlz",
|
||||
"yummycouple.com": "Yummy Couple",
|
||||
"z-filmz-originals.com": "Z-Filmz",
|
||||
}
|
||||
|
||||
|
||||
def clean_url(url: str) -> str:
|
||||
# remove any query parameters
|
||||
return re.sub(r"\?.*", "", url)
|
||||
|
||||
|
||||
# Some sites only work with the `tour.` subdomain
|
||||
def fix_url(url: str) -> str:
|
||||
url = url.replace("twmclassics.com", "topwebmodels.com")
|
||||
url = url.replace("suckthisdick.com", "hobybuchanon.com")
|
||||
url = url.replace("premium-nickmarxx.com", "nickmarxx.com")
|
||||
tour_domain = (
|
||||
"nympho",
|
||||
"allanal",
|
||||
"analonly",
|
||||
"2girls1camera",
|
||||
"biggulpgirls",
|
||||
"deepthroatsirens",
|
||||
"facialsforever",
|
||||
"poundedpetite",
|
||||
"seska",
|
||||
"swallowed",
|
||||
"shesbrandnew",
|
||||
"topwebmodels",
|
||||
"trueanal",
|
||||
"twmclassics",
|
||||
)
|
||||
return re.sub(rf"//(?<!tour\.)({'|'.join(tour_domain)})", r"//tour.\1", url)
|
||||
|
||||
|
||||
def strip_tags(html: str) -> str:
|
||||
class ToPlainText(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.reset()
|
||||
self.strict = False
|
||||
self.convert_charrefs = True
|
||||
self.text = []
|
||||
|
||||
def handle_data(self, d):
|
||||
self.text.append(d)
|
||||
|
||||
def get_data(self):
|
||||
return "".join(self.text)
|
||||
|
||||
s = ToPlainText()
|
||||
s.feed(html)
|
||||
return normalize("NFKD", s.get_data())
|
||||
|
||||
|
||||
def fetch_page_props(url: str) -> dict | None:
|
||||
r = requests.get(url)
|
||||
|
||||
if r.status_code != 200:
|
||||
log.error(f"Failed to fetch page HTML: {r.status_code}")
|
||||
return None
|
||||
|
||||
matches = re.findall(
|
||||
r'(?:<script id="__NEXT_DATA__" type="application\/json">({.+})<\/script>)',
|
||||
r.text,
|
||||
re.MULTILINE,
|
||||
)
|
||||
if not matches:
|
||||
log.error("Could not find JSON data on page")
|
||||
return None
|
||||
|
||||
parsed_json = json.loads(matches[0])
|
||||
|
||||
if not (content := dig(parsed_json, "props", "pageProps")):
|
||||
log.error("Could not find page props in JSON data")
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def make_performer_url(slug: str, site: str) -> str:
|
||||
return f"https://{site}/models/{slug}"
|
||||
|
||||
|
||||
def get_studio(site: str) -> ScrapedStudio:
|
||||
name = studio_map.get(site, site)
|
||||
studio: ScrapedStudio = {
|
||||
"name": name,
|
||||
"url": f"https://{site}",
|
||||
}
|
||||
if name == "Suck This Dick":
|
||||
studio["parent"] = get_studio("hobybuchanon.com")
|
||||
return studio
|
||||
|
||||
|
||||
def to_scraped_performer(raw_performer: dict) -> ScrapedPerformer:
|
||||
performer: ScrapedPerformer = {
|
||||
"name": raw_performer["name"],
|
||||
"gender": raw_performer["gender"],
|
||||
"url": make_performer_url(raw_performer["slug"], raw_performer["site_domain"]),
|
||||
}
|
||||
|
||||
if image := raw_performer.get("thumb"):
|
||||
performer["image"] = image
|
||||
|
||||
if bio := raw_performer.get("Bio"):
|
||||
performer["details"] = strip_tags(bio)
|
||||
|
||||
if (birthdate := raw_performer.get("Birthdate")) and birthdate != "1969-12-31":
|
||||
performer["birthdate"] = birthdate
|
||||
|
||||
if measurements := raw_performer.get("Measurements"):
|
||||
performer["measurements"] = measurements
|
||||
|
||||
if eye_color := raw_performer.get("Eyes"):
|
||||
performer["eye_color"] = eye_color
|
||||
|
||||
if ethnicity := raw_performer.get("Ethnicity"):
|
||||
performer["ethnicity"] = ethnicity
|
||||
|
||||
if (height_ft := raw_performer.get("Height")) and (
|
||||
h := re.match(r"(\d+)\D+(\d+).+", height_ft)
|
||||
):
|
||||
height_cm = round((float(h.group(1)) * 12 + float(h.group(2))) * 2.54)
|
||||
performer["height"] = str(height_cm)
|
||||
|
||||
if (weight_lb := raw_performer.get("Weight")) and (
|
||||
w := re.match(r"(\d+)\slbs", weight_lb)
|
||||
):
|
||||
weight_kg = round(float(w.group(1)) / 2.2046)
|
||||
performer["weight"] = str(weight_kg)
|
||||
|
||||
if hair_color := raw_performer.get("Hair"):
|
||||
performer["hair_color"] = hair_color
|
||||
|
||||
if country := raw_performer.get("Born"):
|
||||
performer["country"] = guess_nationality(country)
|
||||
|
||||
if twitter := raw_performer.get("Twitter", "").removeprefix("@"):
|
||||
performer["twitter"] = f"https://twitter.com/{twitter}"
|
||||
|
||||
if instagram := raw_performer.get("Instagram", "").removeprefix("@"):
|
||||
performer["instagram"] = f"https://www.instagram.com/{instagram}"
|
||||
|
||||
return performer
|
||||
|
||||
|
||||
def to_scraped_movie(raw_movie: dict) -> ScrapedMovie:
|
||||
movie: ScrapedMovie = {
|
||||
"name": raw_movie["title"],
|
||||
}
|
||||
|
||||
if date := raw_movie.get("publish_date"):
|
||||
movie["date"] = date[:10].replace("/", "-")
|
||||
|
||||
if duration := raw_movie.get("videos_duration"):
|
||||
movie["duration"] = duration
|
||||
|
||||
if cover := raw_movie.get("trailer_screencap"):
|
||||
movie["front_image"] = cover
|
||||
|
||||
site = raw_movie["site_domain"]
|
||||
movie["studio"] = get_studio(site)
|
||||
|
||||
# There is no reliable way to construct a movie URL from the data
|
||||
|
||||
return movie
|
||||
|
||||
|
||||
def to_scraped_scene(raw_scene: dict) -> ScrapedScene:
|
||||
site = raw_scene["site_domain"]
|
||||
scene: ScrapedScene = {}
|
||||
|
||||
if title := raw_scene.get("title"):
|
||||
scene["title"] = title
|
||||
if date := raw_scene.get("publish_date"):
|
||||
scene["date"] = date[:10].replace("/", "-")
|
||||
if details := raw_scene.get("description"):
|
||||
scene["details"] = strip_tags(details)
|
||||
if scene_id := raw_scene.get("id"):
|
||||
scene["code"] = str(scene_id)
|
||||
if models := raw_scene.get("models_thumbs"):
|
||||
scene["performers"] = [
|
||||
{
|
||||
"name": x["name"],
|
||||
"image": x["thumb"],
|
||||
"url": make_performer_url(x["slug"], site),
|
||||
}
|
||||
for x in models
|
||||
]
|
||||
if tags := raw_scene.get("tags"):
|
||||
scene["tags"] = [{"name": x} for x in tags]
|
||||
|
||||
scene["studio"] = get_studio(site)
|
||||
|
||||
# trailer_screencap is what's shown on most sites
|
||||
# extra_thumbnails has the best sizes and in most cases the first one is the same as thumb
|
||||
# thumb is a good fallback if extra_thumbnails is not available
|
||||
# final fallback is special_thumbnails
|
||||
cover_candidates = filter(
|
||||
None,
|
||||
(
|
||||
dig(raw_scene, "poster_url"),
|
||||
dig(raw_scene, "trailer_screencap"),
|
||||
dig(raw_scene, "extra_thumbnails", 0),
|
||||
dig(raw_scene, "thumb"),
|
||||
dig(raw_scene, "special_thumbnails", 0),
|
||||
),
|
||||
)
|
||||
# No animated scene covers
|
||||
img_exts = (".jpg", ".jpeg", ".png")
|
||||
|
||||
if scene_cover := next((x for x in cover_candidates if x.endswith(img_exts)), None):
|
||||
scene["image"] = scene_cover
|
||||
|
||||
# There is no reliable way to construct a scene URL from the data
|
||||
|
||||
return scene
|
||||
|
||||
|
||||
def scrape_scene(url: str) -> ScrapedScene | None:
|
||||
if not (props := fetch_page_props(url)):
|
||||
return None
|
||||
|
||||
scene = to_scraped_scene(props["content"])
|
||||
scene["url"] = url
|
||||
|
||||
if playlist := dig(props, "playlist", "data", 0):
|
||||
scene["movies"] = [to_scraped_movie(playlist)]
|
||||
|
||||
return scene
|
||||
|
||||
|
||||
def scrape_performer(url: str) -> ScrapedPerformer | None:
|
||||
if not (props := fetch_page_props(url)):
|
||||
return None
|
||||
|
||||
return to_scraped_performer(props["model"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
op, args = scraper_args()
|
||||
|
||||
result = None
|
||||
match op, args:
|
||||
case "scene-by-url", {"url": url} if url:
|
||||
result = scrape_scene(clean_url(url))
|
||||
case "performer-by-url", {"url": url} if url:
|
||||
result = scrape_performer(clean_url(url))
|
||||
case _:
|
||||
log.error(f"Invalid operation: {op}")
|
||||
sys.exit(1)
|
||||
|
||||
result = replace_all(result, "url", fix_url) # type: ignore
|
||||
print(json.dumps(result))
|
||||
154
stash/config/scrapers/community/KBProductions/KBProductions.yml
Normal file
154
stash/config/scrapers/community/KBProductions/KBProductions.yml
Normal file
@@ -0,0 +1,154 @@
|
||||
name: KB Productions
|
||||
# requires: py_common
|
||||
# scrapes: 2 Girls 1 Camera, All Anal, Alt Erotic, Amazing Films, Anal Only, Benefit Monkey, Big Gulp Girls, BJ Raw, Black Bull Challenge, Cougar Season, Creampie Thais, Deepthroat Sirens, Dirty Auditions, Divine-DD, Emo Network, Facials Forever, Freak Mob Media, Gay Life Network, Got Filled, Hoby Buchanon, Inked POV, Inserted, JAV888, Lady Sonia, LezKey, LucidFlix, Mean Feet Fetish, Monger In Asia, Nylon Perv, Nympho, Pounded Petite, Red-XXX, Ricky's Room, S3XUS, Seska, Sexy Modern Bull, She's Brand New, SIDECHICK, Suck This Dick, Swallowed, Top Web Models, True Anal, TWM Classics, Xful, Yes Girlz, Yummy Couple, Z-Filmz
|
||||
sceneByURL:
|
||||
- action: scrapeXPath
|
||||
url:
|
||||
- emonetwork.com
|
||||
- gaylifenetwork.com
|
||||
scraper: sceneScraper
|
||||
- url:
|
||||
- 2girls1camera.com/scenes
|
||||
- alterotic.com/videos
|
||||
- amazingfilms.com/videos
|
||||
- analjesse.com/trailers
|
||||
- analonly.com/scenes
|
||||
- benefitmonkey.com/scenes
|
||||
- biggulpgirls.com/scenes
|
||||
- bjraw.com/videos
|
||||
- blackbullchallenge.com/videos
|
||||
- cougarseason.com/scenes
|
||||
- creampiethais.com/videos
|
||||
- deepthroatsirens.com/scenes
|
||||
- dirtyauditions.com/scenes
|
||||
- divine-dd.com/videos
|
||||
- facialsforever.com/scenes
|
||||
- freakmobmedia.com/videos
|
||||
- gogobarauditions.com/trailers
|
||||
- gotfilled.com/videos
|
||||
- hobybuchanon.com/behind-the-scenes
|
||||
- hobybuchanon.com/suck-this-dick
|
||||
- hobybuchanon.com/updates
|
||||
- inkedpov.com/scenes
|
||||
- inserted.com/tour/videos
|
||||
- inserted.com/videos
|
||||
- jav888.com/videos
|
||||
- lady-sonia.com/scenes
|
||||
- lezkey.com/scenes
|
||||
- lucidflix.com/episodes
|
||||
- meanfeetfetish.com/videos
|
||||
- mongerinasia.com/trailers
|
||||
- nylonperv.com/videos
|
||||
- red-xxx.com/scenes
|
||||
- rickysroom.com/videos
|
||||
- s3xus.com/scenes
|
||||
- sexymodernbull.com/videos
|
||||
- sidechick.com/videos
|
||||
- thaigirlswild.com/videos
|
||||
- tour.allanal.com/scenes
|
||||
- tour.nympho.com/scenes
|
||||
- tour.poundedpetite.com/scenes
|
||||
- tour.seska.com/scenes
|
||||
- tour.shesbrandnew.com/scenes
|
||||
- tour.swallowed.com/scenes
|
||||
- tour.topwebmodels.com/scenes
|
||||
- trueanal.com/scenes
|
||||
- xful.com/videos
|
||||
- yesgirlz.com/scenes
|
||||
- yummycouple.com/videos
|
||||
- z-filmz-originals.com/videos
|
||||
action: script
|
||||
script:
|
||||
- python
|
||||
- KBProductions.py
|
||||
- scene-by-url
|
||||
performerByURL:
|
||||
- url:
|
||||
- 2girls1camera.com/models
|
||||
- tour.allanal.com/models
|
||||
- alterotic.com/models
|
||||
- amazingfilms.com/models
|
||||
- analonly.com/models
|
||||
- benefitmonkey.com/models
|
||||
- biggulpgirls.com/models
|
||||
- bjraw.com/models
|
||||
- blackbullchallenge.com/models
|
||||
- cougarseason.com/models
|
||||
# creampiethais.com has no model pages
|
||||
- deepthroatsirens.com/models
|
||||
- dirtyauditions.com/models
|
||||
- divine-dd.com/models
|
||||
- facialsforever.com/models
|
||||
- freakmobmedia.com/models
|
||||
- gotfilled.com/models
|
||||
# /models redirects to /hobyshotties
|
||||
- hobybuchanon.com/models
|
||||
- hobybuchanon.com/hobyshotties
|
||||
- inkedpov.com/models
|
||||
- inserted.com/models
|
||||
- inserted.com/tour/models
|
||||
- jav888.com/models
|
||||
# lady-sonia.com has no model pages
|
||||
- lezkey.com/models
|
||||
- lucidflix.com/models
|
||||
- meanfeetfetish.com/models
|
||||
- nylonperv.com/models
|
||||
- tour.nympho.com/models
|
||||
- tour.poundedpetite.com/models
|
||||
# red-xxx.com has no model pages
|
||||
- rickysroom.com/models
|
||||
- s3xus.com/models
|
||||
# seska.com has no model pages
|
||||
- tour.shesbrandnew.com/models
|
||||
- sidechick.com/models
|
||||
- sexymodernbull.com/models
|
||||
- tour.swallowed.com/models
|
||||
- tour.topwebmodels.com/models
|
||||
- trueanal.com/models
|
||||
- xful.com/models
|
||||
- yesgirlz.com/models
|
||||
# yummycouple.com has no model pages
|
||||
- z-filmz-originals.com/models
|
||||
action: script
|
||||
script:
|
||||
- python
|
||||
- KBProductions.py
|
||||
- performer-by-url
|
||||
|
||||
xPathScrapers:
|
||||
sceneScraper:
|
||||
scene:
|
||||
Title: //div[@class="trailer"]//img/@alt
|
||||
Details: //div[@class="videoContainer"]//p[not(contains(@class,'vjs-no-js'))]/text()
|
||||
Performers:
|
||||
Name: //h2[contains(text(),'Models')]/a/text()
|
||||
Image:
|
||||
selector: //div[@class="trailer"]//img/@src | //video/@poster
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^(https:)?
|
||||
with: "https:"
|
||||
Date:
|
||||
selector: //h1
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^([^-]+).+
|
||||
with: $1
|
||||
- parseDate: January 2, 2006
|
||||
Tags:
|
||||
Name:
|
||||
selector: //h4[contains(text(),'Search Tags')]/a/text()
|
||||
Studio:
|
||||
Name:
|
||||
selector: //h3[contains(text(),'Site')]/a/text()
|
||||
postProcess:
|
||||
- map:
|
||||
BeddableBoys: "Beddable Boys"
|
||||
BestBareBack: "Best Bareback"
|
||||
EmoTwinks: "Emo Twinks"
|
||||
ExposedEmos: "Exposed Emos"
|
||||
HomoEmo: "Homo Emo"
|
||||
HomoScene: "Homo Scene"
|
||||
LollipopTwinks: "Lollipop Twinks"
|
||||
Twinklight: "Twink Light"
|
||||
# Last Updated March 29, 2024
|
||||
10
stash/config/scrapers/community/KBProductions/manifest
Executable file
10
stash/config/scrapers/community/KBProductions/manifest
Executable file
@@ -0,0 +1,10 @@
|
||||
id: KBProductions
|
||||
name: KB Productions
|
||||
metadata: {}
|
||||
version: b4d3f2f
|
||||
date: "2024-03-29 17:59:44"
|
||||
requires: []
|
||||
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||
files:
|
||||
- KBProductions.py
|
||||
- KBProductions.yml
|
||||
Reference in New Issue
Block a user