stash

2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions
--- a/stash/config/scrapers/community/MissaX/MissaX.py
+++ b/stash/config/scrapers/community/MissaX/MissaX.py
@@ -0,0 +1,136 @@
+import json
+import re
+import sys
+import urllib.parse
+
+import cloudscraper
+from lxml import html
+
+import py_common.log as log
+from py_common.util import scraper_args
+from py_common.types import ScrapedScene
+
+"This scraper scrapes title and uses it to search the site and grab a cover from the search results, among other things"
+
+STUDIO_MAP = {
+    "missax.com": "MissaX",
+    "allherluv.com": "All Her Luv",
+}
+
+scraper = cloudscraper.create_scraper()
+
+
+def scraped_content(url):
+    try:
+        scraped = scraper.get(url)
+        scraped.raise_for_status()
+        return scraped.content
+    except Exception as e:
+        log.error(f"Unable to fetch '{url}': {e}")
+        exit(1)
+
+
+def scrape_cover(domain, title):
+    # loop throught search result pages until img found
+    for p in range(1, 6):
+        log.debug(f"Searching page {p} for cover")
+        url = f"https://{domain}/tour/search.php?st=advanced&qall=&qany=&qex={urllib.parse.quote(title)}&none=&tadded=0&cat%5B%5D=5&page={p}"
+        body = scraped_content(url)
+        tree = html.fromstring(body)
+        if image := tree.xpath(f'//img[@alt="{title}"]/@src0_4x'):
+            return image[0]
+        if not tree.xpath(
+            '//li[@class="active"]/following-sibling::li'
+        ):  # if there is a next page
+            break
+
+    log.warning(f"Unable to find better cover for {title}")
+
+
+def scene_from_url(url) -> ScrapedScene:
+    domain = urllib.parse.urlparse(url).netloc.removeprefix("www.")
+    studio = STUDIO_MAP.get(domain, domain)
+    body = scraped_content(url)
+    tree = html.fromstring(body)
+
+    scene: ScrapedScene = {}
+
+    if title := tree.xpath('//p[@class="raiting-section__title"]'):
+        title = title[0].text.strip()
+        log.debug(f"Title: {title}")
+        scene["title"] = title
+    else:
+        log.warning("Title not found, bailing")
+        exit(1)
+
+    if (
+        subheader := tree.xpath(
+            '//p[@class="dvd-scenes__data" and contains(., " Added:")]'
+        )
+    ) and (
+        date := re.match(
+            r".*Added:\s(?P<month>\d\d)/(?P<day>\d\d)/(?P<year>\d{4}).*",
+            subheader[0].text_content(),
+            re.DOTALL | re.MULTILINE,
+        )
+    ):
+        date = f"{date.group('year')}-{date.group('month')}-{date.group('day')}"
+        log.debug(f"Date: {date}")
+        scene["date"] = date
+    else:
+        log.warning("Date not found")
+
+    if performers := tree.xpath(
+        '//p[@class="dvd-scenes__data"]//a[contains(@href, "models")]'
+    ):
+        scene["performers"] = [
+            {"name": x.text.strip(), "url": x.get("href")} for x in performers
+        ]
+        performers = ", ".join(p["name"] for p in scene["performers"])
+        log.debug(f"Performers: {performers}")
+    else:
+        log.warning("Performers not found")
+
+    if tags := tree.xpath(
+        '//p[@class="dvd-scenes__data"]//a[contains(@href, "categories")]'
+    ):
+        scene["tags"] = [{"name": x.text.strip()} for x in tags]     
+        tags = ", ".join(t["name"] for t in scene["tags"])
+        log.debug(f"Tags: {tags}")
+    else:
+        log.warning("Tags not found")
+
+    if details := tree.xpath(
+        '//p[@class="dvd-scenes__title"]/following-sibling::p//text()'
+    ):
+        details = "".join(details)
+        # Get rid of double spaces
+        details = "\n".join(" ".join(line.split()) for line in details.split("\n"))
+        # get rid of double newlines
+        details = re.sub(r"\r?\n\n?", r"\n", details).strip()
+        scene["details"] = details
+    else:
+        log.warning("Details not found")
+
+    scene["studio"] = {"name": studio, "url": f"https://{domain}"}
+
+    # cover from scene's page if better one is not found (it will be)
+    bad_cover_url = tree.xpath("//img[@src0_4x]/@src0_4x")
+    scene["image"] = scrape_cover(domain, title) or bad_cover_url
+    log.debug(f"Image: {scene['image']}")
+    return scene
+
+
+# FRAGNEMT = {"url": "https://allherluv.com/tour/trailers/Like-I-Do.html"}
+
+if __name__ == "__main__":
+    op, args = scraper_args()
+    result = None
+    match op, args:
+        case "scene-by-url", {"url": url} if url:
+            result = scene_from_url(url)
+        case _:
+            log.error(f"Operation: {op}, arguments: {json.dumps(args)}")
+            sys.exit(1)
+
+    print(json.dumps(result))