stash
This commit is contained in:
136
stash/config/scrapers/community/MissaX/MissaX.py
Normal file
136
stash/config/scrapers/community/MissaX/MissaX.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import urllib.parse
|
||||
|
||||
import cloudscraper
|
||||
from lxml import html
|
||||
|
||||
import py_common.log as log
|
||||
from py_common.util import scraper_args
|
||||
from py_common.types import ScrapedScene
|
||||
|
||||
"This scraper scrapes title and uses it to search the site and grab a cover from the search results, among other things"
|
||||
|
||||
STUDIO_MAP = {
|
||||
"missax.com": "MissaX",
|
||||
"allherluv.com": "All Her Luv",
|
||||
}
|
||||
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
|
||||
def scraped_content(url):
|
||||
try:
|
||||
scraped = scraper.get(url)
|
||||
scraped.raise_for_status()
|
||||
return scraped.content
|
||||
except Exception as e:
|
||||
log.error(f"Unable to fetch '{url}': {e}")
|
||||
exit(1)
|
||||
|
||||
|
||||
def scrape_cover(domain, title):
|
||||
# loop throught search result pages until img found
|
||||
for p in range(1, 6):
|
||||
log.debug(f"Searching page {p} for cover")
|
||||
url = f"https://{domain}/tour/search.php?st=advanced&qall=&qany=&qex={urllib.parse.quote(title)}&none=&tadded=0&cat%5B%5D=5&page={p}"
|
||||
body = scraped_content(url)
|
||||
tree = html.fromstring(body)
|
||||
if image := tree.xpath(f'//img[@alt="{title}"]/@src0_4x'):
|
||||
return image[0]
|
||||
if not tree.xpath(
|
||||
'//li[@class="active"]/following-sibling::li'
|
||||
): # if there is a next page
|
||||
break
|
||||
|
||||
log.warning(f"Unable to find better cover for {title}")
|
||||
|
||||
|
||||
def scene_from_url(url) -> ScrapedScene:
|
||||
domain = urllib.parse.urlparse(url).netloc.removeprefix("www.")
|
||||
studio = STUDIO_MAP.get(domain, domain)
|
||||
body = scraped_content(url)
|
||||
tree = html.fromstring(body)
|
||||
|
||||
scene: ScrapedScene = {}
|
||||
|
||||
if title := tree.xpath('//p[@class="raiting-section__title"]'):
|
||||
title = title[0].text.strip()
|
||||
log.debug(f"Title: {title}")
|
||||
scene["title"] = title
|
||||
else:
|
||||
log.warning("Title not found, bailing")
|
||||
exit(1)
|
||||
|
||||
if (
|
||||
subheader := tree.xpath(
|
||||
'//p[@class="dvd-scenes__data" and contains(., " Added:")]'
|
||||
)
|
||||
) and (
|
||||
date := re.match(
|
||||
r".*Added:\s(?P<month>\d\d)/(?P<day>\d\d)/(?P<year>\d{4}).*",
|
||||
subheader[0].text_content(),
|
||||
re.DOTALL | re.MULTILINE,
|
||||
)
|
||||
):
|
||||
date = f"{date.group('year')}-{date.group('month')}-{date.group('day')}"
|
||||
log.debug(f"Date: {date}")
|
||||
scene["date"] = date
|
||||
else:
|
||||
log.warning("Date not found")
|
||||
|
||||
if performers := tree.xpath(
|
||||
'//p[@class="dvd-scenes__data"]//a[contains(@href, "models")]'
|
||||
):
|
||||
scene["performers"] = [
|
||||
{"name": x.text.strip(), "url": x.get("href")} for x in performers
|
||||
]
|
||||
performers = ", ".join(p["name"] for p in scene["performers"])
|
||||
log.debug(f"Performers: {performers}")
|
||||
else:
|
||||
log.warning("Performers not found")
|
||||
|
||||
if tags := tree.xpath(
|
||||
'//p[@class="dvd-scenes__data"]//a[contains(@href, "categories")]'
|
||||
):
|
||||
scene["tags"] = [{"name": x.text.strip()} for x in tags]
|
||||
tags = ", ".join(t["name"] for t in scene["tags"])
|
||||
log.debug(f"Tags: {tags}")
|
||||
else:
|
||||
log.warning("Tags not found")
|
||||
|
||||
if details := tree.xpath(
|
||||
'//p[@class="dvd-scenes__title"]/following-sibling::p//text()'
|
||||
):
|
||||
details = "".join(details)
|
||||
# Get rid of double spaces
|
||||
details = "\n".join(" ".join(line.split()) for line in details.split("\n"))
|
||||
# get rid of double newlines
|
||||
details = re.sub(r"\r?\n\n?", r"\n", details).strip()
|
||||
scene["details"] = details
|
||||
else:
|
||||
log.warning("Details not found")
|
||||
|
||||
scene["studio"] = {"name": studio, "url": f"https://{domain}"}
|
||||
|
||||
# cover from scene's page if better one is not found (it will be)
|
||||
bad_cover_url = tree.xpath("//img[@src0_4x]/@src0_4x")
|
||||
scene["image"] = scrape_cover(domain, title) or bad_cover_url
|
||||
log.debug(f"Image: {scene['image']}")
|
||||
return scene
|
||||
|
||||
|
||||
# FRAGNEMT = {"url": "https://allherluv.com/tour/trailers/Like-I-Do.html"}
|
||||
|
||||
if __name__ == "__main__":
|
||||
op, args = scraper_args()
|
||||
result = None
|
||||
match op, args:
|
||||
case "scene-by-url", {"url": url} if url:
|
||||
result = scene_from_url(url)
|
||||
case _:
|
||||
log.error(f"Operation: {op}, arguments: {json.dumps(args)}")
|
||||
sys.exit(1)
|
||||
|
||||
print(json.dumps(result))
|
||||
14
stash/config/scrapers/community/MissaX/MissaX.yml
Normal file
14
stash/config/scrapers/community/MissaX/MissaX.yml
Normal file
@@ -0,0 +1,14 @@
|
||||
name: "MissaX"
|
||||
# requires: py_common
|
||||
# scrapes: MissaX, All Her Luv
|
||||
|
||||
sceneByURL:
|
||||
- action: script
|
||||
script:
|
||||
- python
|
||||
- MissaX.py
|
||||
- scene-by-url
|
||||
url:
|
||||
- allherluv.com/tour
|
||||
- missax.com/tour
|
||||
# Last Updated September 11, 2022
|
||||
10
stash/config/scrapers/community/MissaX/manifest
Executable file
10
stash/config/scrapers/community/MissaX/manifest
Executable file
@@ -0,0 +1,10 @@
|
||||
id: MissaX
|
||||
name: MissaX
|
||||
metadata: {}
|
||||
version: caf5a3b
|
||||
date: "2024-02-19 20:27:17"
|
||||
requires: []
|
||||
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||
files:
|
||||
- MissaX.yml
|
||||
- MissaX.py
|
||||
Reference in New Issue
Block a user