137 lines
4.2 KiB
Python
137 lines
4.2 KiB
Python
import json
|
|
import re
|
|
import sys
|
|
import urllib.parse
|
|
|
|
import cloudscraper
|
|
from lxml import html
|
|
|
|
import py_common.log as log
|
|
from py_common.util import scraper_args
|
|
from py_common.types import ScrapedScene
|
|
|
|
"This scraper scrapes title and uses it to search the site and grab a cover from the search results, among other things"
|
|
|
|
STUDIO_MAP = {
|
|
"missax.com": "MissaX",
|
|
"allherluv.com": "All Her Luv",
|
|
}
|
|
|
|
scraper = cloudscraper.create_scraper()
|
|
|
|
|
|
def scraped_content(url):
|
|
try:
|
|
scraped = scraper.get(url)
|
|
scraped.raise_for_status()
|
|
return scraped.content
|
|
except Exception as e:
|
|
log.error(f"Unable to fetch '{url}': {e}")
|
|
exit(1)
|
|
|
|
|
|
def scrape_cover(domain, title):
|
|
# loop throught search result pages until img found
|
|
for p in range(1, 6):
|
|
log.debug(f"Searching page {p} for cover")
|
|
url = f"https://{domain}/tour/search.php?st=advanced&qall=&qany=&qex={urllib.parse.quote(title)}&none=&tadded=0&cat%5B%5D=5&page={p}"
|
|
body = scraped_content(url)
|
|
tree = html.fromstring(body)
|
|
if image := tree.xpath(f'//img[@alt="{title}"]/@src0_4x'):
|
|
return image[0]
|
|
if not tree.xpath(
|
|
'//li[@class="active"]/following-sibling::li'
|
|
): # if there is a next page
|
|
break
|
|
|
|
log.warning(f"Unable to find better cover for {title}")
|
|
|
|
|
|
def scene_from_url(url) -> ScrapedScene:
|
|
domain = urllib.parse.urlparse(url).netloc.removeprefix("www.")
|
|
studio = STUDIO_MAP.get(domain, domain)
|
|
body = scraped_content(url)
|
|
tree = html.fromstring(body)
|
|
|
|
scene: ScrapedScene = {}
|
|
|
|
if title := tree.xpath('//p[@class="raiting-section__title"]'):
|
|
title = title[0].text.strip()
|
|
log.debug(f"Title: {title}")
|
|
scene["title"] = title
|
|
else:
|
|
log.warning("Title not found, bailing")
|
|
exit(1)
|
|
|
|
if (
|
|
subheader := tree.xpath(
|
|
'//p[@class="dvd-scenes__data" and contains(., " Added:")]'
|
|
)
|
|
) and (
|
|
date := re.match(
|
|
r".*Added:\s(?P<month>\d\d)/(?P<day>\d\d)/(?P<year>\d{4}).*",
|
|
subheader[0].text_content(),
|
|
re.DOTALL | re.MULTILINE,
|
|
)
|
|
):
|
|
date = f"{date.group('year')}-{date.group('month')}-{date.group('day')}"
|
|
log.debug(f"Date: {date}")
|
|
scene["date"] = date
|
|
else:
|
|
log.warning("Date not found")
|
|
|
|
if performers := tree.xpath(
|
|
'//p[@class="dvd-scenes__data"]//a[contains(@href, "models")]'
|
|
):
|
|
scene["performers"] = [
|
|
{"name": x.text.strip(), "url": x.get("href")} for x in performers
|
|
]
|
|
performers = ", ".join(p["name"] for p in scene["performers"])
|
|
log.debug(f"Performers: {performers}")
|
|
else:
|
|
log.warning("Performers not found")
|
|
|
|
if tags := tree.xpath(
|
|
'//p[@class="dvd-scenes__data"]//a[contains(@href, "categories")]'
|
|
):
|
|
scene["tags"] = [{"name": x.text.strip()} for x in tags]
|
|
tags = ", ".join(t["name"] for t in scene["tags"])
|
|
log.debug(f"Tags: {tags}")
|
|
else:
|
|
log.warning("Tags not found")
|
|
|
|
if details := tree.xpath(
|
|
'//p[@class="dvd-scenes__title"]/following-sibling::p//text()'
|
|
):
|
|
details = "".join(details)
|
|
# Get rid of double spaces
|
|
details = "\n".join(" ".join(line.split()) for line in details.split("\n"))
|
|
# get rid of double newlines
|
|
details = re.sub(r"\r?\n\n?", r"\n", details).strip()
|
|
scene["details"] = details
|
|
else:
|
|
log.warning("Details not found")
|
|
|
|
scene["studio"] = {"name": studio, "url": f"https://{domain}"}
|
|
|
|
# cover from scene's page if better one is not found (it will be)
|
|
bad_cover_url = tree.xpath("//img[@src0_4x]/@src0_4x")
|
|
scene["image"] = scrape_cover(domain, title) or bad_cover_url
|
|
log.debug(f"Image: {scene['image']}")
|
|
return scene
|
|
|
|
|
|
# FRAGNEMT = {"url": "https://allherluv.com/tour/trailers/Like-I-Do.html"}
|
|
|
|
if __name__ == "__main__":
|
|
op, args = scraper_args()
|
|
result = None
|
|
match op, args:
|
|
case "scene-by-url", {"url": url} if url:
|
|
result = scene_from_url(url)
|
|
case _:
|
|
log.error(f"Operation: {op}, arguments: {json.dumps(args)}")
|
|
sys.exit(1)
|
|
|
|
print(json.dumps(result))
|