import json import os import sys import urllib.request import urllib.parse # to import from a parent directory we need to add that directory to the system path csd = os.path.dirname(os.path.realpath(__file__)) # get current script directory parent = os.path.dirname(csd) # parent directory (should be the scrapers one) sys.path.append( parent ) # add parent dir to sys path so that we can import py_common from there try: from lxml import html except ModuleNotFoundError: print( "You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr, ) print( "If you have pip (normally installed with python),", "run this command in a terminal (cmd): python -m pip install lxml", file=sys.stderr, ) sys.exit(1) try: from py_common.log import debug except ModuleNotFoundError: print( "You need to download the folder 'py_common' from the community repo", "https://github.com/stashapp/CommunityScrapers", file=sys.stderr, ) sys.exit(1) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" } def sceneByURL(url): req = urllib.request.Request(url, headers=headers) res = urllib.request.urlopen(req) if not res.status == 200: debug(f"Request to '{url}' failed with status code {res.status}") return {} tree = html.fromstring(res.read().decode()) m, d, y = ( tree.xpath("//div[contains(@class,'update_date')]/text()[1]") .pop(0) .strip() .split("/") ) url_parts = urllib.parse.urlparse(url) scene = { "title": tree.xpath("//span[@class='title_bar_hilite']/text()").pop(), "details": tree.xpath("//span[@class='update_description']/text()") .pop() .strip(), "studio": { "name": "Aunt Judy's" if "auntjudys.com" in url else "Aunt Judy's XXX" }, "performers": [ {"name": x} for x in tree.xpath("//p/span[@class='update_models']/a/text()[1]") ], "tags": [ {"name": x} for x in tree.xpath("//span[@class='update_tags']/a/text()") ], "date": "-".join([y, m, d]), } try: next_url = tree.xpath("//p/span[@class='update_models']/a/@href[1]").pop(0) while next_url: req = urllib.request.Request(next_url, headers=headers) res = urllib.request.urlopen(req) tree = html.fromstring(res.read().decode()) next_url = None links = tree.xpath("//div[a[@href='{}']]".format(url)) if len(links): link = links[0] scene["code"] = link.get("data-setid") scene["image"] = urllib.parse.urlunparse( ( url_parts.scheme, url_parts.netloc, link.xpath("./a/img/@src0_4x").pop(0), "", "", "", ) ) else: n = tree.xpath("//a[@class='pagenav' and span[text()='>']]/@href") if len(n): next_url = urllib.parse.urlunparse( ( url_parts.scheme, url_parts.netloc, "/tour/" + n.pop(0), "", "", "", ) ) except Exception as e: debug(f"Unable to find image for scene: {e}") return scene if sys.argv[1] == "sceneByURL": j = json.loads(sys.stdin.read()) print(json.dumps(sceneByURL(j["url"])))