import datetime import difflib import json import os import re import sqlite3 import sys from configparser import ConfigParser, NoSectionError from urllib.parse import urlparse # to import from a parent directory we need to add that directory to the system path csd = os.path.dirname(os.path.realpath(__file__)) # get current script directory parent = os.path.dirname(csd) # parent directory (should be the scrapers one) sys.path.append( parent ) # add parent dir to sys path so that we can import py_common from there try: from bs4 import BeautifulSoup as bs import requests import lxml except ModuleNotFoundError: print( "You need to install the following modules 'requests', 'bs4', 'lxml'.", file=sys.stderr) sys.exit() try: from py_common import graphql from py_common import log except ModuleNotFoundError: print( "You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr) sys.exit() # # User variables # # File to store the Algolia API key. STOCKAGE_FILE_APIKEY = "Algolia.ini" # Extra tag that will be added to the scene FIXED_TAG = "" # Include non female performers NON_FEMALE = True # a list of main channels (`mainChannelName` from the API) to use as the studio # name for a scene MAIN_CHANNELS_AS_STUDIO_FOR_SCENE = [ "Buttman", "Cock Choking Sluts", "Devil's Film Parodies", "Euro Angels", ] # a dict with sites having movie sections # used when populating movie urls from the scene scraper MOVIE_SITES = { "devilsfilm": "https://www.devilsfilm.com/en/dvd", "devilstgirls": "https://www.devilstgirls.com/en/dvd", "diabolic": "https://www.diabolic.com/en/movie", "evilangel": "https://www.evilangel.com/en/movie", "genderx": "https://www.genderxfilms.com/en/movie", "girlfriendsfilms": "https://www.girlfriendsfilms.com/en/movie", "lewood": "https://www.lewood.com/en/movie", "outofthefamily": "https://www.outofthefamily.com/en/dvd", "peternorth": "https://www.peternorth.com/en/dvd", "tsfactor": "https://www.tsfactor.com/en/movie/", "wicked": "https://www.wicked.com/en/movie", "zerotolerancefilms": "https://www.zerotolerancefilms.com/en/movie", "3rddegreefilms": "https://www.3rddegreefilms.com/en/movie", "roccosiffredi": "https://www.roccosiffredi.com/en/dvd", } # a dict of serie (`serie_name` from the API) which should set the value # for the studio name for a scene SERIE_USING_OVERRIDE_AS_STUDIO_FOR_SCENE = { "Jonni Darkko's Stand Alone Scenes": "Jonni Darkko XXX", "Big Boob Angels": "BAM Visions", "Mick's ANAL PantyHOES": "BAM Visions", "Real Anal Lovers": "BAM Visions", "XXXmailed": "Blackmailed" } # a list of serie (`serie_name` from the API) which should use the sitename # for the studio name for a scene SERIE_USING_SITENAME_AS_STUDIO_FOR_SCENE = [ "Evil", # sitename_pretty: Evil Angel "Trans-Active", # sitename_pretty: Evil Angel ] # a dict of sites (`sitename_pretty` from the API) which should set the value # for the studio name for a scene # this is because the `serie_name` is the Movie (series) title on these sites, # not the studio SITES_USING_OVERRIDE_AS_STUDIO_FOR_SCENE = { "Adamandevepictures": "Adam & Eve Pictures", "AgentRedGirl": "Agent Red Girl", "Devils Gangbangs": "Devil's Gangbangs", "Devilstgirls": "Devil's Tgirls", "Dpfanatics": "DP Fanatics", "Janedoe": "Jane Doe Pictures", "ModernDaySins": "Modern-Day Sins", "Transgressivexxx": "TransgressiveXXX", "Hot House": "Hot House Entertainment", "HotHouse.com": "Hot House Entertainment", "1000facials": "1000 Facials", "Immorallive": "Immoral Live", "Mommyblowsbest": "Mommy Blows Best", "Onlyteenblowjobs": "Only Teen Blowjobs" } # a list of sites (`sitename_pretty` from the API) which should pick out the # `sitename_pretty` for the studio name for a scene # this is because the `serie_name` is the Movie (series) title on these sites, # not the studio SITES_USING_SITENAME_AS_STUDIO_FOR_SCENE = [ "ChaosMen", "Devil's Film", "GenderXFilms", "Give Me Teens", "Hairy Undies", "Lesbian Factor", "Oopsie", "Out of the Family", "Rocco Siffredi", "Squirtalicious", "3rd Degree Films", ] # a list of sites (`sitename_pretty` from the API) which should pick out the # `network_name` for the studio name for a scene # this is because the `serie_name` is the Movie (series) title on these sites, # not the studio SITES_USING_NETWORK_AS_STUDIO_FOR_SCENE = [ "Extremepickups", # network_name: Adult Time Originals "Isthisreal", # network_name: Is This Real "Muses", # network_name: Transfixed "Officemsconduct", # network_name: Transfixed "Sabiendemonia", # network_name: Sabien DeMonia "Upclosex" # network_name: UpCloseX ] # a list of networks (`network_name` from the API) which should pick out the # `sitename_pretty` for the studio name for a scene NETWORKS_USING_SITENAME_AS_STUDIO_FOR_SCENE = [ "Fame Digital", # this should support all sub-studios listed at https://stashdb.org/studios/cd5591a5-eb26-42fc-a406-b6969a8ef3dd "fistinginferno", "MyXXXPass", ] # a dict of directors to use as the studio for a scene DIRECTOR_AS_STUDIO_OVERRIDE_FOR_SCENE = { "Le Wood": "LeWood" } def clean_text(details: str) -> str: """ remove escaped backslashes and html parse the details text """ if details: details = re.sub(r"\\", "", details) details = re.sub(r"<\s*/?br\s*/?\s*>", "\n", details) # bs.get_text doesnt replace br's with \n details = bs(details, features='lxml').get_text() return details def check_db(database_path: str, scn_id: str) -> dict: """ get scene data (size, duration, height) directly from the database file """ try: sqlite_connection = sqlite3.connect("file:" + database_path + "?mode=ro", uri=True) log.debug("Connected to SQLite database") except: log.warning("Fail to connect to the database") return None, None, None cursor = sqlite_connection.cursor() cursor.execute("SELECT size,duration,height from scenes WHERE id=?;", [scn_id]) record = cursor.fetchall() database = {} database["size"] = int(record[0][0]) database["duration"] = int(record[0][1]) database["height"] = str(record[0][2]) cursor.close() sqlite_connection.close() return database def send_request(url: str, head: str, send_json="") -> requests.Response: """ get post response from url """ log.debug(f"Request URL: {url}") try: response = requests.post(url, headers=head, json=send_json, timeout=10) except requests.RequestException as req_error: log.warning(f"Requests failed: {req_error}") return None #log.debug(f"Returned URL: {response.url}") if response.content and response.status_code == 200: return response log.warning(f"[REQUEST] Error, Status Code: {response.status_code}") #print(response.text, file=open("algolia_request.html", "w", encoding='utf-8')) return None # API Authentification def apikey_get(site_url, time): req = send_request(site_url, HEADERS) if req is None: return None, None script_html = fetch_page_json(req.text) if script_html is not None: app_id = script_html['api']['algolia']['applicationID'] algolia_api_key = script_html['api']['algolia']['apiKey'] # Write key into a file write_config(time, app_id, algolia_api_key) log.info(f"New API keys: {algolia_api_key}") return app_id, algolia_api_key log.error(f"Can't retrieve Algolia API keys from page ({site_url})") return None, None def fetch_page_json(page_html): matches = re.findall(r'window.env\s+=\s(.+);', page_html, re.MULTILINE) return None if len(matches) == 0 else json.loads(matches[0]) def check_config(domain, time): if os.path.isfile(STOCKAGE_FILE_APIKEY): config = ConfigParser() config.read(STOCKAGE_FILE_APIKEY) try: time_past = datetime.datetime.strptime(config.get(domain, 'date'), '%Y-%m-%d %H:%M:%S.%f') if time_past.hour - 1 < time.hour < time_past.hour + 1 and ( time - time_past).days == 0: log.debug("Using old key") application_id = config.get(domain, 'app_id') api_key = config.get(domain, 'api_key') return application_id, api_key log.info( f"Need new api key: [{time.hour}|{time_past.hour}|{(time-time_past).days}]" ) except NoSectionError: pass return None, None def write_config(date, app_id, api_key): log.debug("Writing config!") config = ConfigParser() config.read(STOCKAGE_FILE_APIKEY) try: config.get(SITE, 'date') except NoSectionError: config.add_section(SITE) config.set(SITE, 'date', date.strftime("%Y-%m-%d %H:%M:%S.%f")) config.set(SITE, 'app_id', app_id) config.set(SITE, 'api_key', api_key) with open(STOCKAGE_FILE_APIKEY, 'w', encoding='utf-8') as configfile: config.write(configfile) # API Search Data def api_search_req(type_search, query, url): api_request = None if type_search == "query_all_scenes": api_request = api_search_query("all_scenes", query, url) if type_search == "query_all_photosets": api_request = api_search_query("all_photosets", query, url) if type_search == "id": api_request = api_search_id(query, url) if api_request: api_search = api_request.json()["results"][0].get("hits") if api_search: return api_search return None def api_search_id(scene_id, url): clip_id = [f"clip_id:{scene_id}"] request_api = { "requests": [{ "indexName": "all_scenes", "params": "query=&hitsPerPage=20&page=0", "facetFilters": clip_id }] } req = send_request(url, HEADERS, request_api) return req def api_search_movie_id(m_id, url): movie_id = [f"movie_id:{m_id}"] request_api = { "requests": [{ "indexName": "all_movies", "params": "query=&hitsPerPage=20&page=0", "facetFilters": movie_id }] } req = send_request(url, HEADERS, request_api) return req def api_search_gallery_id(p_id, url): gallery_id = [[f"set_id:{p_id}"]] request_api = { "requests": [{ "indexName": "all_photosets", "params": "query=&hitsPerPage=20&page=0", "facetFilters": gallery_id, "facets": [] }] } req = send_request(url, HEADERS, request_api) return req def api_search_query(index_name, query, url): request_api = { "requests": [{ "indexName": index_name, "params": "query=" + query + "&hitsPerPage=40&page=0" }] } res = send_request(url, HEADERS, request_api) return res # Searching Result def json_parser(search_json, range_duration=60, single=False, scene_id=None): result_dict = {} # Just for not printing the full JSON in log... debug_dict = {} with open("adultime_scene_search.json", 'w', encoding='utf-8') as search_file: json.dump(search_json, search_file, ensure_ascii=False, indent=4) for scene in search_json: r_match = match_result(scene, range_duration, single, clip_id=url_id) if r_match["info"]: if result_dict.get(r_match["info"]): # Url should be more accurate than the title if r_match["url"] > result_dict[r_match["info"]]["url"]: result_dict[r_match["info"]] = { "title": r_match["title"], "url": r_match["url"], "clip_id": r_match["clip_id"], "json": scene } debug_dict[r_match["info"]] = { "title": r_match["title"], "url": r_match["url"], "scene": scene["title"] } elif r_match["title"] > result_dict[r_match["info"]][ "title"] and r_match["title"] > result_dict[ r_match["info"]]["url"]: result_dict[r_match["info"]] = { "title": r_match["title"], "url": r_match["url"], "clip_id": r_match["clip_id"], "json": scene } debug_dict[r_match["info"]] = { "title": r_match["title"], "url": r_match["url"], "scene": scene["title"] } else: result_dict[r_match["info"]] = { "title": r_match["title"], "url": r_match["url"], "clip_id": r_match["clip_id"], "json": scene } debug_dict[r_match["info"]] = { "title": r_match["title"], "url": r_match["url"], "scene": scene["title"] } # Engine whoaaaaa # A = ByID/Most likely | S = Size | D = Duration | N = Network | R = Only Ratio log.info("--- BEST RESULT ---") for key, item in debug_dict.items(): log.info( f'[{key}] Title: {item["scene"]}; Ratio Title: {round(item["title"], 3)} - URL: {round(item["url"], 3)}' ) log.info("--------------") # if result_dict.get("ASDN"): return result_dict["ASDN"]["json"] if result_dict.get("ASD"): return result_dict["ASD"]["json"] if result_dict.get("ASN"): return result_dict["ASN"]["json"] if result_dict.get("ADN"): return result_dict["ADN"]["json"] if result_dict.get("AS"): return result_dict["AS"]["json"] if result_dict.get("AD"): return result_dict["AD"]["json"] if result_dict.get("AN"): if result_dict["AN"]["clip_id"] or result_dict["AN"]["title"] > 0.5 or result_dict["AN"]["url"] > 0.5: return result_dict["AN"]["json"] if result_dict.get("A"): if result_dict["A"]["title"] > 0.7 or result_dict["A"]["url"] > 0.7: return result_dict["A"]["json"] if result_dict.get("SDN"): return result_dict["SDN"]["json"] if result_dict.get("SD"): return result_dict["SD"]["json"] if result_dict.get("SN"): if result_dict["SN"]["title"] > 0.5 or result_dict["SN"]["url"] > 0.5: return result_dict["SN"]["json"] if result_dict.get("DN"): if result_dict["DN"]["title"] > 0.5 or result_dict["DN"]["url"] > 0.5: return result_dict["DN"]["json"] if result_dict.get("S"): if result_dict["S"]["title"] > 0.7 or result_dict["S"]["url"] > 0.7: return result_dict["S"]["json"] if result_dict.get("D"): if result_dict["D"]["title"] > 0.7 or result_dict["D"]["url"] > 0.7: return result_dict["D"]["json"] if result_dict.get("N"): if result_dict["N"]["title"] > 0.7 or result_dict["N"]["url"] > 0.7: return result_dict["N"]["json"] if result_dict.get("R"): if result_dict["R"]["title"] > 0.8 or result_dict["R"]["url"] > 0.8: return result_dict["R"]["json"] return None def match_result(api_scene, range_duration=60, single=False, clip_id: str=None): api_title = api_scene.get("title") api_duration = int(api_scene.get("length")) api_clip_id = str(api_scene["clip_id"]) api_filesize = None match_duration = False match_size = False match_clip_id = False # Using database if database_dict: db_duration = int(database_dict[0]["duration"]) db_height = str(database_dict[0]["height"]) db_size = int(database_dict[0]["size"]) if api_scene.get("download_file_sizes"): if db_height == "2160": api_filesize = api_scene["download_file_sizes"].get("4k") else: api_filesize = api_scene["download_file_sizes"].get(db_height + "p") if api_filesize: api_filesize = int(api_filesize) if api_filesize is None: api_filesize = api_scene.get("index_size") if api_filesize: api_filesize = int(api_filesize) if db_duration - range_duration <= api_duration <= db_duration + range_duration: match_duration = True db_size_max = db_size + (db_size / 100) db_size_min = db_size - (db_size / 100) if api_filesize: if db_size_min <= api_filesize <= db_size_max: match_size = True # Post process things match_domain = False if URL_DOMAIN: if api_scene.get("sitename"): #log.debug("API Sitename: {}".format(api_scene["sitename"])) if api_scene["sitename"].lower() == URL_DOMAIN: match_domain = True if api_scene.get("network_name"): #log.debug("API Network: {}".format(api_scene["network_name"])) if api_scene["network_name"].lower() == URL_DOMAIN: match_domain = True # Matching ratio if SCENE_TITLE: match_ratio_title = difflib.SequenceMatcher(None, SCENE_TITLE.lower(), api_title.lower()).ratio() else: match_ratio_title = 0 if url_title and api_scene.get("url_title"): match_ratio_title_url = difflib.SequenceMatcher( None, url_title.lower(), api_scene["url_title"].lower()).ratio() else: match_ratio_title_url = 0 # Rank search result log.debug( f"[MATCH] Title: {api_title} |-RATIO-| Ratio: {round(match_ratio_title, 5)} / URL: {round(match_ratio_title_url, 5)} |-MATCH-| Duration: {match_duration}, Size: {match_size}, Domain: {match_domain}" ) match_dict = {} match_dict["title"] = match_ratio_title match_dict["url"] = match_ratio_title_url information_used = "" if (single and (match_duration or (database_dict is None and match_ratio_title_url > 0.5)) ) or match_ratio_title_url == 1: information_used += "A" if match_size: information_used += "S" if match_duration: information_used += "D" if match_domain: information_used += "N" if clip_id: if clip_id == api_clip_id: match_clip_id = True if information_used == "": information_used = "R" match_dict["info"] = information_used match_dict["clip_id"] = match_clip_id #debug("[MATCH] {} - {}".format(api_title,match_dict)) return match_dict def get_id_from_url(url: str) -> str: ''' gets the id from a valid url expects urls of the form www.example.com/.../title/id ''' if url is None or url == "": return None id_check = re.sub('.+/', '', url) id_from_url = None try: if id_check.isdigit(): id_from_url = id_check else: id_from_url = re.search(r"/(\d+)/*", url).group(1) log.info(f"ID: {id_from_url}") except: log.warning("Can't get ID from URL") return id_from_url def parse_movie_json(movie_json: dict) -> dict: """ process an api movie dictionary and return a scraped one """ scrape = {} try: studio_name = determine_studio_name_from_json(movie_json[0]) except IndexError: log.debug("No movie found") return scrape scrape["synopsis"] = clean_text(movie_json[0].get("description")) scrape["name"] = movie_json[0].get("title") scrape["studio"] = {"name": studio_name} scrape["duration"] = movie_json[0].get("total_length") date_by_studio = "date_created" # options are "date_created", "upcoming" (not always avaialble), "last_modified" # dates don't seem to be accurate (modifed multiple times by studio) # using date_created as default and we later override for each site when needed log.debug( f"Dates available: upcoming {movie_json[0].get('upcoming')} - created {movie_json[0].get('date_created')} - last modified {movie_json[0].get('last_modified')}" ) studios_movie_dates = { "Diabolic": "last_modified", "Evil Angel": "date_created", "Wicked": "date_created", "Zerotolerance": "last_modified" } if studios_movie_dates.get(studio_name): date_by_studio = studios_movie_dates[studio_name] scrape["date"] = movie_json[0].get(date_by_studio) scrape[ "front_image"] = f"https://transform.gammacdn.com/movies{movie[0].get('cover_path')}_front_400x625.jpg?width=450&height=636" scrape[ "back_image"] = f"https://transform.gammacdn.com/movies{movie[0].get('cover_path')}_back_400x625.jpg?width=450&height=636" directors = [] if movie_json[0].get('directors') is not None: for director in movie_json[0].get('directors'): directors.append(director.get('name').strip()) scrape["director"] = ", ".join(directors) return scrape def determine_studio_name_from_json(some_json): ''' Reusable function to determine studio name based on what was scraped. This can be used for scraping: - scene - gallery - movie ''' studio_name = None if some_json.get('sitename_pretty'): if some_json.get('sitename_pretty') in SITES_USING_OVERRIDE_AS_STUDIO_FOR_SCENE: studio_name = \ SITES_USING_OVERRIDE_AS_STUDIO_FOR_SCENE.get(some_json.get('sitename_pretty')) elif some_json.get('sitename_pretty') in SITES_USING_SITENAME_AS_STUDIO_FOR_SCENE \ or some_json.get('serie_name') in SERIE_USING_SITENAME_AS_STUDIO_FOR_SCENE \ or some_json.get('network_name') \ and some_json.get('network_name') in NETWORKS_USING_SITENAME_AS_STUDIO_FOR_SCENE: studio_name = some_json.get('sitename_pretty') elif some_json.get('sitename_pretty') in SITES_USING_NETWORK_AS_STUDIO_FOR_SCENE \ and some_json.get('network_name'): studio_name = some_json.get('network_name') if not studio_name and some_json.get('network_name') and \ some_json.get('network_name') in NETWORKS_USING_SITENAME_AS_STUDIO_FOR_SCENE: studio_name = some_json.get('sitename_pretty') if not studio_name and some_json.get('mainChannelName') and \ some_json.get('mainChannelName') in MAIN_CHANNELS_AS_STUDIO_FOR_SCENE: studio_name = some_json.get('mainChannelName') if not studio_name and some_json.get('directors'): for director in [ d.get('name').strip() for d in some_json.get('directors') ]: if DIRECTOR_AS_STUDIO_OVERRIDE_FOR_SCENE.get(director): studio_name = \ DIRECTOR_AS_STUDIO_OVERRIDE_FOR_SCENE.get(director) if not studio_name and some_json.get('serie_name'): if some_json.get('serie_name') in SERIE_USING_OVERRIDE_AS_STUDIO_FOR_SCENE: studio_name = \ SERIE_USING_OVERRIDE_AS_STUDIO_FOR_SCENE.get(some_json.get('serie_name')) else: studio_name = some_json.get('serie_name') return studio_name def parse_scene_json(scene_json, url=None): """ process an api scene dictionary and return a scraped one """ scrape = {} # Title if scene_json.get('title'): scrape['title'] = scene_json['title'].strip() # Date scrape['date'] = scene_json.get('release_date') # Details scrape['details'] = clean_text(scene_json.get('description')) # Studio Code if scene_json.get('clip_id'): scrape['code'] = str(scene_json['clip_id']) # Director directors = [] if scene_json.get('directors') is not None: for director in scene_json.get('directors'): directors.append(director.get('name').strip()) scrape["director"] = ", ".join(directors) # Studio scrape['studio'] = {} studio_name = determine_studio_name_from_json(scene_json) if studio_name: scrape['studio']['name'] = studio_name log.debug( f"[STUDIO] {scene_json.get('serie_name')} - {scene_json.get('network_name')} - {scene_json.get('mainChannelName')} - {scene_json.get('sitename_pretty')}" ) # Performer perf = [] for actor in scene_json.get('actors'): if actor.get('gender') == "female" or NON_FEMALE: perf.append({ "name": actor.get('name').strip(), "gender": actor.get('gender') }) scrape['performers'] = perf # Tags list_tag = [] for tag in scene_json.get('categories'): if tag.get('name') is None: continue tag_name = tag.get('name') tag_name = " ".join(tag.capitalize() for tag in tag_name.split(" ")) if tag_name: list_tag.append({"name": tag.get('name')}) if FIXED_TAG: list_tag.append({"name": FIXED_TAG}) scrape['tags'] = list_tag # Image try: scrape['image'] = 'https://images03-fame.gammacdn.com/movies' + next( iter(scene_json['pictures']['nsfw']['top'].values())) except: try: scrape[ 'image'] = 'https://images03-fame.gammacdn.com/movies' + next( iter(scene_json['pictures']['sfw']['top'].values())) except: log.warning("Can't locate image.") # URL try: hostname = scene_json.get('sitename') if hostname is None: hostname = SITE # Movie if scene_json.get('movie_title'): scrape['movies'] = [{ "name": scene_json["movie_title"], "synopsis": clean_text(scene_json.get("movie_desc")), "date": scene_json.get("movie_date_created") }] log.debug(f"domain to use for movie url: {URL_DOMAIN}") if scene_json.get("url_movie_title") and scene_json.get( "movie_id"): if URL_DOMAIN and MOVIE_SITES.get(URL_DOMAIN): scrape['movies'][0][ 'url'] = f"{MOVIE_SITES[URL_DOMAIN]}/{scene_json['url_movie_title']}/{scene_json['movie_id']}" net_name = scene_json.get('network_name') if net_name: if net_name.lower() == "21 sextury": hostname = "21sextury" elif net_name.lower() == "21 naturals": hostname = "21naturals" elif net_name.lower() == 'transfixed': hostname = 'transfixed' scrape[ 'url'] = f"https://{hostname.lower()}.com/en/video/{hostname.lower()}/{scene_json['url_title']}/{scene_json['clip_id']}" except Exception as exc: log.debug(f"{exc}") if url: scrape['url'] = url #log.debug(f"{scrape}") return scrape def parse_gallery_json(gallery_json: dict, url: str = None) -> dict: """ process an api gallery dictionary and return a scraped one """ scrape = {} # Title if gallery_json.get('clip_title'): scrape['title'] = gallery_json['clip_title'].strip() elif gallery_json.get('title'): scrape['title'] = gallery_json['title'].strip() # Date scrape['date'] = gallery_json.get('date_online') or gallery_json.get('release_date') # Details scrape['details'] = clean_text(gallery_json.get('description')) # Studio Code # not yet supported in stash #if gallery_json.get('set_id'): # scrape['code'] = str(gallery_json['set_id']) # Director # not yet supported in stash #directors = [] #if gallery_json.get('directors') is not None: # for director in gallery_json.get('directors'): # directors.append(director.get('name').strip()) #scrape["director"] = ", ".join(directors) # Studio scrape['studio'] = {} studio_name = determine_studio_name_from_json(gallery_json) if studio_name: scrape['studio']['name'] = studio_name log.debug( f"[STUDIO] {gallery_json.get('serie_name')} - {gallery_json.get('network_name')} - {gallery_json.get('mainChannelName')} - {gallery_json.get('sitename_pretty')}" ) # Performer perf = [] for actor in gallery_json.get('actors'): if actor.get('gender') == "female" or NON_FEMALE: perf.append({ "name": actor.get('name').strip(), "gender": actor.get('gender') }) scrape['performers'] = perf # Tags list_tag = [] for tag in gallery_json.get('categories'): if tag.get('name') is None: continue tag_name = tag.get('name') tag_name = " ".join(tag.capitalize() for tag in tag_name.split(" ")) if tag_name: list_tag.append({"name": tag.get('name')}) if FIXED_TAG: list_tag.append({"name": FIXED_TAG}) scrape['tags'] = list_tag # URL try: hostname = gallery_json['sitename'] net_name = gallery_json['network_name'] if net_name.lower() == "21 sextury": hostname = "21sextury" elif net_name.lower() == "21 naturals": hostname = "21naturals" scrape['url'] = f"https://www.{hostname.lower()}.com/en/photo/" \ f"{gallery_json['url_title']}/{gallery_json['set_id']}" except: if url: scrape['url'] = url return scrape # # Start processing # try: USERFOLDER_PATH = re.match(r".+\.stash.", __file__).group(0) CONFIG_PATH = USERFOLDER_PATH + "config.yml" log.debug(f"Config Path: {CONFIG_PATH}") except: USERFOLDER_PATH = None CONFIG_PATH = None log.debug("No config") SITE = sys.argv[1] HEADERS = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0', "Origin": f"https://www.{SITE}.com", "Referer": f"https://www.{SITE}.com" } FRAGMENT = json.loads(sys.stdin.read()) SEARCH_TITLE = FRAGMENT.get("name") SCENE_ID = FRAGMENT.get("id") SCENE_TITLE = FRAGMENT.get("title") SCENE_URL = FRAGMENT.get("url") # log.trace(f"fragment: {FRAGMENT}") # ACCESS API # Check existing API keys CURRENT_TIME = datetime.datetime.now() application_id, api_key = check_config(SITE, CURRENT_TIME) # Getting new key if application_id is None: application_id, api_key = apikey_get(f"https://www.{SITE}.com/en", CURRENT_TIME) # Failed to get new key if application_id is None: sys.exit(1) api_url = f"https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-application-id={application_id}&x-algolia-api-key={api_key}" #log.debug(HEADERS) #log.debug(FRAGMENT) URL_DOMAIN = None if SCENE_URL: URL_DOMAIN = re.sub(r"www\.|\.com", "", urlparse(SCENE_URL).netloc).lower() log.info(f"URL Domain: {URL_DOMAIN}") if "validName" in sys.argv and SCENE_URL is None: sys.exit(1) if SCENE_URL and SCENE_ID is None: log.debug(f"URL Scraping: {SCENE_URL}") else: log.debug(f"Stash ID: {SCENE_ID}") log.debug(f"Stash Title: {SCENE_TITLE}") if "movie" not in sys.argv and "gallery" not in sys.argv: # Get your sqlite database stash_config = graphql.configuration() DB_PATH = None if stash_config: DB_PATH = stash_config["general"]["databasePath"] if (CONFIG_PATH and DB_PATH is None): # getting your database from the config.yml if os.path.isfile(CONFIG_PATH): with open(CONFIG_PATH, encoding='utf-8') as f: for line in f: if "database: " in line: DB_PATH = line.replace("database: ", "").rstrip('\n') break log.debug(f"Database Path: {DB_PATH}") if DB_PATH: if SCENE_ID: # Get data by GraphQL database_dict = graphql.getScene(SCENE_ID) if database_dict is None: # Get data by SQlite log.warning( "GraphQL request failed, accessing database directly...") database_dict = check_db(DB_PATH, SCENE_ID) else: database_dict = database_dict["files"] log.debug(f"[DATABASE] Info: {database_dict}") else: database_dict = None log.debug("URL scraping... Ignoring database...") else: database_dict = None log.warning("Database path missing.") # Extract things url_title = None url_id = None url_domain = None if SCENE_URL: url_id = get_id_from_url(SCENE_URL) try: url_title = re.match(r".+/(.+)/\d+", SCENE_URL).group(1) log.info(f"URL_TITLE: {url_title}") except: log.warning("Can't get url_title from URL") # Filter title if SCENE_TITLE: SCENE_TITLE = re.sub(r'[-._\']', ' ', os.path.splitext(SCENE_TITLE)[0]) # Remove resolution SCENE_TITLE = re.sub( r'\sXXX|\s1080p|720p|2160p|KTR|RARBG|\scom\s|\[|]|\sHD|\sSD|', '', SCENE_TITLE) # Remove Date SCENE_TITLE = re.sub(r'\s\d{2}\s\d{2}\s\d{2}|\s\d{4}\s\d{2}\s\d{2}', '', SCENE_TITLE) log.debug(f"Title: {SCENE_TITLE}") # Time to search the API api_search = None api_json = None # sceneByName if SEARCH_TITLE: SEARCH_TITLE = SEARCH_TITLE.replace(".", " ") log.debug(f"[API] Searching for: {SEARCH_TITLE}") api_search = api_search_req("query_all_scenes", SEARCH_TITLE, api_url) final_json = None if api_search: result_search = [] for scene in api_search: scraped_json = parse_scene_json(scene) if scraped_json.get("tags"): scraped_json.pop("tags") result_search.append(scraped_json) if result_search: final_json = result_search if final_json is None: log.error("API Search finished. No results!") print(json.dumps(final_json)) sys.exit() if url_id: log.debug(f"[API] Searching using URL_ID {url_id}") api_search = api_search_req("id", url_id, api_url) if api_search: log.info(f"[API] Search gives {len(api_search)} result(s)") api_json = json_parser(api_search, 120, True) else: log.warning("[API] No result") if url_title and api_json is None: log.debug("[API] Searching using URL_TITLE") api_search = api_search_req("query_all_scenes", url_title, api_url) if api_search: log.info(f"[API] Search gives {len(api_search)} result(s)") api_json = json_parser(api_search) if SCENE_TITLE and api_json is None: log.debug("[API] Searching using STASH_TITLE") api_search = api_search_req("query_all_scenes", SCENE_TITLE, api_url) if api_search: log.info(f"[API] Search gives {len(api_search)} result(s)") api_json = json_parser(api_search) # Scraping the JSON if api_json: log.info(f"Scene found: {api_json['title']}") scraped_json = parse_scene_json(api_json, SCENE_URL) print(json.dumps(scraped_json)) else: log.error("Can't find the scene") print(json.dumps({})) sys.exit() elif "movie" in sys.argv: log.debug("Scraping movie") movie_id = get_id_from_url(SCENE_URL) if movie_id: movie_results = api_search_movie_id(movie_id, api_url) movie = movie_results.json()["results"][0].get("hits") scraped_movie = parse_movie_json(movie) #log.debug(scraped_movie) print(json.dumps(scraped_movie)) elif "gallery" in sys.argv: scraped_gallery = None if SCENE_URL: if "/video/" in SCENE_URL: log.debug("Scraping scene by URL") scene_id = get_id_from_url(SCENE_URL) api_search_response = api_search_req("id", scene_id, api_url) if api_search_response: # log.debug(f"[API] Search gives {len(api_search_response)} result(s)") # log.trace(f"api_search_response: {api_search_response}") scraped_gallery = parse_gallery_json(api_search_response[0]) else: log.debug("Scraping gallery by URL") gallery_id = get_id_from_url(SCENE_URL) if gallery_id: gallery_results = api_search_gallery_id(gallery_id, api_url) gallery = gallery_results.json()["results"][0].get("hits") if gallery: #log.debug(gallery[0]) scraped_gallery = parse_gallery_json(gallery[0]) #log.debug(scraped_gallery) elif SCENE_TITLE: log.debug("Scraping gallery by fragment") # log.debug(f"[API] Searching using SCENE_TITLE: {SCENE_TITLE}") api_search = api_search_req("query_all_photosets", SCENE_TITLE, api_url) if api_search: log.info(f"[API] Search gives {len(api_search)} result(s)") # log.trace(f"api_search: {api_search}") log.debug(f"Galleries found: {'; '.join([g['title'] for g in api_search])}") scraped_gallery = parse_gallery_json(api_search[0]) # Scraping the JSON if scraped_gallery: print(json.dumps(scraped_gallery)) else: log.error("Can't find the gallery") print(json.dumps({})) sys.exit()