1027 lines
37 KiB
Python
1027 lines
37 KiB
Python
import datetime
|
|
import difflib
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
from configparser import ConfigParser, NoSectionError
|
|
from urllib.parse import urlparse
|
|
|
|
# to import from a parent directory we need to add that directory to the system path
|
|
csd = os.path.dirname(os.path.realpath(__file__)) # get current script directory
|
|
parent = os.path.dirname(csd) # parent directory (should be the scrapers one)
|
|
sys.path.append(
|
|
parent
|
|
) # add parent dir to sys path so that we can import py_common from there
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup as bs
|
|
import requests
|
|
import lxml
|
|
except ModuleNotFoundError:
|
|
print(
|
|
"You need to install the following modules 'requests', 'bs4', 'lxml'.", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
try:
|
|
from py_common import graphql
|
|
from py_common import log
|
|
except ModuleNotFoundError:
|
|
print(
|
|
"You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)",
|
|
file=sys.stderr)
|
|
sys.exit()
|
|
|
|
#
|
|
# User variables
|
|
#
|
|
|
|
# File to store the Algolia API key.
|
|
STOCKAGE_FILE_APIKEY = "Algolia.ini"
|
|
# Extra tag that will be added to the scene
|
|
FIXED_TAG = ""
|
|
# Include non female performers
|
|
NON_FEMALE = True
|
|
|
|
# a list of main channels (`mainChannelName` from the API) to use as the studio
|
|
# name for a scene
|
|
MAIN_CHANNELS_AS_STUDIO_FOR_SCENE = [
|
|
"Buttman",
|
|
"Cock Choking Sluts",
|
|
"Devil's Film Parodies",
|
|
"Euro Angels",
|
|
]
|
|
|
|
# a dict with sites having movie sections
|
|
# used when populating movie urls from the scene scraper
|
|
MOVIE_SITES = {
|
|
"devilsfilm": "https://www.devilsfilm.com/en/dvd",
|
|
"devilstgirls": "https://www.devilstgirls.com/en/dvd",
|
|
"diabolic": "https://www.diabolic.com/en/movie",
|
|
"evilangel": "https://www.evilangel.com/en/movie",
|
|
"genderx": "https://www.genderxfilms.com/en/movie",
|
|
"girlfriendsfilms": "https://www.girlfriendsfilms.com/en/movie",
|
|
"lewood": "https://www.lewood.com/en/movie",
|
|
"outofthefamily": "https://www.outofthefamily.com/en/dvd",
|
|
"peternorth": "https://www.peternorth.com/en/dvd",
|
|
"tsfactor": "https://www.tsfactor.com/en/movie/",
|
|
"wicked": "https://www.wicked.com/en/movie",
|
|
"zerotolerancefilms": "https://www.zerotolerancefilms.com/en/movie",
|
|
"3rddegreefilms": "https://www.3rddegreefilms.com/en/movie",
|
|
"roccosiffredi": "https://www.roccosiffredi.com/en/dvd",
|
|
}
|
|
|
|
# a dict of serie (`serie_name` from the API) which should set the value
|
|
# for the studio name for a scene
|
|
SERIE_USING_OVERRIDE_AS_STUDIO_FOR_SCENE = {
|
|
"Jonni Darkko's Stand Alone Scenes": "Jonni Darkko XXX",
|
|
"Big Boob Angels": "BAM Visions",
|
|
"Mick's ANAL PantyHOES": "BAM Visions",
|
|
"Real Anal Lovers": "BAM Visions",
|
|
"XXXmailed": "Blackmailed"
|
|
}
|
|
|
|
# a list of serie (`serie_name` from the API) which should use the sitename
|
|
# for the studio name for a scene
|
|
SERIE_USING_SITENAME_AS_STUDIO_FOR_SCENE = [
|
|
"Evil", # sitename_pretty: Evil Angel
|
|
"Trans-Active", # sitename_pretty: Evil Angel
|
|
]
|
|
|
|
# a dict of sites (`sitename_pretty` from the API) which should set the value
|
|
# for the studio name for a scene
|
|
# this is because the `serie_name` is the Movie (series) title on these sites,
|
|
# not the studio
|
|
SITES_USING_OVERRIDE_AS_STUDIO_FOR_SCENE = {
|
|
"Adamandevepictures": "Adam & Eve Pictures",
|
|
"AgentRedGirl": "Agent Red Girl",
|
|
"Devils Gangbangs": "Devil's Gangbangs",
|
|
"Devilstgirls": "Devil's Tgirls",
|
|
"Dpfanatics": "DP Fanatics",
|
|
"Janedoe": "Jane Doe Pictures",
|
|
"ModernDaySins": "Modern-Day Sins",
|
|
"Transgressivexxx": "TransgressiveXXX",
|
|
"Hot House": "Hot House Entertainment",
|
|
"HotHouse.com": "Hot House Entertainment",
|
|
"1000facials": "1000 Facials",
|
|
"Immorallive": "Immoral Live",
|
|
"Mommyblowsbest": "Mommy Blows Best",
|
|
"Onlyteenblowjobs": "Only Teen Blowjobs"
|
|
}
|
|
|
|
# a list of sites (`sitename_pretty` from the API) which should pick out the
|
|
# `sitename_pretty` for the studio name for a scene
|
|
# this is because the `serie_name` is the Movie (series) title on these sites,
|
|
# not the studio
|
|
SITES_USING_SITENAME_AS_STUDIO_FOR_SCENE = [
|
|
"ChaosMen",
|
|
"Devil's Film",
|
|
"GenderXFilms",
|
|
"Give Me Teens",
|
|
"Hairy Undies",
|
|
"Lesbian Factor",
|
|
"Oopsie",
|
|
"Out of the Family",
|
|
"Rocco Siffredi",
|
|
"Squirtalicious",
|
|
"3rd Degree Films",
|
|
]
|
|
|
|
# a list of sites (`sitename_pretty` from the API) which should pick out the
|
|
# `network_name` for the studio name for a scene
|
|
# this is because the `serie_name` is the Movie (series) title on these sites,
|
|
# not the studio
|
|
SITES_USING_NETWORK_AS_STUDIO_FOR_SCENE = [
|
|
"Extremepickups", # network_name: Adult Time Originals
|
|
"Isthisreal", # network_name: Is This Real
|
|
"Muses", # network_name: Transfixed
|
|
"Officemsconduct", # network_name: Transfixed
|
|
"Sabiendemonia", # network_name: Sabien DeMonia
|
|
"Upclosex" # network_name: UpCloseX
|
|
]
|
|
|
|
# a list of networks (`network_name` from the API) which should pick out the
|
|
# `sitename_pretty` for the studio name for a scene
|
|
NETWORKS_USING_SITENAME_AS_STUDIO_FOR_SCENE = [
|
|
"Fame Digital", # this should support all sub-studios listed at https://stashdb.org/studios/cd5591a5-eb26-42fc-a406-b6969a8ef3dd
|
|
"fistinginferno",
|
|
"MyXXXPass",
|
|
]
|
|
|
|
# a dict of directors to use as the studio for a scene
|
|
DIRECTOR_AS_STUDIO_OVERRIDE_FOR_SCENE = {
|
|
"Le Wood": "LeWood"
|
|
}
|
|
|
|
|
|
def clean_text(details: str) -> str:
|
|
"""
|
|
remove escaped backslashes and html parse the details text
|
|
"""
|
|
if details:
|
|
details = re.sub(r"\\", "", details)
|
|
details = re.sub(r"<\s*/?br\s*/?\s*>", "\n",
|
|
details) # bs.get_text doesnt replace br's with \n
|
|
details = bs(details, features='lxml').get_text()
|
|
return details
|
|
|
|
|
|
def check_db(database_path: str, scn_id: str) -> dict:
|
|
"""
|
|
get scene data (size, duration, height) directly from the database file
|
|
"""
|
|
try:
|
|
sqlite_connection = sqlite3.connect("file:" + database_path +
|
|
"?mode=ro",
|
|
uri=True)
|
|
log.debug("Connected to SQLite database")
|
|
except:
|
|
log.warning("Fail to connect to the database")
|
|
return None, None, None
|
|
cursor = sqlite_connection.cursor()
|
|
cursor.execute("SELECT size,duration,height from scenes WHERE id=?;",
|
|
[scn_id])
|
|
record = cursor.fetchall()
|
|
database = {}
|
|
database["size"] = int(record[0][0])
|
|
database["duration"] = int(record[0][1])
|
|
database["height"] = str(record[0][2])
|
|
cursor.close()
|
|
sqlite_connection.close()
|
|
return database
|
|
|
|
|
|
def send_request(url: str, head: str, send_json="") -> requests.Response:
|
|
"""
|
|
get post response from url
|
|
"""
|
|
log.debug(f"Request URL: {url}")
|
|
try:
|
|
response = requests.post(url, headers=head, json=send_json, timeout=10)
|
|
except requests.RequestException as req_error:
|
|
log.warning(f"Requests failed: {req_error}")
|
|
return None
|
|
#log.debug(f"Returned URL: {response.url}")
|
|
if response.content and response.status_code == 200:
|
|
return response
|
|
log.warning(f"[REQUEST] Error, Status Code: {response.status_code}")
|
|
#print(response.text, file=open("algolia_request.html", "w", encoding='utf-8'))
|
|
return None
|
|
|
|
|
|
# API Authentification
|
|
def apikey_get(site_url, time):
|
|
req = send_request(site_url, HEADERS)
|
|
if req is None:
|
|
return None, None
|
|
script_html = fetch_page_json(req.text)
|
|
if script_html is not None:
|
|
app_id = script_html['api']['algolia']['applicationID']
|
|
algolia_api_key = script_html['api']['algolia']['apiKey']
|
|
# Write key into a file
|
|
write_config(time, app_id, algolia_api_key)
|
|
log.info(f"New API keys: {algolia_api_key}")
|
|
return app_id, algolia_api_key
|
|
log.error(f"Can't retrieve Algolia API keys from page ({site_url})")
|
|
return None, None
|
|
|
|
|
|
def fetch_page_json(page_html):
|
|
matches = re.findall(r'window.env\s+=\s(.+);', page_html, re.MULTILINE)
|
|
return None if len(matches) == 0 else json.loads(matches[0])
|
|
|
|
|
|
def check_config(domain, time):
|
|
if os.path.isfile(STOCKAGE_FILE_APIKEY):
|
|
config = ConfigParser()
|
|
config.read(STOCKAGE_FILE_APIKEY)
|
|
try:
|
|
time_past = datetime.datetime.strptime(config.get(domain, 'date'),
|
|
'%Y-%m-%d %H:%M:%S.%f')
|
|
|
|
if time_past.hour - 1 < time.hour < time_past.hour + 1 and (
|
|
time - time_past).days == 0:
|
|
log.debug("Using old key")
|
|
application_id = config.get(domain, 'app_id')
|
|
api_key = config.get(domain, 'api_key')
|
|
return application_id, api_key
|
|
log.info(
|
|
f"Need new api key: [{time.hour}|{time_past.hour}|{(time-time_past).days}]"
|
|
)
|
|
except NoSectionError:
|
|
pass
|
|
return None, None
|
|
|
|
|
|
def write_config(date, app_id, api_key):
|
|
log.debug("Writing config!")
|
|
config = ConfigParser()
|
|
config.read(STOCKAGE_FILE_APIKEY)
|
|
try:
|
|
config.get(SITE, 'date')
|
|
except NoSectionError:
|
|
config.add_section(SITE)
|
|
config.set(SITE, 'date', date.strftime("%Y-%m-%d %H:%M:%S.%f"))
|
|
config.set(SITE, 'app_id', app_id)
|
|
config.set(SITE, 'api_key', api_key)
|
|
with open(STOCKAGE_FILE_APIKEY, 'w', encoding='utf-8') as configfile:
|
|
config.write(configfile)
|
|
|
|
|
|
# API Search Data
|
|
def api_search_req(type_search, query, url):
|
|
api_request = None
|
|
if type_search == "query_all_scenes":
|
|
api_request = api_search_query("all_scenes", query, url)
|
|
if type_search == "query_all_photosets":
|
|
api_request = api_search_query("all_photosets", query, url)
|
|
if type_search == "id":
|
|
api_request = api_search_id(query, url)
|
|
if api_request:
|
|
api_search = api_request.json()["results"][0].get("hits")
|
|
if api_search:
|
|
return api_search
|
|
return None
|
|
|
|
|
|
def api_search_id(scene_id, url):
|
|
clip_id = [f"clip_id:{scene_id}"]
|
|
request_api = {
|
|
"requests": [{
|
|
"indexName": "all_scenes",
|
|
"params": "query=&hitsPerPage=20&page=0",
|
|
"facetFilters": clip_id
|
|
}]
|
|
}
|
|
req = send_request(url, HEADERS, request_api)
|
|
return req
|
|
|
|
|
|
def api_search_movie_id(m_id, url):
|
|
movie_id = [f"movie_id:{m_id}"]
|
|
request_api = {
|
|
"requests": [{
|
|
"indexName": "all_movies",
|
|
"params": "query=&hitsPerPage=20&page=0",
|
|
"facetFilters": movie_id
|
|
}]
|
|
}
|
|
req = send_request(url, HEADERS, request_api)
|
|
return req
|
|
|
|
def api_search_gallery_id(p_id, url):
|
|
gallery_id = [[f"set_id:{p_id}"]]
|
|
request_api = {
|
|
"requests": [{
|
|
"indexName": "all_photosets",
|
|
"params": "query=&hitsPerPage=20&page=0",
|
|
"facetFilters": gallery_id,
|
|
"facets": []
|
|
}]
|
|
}
|
|
req = send_request(url, HEADERS, request_api)
|
|
return req
|
|
|
|
|
|
def api_search_query(index_name, query, url):
|
|
request_api = {
|
|
"requests": [{
|
|
"indexName": index_name,
|
|
"params": "query=" + query + "&hitsPerPage=40&page=0"
|
|
}]
|
|
}
|
|
res = send_request(url, HEADERS, request_api)
|
|
return res
|
|
|
|
|
|
# Searching Result
|
|
|
|
|
|
def json_parser(search_json, range_duration=60, single=False, scene_id=None):
|
|
result_dict = {}
|
|
# Just for not printing the full JSON in log...
|
|
debug_dict = {}
|
|
with open("adultime_scene_search.json", 'w',
|
|
encoding='utf-8') as search_file:
|
|
json.dump(search_json, search_file, ensure_ascii=False, indent=4)
|
|
for scene in search_json:
|
|
r_match = match_result(scene, range_duration, single, clip_id=url_id)
|
|
if r_match["info"]:
|
|
if result_dict.get(r_match["info"]):
|
|
# Url should be more accurate than the title
|
|
if r_match["url"] > result_dict[r_match["info"]]["url"]:
|
|
result_dict[r_match["info"]] = {
|
|
"title": r_match["title"],
|
|
"url": r_match["url"],
|
|
"clip_id": r_match["clip_id"],
|
|
"json": scene
|
|
}
|
|
debug_dict[r_match["info"]] = {
|
|
"title": r_match["title"],
|
|
"url": r_match["url"],
|
|
"scene": scene["title"]
|
|
}
|
|
elif r_match["title"] > result_dict[r_match["info"]][
|
|
"title"] and r_match["title"] > result_dict[
|
|
r_match["info"]]["url"]:
|
|
result_dict[r_match["info"]] = {
|
|
"title": r_match["title"],
|
|
"url": r_match["url"],
|
|
"clip_id": r_match["clip_id"],
|
|
"json": scene
|
|
}
|
|
debug_dict[r_match["info"]] = {
|
|
"title": r_match["title"],
|
|
"url": r_match["url"],
|
|
"scene": scene["title"]
|
|
}
|
|
else:
|
|
result_dict[r_match["info"]] = {
|
|
"title": r_match["title"],
|
|
"url": r_match["url"],
|
|
"clip_id": r_match["clip_id"],
|
|
"json": scene
|
|
}
|
|
debug_dict[r_match["info"]] = {
|
|
"title": r_match["title"],
|
|
"url": r_match["url"],
|
|
"scene": scene["title"]
|
|
}
|
|
# Engine whoaaaaa
|
|
# A = ByID/Most likely | S = Size | D = Duration | N = Network | R = Only Ratio
|
|
log.info("--- BEST RESULT ---")
|
|
for key, item in debug_dict.items():
|
|
log.info(
|
|
f'[{key}] Title: {item["scene"]}; Ratio Title: {round(item["title"], 3)} - URL: {round(item["url"], 3)}'
|
|
)
|
|
log.info("--------------")
|
|
#
|
|
if result_dict.get("ASDN"):
|
|
return result_dict["ASDN"]["json"]
|
|
if result_dict.get("ASD"):
|
|
return result_dict["ASD"]["json"]
|
|
if result_dict.get("ASN"):
|
|
return result_dict["ASN"]["json"]
|
|
if result_dict.get("ADN"):
|
|
return result_dict["ADN"]["json"]
|
|
if result_dict.get("AS"):
|
|
return result_dict["AS"]["json"]
|
|
if result_dict.get("AD"):
|
|
return result_dict["AD"]["json"]
|
|
if result_dict.get("AN"):
|
|
if result_dict["AN"]["clip_id"] or result_dict["AN"]["title"] > 0.5 or result_dict["AN"]["url"] > 0.5:
|
|
return result_dict["AN"]["json"]
|
|
if result_dict.get("A"):
|
|
if result_dict["A"]["title"] > 0.7 or result_dict["A"]["url"] > 0.7:
|
|
return result_dict["A"]["json"]
|
|
if result_dict.get("SDN"):
|
|
return result_dict["SDN"]["json"]
|
|
if result_dict.get("SD"):
|
|
return result_dict["SD"]["json"]
|
|
if result_dict.get("SN"):
|
|
if result_dict["SN"]["title"] > 0.5 or result_dict["SN"]["url"] > 0.5:
|
|
return result_dict["SN"]["json"]
|
|
if result_dict.get("DN"):
|
|
if result_dict["DN"]["title"] > 0.5 or result_dict["DN"]["url"] > 0.5:
|
|
return result_dict["DN"]["json"]
|
|
if result_dict.get("S"):
|
|
if result_dict["S"]["title"] > 0.7 or result_dict["S"]["url"] > 0.7:
|
|
return result_dict["S"]["json"]
|
|
if result_dict.get("D"):
|
|
if result_dict["D"]["title"] > 0.7 or result_dict["D"]["url"] > 0.7:
|
|
return result_dict["D"]["json"]
|
|
if result_dict.get("N"):
|
|
if result_dict["N"]["title"] > 0.7 or result_dict["N"]["url"] > 0.7:
|
|
return result_dict["N"]["json"]
|
|
if result_dict.get("R"):
|
|
if result_dict["R"]["title"] > 0.8 or result_dict["R"]["url"] > 0.8:
|
|
return result_dict["R"]["json"]
|
|
return None
|
|
|
|
|
|
def match_result(api_scene, range_duration=60, single=False, clip_id: str=None):
|
|
api_title = api_scene.get("title")
|
|
api_duration = int(api_scene.get("length"))
|
|
api_clip_id = str(api_scene["clip_id"])
|
|
api_filesize = None
|
|
match_duration = False
|
|
match_size = False
|
|
match_clip_id = False
|
|
# Using database
|
|
if database_dict:
|
|
db_duration = int(database_dict[0]["duration"])
|
|
db_height = str(database_dict[0]["height"])
|
|
db_size = int(database_dict[0]["size"])
|
|
if api_scene.get("download_file_sizes"):
|
|
if db_height == "2160":
|
|
api_filesize = api_scene["download_file_sizes"].get("4k")
|
|
else:
|
|
api_filesize = api_scene["download_file_sizes"].get(db_height +
|
|
"p")
|
|
if api_filesize:
|
|
api_filesize = int(api_filesize)
|
|
if api_filesize is None:
|
|
api_filesize = api_scene.get("index_size")
|
|
if api_filesize:
|
|
api_filesize = int(api_filesize)
|
|
if db_duration - range_duration <= api_duration <= db_duration + range_duration:
|
|
match_duration = True
|
|
db_size_max = db_size + (db_size / 100)
|
|
db_size_min = db_size - (db_size / 100)
|
|
if api_filesize:
|
|
if db_size_min <= api_filesize <= db_size_max:
|
|
match_size = True
|
|
# Post process things
|
|
match_domain = False
|
|
if URL_DOMAIN:
|
|
if api_scene.get("sitename"):
|
|
#log.debug("API Sitename: {}".format(api_scene["sitename"]))
|
|
if api_scene["sitename"].lower() == URL_DOMAIN:
|
|
match_domain = True
|
|
if api_scene.get("network_name"):
|
|
#log.debug("API Network: {}".format(api_scene["network_name"]))
|
|
if api_scene["network_name"].lower() == URL_DOMAIN:
|
|
match_domain = True
|
|
|
|
# Matching ratio
|
|
if SCENE_TITLE:
|
|
match_ratio_title = difflib.SequenceMatcher(None, SCENE_TITLE.lower(),
|
|
api_title.lower()).ratio()
|
|
else:
|
|
match_ratio_title = 0
|
|
if url_title and api_scene.get("url_title"):
|
|
match_ratio_title_url = difflib.SequenceMatcher(
|
|
None, url_title.lower(), api_scene["url_title"].lower()).ratio()
|
|
else:
|
|
match_ratio_title_url = 0
|
|
|
|
# Rank search result
|
|
|
|
log.debug(
|
|
f"[MATCH] Title: {api_title} |-RATIO-| Ratio: {round(match_ratio_title, 5)} / URL: {round(match_ratio_title_url, 5)} |-MATCH-| Duration: {match_duration}, Size: {match_size}, Domain: {match_domain}"
|
|
)
|
|
match_dict = {}
|
|
match_dict["title"] = match_ratio_title
|
|
match_dict["url"] = match_ratio_title_url
|
|
|
|
information_used = ""
|
|
if (single and (match_duration or
|
|
(database_dict is None and match_ratio_title_url > 0.5))
|
|
) or match_ratio_title_url == 1:
|
|
information_used += "A"
|
|
if match_size:
|
|
information_used += "S"
|
|
if match_duration:
|
|
information_used += "D"
|
|
if match_domain:
|
|
information_used += "N"
|
|
if clip_id:
|
|
if clip_id == api_clip_id:
|
|
match_clip_id = True
|
|
if information_used == "":
|
|
information_used = "R"
|
|
match_dict["info"] = information_used
|
|
match_dict["clip_id"] = match_clip_id
|
|
#debug("[MATCH] {} - {}".format(api_title,match_dict))
|
|
return match_dict
|
|
|
|
|
|
def get_id_from_url(url: str) -> str:
|
|
'''
|
|
gets the id from a valid url
|
|
expects urls of the form www.example.com/.../title/id
|
|
'''
|
|
if url is None or url == "":
|
|
return None
|
|
|
|
id_check = re.sub('.+/', '', url)
|
|
id_from_url = None
|
|
try:
|
|
if id_check.isdigit():
|
|
id_from_url = id_check
|
|
else:
|
|
id_from_url = re.search(r"/(\d+)/*", url).group(1)
|
|
log.info(f"ID: {id_from_url}")
|
|
except:
|
|
log.warning("Can't get ID from URL")
|
|
return id_from_url
|
|
|
|
|
|
def parse_movie_json(movie_json: dict) -> dict:
|
|
"""
|
|
process an api movie dictionary and return a scraped one
|
|
"""
|
|
scrape = {}
|
|
try:
|
|
studio_name = determine_studio_name_from_json(movie_json[0])
|
|
except IndexError:
|
|
log.debug("No movie found")
|
|
return scrape
|
|
scrape["synopsis"] = clean_text(movie_json[0].get("description"))
|
|
scrape["name"] = movie_json[0].get("title")
|
|
scrape["studio"] = {"name": studio_name}
|
|
scrape["duration"] = movie_json[0].get("total_length")
|
|
|
|
date_by_studio = "date_created" # options are "date_created", "upcoming" (not always avaialble), "last_modified"
|
|
# dates don't seem to be accurate (modifed multiple times by studio)
|
|
# using date_created as default and we later override for each site when needed
|
|
|
|
log.debug(
|
|
f"Dates available: upcoming {movie_json[0].get('upcoming')} - created {movie_json[0].get('date_created')} - last modified {movie_json[0].get('last_modified')}"
|
|
)
|
|
studios_movie_dates = {
|
|
"Diabolic": "last_modified",
|
|
"Evil Angel": "date_created",
|
|
"Wicked": "date_created",
|
|
"Zerotolerance": "last_modified"
|
|
}
|
|
if studios_movie_dates.get(studio_name):
|
|
date_by_studio = studios_movie_dates[studio_name]
|
|
scrape["date"] = movie_json[0].get(date_by_studio)
|
|
|
|
scrape[
|
|
"front_image"] = f"https://transform.gammacdn.com/movies{movie[0].get('cover_path')}_front_400x625.jpg?width=450&height=636"
|
|
scrape[
|
|
"back_image"] = f"https://transform.gammacdn.com/movies{movie[0].get('cover_path')}_back_400x625.jpg?width=450&height=636"
|
|
|
|
directors = []
|
|
if movie_json[0].get('directors') is not None:
|
|
for director in movie_json[0].get('directors'):
|
|
directors.append(director.get('name').strip())
|
|
scrape["director"] = ", ".join(directors)
|
|
return scrape
|
|
|
|
def determine_studio_name_from_json(some_json):
|
|
'''
|
|
Reusable function to determine studio name based on what was scraped.
|
|
This can be used for scraping:
|
|
- scene
|
|
- gallery
|
|
- movie
|
|
'''
|
|
studio_name = None
|
|
if some_json.get('sitename_pretty'):
|
|
if some_json.get('sitename_pretty') in SITES_USING_OVERRIDE_AS_STUDIO_FOR_SCENE:
|
|
studio_name = \
|
|
SITES_USING_OVERRIDE_AS_STUDIO_FOR_SCENE.get(some_json.get('sitename_pretty'))
|
|
elif some_json.get('sitename_pretty') in SITES_USING_SITENAME_AS_STUDIO_FOR_SCENE \
|
|
or some_json.get('serie_name') in SERIE_USING_SITENAME_AS_STUDIO_FOR_SCENE \
|
|
or some_json.get('network_name') \
|
|
and some_json.get('network_name') in NETWORKS_USING_SITENAME_AS_STUDIO_FOR_SCENE:
|
|
studio_name = some_json.get('sitename_pretty')
|
|
elif some_json.get('sitename_pretty') in SITES_USING_NETWORK_AS_STUDIO_FOR_SCENE \
|
|
and some_json.get('network_name'):
|
|
studio_name = some_json.get('network_name')
|
|
if not studio_name and some_json.get('network_name') and \
|
|
some_json.get('network_name') in NETWORKS_USING_SITENAME_AS_STUDIO_FOR_SCENE:
|
|
studio_name = some_json.get('sitename_pretty')
|
|
if not studio_name and some_json.get('mainChannelName') and \
|
|
some_json.get('mainChannelName') in MAIN_CHANNELS_AS_STUDIO_FOR_SCENE:
|
|
studio_name = some_json.get('mainChannelName')
|
|
if not studio_name and some_json.get('directors'):
|
|
for director in [ d.get('name').strip() for d in some_json.get('directors') ]:
|
|
if DIRECTOR_AS_STUDIO_OVERRIDE_FOR_SCENE.get(director):
|
|
studio_name = \
|
|
DIRECTOR_AS_STUDIO_OVERRIDE_FOR_SCENE.get(director)
|
|
if not studio_name and some_json.get('serie_name'):
|
|
if some_json.get('serie_name') in SERIE_USING_OVERRIDE_AS_STUDIO_FOR_SCENE:
|
|
studio_name = \
|
|
SERIE_USING_OVERRIDE_AS_STUDIO_FOR_SCENE.get(some_json.get('serie_name'))
|
|
else:
|
|
studio_name = some_json.get('serie_name')
|
|
return studio_name
|
|
|
|
def parse_scene_json(scene_json, url=None):
|
|
"""
|
|
process an api scene dictionary and return a scraped one
|
|
"""
|
|
scrape = {}
|
|
# Title
|
|
if scene_json.get('title'):
|
|
scrape['title'] = scene_json['title'].strip()
|
|
# Date
|
|
scrape['date'] = scene_json.get('release_date')
|
|
# Details
|
|
scrape['details'] = clean_text(scene_json.get('description'))
|
|
|
|
# Studio Code
|
|
if scene_json.get('clip_id'):
|
|
scrape['code'] = str(scene_json['clip_id'])
|
|
|
|
# Director
|
|
directors = []
|
|
if scene_json.get('directors') is not None:
|
|
for director in scene_json.get('directors'):
|
|
directors.append(director.get('name').strip())
|
|
scrape["director"] = ", ".join(directors)
|
|
|
|
# Studio
|
|
scrape['studio'] = {}
|
|
studio_name = determine_studio_name_from_json(scene_json)
|
|
if studio_name:
|
|
scrape['studio']['name'] = studio_name
|
|
|
|
log.debug(
|
|
f"[STUDIO] {scene_json.get('serie_name')} - {scene_json.get('network_name')} - {scene_json.get('mainChannelName')} - {scene_json.get('sitename_pretty')}"
|
|
)
|
|
# Performer
|
|
perf = []
|
|
for actor in scene_json.get('actors'):
|
|
if actor.get('gender') == "female" or NON_FEMALE:
|
|
perf.append({
|
|
"name": actor.get('name').strip(),
|
|
"gender": actor.get('gender')
|
|
})
|
|
scrape['performers'] = perf
|
|
|
|
# Tags
|
|
list_tag = []
|
|
for tag in scene_json.get('categories'):
|
|
if tag.get('name') is None:
|
|
continue
|
|
tag_name = tag.get('name')
|
|
tag_name = " ".join(tag.capitalize() for tag in tag_name.split(" "))
|
|
if tag_name:
|
|
list_tag.append({"name": tag.get('name')})
|
|
if FIXED_TAG:
|
|
list_tag.append({"name": FIXED_TAG})
|
|
scrape['tags'] = list_tag
|
|
|
|
# Image
|
|
try:
|
|
scrape['image'] = 'https://images03-fame.gammacdn.com/movies' + next(
|
|
iter(scene_json['pictures']['nsfw']['top'].values()))
|
|
except:
|
|
try:
|
|
scrape[
|
|
'image'] = 'https://images03-fame.gammacdn.com/movies' + next(
|
|
iter(scene_json['pictures']['sfw']['top'].values()))
|
|
except:
|
|
log.warning("Can't locate image.")
|
|
# URL
|
|
try:
|
|
hostname = scene_json.get('sitename')
|
|
if hostname is None:
|
|
hostname = SITE
|
|
# Movie
|
|
if scene_json.get('movie_title'):
|
|
scrape['movies'] = [{
|
|
"name": scene_json["movie_title"],
|
|
"synopsis": clean_text(scene_json.get("movie_desc")),
|
|
"date": scene_json.get("movie_date_created")
|
|
}]
|
|
log.debug(f"domain to use for movie url: {URL_DOMAIN}")
|
|
if scene_json.get("url_movie_title") and scene_json.get(
|
|
"movie_id"):
|
|
if URL_DOMAIN and MOVIE_SITES.get(URL_DOMAIN):
|
|
scrape['movies'][0][
|
|
'url'] = f"{MOVIE_SITES[URL_DOMAIN]}/{scene_json['url_movie_title']}/{scene_json['movie_id']}"
|
|
net_name = scene_json.get('network_name')
|
|
if net_name:
|
|
if net_name.lower() == "21 sextury":
|
|
hostname = "21sextury"
|
|
elif net_name.lower() == "21 naturals":
|
|
hostname = "21naturals"
|
|
elif net_name.lower() == 'transfixed':
|
|
hostname = 'transfixed'
|
|
|
|
scrape[
|
|
'url'] = f"https://{hostname.lower()}.com/en/video/{hostname.lower()}/{scene_json['url_title']}/{scene_json['clip_id']}"
|
|
except Exception as exc:
|
|
log.debug(f"{exc}")
|
|
if url:
|
|
scrape['url'] = url
|
|
#log.debug(f"{scrape}")
|
|
return scrape
|
|
|
|
def parse_gallery_json(gallery_json: dict, url: str = None) -> dict:
|
|
"""
|
|
process an api gallery dictionary and return a scraped one
|
|
"""
|
|
scrape = {}
|
|
# Title
|
|
if gallery_json.get('clip_title'):
|
|
scrape['title'] = gallery_json['clip_title'].strip()
|
|
elif gallery_json.get('title'):
|
|
scrape['title'] = gallery_json['title'].strip()
|
|
# Date
|
|
scrape['date'] = gallery_json.get('date_online') or gallery_json.get('release_date')
|
|
# Details
|
|
scrape['details'] = clean_text(gallery_json.get('description'))
|
|
|
|
# Studio Code # not yet supported in stash
|
|
#if gallery_json.get('set_id'):
|
|
# scrape['code'] = str(gallery_json['set_id'])
|
|
|
|
# Director # not yet supported in stash
|
|
#directors = []
|
|
#if gallery_json.get('directors') is not None:
|
|
# for director in gallery_json.get('directors'):
|
|
# directors.append(director.get('name').strip())
|
|
#scrape["director"] = ", ".join(directors)
|
|
|
|
# Studio
|
|
scrape['studio'] = {}
|
|
studio_name = determine_studio_name_from_json(gallery_json)
|
|
if studio_name:
|
|
scrape['studio']['name'] = studio_name
|
|
|
|
log.debug(
|
|
f"[STUDIO] {gallery_json.get('serie_name')} - {gallery_json.get('network_name')} - {gallery_json.get('mainChannelName')} - {gallery_json.get('sitename_pretty')}"
|
|
)
|
|
# Performer
|
|
perf = []
|
|
for actor in gallery_json.get('actors'):
|
|
if actor.get('gender') == "female" or NON_FEMALE:
|
|
perf.append({
|
|
"name": actor.get('name').strip(),
|
|
"gender": actor.get('gender')
|
|
})
|
|
scrape['performers'] = perf
|
|
|
|
# Tags
|
|
list_tag = []
|
|
for tag in gallery_json.get('categories'):
|
|
if tag.get('name') is None:
|
|
continue
|
|
tag_name = tag.get('name')
|
|
tag_name = " ".join(tag.capitalize() for tag in tag_name.split(" "))
|
|
if tag_name:
|
|
list_tag.append({"name": tag.get('name')})
|
|
if FIXED_TAG:
|
|
list_tag.append({"name": FIXED_TAG})
|
|
scrape['tags'] = list_tag
|
|
|
|
# URL
|
|
try:
|
|
hostname = gallery_json['sitename']
|
|
net_name = gallery_json['network_name']
|
|
if net_name.lower() == "21 sextury":
|
|
hostname = "21sextury"
|
|
elif net_name.lower() == "21 naturals":
|
|
hostname = "21naturals"
|
|
scrape['url'] = f"https://www.{hostname.lower()}.com/en/photo/" \
|
|
f"{gallery_json['url_title']}/{gallery_json['set_id']}"
|
|
except:
|
|
if url:
|
|
scrape['url'] = url
|
|
return scrape
|
|
#
|
|
# Start processing
|
|
#
|
|
|
|
try:
|
|
USERFOLDER_PATH = re.match(r".+\.stash.", __file__).group(0)
|
|
CONFIG_PATH = USERFOLDER_PATH + "config.yml"
|
|
log.debug(f"Config Path: {CONFIG_PATH}")
|
|
except:
|
|
USERFOLDER_PATH = None
|
|
CONFIG_PATH = None
|
|
log.debug("No config")
|
|
|
|
SITE = sys.argv[1]
|
|
HEADERS = {
|
|
"User-Agent":
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0',
|
|
"Origin": f"https://www.{SITE}.com",
|
|
"Referer": f"https://www.{SITE}.com"
|
|
}
|
|
|
|
FRAGMENT = json.loads(sys.stdin.read())
|
|
SEARCH_TITLE = FRAGMENT.get("name")
|
|
SCENE_ID = FRAGMENT.get("id")
|
|
SCENE_TITLE = FRAGMENT.get("title")
|
|
SCENE_URL = FRAGMENT.get("url")
|
|
|
|
# log.trace(f"fragment: {FRAGMENT}")
|
|
|
|
# ACCESS API
|
|
# Check existing API keys
|
|
CURRENT_TIME = datetime.datetime.now()
|
|
application_id, api_key = check_config(SITE, CURRENT_TIME)
|
|
# Getting new key
|
|
if application_id is None:
|
|
application_id, api_key = apikey_get(f"https://www.{SITE}.com/en",
|
|
CURRENT_TIME)
|
|
# Failed to get new key
|
|
if application_id is None:
|
|
sys.exit(1)
|
|
api_url = f"https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-application-id={application_id}&x-algolia-api-key={api_key}"
|
|
|
|
#log.debug(HEADERS)
|
|
#log.debug(FRAGMENT)
|
|
URL_DOMAIN = None
|
|
if SCENE_URL:
|
|
URL_DOMAIN = re.sub(r"www\.|\.com", "", urlparse(SCENE_URL).netloc).lower()
|
|
log.info(f"URL Domain: {URL_DOMAIN}")
|
|
|
|
if "validName" in sys.argv and SCENE_URL is None:
|
|
sys.exit(1)
|
|
|
|
if SCENE_URL and SCENE_ID is None:
|
|
log.debug(f"URL Scraping: {SCENE_URL}")
|
|
else:
|
|
log.debug(f"Stash ID: {SCENE_ID}")
|
|
log.debug(f"Stash Title: {SCENE_TITLE}")
|
|
|
|
if "movie" not in sys.argv and "gallery" not in sys.argv:
|
|
# Get your sqlite database
|
|
stash_config = graphql.configuration()
|
|
DB_PATH = None
|
|
if stash_config:
|
|
DB_PATH = stash_config["general"]["databasePath"]
|
|
|
|
if (CONFIG_PATH and DB_PATH is None):
|
|
# getting your database from the config.yml
|
|
if os.path.isfile(CONFIG_PATH):
|
|
with open(CONFIG_PATH, encoding='utf-8') as f:
|
|
for line in f:
|
|
if "database: " in line:
|
|
DB_PATH = line.replace("database: ", "").rstrip('\n')
|
|
break
|
|
log.debug(f"Database Path: {DB_PATH}")
|
|
if DB_PATH:
|
|
if SCENE_ID:
|
|
# Get data by GraphQL
|
|
database_dict = graphql.getScene(SCENE_ID)
|
|
if database_dict is None:
|
|
# Get data by SQlite
|
|
log.warning(
|
|
"GraphQL request failed, accessing database directly...")
|
|
database_dict = check_db(DB_PATH, SCENE_ID)
|
|
else:
|
|
database_dict = database_dict["files"]
|
|
log.debug(f"[DATABASE] Info: {database_dict}")
|
|
else:
|
|
database_dict = None
|
|
log.debug("URL scraping... Ignoring database...")
|
|
else:
|
|
database_dict = None
|
|
log.warning("Database path missing.")
|
|
|
|
# Extract things
|
|
url_title = None
|
|
url_id = None
|
|
url_domain = None
|
|
if SCENE_URL:
|
|
url_id = get_id_from_url(SCENE_URL)
|
|
try:
|
|
url_title = re.match(r".+/(.+)/\d+", SCENE_URL).group(1)
|
|
log.info(f"URL_TITLE: {url_title}")
|
|
except:
|
|
log.warning("Can't get url_title from URL")
|
|
|
|
# Filter title
|
|
if SCENE_TITLE:
|
|
SCENE_TITLE = re.sub(r'[-._\']', ' ', os.path.splitext(SCENE_TITLE)[0])
|
|
# Remove resolution
|
|
SCENE_TITLE = re.sub(
|
|
r'\sXXX|\s1080p|720p|2160p|KTR|RARBG|\scom\s|\[|]|\sHD|\sSD|', '',
|
|
SCENE_TITLE)
|
|
# Remove Date
|
|
SCENE_TITLE = re.sub(r'\s\d{2}\s\d{2}\s\d{2}|\s\d{4}\s\d{2}\s\d{2}',
|
|
'', SCENE_TITLE)
|
|
log.debug(f"Title: {SCENE_TITLE}")
|
|
|
|
# Time to search the API
|
|
api_search = None
|
|
api_json = None
|
|
|
|
# sceneByName
|
|
if SEARCH_TITLE:
|
|
SEARCH_TITLE = SEARCH_TITLE.replace(".", " ")
|
|
log.debug(f"[API] Searching for: {SEARCH_TITLE}")
|
|
api_search = api_search_req("query_all_scenes", SEARCH_TITLE, api_url)
|
|
final_json = None
|
|
if api_search:
|
|
result_search = []
|
|
for scene in api_search:
|
|
scraped_json = parse_scene_json(scene)
|
|
if scraped_json.get("tags"):
|
|
scraped_json.pop("tags")
|
|
result_search.append(scraped_json)
|
|
if result_search:
|
|
final_json = result_search
|
|
if final_json is None:
|
|
log.error("API Search finished. No results!")
|
|
print(json.dumps(final_json))
|
|
sys.exit()
|
|
|
|
if url_id:
|
|
log.debug(f"[API] Searching using URL_ID {url_id}")
|
|
api_search = api_search_req("id", url_id, api_url)
|
|
if api_search:
|
|
log.info(f"[API] Search gives {len(api_search)} result(s)")
|
|
api_json = json_parser(api_search, 120, True)
|
|
else:
|
|
log.warning("[API] No result")
|
|
if url_title and api_json is None:
|
|
log.debug("[API] Searching using URL_TITLE")
|
|
api_search = api_search_req("query_all_scenes", url_title, api_url)
|
|
if api_search:
|
|
log.info(f"[API] Search gives {len(api_search)} result(s)")
|
|
api_json = json_parser(api_search)
|
|
if SCENE_TITLE and api_json is None:
|
|
log.debug("[API] Searching using STASH_TITLE")
|
|
api_search = api_search_req("query_all_scenes", SCENE_TITLE, api_url)
|
|
if api_search:
|
|
log.info(f"[API] Search gives {len(api_search)} result(s)")
|
|
api_json = json_parser(api_search)
|
|
|
|
# Scraping the JSON
|
|
if api_json:
|
|
log.info(f"Scene found: {api_json['title']}")
|
|
scraped_json = parse_scene_json(api_json, SCENE_URL)
|
|
print(json.dumps(scraped_json))
|
|
else:
|
|
log.error("Can't find the scene")
|
|
print(json.dumps({}))
|
|
sys.exit()
|
|
elif "movie" in sys.argv:
|
|
log.debug("Scraping movie")
|
|
movie_id = get_id_from_url(SCENE_URL)
|
|
if movie_id:
|
|
movie_results = api_search_movie_id(movie_id, api_url)
|
|
movie = movie_results.json()["results"][0].get("hits")
|
|
scraped_movie = parse_movie_json(movie)
|
|
#log.debug(scraped_movie)
|
|
print(json.dumps(scraped_movie))
|
|
elif "gallery" in sys.argv:
|
|
scraped_gallery = None
|
|
if SCENE_URL:
|
|
if "/video/" in SCENE_URL:
|
|
log.debug("Scraping scene by URL")
|
|
scene_id = get_id_from_url(SCENE_URL)
|
|
api_search_response = api_search_req("id", scene_id, api_url)
|
|
if api_search_response:
|
|
# log.debug(f"[API] Search gives {len(api_search_response)} result(s)")
|
|
# log.trace(f"api_search_response: {api_search_response}")
|
|
scraped_gallery = parse_gallery_json(api_search_response[0])
|
|
else:
|
|
log.debug("Scraping gallery by URL")
|
|
gallery_id = get_id_from_url(SCENE_URL)
|
|
if gallery_id:
|
|
gallery_results = api_search_gallery_id(gallery_id, api_url)
|
|
gallery = gallery_results.json()["results"][0].get("hits")
|
|
if gallery:
|
|
#log.debug(gallery[0])
|
|
scraped_gallery = parse_gallery_json(gallery[0])
|
|
#log.debug(scraped_gallery)
|
|
elif SCENE_TITLE:
|
|
log.debug("Scraping gallery by fragment")
|
|
# log.debug(f"[API] Searching using SCENE_TITLE: {SCENE_TITLE}")
|
|
api_search = api_search_req("query_all_photosets", SCENE_TITLE, api_url)
|
|
if api_search:
|
|
log.info(f"[API] Search gives {len(api_search)} result(s)")
|
|
# log.trace(f"api_search: {api_search}")
|
|
log.debug(f"Galleries found: {'; '.join([g['title'] for g in api_search])}")
|
|
scraped_gallery = parse_gallery_json(api_search[0])
|
|
# Scraping the JSON
|
|
if scraped_gallery:
|
|
print(json.dumps(scraped_gallery))
|
|
else:
|
|
log.error("Can't find the gallery")
|
|
print(json.dumps({}))
|
|
sys.exit()
|