stash
This commit is contained in:
371
stash/config/scrapers/community/Tokyohot/Tokyohot.py
Normal file
371
stash/config/scrapers/community/Tokyohot/Tokyohot.py
Normal file
@@ -0,0 +1,371 @@
|
||||
import base64
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
# to import from a parent directory we need to add that directory to the system path
|
||||
csd = os.path.dirname(os.path.realpath(__file__)) # get current script directory
|
||||
parent = os.path.dirname(csd) # parent directory (should be the scrapers one)
|
||||
sys.path.append(
|
||||
parent
|
||||
) # add parent dir to sys path so that we can import py_common from ther
|
||||
|
||||
BASE_QUERY_MEDIA_SEARCH = "https://my.tokyo-hot.com/product/?q="
|
||||
BASE_DETAIL_URL = "https://my.tokyo-hot.com"
|
||||
|
||||
JAP_TO_US_BUST = {
|
||||
"A": "AA",
|
||||
"B": "A",
|
||||
"C": "B",
|
||||
"D": "C",
|
||||
"E": "D",
|
||||
"F": "DD",
|
||||
"G": "DDD",
|
||||
"H": "F",
|
||||
"I": "G",
|
||||
"J": "H",
|
||||
"K": "I",
|
||||
}
|
||||
|
||||
MEDIA_CONFIGURATIONS = [
|
||||
## must contain either 1 or 2 capture groups
|
||||
## group 1 = the code
|
||||
## group 2 (optional) = the part number if it's a multi-part (split) scene
|
||||
r"(n\d{4})\D*_\D{2}(\d)\S*", # "mult-part N series"
|
||||
r"(n\d{4})\S*", # "single part N series"
|
||||
r"(k\d{4})\S*", # "single part K series"
|
||||
r"(kb\d{4})\S*", # "single part KB series"
|
||||
]
|
||||
|
||||
try:
|
||||
from py_common import log
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
"If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"You need to install the Beautiful Soup module. (https://pypi.org/project/beautifulsoup4/)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
"If you have pip (normally installed with python), run this command in a terminal (cmd): pip install beautifulsoup4",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
|
||||
class ScenePage:
|
||||
def __init__(self, scene_id, multipart, partnum, url):
|
||||
self.url = url
|
||||
self.soup = _soup_maker(self.url)
|
||||
self.scene_id = scene_id
|
||||
self.multipart = multipart
|
||||
self.partnum = partnum
|
||||
self.title = self.get_title()
|
||||
self.studio = self.get_studio()
|
||||
self.image = self.get_image()
|
||||
self.details = self.get_details()
|
||||
self.performers = self.get_performers()
|
||||
self.date = self.get_date()
|
||||
self.tags = self.get_tags()
|
||||
|
||||
def get_title(self):
|
||||
title = self.scene_id
|
||||
if self.multipart:
|
||||
title = title + f" - Part {self.partnum}"
|
||||
scene_title = self.soup.find("div", {"class": "pagetitle"})
|
||||
if scene_title:
|
||||
title = title + " - " + scene_title.text.strip()
|
||||
return title
|
||||
|
||||
def get_studio(self):
|
||||
info = self.soup.find("div", {"class": "infowrapper"})
|
||||
info_links = info.find_all("a")
|
||||
for link in info_links:
|
||||
if "vendor" in link.get("href"):
|
||||
return link.text
|
||||
return None
|
||||
|
||||
def get_image(self):
|
||||
info = self.soup.find("video")
|
||||
if info:
|
||||
return get_image(info.get("poster"))
|
||||
return None
|
||||
|
||||
def get_performers(self):
|
||||
performers = []
|
||||
info = self.soup.find("div", {"class": "infowrapper"})
|
||||
info_links = info.find_all("a")
|
||||
for link in info_links:
|
||||
if "cast" in link.get("href"):
|
||||
perf = TokyoHotModel(
|
||||
model_url=BASE_DETAIL_URL + link.get("href")
|
||||
).get_json()
|
||||
performers.append(perf)
|
||||
return performers
|
||||
|
||||
def get_details(self):
|
||||
details = None
|
||||
scene_details = self.soup.find("div", {"class": "sentence"})
|
||||
if scene_details:
|
||||
details = scene_details.text.strip()
|
||||
return details
|
||||
|
||||
def get_date(self):
|
||||
log.info("Invoking self date")
|
||||
info_dd = self.soup.find("div", {"class": "infowrapper"}).find_all("dd")
|
||||
for dd in info_dd:
|
||||
search = re.search("(\d{4})/(\d{2})/(\d{2})", dd.text)
|
||||
if search:
|
||||
date = f"{search[1]}-{search[2]}-{search[3]}"
|
||||
return date
|
||||
return None
|
||||
|
||||
def get_tags(self):
|
||||
potential_tags = self.soup.find("div", {"class": "infowrapper"}).find_all("a")
|
||||
return [
|
||||
{"Name": a.text} for a in potential_tags if "type=play" in a.get("href")
|
||||
]
|
||||
|
||||
def get_json(self):
|
||||
return {
|
||||
"Title": self.title,
|
||||
"Details": self.details,
|
||||
"URL": self.url,
|
||||
"Date": self.date,
|
||||
"Performers": self.performers,
|
||||
"Studio": {"Name": self.studio},
|
||||
"Code": self.scene_id,
|
||||
"Image": self.image,
|
||||
"Tags": self.tags,
|
||||
}
|
||||
|
||||
|
||||
class TokyoHotModel:
|
||||
def __init__(self, model_url):
|
||||
self.url = model_url
|
||||
self.model_soup = _soup_maker(self.url)
|
||||
self.model_name = self.get_name()
|
||||
self.height = self.get_height()
|
||||
self.weight = self.get_weight()
|
||||
self.measurements = self.get_measurements()
|
||||
self.images = self.get_images()
|
||||
self.gender = "Female"
|
||||
self.ethnicity = "Asian"
|
||||
self.country = "JP"
|
||||
|
||||
def get_name(self):
|
||||
name = None
|
||||
model_name = self.model_soup.find("div", {"class": "pagetitle mb0"})
|
||||
if model_name:
|
||||
name = model_name.text.strip()
|
||||
return name
|
||||
|
||||
def get_height(self):
|
||||
info_dt = self.model_soup.find("dl", {"class": "info"}).find_all("dt")
|
||||
info_dd = self.model_soup.find("dl", {"class": "info"}).find_all("dd")
|
||||
info_dict = dict(map(lambda k, v: (k.text, v.text), info_dt, info_dd))
|
||||
|
||||
if info_dict.get("Height"):
|
||||
parse_data = re.search("(\d{3})cm\s~\s(\d{3})cm", info_dict.get("Height"))
|
||||
if parse_data:
|
||||
data = (int(parse_data[1]) + int(parse_data[2])) / 2
|
||||
return str(data)
|
||||
return None
|
||||
|
||||
def get_weight(self):
|
||||
info_dt = self.model_soup.find("dl", {"class": "info"}).find_all("dt")
|
||||
info_dd = self.model_soup.find("dl", {"class": "info"}).find_all("dd")
|
||||
info_dict = dict(map(lambda k, v: (k.text, v.text), info_dt, info_dd))
|
||||
if info_dict.get("Weight"):
|
||||
parse_data = re.search(
|
||||
"(\d{2,3})cm\s~\s(\d{2,3})cm", info_dict.get("Weight")
|
||||
)
|
||||
if parse_data:
|
||||
data = (int(parse_data[1]) + int(parse_data[2])) / 2
|
||||
return str(data)
|
||||
return None
|
||||
|
||||
def get_measurements(self):
|
||||
info_dt = self.model_soup.find("dl", {"class": "info"}).find_all("dt")
|
||||
info_dd = self.model_soup.find("dl", {"class": "info"}).find_all("dd")
|
||||
info_dict = dict(map(lambda k, v: (k.text, v.text), info_dt, info_dd))
|
||||
|
||||
cup = None
|
||||
bust = None
|
||||
waist = None
|
||||
hip = None
|
||||
|
||||
if info_dict.get("Cup Size"):
|
||||
parse_cup = re.search("^(\w)", info_dict.get("Cup Size"))
|
||||
if parse_cup:
|
||||
cup = JAP_TO_US_BUST.get(parse_cup[1].strip())
|
||||
|
||||
if info_dict.get("Bust Size"):
|
||||
parse_bust = re.search(
|
||||
"(\d{2,3})cm\s~\s(\d{2,3})cm", info_dict.get("Bust Size")
|
||||
)
|
||||
if parse_bust:
|
||||
bust = round(((int(parse_bust[1]) + int(parse_bust[2])) / 2) * 0.393701)
|
||||
|
||||
if info_dict.get("Waist Size"):
|
||||
parse_waist = re.search(
|
||||
"(\d{2,3})cm\s~\s(\d{2,3})cm", info_dict.get("Waist Size")
|
||||
)
|
||||
if parse_waist:
|
||||
waist = round(
|
||||
((int(parse_waist[1]) + int(parse_waist[2])) / 2) * 0.393701
|
||||
)
|
||||
|
||||
if info_dict.get("Hip"):
|
||||
parse_hip = re.search("(\d{2,3})cm\s~\s(\d{2,3})cm", info_dict.get("Hip"))
|
||||
if parse_hip:
|
||||
hip = round(((int(parse_hip[1]) + int(parse_hip[2])) / 2) * 0.393701)
|
||||
|
||||
if cup and bust and waist and hip:
|
||||
return f"{bust}{cup}-{waist}-{hip}"
|
||||
|
||||
return None
|
||||
|
||||
def get_images(self):
|
||||
try:
|
||||
model_url = (
|
||||
self.model_soup.find("div", {"id": "profile"}).find("img").get("src")
|
||||
)
|
||||
return [get_image(model_url)]
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_json(self):
|
||||
return {
|
||||
"Name": self.model_name,
|
||||
"Gender": self.gender,
|
||||
"URL": self.url,
|
||||
"Ethnicity": self.ethnicity,
|
||||
"Country": self.country,
|
||||
"Height": self.height,
|
||||
"Weight": self.weight,
|
||||
"Measurements": self.measurements,
|
||||
"Images": self.images,
|
||||
}
|
||||
|
||||
|
||||
def query(fragment, query_type):
|
||||
res = None
|
||||
media_info = None
|
||||
|
||||
if query_type in ("scene"):
|
||||
name = re.sub(r"\s", "_", fragment["title"]).lower()
|
||||
media_info = _extract_media_id(name)
|
||||
|
||||
if media_info:
|
||||
res = scrape_scene(
|
||||
name=media_info["code"],
|
||||
multipart=media_info["multipart"],
|
||||
partnum=media_info["partnum"],
|
||||
)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _soup_maker(url: str):
|
||||
requests.packages.urllib3.disable_warnings()
|
||||
try:
|
||||
html = requests.get(url, verify=False)
|
||||
soup = BeautifulSoup(html.text, "html.parser")
|
||||
except Exception as e:
|
||||
log.error("Error retrieving specified URL")
|
||||
raise e
|
||||
return soup
|
||||
|
||||
|
||||
def _parse_media_search(soup):
|
||||
detail_page_url = None
|
||||
detail_object = soup.find("a", {"class": "rm"})
|
||||
if detail_object:
|
||||
detail_page_url = BASE_DETAIL_URL + detail_object.get("href")
|
||||
log.info(f"Scene URL found: {detail_page_url}")
|
||||
return detail_page_url
|
||||
|
||||
|
||||
def _extract_media_id(media_title: str, configuration: dict = MEDIA_CONFIGURATIONS):
|
||||
log.info(f"Extracting Media ID for {media_title}")
|
||||
|
||||
def _extract_multi_part(search_results):
|
||||
if len(search_results.groups()) > 1:
|
||||
return (True, search_results[2])
|
||||
return (False, False)
|
||||
|
||||
for config in configuration:
|
||||
search = re.search(pattern=config, string=media_title)
|
||||
if search:
|
||||
scene_info = {
|
||||
"code": search[1],
|
||||
"multipart": _extract_multi_part(search)[0],
|
||||
"partnum": _extract_multi_part(search)[1],
|
||||
}
|
||||
log.info(f"Regex matched. Details {scene_info}")
|
||||
return scene_info
|
||||
return None
|
||||
|
||||
|
||||
def scrape_scene(name, multipart, partnum):
|
||||
search_soup = _soup_maker(BASE_QUERY_MEDIA_SEARCH + name)
|
||||
scene_url = _parse_media_search(soup=search_soup)
|
||||
if scene_url is None:
|
||||
log.info(f"Scene not found: {name}. Try another server region, e.g. Hong Kong")
|
||||
return None
|
||||
scene_page = ScenePage(
|
||||
scene_id=name, multipart=multipart, partnum=partnum, url=scene_url
|
||||
)
|
||||
response = scene_page.get_json()
|
||||
return response
|
||||
|
||||
|
||||
def get_image(image_url):
|
||||
try:
|
||||
response = requests.get(image_url, verify=False, timeout=(3, 6))
|
||||
except requests.exceptions.RequestException:
|
||||
log.error(f"Error fetching URL {image_url}")
|
||||
|
||||
if response.status_code < 400:
|
||||
mime = "image/jpeg"
|
||||
encoded = base64.b64encode(response.content).decode("utf-8")
|
||||
return f"data:{mime};base64,{encoded}"
|
||||
|
||||
log.info(f"Fetching {image_url} resulted in error: {response.status_code}")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
scraper_input = sys.stdin.read()
|
||||
i = json.loads(scraper_input)
|
||||
ret = {}
|
||||
if sys.argv[1] == "query":
|
||||
ret = query(i, sys.argv[2])
|
||||
output = json.dumps(ret)
|
||||
print(output)
|
||||
|
||||
|
||||
main()
|
||||
Reference in New Issue
Block a user