stash
This commit is contained in:
379
stash/config/scrapers/community/Minnano-AV/Minnano-AV.py
Normal file
379
stash/config/scrapers/community/Minnano-AV/Minnano-AV.py
Normal file
@@ -0,0 +1,379 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
CURRENT_SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
PARENT_DIR = os.path.dirname(CURRENT_SCRIPT_DIR)
|
||||
sys.path.append(PARENT_DIR)
|
||||
|
||||
try:
|
||||
import py_common.log as log
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"You need to download the folder 'py_common' from the community repo (CommunityScrapers/tree/master/scrapers/py_common)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
import requests
|
||||
from lxml import etree
|
||||
except ModuleNotFoundError:
|
||||
print("You need to install dependencies from requirements.txt")
|
||||
sys.exit(1)
|
||||
|
||||
XPATHS = {
|
||||
"alias": "//section[@class=\"main-column details\"]/h1/text()|//span[text()='別名']/following-sibling::p/text()",
|
||||
"birthdate": "//span[text()='生年月日']/../p/a/@href",
|
||||
"career": "//span[text()='AV出演期間']/../p/text()",
|
||||
"debut": "//span[text()='デビュー作品']/../p/text()",
|
||||
"id": '//form[@class="add_favorite"]/@action',
|
||||
"image": "//div[@class='act-area']/div[@class=\"thumb\"]/img/@src",
|
||||
"instagram": ("//span[text()='ブログ']/../p/a[contains(@href,'instagram.com')]/@href"),
|
||||
"measurements": (
|
||||
"//span[text()='サイズ']/../p/a/@href|//span[text()='サイズ']/../p/text()"
|
||||
),
|
||||
"name_kanji": '//section[@class="main-column details"]/h1/text()',
|
||||
"origin": "//span[text()='出身地']/../p/a/text()",
|
||||
"name": '//section[@class="main-column details"]/h1/span/text()',
|
||||
"search_url": '../h2[@class="ttl"]/a/@href',
|
||||
"search": '//p[@class="furi"]',
|
||||
"twitter": ("//span[text()='ブログ']/../p/a[contains(@href,'twitter.com')]/@href"),
|
||||
}
|
||||
|
||||
REGEXES = {
|
||||
# https://regex101.com/r/9k2GXw/5
|
||||
"alias": r"(?P<kanji>[^\x29\uFF09]+?)(?P<studio>[\x28\uFF08\u3010][^\x29\uFF09\u3011]+(?:[\x29\uFF09\u3011]))?\s[\x28\uFF08](?P<katakana>\w+)?\s+/\s(?P<romanized>[a-z-A-Z ]+)?[\x29\uFF09]",
|
||||
"id": r"\d+",
|
||||
"birthdate": r"[0-9-]+",
|
||||
# https://regex101.com/r/FSqv0L/1
|
||||
"career": (r"(?P<start>\d{4})年?(?:\d+月)? ?(?:\d+)?日?[-~]? ?(?:(?P<end>\d+)?)?年?"),
|
||||
"measurements": (
|
||||
r"(?<=T)(?P<height>\d+)? / B(?P<bust>\d+)\([^=]+=(?P<cup>\w+)\) / W(?P<waist>\d+) / H(?P<hip>\d+)"
|
||||
),
|
||||
"url": r"https://www.minnano-av.com/actress\d+.html",
|
||||
}
|
||||
|
||||
FORMATS = {
|
||||
"image": "https://www.minnano-av.com{IMAGE_URL_FRAGMENT}",
|
||||
"url": "https://www.minnano-av.com/actress{PERFORMER_ID}.html",
|
||||
}
|
||||
|
||||
|
||||
def reverse_first_last_name(performer_name):
|
||||
return " ".join(reversed(performer_name.split(" ")))
|
||||
|
||||
|
||||
def convert_to_halfwidth(input: str) -> str:
|
||||
"""Convert full-width characters to half-width."""
|
||||
fullwidth_range = range(0xFF01, 0xFF5E + 1)
|
||||
fullwidth_to_halfwidth_dict = {
|
||||
chr(fw_char): chr(fw_char - 0xFEE0) for fw_char in fullwidth_range
|
||||
}
|
||||
halfwidth_str = "".join(
|
||||
fullwidth_to_halfwidth_dict.get(char, char) for char in input
|
||||
)
|
||||
return halfwidth_str
|
||||
|
||||
|
||||
def cm_to_inches(centimeters: int) -> int:
|
||||
return int(f"{centimeters / 2.54:.0f}")
|
||||
|
||||
|
||||
def convert_bra_jp_to_us(jp_size: str) -> str:
|
||||
"""
|
||||
Converts bra size from Japanese to US size.
|
||||
First it looks up the whole size in predefined chart,
|
||||
and if that fails:
|
||||
1. Band size is calculated manually.
|
||||
2. Cup size is looked up in another chart.
|
||||
1. If that fails as well, the Japanese cup size is used.
|
||||
References:
|
||||
* https://www.petitecherry.com/pages/size-guide
|
||||
* https://japanrabbit.com/blog/japanese-clothing-size-chart/
|
||||
"""
|
||||
predefined_conversion_chart = {
|
||||
"65A": "30AA",
|
||||
"65B": "30A",
|
||||
"65C": "30B",
|
||||
"65D": "30C",
|
||||
"65E": "30D",
|
||||
"65F": "30E",
|
||||
"70A": "32AA",
|
||||
"70B": "32A",
|
||||
"70C": "32B",
|
||||
"70D": "32C",
|
||||
"70E": "32D",
|
||||
"70F": "32E",
|
||||
"70G": "32F",
|
||||
"70H": "32F",
|
||||
"70I": "32G",
|
||||
"75A": "34AA",
|
||||
"75B": "34A",
|
||||
"75C": "34B",
|
||||
"75D": "34C",
|
||||
"75E": "34D",
|
||||
"75F": "34E",
|
||||
"75G": "32E",
|
||||
"75H": "34F",
|
||||
"75I": "34G",
|
||||
"80B": "36A",
|
||||
"80C": "36B",
|
||||
"80D": "36C",
|
||||
"80E": "36D",
|
||||
"80F": "36E",
|
||||
"80G": "36E",
|
||||
"80H": "36F",
|
||||
"80I": "36G",
|
||||
"85C": "38B",
|
||||
"85D": "38C",
|
||||
"85E": "38D",
|
||||
"85F": "38E",
|
||||
"85G": "38E",
|
||||
"85H": "38F",
|
||||
"90D": "40C",
|
||||
"90E": "40D",
|
||||
"90F": "40E",
|
||||
"90G": "40E",
|
||||
"90H": "40F",
|
||||
"90I": "40G",
|
||||
"95E": "42C",
|
||||
"95F": "42E",
|
||||
"95G": "42E",
|
||||
"95H": "42F",
|
||||
"95I": "42G",
|
||||
"100E": "44D",
|
||||
"100F": "44E",
|
||||
"100G": "44E",
|
||||
"100H": "44F",
|
||||
}
|
||||
cup_conversion_chart = {
|
||||
"A": "AA",
|
||||
"B": "A",
|
||||
"C": "B",
|
||||
"D": "C",
|
||||
"F": "DD",
|
||||
"G": "D",
|
||||
"H": "F",
|
||||
"I": "G",
|
||||
"J": "H",
|
||||
"K": "I",
|
||||
}
|
||||
|
||||
converted_size = None
|
||||
converted_size = predefined_conversion_chart.get(jp_size, None)
|
||||
|
||||
if converted_size is None:
|
||||
band_size = int(jp_size[:-1])
|
||||
cup_size = jp_size[-1]
|
||||
converted_size = (
|
||||
f"{cm_to_inches(band_size)}{cup_conversion_chart.get(cup_size, cup_size)}"
|
||||
)
|
||||
return converted_size
|
||||
|
||||
|
||||
def get_xpath_result(tree: Any, xpath_string: str) -> str | list[str] | None:
|
||||
_result = tree.xpath(xpath_string)
|
||||
if _result == []:
|
||||
return None
|
||||
elif len(_result) == 1:
|
||||
return _result[0]
|
||||
else:
|
||||
return _result
|
||||
|
||||
|
||||
def performer_by_url(url):
|
||||
request = requests.get(url)
|
||||
log.debug(request.status_code)
|
||||
|
||||
tree = etree.HTML(request.text)
|
||||
|
||||
scrape = {}
|
||||
aliases = set()
|
||||
|
||||
JAPANESE = True
|
||||
|
||||
if origin_result := get_xpath_result(tree, XPATHS["origin"]):
|
||||
if origin_result == "海外":
|
||||
JAPANESE = False
|
||||
|
||||
if name_xpath_result := get_xpath_result(tree, XPATHS["name"]):
|
||||
_, romanized_name = name_xpath_result.split(" / ")
|
||||
performer_name = romanized_name
|
||||
if JAPANESE:
|
||||
performer_name = reverse_first_last_name(performer_name)
|
||||
scrape["name"] = performer_name
|
||||
aliases.add(romanized_name)
|
||||
|
||||
if kanji_xpath_result := get_xpath_result(tree, XPATHS["name_kanji"]):
|
||||
# \u3010 is 【
|
||||
if "\u3010" in kanji_xpath_result:
|
||||
kanji_name, _ = kanji_xpath_result.split("\u3010")
|
||||
else:
|
||||
kanji_name = kanji_xpath_result
|
||||
if kanji_name != "":
|
||||
aliases.add(kanji_name)
|
||||
else:
|
||||
log.debug("Kanji name XPath matched, but no value found.")
|
||||
|
||||
if aliases_xpath_result := get_xpath_result(tree, XPATHS["alias"]):
|
||||
for alias in aliases_xpath_result:
|
||||
if match := re.match(REGEXES["alias"], alias):
|
||||
aliases.add(match.group("kanji"))
|
||||
try:
|
||||
aliases.add(match.group("romanized"))
|
||||
except:
|
||||
pass
|
||||
|
||||
if favorite_form_url := get_xpath_result(tree, XPATHS["id"]):
|
||||
if match := re.search(REGEXES["id"], favorite_form_url):
|
||||
scrape["url"] = FORMATS["url"].format(PERFORMER_ID=match[0])
|
||||
else:
|
||||
log.debug("URL XPath matched, but no value found.")
|
||||
|
||||
if twitter_url_result := get_xpath_result(tree, XPATHS["twitter"]):
|
||||
if twitter_url_result != None:
|
||||
scrape["twitter"] = twitter_url_result
|
||||
else:
|
||||
log.debug("Twitter XPath matched, but no value found.")
|
||||
|
||||
if instagram_url_result := get_xpath_result(tree, XPATHS["instagram"]):
|
||||
if instagram_url_result != None:
|
||||
scrape["instagram"] = instagram_url_result
|
||||
else:
|
||||
log.debug("Instagram XPath matched, but no value found.")
|
||||
|
||||
if birthdate_result := get_xpath_result(tree, XPATHS["birthdate"]):
|
||||
if match := re.search(
|
||||
REGEXES["birthdate"], convert_to_halfwidth(birthdate_result)
|
||||
):
|
||||
scrape["birthdate"] = match[0]
|
||||
else:
|
||||
log.debug("Birthday XPath matched, but no value found.")
|
||||
|
||||
if measurements_result := get_xpath_result(tree, XPATHS["measurements"]):
|
||||
combined = "".join(measurements_result)
|
||||
if match := re.search(REGEXES["measurements"], convert_to_halfwidth(combined)):
|
||||
waist_in_inches, hip_in_inches = [
|
||||
cm_to_inches(int(measurement))
|
||||
for measurement in [match["waist"], match["hip"]]
|
||||
]
|
||||
|
||||
bra_size = convert_bra_jp_to_us(f'{match["bust"]}{match["cup"]}')
|
||||
|
||||
scrape["measurements"] = f"{bra_size}-{waist_in_inches}-{hip_in_inches}"
|
||||
if match["height"] != None:
|
||||
scrape["height"] = match["height"]
|
||||
else:
|
||||
log.debug("Measurements XPath matched, but no value found.")
|
||||
|
||||
if career_result := get_xpath_result(tree, XPATHS["career"]):
|
||||
clean_career_result = convert_to_halfwidth(career_result).replace(" ", "")
|
||||
if match := re.match(REGEXES["career"], clean_career_result):
|
||||
groups = match.groups()
|
||||
start = match["start"] + "-" if groups[0] != None else ""
|
||||
end = match["end"] if groups[1] != None else ""
|
||||
scrape["career_length"] = start + end
|
||||
else:
|
||||
log.debug("Career debut XPath matched, but no value found.")
|
||||
|
||||
elif debut_result := get_xpath_result(tree, XPATHS["debut"]):
|
||||
if match := re.search(REGEXES["career"], convert_to_halfwidth(debut_result)):
|
||||
groups = match.groups()
|
||||
scrape[
|
||||
"career_length"
|
||||
] = f'{match["start"] if groups[0] != None else ""}-{match["end"] if groups[1] != None else ""}'
|
||||
else:
|
||||
log.debug("Career debut XPath matched, but no value found.")
|
||||
|
||||
if image_result := get_xpath_result(tree, XPATHS["image"]):
|
||||
clean_url_fragment = str.replace(image_result, "?new", "")
|
||||
if clean_url_fragment != "":
|
||||
scrape["image"] = str.format(
|
||||
FORMATS["image"], IMAGE_URL_FRAGMENT=clean_url_fragment
|
||||
)
|
||||
else:
|
||||
log.debug("Image XPath matched, but no value found.")
|
||||
|
||||
aliases.discard(None)
|
||||
sorted_aliases = sorted(aliases)
|
||||
scrape["aliases"] = ", ".join(sorted_aliases)
|
||||
if JAPANESE:
|
||||
scrape["country"] = "Japan"
|
||||
scrape["ethnicity"] = "Asian"
|
||||
scrape["hair_color"] = "Black"
|
||||
scrape["eye_color"] = "Brown"
|
||||
scrape["gender"] = "Female"
|
||||
print(json.dumps(scrape))
|
||||
|
||||
|
||||
def performer_by_name(name: str, retry=True) -> None:
|
||||
queryURL = f"https://www.minnano-av.com/search_result.php?search_scope=actress&search_word={name}"
|
||||
|
||||
result = requests.get(queryURL)
|
||||
tree = etree.HTML(result.text)
|
||||
|
||||
performer_list = []
|
||||
|
||||
if re.search(REGEXES["url"], result.url):
|
||||
performer_list.append({"name": name, "url": result.url})
|
||||
elif search_result := get_xpath_result(tree, XPATHS["search"]):
|
||||
for node in search_result:
|
||||
performer = {}
|
||||
node_value = node.text
|
||||
if "/" not in node_value:
|
||||
continue
|
||||
_, romanized_name = node_value.split(" / ")
|
||||
performer["name"] = romanized_name
|
||||
if url_result := get_xpath_result(node, XPATHS["search_url"]):
|
||||
url = ""
|
||||
if match := re.search(REGEXES["id"], url_result):
|
||||
url = str.format(FORMATS["url"], PERFORMER_ID=match[0])
|
||||
performer["url"] = url
|
||||
performer_list.append(performer)
|
||||
elif retry:
|
||||
modified_name = reverse_first_last_name(name)
|
||||
performer_by_name(modified_name, retry=False)
|
||||
else:
|
||||
performer_list.append({"name": "No performer found"})
|
||||
|
||||
print(json.dumps(performer_list))
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) == 1:
|
||||
log.error("No arguments")
|
||||
sys.exit(1)
|
||||
|
||||
stdin = sys.stdin.read()
|
||||
|
||||
inputJSON = json.loads(stdin)
|
||||
url = inputJSON.get("url", None)
|
||||
name = inputJSON.get("name", None)
|
||||
|
||||
if "performer_by_url" in sys.argv:
|
||||
log.debug("Processing performer by URL")
|
||||
log.debug(stdin)
|
||||
if url:
|
||||
performer_by_url(url)
|
||||
else:
|
||||
log.error("Missing URL")
|
||||
elif "performer_by_name" in sys.argv:
|
||||
log.debug("Processing performer by name")
|
||||
log.debug(stdin)
|
||||
if name:
|
||||
performer_by_name(name)
|
||||
else:
|
||||
log.error("Missing name")
|
||||
else:
|
||||
log.error("No argument processed")
|
||||
log.debug(stdin)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log.error(e)
|
||||
Reference in New Issue
Block a user