compose-projects-arr/stash/config/scrapers/community/Minnano-AV/Minnano-AV.py

import json
import os
import re
import sys
from typing import Any

CURRENT_SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
PARENT_DIR = os.path.dirname(CURRENT_SCRIPT_DIR)
sys.path.append(PARENT_DIR)

try:
    import py_common.log as log
except ModuleNotFoundError:
    print(
        "You need to download the folder 'py_common' from the community repo (CommunityScrapers/tree/master/scrapers/py_common)",
        file=sys.stderr,
    )
    sys.exit()

try:
    import requests
    from lxml import etree
except ModuleNotFoundError:
    print("You need to install dependencies from requirements.txt")
    sys.exit(1)

XPATHS = {
    "alias": "//section[@class=\"main-column details\"]/h1/text()|//span[text()='別名']/following-sibling::p/text()",
    "birthdate": "//span[text()='生年月日']/../p/a/@href",
    "career": "//span[text()='AV出演期間']/../p/text()",
    "debut": "//span[text()='デビュー作品']/../p/text()",
    "id": '//form[@class="add_favorite"]/@action',
    "image": "//div[@class='act-area']/div[@class=\"thumb\"]/img/@src",
    "instagram": ("//span[text()='ブログ']/../p/a[contains(@href,'instagram.com')]/@href"),
    "measurements": (
        "//span[text()='サイズ']/../p/a/@href|//span[text()='サイズ']/../p/text()"
    ),
    "name_kanji": '//section[@class="main-column details"]/h1/text()',
    "origin": "//span[text()='出身地']/../p/a/text()",
    "name": '//section[@class="main-column details"]/h1/span/text()',
    "search_url": '../h2[@class="ttl"]/a/@href',
    "search": '//p[@class="furi"]',
    "twitter": ("//span[text()='ブログ']/../p/a[contains(@href,'twitter.com')]/@href"),
}

REGEXES = {
    # https://regex101.com/r/9k2GXw/5
    "alias": r"(?P<kanji>[^\x29\uFF09]+?)(?P<studio>[\x28\uFF08\u3010][^\x29\uFF09\u3011]+(?:[\x29\uFF09\u3011]))?\s[\x28\uFF08](?P<katakana>\w+)?\s+/\s(?P<romanized>[a-z-A-Z ]+)?[\x29\uFF09]",
    "id": r"\d+",
    "birthdate": r"[0-9-]+",
    # https://regex101.com/r/FSqv0L/1
    "career": (r"(?P<start>\d{4})年?(?:\d+月)? ?(?:\d+)?日?[-~]? ?(?:(?P<end>\d+)?)?年?"),
    "measurements": (
        r"(?<=T)(?P<height>\d+)? / B(?P<bust>\d+)\([^=]+=(?P<cup>\w+)\) / W(?P<waist>\d+) / H(?P<hip>\d+)"
    ),
    "url": r"https://www.minnano-av.com/actress\d+.html",
}

FORMATS = {
    "image": "https://www.minnano-av.com{IMAGE_URL_FRAGMENT}",
    "url": "https://www.minnano-av.com/actress{PERFORMER_ID}.html",
}


def reverse_first_last_name(performer_name):
    return " ".join(reversed(performer_name.split(" ")))


def convert_to_halfwidth(input: str) -> str:
    """Convert full-width characters to half-width."""
    fullwidth_range = range(0xFF01, 0xFF5E + 1)
    fullwidth_to_halfwidth_dict = {
        chr(fw_char): chr(fw_char - 0xFEE0) for fw_char in fullwidth_range
    }
    halfwidth_str = "".join(
        fullwidth_to_halfwidth_dict.get(char, char) for char in input
    )
    return halfwidth_str


def cm_to_inches(centimeters: int) -> int:
    return int(f"{centimeters / 2.54:.0f}")


def convert_bra_jp_to_us(jp_size: str) -> str:
    """
    Converts bra size from Japanese to US size.
    First it looks up the whole size in predefined chart,
    and if that fails:
        1. Band size is calculated manually.
        2. Cup size is looked up in another chart.
            1. If that fails as well, the Japanese cup size is used.
    References:
        * https://www.petitecherry.com/pages/size-guide
        * https://japanrabbit.com/blog/japanese-clothing-size-chart/
    """
    predefined_conversion_chart = {
        "65A": "30AA",
        "65B": "30A",
        "65C": "30B",
        "65D": "30C",
        "65E": "30D",
        "65F": "30E",
        "70A": "32AA",
        "70B": "32A",
        "70C": "32B",
        "70D": "32C",
        "70E": "32D",
        "70F": "32E",
        "70G": "32F",
        "70H": "32F",
        "70I": "32G",
        "75A": "34AA",
        "75B": "34A",
        "75C": "34B",
        "75D": "34C",
        "75E": "34D",
        "75F": "34E",
        "75G": "32E",
        "75H": "34F",
        "75I": "34G",
        "80B": "36A",
        "80C": "36B",
        "80D": "36C",
        "80E": "36D",
        "80F": "36E",
        "80G": "36E",
        "80H": "36F",
        "80I": "36G",
        "85C": "38B",
        "85D": "38C",
        "85E": "38D",
        "85F": "38E",
        "85G": "38E",
        "85H": "38F",
        "90D": "40C",
        "90E": "40D",
        "90F": "40E",
        "90G": "40E",
        "90H": "40F",
        "90I": "40G",
        "95E": "42C",
        "95F": "42E",
        "95G": "42E",
        "95H": "42F",
        "95I": "42G",
        "100E": "44D",
        "100F": "44E",
        "100G": "44E",
        "100H": "44F",
    }
    cup_conversion_chart = {
        "A": "AA",
        "B": "A",
        "C": "B",
        "D": "C",
        "F": "DD",
        "G": "D",
        "H": "F",
        "I": "G",
        "J": "H",
        "K": "I",
    }

    converted_size = None
    converted_size = predefined_conversion_chart.get(jp_size, None)

    if converted_size is None:
        band_size = int(jp_size[:-1])
        cup_size = jp_size[-1]
        converted_size = (
            f"{cm_to_inches(band_size)}{cup_conversion_chart.get(cup_size, cup_size)}"
        )
    return converted_size


def get_xpath_result(tree: Any, xpath_string: str) -> str | list[str] | None:
    _result = tree.xpath(xpath_string)
    if _result == []:
        return None
    elif len(_result) == 1:
        return _result[0]
    else:
        return _result


def performer_by_url(url):
    request = requests.get(url)
    log.debug(request.status_code)

    tree = etree.HTML(request.text)

    scrape = {}
    aliases = set()

    JAPANESE = True

    if origin_result := get_xpath_result(tree, XPATHS["origin"]):
        if origin_result == "海外":
            JAPANESE = False

    if name_xpath_result := get_xpath_result(tree, XPATHS["name"]):
        _, romanized_name = name_xpath_result.split(" / ")
        performer_name = romanized_name
        if JAPANESE:
            performer_name = reverse_first_last_name(performer_name)
        scrape["name"] = performer_name
        aliases.add(romanized_name)

    if kanji_xpath_result := get_xpath_result(tree, XPATHS["name_kanji"]):
        # \u3010 is 【
        if "\u3010" in kanji_xpath_result:
            kanji_name, _ = kanji_xpath_result.split("\u3010")
        else:
            kanji_name = kanji_xpath_result
        if kanji_name != "":
            aliases.add(kanji_name)
        else:
            log.debug("Kanji name XPath matched, but no value found.")

    if aliases_xpath_result := get_xpath_result(tree, XPATHS["alias"]):
        for alias in aliases_xpath_result:
            if match := re.match(REGEXES["alias"], alias):
                aliases.add(match.group("kanji"))
                try:
                    aliases.add(match.group("romanized"))
                except:
                    pass

    if favorite_form_url := get_xpath_result(tree, XPATHS["id"]):
        if match := re.search(REGEXES["id"], favorite_form_url):
            scrape["url"] = FORMATS["url"].format(PERFORMER_ID=match[0])
        else:
            log.debug("URL XPath matched, but no value found.")

    if twitter_url_result := get_xpath_result(tree, XPATHS["twitter"]):
        if twitter_url_result != None:
            scrape["twitter"] = twitter_url_result
        else:
            log.debug("Twitter XPath matched, but no value found.")

    if instagram_url_result := get_xpath_result(tree, XPATHS["instagram"]):
        if instagram_url_result != None:
            scrape["instagram"] = instagram_url_result
        else:
            log.debug("Instagram XPath matched, but no value found.")

    if birthdate_result := get_xpath_result(tree, XPATHS["birthdate"]):
        if match := re.search(
            REGEXES["birthdate"], convert_to_halfwidth(birthdate_result)
        ):
            scrape["birthdate"] = match[0]
        else:
            log.debug("Birthday XPath matched, but no value found.")

    if measurements_result := get_xpath_result(tree, XPATHS["measurements"]):
        combined = "".join(measurements_result)
        if match := re.search(REGEXES["measurements"], convert_to_halfwidth(combined)):
            waist_in_inches, hip_in_inches = [
                cm_to_inches(int(measurement))
                for measurement in [match["waist"], match["hip"]]
            ]

            bra_size = convert_bra_jp_to_us(f'{match["bust"]}{match["cup"]}')

            scrape["measurements"] = f"{bra_size}-{waist_in_inches}-{hip_in_inches}"
            if match["height"] != None:
                scrape["height"] = match["height"]
        else:
            log.debug("Measurements XPath matched, but no value found.")

    if career_result := get_xpath_result(tree, XPATHS["career"]):
        clean_career_result = convert_to_halfwidth(career_result).replace(" ", "")
        if match := re.match(REGEXES["career"], clean_career_result):
            groups = match.groups()
            start = match["start"] + "-" if groups[0] != None else ""
            end = match["end"] if groups[1] != None else ""
            scrape["career_length"] = start + end
        else:
            log.debug("Career debut XPath matched, but no value found.")

    elif debut_result := get_xpath_result(tree, XPATHS["debut"]):
        if match := re.search(REGEXES["career"], convert_to_halfwidth(debut_result)):
            groups = match.groups()
            scrape[
                "career_length"
            ] = f'{match["start"] if groups[0] != None else ""}-{match["end"] if groups[1] != None else ""}'
        else:
            log.debug("Career debut XPath matched, but no value found.")

    if image_result := get_xpath_result(tree, XPATHS["image"]):
        clean_url_fragment = str.replace(image_result, "?new", "")
        if clean_url_fragment != "":
            scrape["image"] = str.format(
                FORMATS["image"], IMAGE_URL_FRAGMENT=clean_url_fragment
            )
        else:
            log.debug("Image XPath matched, but no value found.")

    aliases.discard(None)
    sorted_aliases = sorted(aliases)
    scrape["aliases"] = ", ".join(sorted_aliases)
    if JAPANESE:
        scrape["country"] = "Japan"
        scrape["ethnicity"] = "Asian"
        scrape["hair_color"] = "Black"
        scrape["eye_color"] = "Brown"
    scrape["gender"] = "Female"
    print(json.dumps(scrape))


def performer_by_name(name: str, retry=True) -> None:
    queryURL = f"https://www.minnano-av.com/search_result.php?search_scope=actress&search_word={name}"

    result = requests.get(queryURL)
    tree = etree.HTML(result.text)

    performer_list = []

    if re.search(REGEXES["url"], result.url):
        performer_list.append({"name": name, "url": result.url})
    elif search_result := get_xpath_result(tree, XPATHS["search"]):
        for node in search_result:
            performer = {}
            node_value = node.text
            if "/" not in node_value:
                continue
            _, romanized_name = node_value.split(" / ")
            performer["name"] = romanized_name
            if url_result := get_xpath_result(node, XPATHS["search_url"]):
                url = ""
                if match := re.search(REGEXES["id"], url_result):
                    url = str.format(FORMATS["url"], PERFORMER_ID=match[0])
                performer["url"] = url
            performer_list.append(performer)
    elif retry:
        modified_name = reverse_first_last_name(name)
        performer_by_name(modified_name, retry=False)
    else:
        performer_list.append({"name": "No performer found"})

    print(json.dumps(performer_list))


def main():
    if len(sys.argv) == 1:
        log.error("No arguments")
        sys.exit(1)

    stdin = sys.stdin.read()

    inputJSON = json.loads(stdin)
    url = inputJSON.get("url", None)
    name = inputJSON.get("name", None)

    if "performer_by_url" in sys.argv:
        log.debug("Processing performer by URL")
        log.debug(stdin)
        if url:
            performer_by_url(url)
        else:
            log.error("Missing URL")
    elif "performer_by_name" in sys.argv:
        log.debug("Processing performer by name")
        log.debug(stdin)
        if name:
            performer_by_name(name)
        else:
            log.error("Missing name")
    else:
        log.error("No argument processed")
        log.debug(stdin)


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        log.error(e)