compose-projects-arr/stash/config/scrapers/community/WAPdB/WAPdB.py

import json
import io
import sys

from datetime import datetime

try:
    import requests
except ModuleNotFoundError:
    print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
    sys.exit()

try:
    from bs4 import BeautifulSoup # requires v4.10.0 and above
except ModuleNotFoundError:
    print("You need to install the BeautifulSoup module (v4.10.0+). (https://pypi.org/project/beautifulsoup4/)", file=sys.stderr)
    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install beautifulsoup4", file=sys.stderr)
    sys.exit()


def check_compat():
    from bs4 import __version__ as ver
    major, minor, _ = ver.split('.')
    if (int(major) == 4 and int(minor) >= 10) or (int(major) > 4):
        return
    print(f'This scraper requires BeautifulSoup 4.10.0 and above. Your version: {ver}', file=sys.stderr)
    sys.exit(1)


def process_name(name):
    name_map = {
        'Ô': 'ou',
        'ô': 'ou',
        'û': 'uu',
        'Û': 'uu',
        'î': 'ii',
        'Î': 'ii'
    }
    for k, v in name_map.items():
        name = name.replace(k, v)
    return name.title()


def get_gender(url):
    if 'female-pornstar' in url:
        return 'female'
    if 'male-pornstar' in url:
        return 'male'


def scrape_performer(url):
    resp = requests.get(url)
    if resp.ok:
        soup = BeautifulSoup(resp.text, 'html.parser')
        if soup.find('div', {'id': 'casting-profil-mini-infos'}):
            return scrape_mini_profile(soup, url)
        else:
            return scrape_full_profile(soup, url)


def scrape_mini_profile(soup, url):
    performer = {}
    birthdate_prefix = 'birthdate: '
    birthplace_prefix = 'birthplace: '
    measurements_prefix = 'measurements: '
    height_prefix = 'height: '

    performer['url'] = url
    if gender := get_gender(url):
        performer['gender'] = gender

    if soup.find(text=lambda t: 'pornstar is not yet in our database' in t):
        print('Performer not in database', file=sys.stderr)
        return

    if profile := soup.find('div', {'id': 'casting-profil-mini-infos'}):
        if alphabet_name := profile.find('meta', {'itemprop': 'name'}):
            name = alphabet_name.attrs['content']
            performer['name'] = process_name(name)
        if additional_name := profile.find('meta', {'itemprop': 'additionalName'}):
            japanese_name = additional_name.attrs['content']
            performer['aliases'] = japanese_name

    if details_node := soup.find('div', {'id': 'casting-profil-mini-infos-details'}):
        if birthdate_node := details_node.find('p', text=lambda t: birthdate_prefix in str(t)):
            birthdate_full = birthdate_node.text.split(birthdate_prefix)[1]
            if birthdate_full != 'unknown':
                performer['birthdate'] = datetime.strptime(birthdate_full, '%B %d, %Y').strftime('%Y-%m-%d')
        if birthplace_node := details_node.find('p', text=lambda t: birthplace_prefix in str(t)):
            birthplace_full = birthplace_node.text.split(birthplace_prefix)[1]
            if ', ' in birthplace_full:
                birthplace = birthplace_full.split(', ')[0]
            else:
                birthplace = birthplace_full
            if birthplace != 'unknown':
                performer['country'] = birthplace
            if birthplace == 'Japan':
                performer['ethnicity'] = 'asian'
        if measurements_node := details_node.find('p', text=lambda t: measurements_prefix in str(t)):
            measurements = measurements_node.text.split(measurements_prefix)[1]
            if measurements != 'unknown':
                performer['measurements'] = measurements
        if height_node := details_node.find('p', text=lambda t: height_prefix in str(t)):
            height = height_node.text.split(height_prefix)[1].split()[0]
            if height != 'unknown':
                performer['height'] = height
    if image_node := soup.find('div', {'id': 'casting-profil-preview'}):
        image_url = image_node.find('img', {'itemprop': 'image'}).attrs['src']
        if '/WAPdB-img/par-defaut/' not in image_url:
            performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'
    return performer


def scrape_full_profile(soup, url):
    performer = {}
    measurements_prefix = 'measurements: '
    activity_prefix = 'porn/AV activity: '

    if alphabet_name := soup.find('span', {'itemprop': 'name'}):
        alphabet_name = alphabet_name.text

    japanese_name = None
    if additional_name := soup.find('span', {'itemprop': 'additionalName'}):
        japanese_name = additional_name.text
    performer['name'] = process_name(alphabet_name)
    performer['url'] = url
    if gender := get_gender(url):
        performer['gender'] = gender
    if gender_node := soup.find('meta', {'property': 'og:gender'}):
        performer['gender'] = gender_node.attrs['content']
    if twitter_node := soup.find(text='official Twitter'):
        performer['twitter'] = twitter_node.parent.attrs['href']
    if birthday_node := soup.find('time', {'itemprop': 'birthDate'}):
        performer['birthdate'] = birthday_node.attrs['content']
    if height_node := soup.find('p', {'itemprop': 'height'}):
        if height_value_node := height_node.find('span', {'itemprop': 'value'}):
            performer['height'] = height_value_node.text
    if weight_node := soup.find('p', {'itemprop': 'weight'}):
        if weight_value_node := weight_node.find('span', {'itemprop': 'value'}):
            performer['Weight'] = weight_value_node.text
    if measurements_node := soup.find(text=lambda t: measurements_prefix in str(t)):
        measurements = measurements_node.text.split(measurements_prefix)[1]
        if measurements != 'unknown':
            performer['measurements'] = measurements_node.text.split(measurements_prefix)[1]
    if activity_node := soup.find(text=lambda t: activity_prefix in str(t)):
        performer['career_length'] = activity_node.text.split(activity_prefix)[1].strip()

    if image_container_node := soup.find('div', {'id': 'pornostar-profil-photos-0'}):
        if image_node := image_container_node.find('img', {'itemprop': 'image'}):
            image_url = image_node.attrs['src']
            if '/WAPdB-img/par-defaut/' not in image_url:
                performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'

    if len(country_nodes := soup.find_all('span', {'itemprop': 'addressCountry'})) > 1:
        country = country_nodes[1].text
        performer['country'] = country
        if country == 'Japan':
            performer['ethnicity'] = 'asian'

    aliases = []
    if japanese_name:
        aliases.append(japanese_name)
    if alias_node := soup.find('div', {'id': 'pornostar-profil-noms-alternatifs'}):
        for couple in alias_node.find_all('li'):
            alias = process_name(couple.text)
            if alias == alphabet_name or alias == str(japanese_name):
                continue
            if alias not in aliases:
                aliases.append(alias)
    performer['aliases'] = ', '.join(set(aliases))

    if tags_node := soup.find('p', {'class': 'implode-tags'}):
        for tag in tags_node.find_all('a'):
            if tag.text == 'breast augmentation':
                performer['fake_tits'] = 'Y'
            if tag.text == 'tatoos':
                performer['tattoos'] = 'Y'
            if tag.text == 'piercings':
                performer['piercings'] = 'Y'

    if physical_characteristics := soup.find('p', text=lambda t: 'distinctive physical characteristics' in str(t)):
        dpc = physical_characteristics.text
        if 'breast augmentation' in dpc:
            performer['fake_tits'] = 'Y'
        if 'tattoo(s)' in dpc:
            performer['tattoos'] = 'Y'
        if 'piercing(s)' in dpc:
            performer['piercings'] = 'Y'

    return performer


def search_performer(frag):
    data = {
        'recherche_critere': 'f',
        'recherche_valeur': frag['name'],
        'x': '20',
        'y': '17'
    }
    base_site = 'http://warashi-asian-pornstars.fr'
    performers = []
    resp = requests.post(f'{base_site}/en/s-12/search', data=data)
    if resp.ok:
        soup = BeautifulSoup(resp.text, 'html.parser')
        entries = []
        already_seen = []
        if exact_match := soup.find('div', {'class': 'correspondance_exacte'}):  # process exact matches first
            entries.append(exact_match)
        for e in soup.find_all('div', {'class': 'resultat-pornostar'}):
            entries.append(e)
        for entry in entries:
            p = {}
            if n := entry.find('span', {'class': 'correspondance-lien'}):
                name = n.parent.text.strip()
                p['name'] = process_name(name)
                p['url'] = f'{base_site}{n.parent.attrs["href"]}'
            elif len(n := entry.find_all('a')) > 1:
                p['name'] = process_name(n[1].text.strip())
                p['url'] = f'{base_site}{n[1].attrs["href"]}'
            if p:
                if p['url'] not in already_seen:
                    performers.append(p)
                    already_seen.append(p['url'])
        return performers


def main():
    check_compat()
    # workaround for cp1252
    sys.stdin = io.TextIOWrapper(sys.stdin.detach(), encoding='utf-8')
    frag = json.loads(sys.stdin.read())
    arg = sys.argv[-1]
    if arg == 'performerByName':
        performers = search_performer(frag)
        result = json.dumps(performers)
        print(result)
    if arg in ['performerByFragment', 'performerByURL']:
        performer = scrape_performer(frag['url'])
        result = json.dumps(performer)
        print(result)


if __name__ == '__main__':
    main()