stash

2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions
--- a/stash/config/scrapers/community/WAPdB/WAPdB.py
+++ b/stash/config/scrapers/community/WAPdB/WAPdB.py
@@ -0,0 +1,245 @@
+import json
+import io
+import sys
+
+from datetime import datetime
+
+try:
+    import requests
+except ModuleNotFoundError:
+    print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
+    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
+    sys.exit()
+
+try:
+    from bs4 import BeautifulSoup # requires v4.10.0 and above
+except ModuleNotFoundError:
+    print("You need to install the BeautifulSoup module (v4.10.0+). (https://pypi.org/project/beautifulsoup4/)", file=sys.stderr)
+    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install beautifulsoup4", file=sys.stderr)
+    sys.exit()
+    
+    
+def check_compat():
+    from bs4 import __version__ as ver
+    major, minor, _ = ver.split('.')
+    if (int(major) == 4 and int(minor) >= 10) or (int(major) > 4):
+        return
+    print(f'This scraper requires BeautifulSoup 4.10.0 and above. Your version: {ver}', file=sys.stderr)
+    sys.exit(1)
+
+
+def process_name(name):
+    name_map = {
+        'Ô': 'ou',
+        'ô': 'ou',
+        'û': 'uu',
+        'Û': 'uu',
+        'î': 'ii',
+        'Î': 'ii'
+    }
+    for k, v in name_map.items():
+        name = name.replace(k, v)
+    return name.title()
+
+
+def get_gender(url):
+    if 'female-pornstar' in url:
+        return 'female'
+    if 'male-pornstar' in url:
+        return 'male'
+
+
+def scrape_performer(url):
+    resp = requests.get(url)
+    if resp.ok:
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        if soup.find('div', {'id': 'casting-profil-mini-infos'}):
+            return scrape_mini_profile(soup, url)
+        else:
+            return scrape_full_profile(soup, url)
+
+
+def scrape_mini_profile(soup, url):
+    performer = {}
+    birthdate_prefix = 'birthdate: '
+    birthplace_prefix = 'birthplace: '
+    measurements_prefix = 'measurements: '
+    height_prefix = 'height: '
+
+    performer['url'] = url
+    if gender := get_gender(url):
+        performer['gender'] = gender
+
+    if soup.find(text=lambda t: 'pornstar is not yet in our database' in t):
+        print('Performer not in database', file=sys.stderr)
+        return
+
+    if profile := soup.find('div', {'id': 'casting-profil-mini-infos'}):
+        if alphabet_name := profile.find('meta', {'itemprop': 'name'}):
+            name = alphabet_name.attrs['content']
+            performer['name'] = process_name(name)
+        if additional_name := profile.find('meta', {'itemprop': 'additionalName'}):
+            japanese_name = additional_name.attrs['content']
+            performer['aliases'] = japanese_name
+
+    if details_node := soup.find('div', {'id': 'casting-profil-mini-infos-details'}):
+        if birthdate_node := details_node.find('p', text=lambda t: birthdate_prefix in str(t)):
+            birthdate_full = birthdate_node.text.split(birthdate_prefix)[1]
+            if birthdate_full != 'unknown':
+                performer['birthdate'] = datetime.strptime(birthdate_full, '%B %d, %Y').strftime('%Y-%m-%d')
+        if birthplace_node := details_node.find('p', text=lambda t: birthplace_prefix in str(t)):
+            birthplace_full = birthplace_node.text.split(birthplace_prefix)[1]
+            if ', ' in birthplace_full:
+                birthplace = birthplace_full.split(', ')[0]
+            else:
+                birthplace = birthplace_full
+            if birthplace != 'unknown':
+                performer['country'] = birthplace
+            if birthplace == 'Japan':
+                performer['ethnicity'] = 'asian'
+        if measurements_node := details_node.find('p', text=lambda t: measurements_prefix in str(t)):
+            measurements = measurements_node.text.split(measurements_prefix)[1]
+            if measurements != 'unknown':
+                performer['measurements'] = measurements
+        if height_node := details_node.find('p', text=lambda t: height_prefix in str(t)):
+            height = height_node.text.split(height_prefix)[1].split()[0]
+            if height != 'unknown':
+                performer['height'] = height
+    if image_node := soup.find('div', {'id': 'casting-profil-preview'}):
+        image_url = image_node.find('img', {'itemprop': 'image'}).attrs['src']
+        if '/WAPdB-img/par-defaut/' not in image_url:
+            performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'
+    return performer
+
+
+def scrape_full_profile(soup, url):
+    performer = {}
+    measurements_prefix = 'measurements: '
+    activity_prefix = 'porn/AV activity: '
+
+    if alphabet_name := soup.find('span', {'itemprop': 'name'}):
+        alphabet_name = alphabet_name.text
+
+    japanese_name = None
+    if additional_name := soup.find('span', {'itemprop': 'additionalName'}):
+        japanese_name = additional_name.text
+    performer['name'] = process_name(alphabet_name)
+    performer['url'] = url
+    if gender := get_gender(url):
+        performer['gender'] = gender
+    if gender_node := soup.find('meta', {'property': 'og:gender'}):
+        performer['gender'] = gender_node.attrs['content']
+    if twitter_node := soup.find(text='official Twitter'):
+        performer['twitter'] = twitter_node.parent.attrs['href']
+    if birthday_node := soup.find('time', {'itemprop': 'birthDate'}):
+        performer['birthdate'] = birthday_node.attrs['content']
+    if height_node := soup.find('p', {'itemprop': 'height'}):
+        if height_value_node := height_node.find('span', {'itemprop': 'value'}):
+            performer['height'] = height_value_node.text
+    if weight_node := soup.find('p', {'itemprop': 'weight'}):
+        if weight_value_node := weight_node.find('span', {'itemprop': 'value'}):
+            performer['Weight'] = weight_value_node.text
+    if measurements_node := soup.find(text=lambda t: measurements_prefix in str(t)):
+        measurements = measurements_node.text.split(measurements_prefix)[1]
+        if measurements != 'unknown':
+            performer['measurements'] = measurements_node.text.split(measurements_prefix)[1]
+    if activity_node := soup.find(text=lambda t: activity_prefix in str(t)):
+        performer['career_length'] = activity_node.text.split(activity_prefix)[1].strip()
+
+    if image_container_node := soup.find('div', {'id': 'pornostar-profil-photos-0'}):
+        if image_node := image_container_node.find('img', {'itemprop': 'image'}):
+            image_url = image_node.attrs['src']
+            if '/WAPdB-img/par-defaut/' not in image_url:
+                performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'
+
+    if len(country_nodes := soup.find_all('span', {'itemprop': 'addressCountry'})) > 1:
+        country = country_nodes[1].text
+        performer['country'] = country
+        if country == 'Japan':
+            performer['ethnicity'] = 'asian'
+
+    aliases = []
+    if japanese_name:
+        aliases.append(japanese_name)
+    if alias_node := soup.find('div', {'id': 'pornostar-profil-noms-alternatifs'}):
+        for couple in alias_node.find_all('li'):
+            alias = process_name(couple.text)
+            if alias == alphabet_name or alias == str(japanese_name):
+                continue
+            if alias not in aliases:
+                aliases.append(alias)
+    performer['aliases'] = ', '.join(set(aliases))
+
+    if tags_node := soup.find('p', {'class': 'implode-tags'}):
+        for tag in tags_node.find_all('a'):
+            if tag.text == 'breast augmentation':
+                performer['fake_tits'] = 'Y'
+            if tag.text == 'tatoos':
+                performer['tattoos'] = 'Y'
+            if tag.text == 'piercings':
+                performer['piercings'] = 'Y'
+
+    if physical_characteristics := soup.find('p', text=lambda t: 'distinctive physical characteristics' in str(t)):
+        dpc = physical_characteristics.text
+        if 'breast augmentation' in dpc:
+            performer['fake_tits'] = 'Y'
+        if 'tattoo(s)' in dpc:
+            performer['tattoos'] = 'Y'
+        if 'piercing(s)' in dpc:
+            performer['piercings'] = 'Y'
+
+    return performer
+
+
+def search_performer(frag):
+    data = {
+        'recherche_critere': 'f',
+        'recherche_valeur': frag['name'],
+        'x': '20',
+        'y': '17'
+    }
+    base_site = 'http://warashi-asian-pornstars.fr'
+    performers = []
+    resp = requests.post(f'{base_site}/en/s-12/search', data=data)
+    if resp.ok:
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        entries = []
+        already_seen = []
+        if exact_match := soup.find('div', {'class': 'correspondance_exacte'}):  # process exact matches first
+            entries.append(exact_match)
+        for e in soup.find_all('div', {'class': 'resultat-pornostar'}):
+            entries.append(e)
+        for entry in entries:
+            p = {}
+            if n := entry.find('span', {'class': 'correspondance-lien'}):
+                name = n.parent.text.strip()
+                p['name'] = process_name(name)
+                p['url'] = f'{base_site}{n.parent.attrs["href"]}'
+            elif len(n := entry.find_all('a')) > 1:
+                p['name'] = process_name(n[1].text.strip())
+                p['url'] = f'{base_site}{n[1].attrs["href"]}'
+            if p:
+                if p['url'] not in already_seen:
+                    performers.append(p)
+                    already_seen.append(p['url'])
+        return performers
+
+
+def main():
+    check_compat()
+    # workaround for cp1252
+    sys.stdin = io.TextIOWrapper(sys.stdin.detach(), encoding='utf-8')
+    frag = json.loads(sys.stdin.read())
+    arg = sys.argv[-1]
+    if arg == 'performerByName':
+        performers = search_performer(frag)
+        result = json.dumps(performers)
+        print(result)
+    if arg in ['performerByFragment', 'performerByURL']:
+        performer = scrape_performer(frag['url'])
+        result = json.dumps(performer)
+        print(result)
+
+
+if __name__ == '__main__':
+    main()
--- a/stash/config/scrapers/community/WAPdB/WAPdB.yml
+++ b/stash/config/scrapers/community/WAPdB/WAPdB.yml
@@ -0,0 +1,25 @@
+name: "WAPdB"
+performerByName:
+    action: script
+    script:
+      - python
+      - WAPdB.py
+      - performerByName
+performerByFragment:
+    action: script
+    script:
+      - python
+      - WAPdB.py
+      - performerByFragment
+performerByURL:
+    - action: script
+      url:
+        - http://warashi-asian-pornstars.fr/en/
+        - http://warashi-asian-pornstars.fr/fr/
+        - http://warashi-asian-pornstars.fr/ja/
+      script:
+        - python
+        - WAPdB.py
+        - performerByURL
+
+# Last Updated January 11, 2022
--- a/stash/config/scrapers/community/WAPdB/manifest
+++ b/stash/config/scrapers/community/WAPdB/manifest
@@ -0,0 +1,10 @@
+id: WAPdB
+name: WAPdB
+metadata: {}
+version: fbd81c5
+date: "2023-11-22 00:31:17"
+requires: []
+source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
+files:
+- WAPdB.py
+- WAPdB.yml