This commit is contained in:
Christoph Califice
2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions

View File

@@ -0,0 +1,245 @@
import json
import io
import sys
from datetime import datetime
try:
import requests
except ModuleNotFoundError:
print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
sys.exit()
try:
from bs4 import BeautifulSoup # requires v4.10.0 and above
except ModuleNotFoundError:
print("You need to install the BeautifulSoup module (v4.10.0+). (https://pypi.org/project/beautifulsoup4/)", file=sys.stderr)
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install beautifulsoup4", file=sys.stderr)
sys.exit()
def check_compat():
from bs4 import __version__ as ver
major, minor, _ = ver.split('.')
if (int(major) == 4 and int(minor) >= 10) or (int(major) > 4):
return
print(f'This scraper requires BeautifulSoup 4.10.0 and above. Your version: {ver}', file=sys.stderr)
sys.exit(1)
def process_name(name):
name_map = {
'Ô': 'ou',
'ô': 'ou',
'û': 'uu',
'Û': 'uu',
'î': 'ii',
'Î': 'ii'
}
for k, v in name_map.items():
name = name.replace(k, v)
return name.title()
def get_gender(url):
if 'female-pornstar' in url:
return 'female'
if 'male-pornstar' in url:
return 'male'
def scrape_performer(url):
resp = requests.get(url)
if resp.ok:
soup = BeautifulSoup(resp.text, 'html.parser')
if soup.find('div', {'id': 'casting-profil-mini-infos'}):
return scrape_mini_profile(soup, url)
else:
return scrape_full_profile(soup, url)
def scrape_mini_profile(soup, url):
performer = {}
birthdate_prefix = 'birthdate: '
birthplace_prefix = 'birthplace: '
measurements_prefix = 'measurements: '
height_prefix = 'height: '
performer['url'] = url
if gender := get_gender(url):
performer['gender'] = gender
if soup.find(text=lambda t: 'pornstar is not yet in our database' in t):
print('Performer not in database', file=sys.stderr)
return
if profile := soup.find('div', {'id': 'casting-profil-mini-infos'}):
if alphabet_name := profile.find('meta', {'itemprop': 'name'}):
name = alphabet_name.attrs['content']
performer['name'] = process_name(name)
if additional_name := profile.find('meta', {'itemprop': 'additionalName'}):
japanese_name = additional_name.attrs['content']
performer['aliases'] = japanese_name
if details_node := soup.find('div', {'id': 'casting-profil-mini-infos-details'}):
if birthdate_node := details_node.find('p', text=lambda t: birthdate_prefix in str(t)):
birthdate_full = birthdate_node.text.split(birthdate_prefix)[1]
if birthdate_full != 'unknown':
performer['birthdate'] = datetime.strptime(birthdate_full, '%B %d, %Y').strftime('%Y-%m-%d')
if birthplace_node := details_node.find('p', text=lambda t: birthplace_prefix in str(t)):
birthplace_full = birthplace_node.text.split(birthplace_prefix)[1]
if ', ' in birthplace_full:
birthplace = birthplace_full.split(', ')[0]
else:
birthplace = birthplace_full
if birthplace != 'unknown':
performer['country'] = birthplace
if birthplace == 'Japan':
performer['ethnicity'] = 'asian'
if measurements_node := details_node.find('p', text=lambda t: measurements_prefix in str(t)):
measurements = measurements_node.text.split(measurements_prefix)[1]
if measurements != 'unknown':
performer['measurements'] = measurements
if height_node := details_node.find('p', text=lambda t: height_prefix in str(t)):
height = height_node.text.split(height_prefix)[1].split()[0]
if height != 'unknown':
performer['height'] = height
if image_node := soup.find('div', {'id': 'casting-profil-preview'}):
image_url = image_node.find('img', {'itemprop': 'image'}).attrs['src']
if '/WAPdB-img/par-defaut/' not in image_url:
performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'
return performer
def scrape_full_profile(soup, url):
performer = {}
measurements_prefix = 'measurements: '
activity_prefix = 'porn/AV activity: '
if alphabet_name := soup.find('span', {'itemprop': 'name'}):
alphabet_name = alphabet_name.text
japanese_name = None
if additional_name := soup.find('span', {'itemprop': 'additionalName'}):
japanese_name = additional_name.text
performer['name'] = process_name(alphabet_name)
performer['url'] = url
if gender := get_gender(url):
performer['gender'] = gender
if gender_node := soup.find('meta', {'property': 'og:gender'}):
performer['gender'] = gender_node.attrs['content']
if twitter_node := soup.find(text='official Twitter'):
performer['twitter'] = twitter_node.parent.attrs['href']
if birthday_node := soup.find('time', {'itemprop': 'birthDate'}):
performer['birthdate'] = birthday_node.attrs['content']
if height_node := soup.find('p', {'itemprop': 'height'}):
if height_value_node := height_node.find('span', {'itemprop': 'value'}):
performer['height'] = height_value_node.text
if weight_node := soup.find('p', {'itemprop': 'weight'}):
if weight_value_node := weight_node.find('span', {'itemprop': 'value'}):
performer['Weight'] = weight_value_node.text
if measurements_node := soup.find(text=lambda t: measurements_prefix in str(t)):
measurements = measurements_node.text.split(measurements_prefix)[1]
if measurements != 'unknown':
performer['measurements'] = measurements_node.text.split(measurements_prefix)[1]
if activity_node := soup.find(text=lambda t: activity_prefix in str(t)):
performer['career_length'] = activity_node.text.split(activity_prefix)[1].strip()
if image_container_node := soup.find('div', {'id': 'pornostar-profil-photos-0'}):
if image_node := image_container_node.find('img', {'itemprop': 'image'}):
image_url = image_node.attrs['src']
if '/WAPdB-img/par-defaut/' not in image_url:
performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'
if len(country_nodes := soup.find_all('span', {'itemprop': 'addressCountry'})) > 1:
country = country_nodes[1].text
performer['country'] = country
if country == 'Japan':
performer['ethnicity'] = 'asian'
aliases = []
if japanese_name:
aliases.append(japanese_name)
if alias_node := soup.find('div', {'id': 'pornostar-profil-noms-alternatifs'}):
for couple in alias_node.find_all('li'):
alias = process_name(couple.text)
if alias == alphabet_name or alias == str(japanese_name):
continue
if alias not in aliases:
aliases.append(alias)
performer['aliases'] = ', '.join(set(aliases))
if tags_node := soup.find('p', {'class': 'implode-tags'}):
for tag in tags_node.find_all('a'):
if tag.text == 'breast augmentation':
performer['fake_tits'] = 'Y'
if tag.text == 'tatoos':
performer['tattoos'] = 'Y'
if tag.text == 'piercings':
performer['piercings'] = 'Y'
if physical_characteristics := soup.find('p', text=lambda t: 'distinctive physical characteristics' in str(t)):
dpc = physical_characteristics.text
if 'breast augmentation' in dpc:
performer['fake_tits'] = 'Y'
if 'tattoo(s)' in dpc:
performer['tattoos'] = 'Y'
if 'piercing(s)' in dpc:
performer['piercings'] = 'Y'
return performer
def search_performer(frag):
data = {
'recherche_critere': 'f',
'recherche_valeur': frag['name'],
'x': '20',
'y': '17'
}
base_site = 'http://warashi-asian-pornstars.fr'
performers = []
resp = requests.post(f'{base_site}/en/s-12/search', data=data)
if resp.ok:
soup = BeautifulSoup(resp.text, 'html.parser')
entries = []
already_seen = []
if exact_match := soup.find('div', {'class': 'correspondance_exacte'}): # process exact matches first
entries.append(exact_match)
for e in soup.find_all('div', {'class': 'resultat-pornostar'}):
entries.append(e)
for entry in entries:
p = {}
if n := entry.find('span', {'class': 'correspondance-lien'}):
name = n.parent.text.strip()
p['name'] = process_name(name)
p['url'] = f'{base_site}{n.parent.attrs["href"]}'
elif len(n := entry.find_all('a')) > 1:
p['name'] = process_name(n[1].text.strip())
p['url'] = f'{base_site}{n[1].attrs["href"]}'
if p:
if p['url'] not in already_seen:
performers.append(p)
already_seen.append(p['url'])
return performers
def main():
check_compat()
# workaround for cp1252
sys.stdin = io.TextIOWrapper(sys.stdin.detach(), encoding='utf-8')
frag = json.loads(sys.stdin.read())
arg = sys.argv[-1]
if arg == 'performerByName':
performers = search_performer(frag)
result = json.dumps(performers)
print(result)
if arg in ['performerByFragment', 'performerByURL']:
performer = scrape_performer(frag['url'])
result = json.dumps(performer)
print(result)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,25 @@
name: "WAPdB"
performerByName:
action: script
script:
- python
- WAPdB.py
- performerByName
performerByFragment:
action: script
script:
- python
- WAPdB.py
- performerByFragment
performerByURL:
- action: script
url:
- http://warashi-asian-pornstars.fr/en/
- http://warashi-asian-pornstars.fr/fr/
- http://warashi-asian-pornstars.fr/ja/
script:
- python
- WAPdB.py
- performerByURL
# Last Updated January 11, 2022

View File

@@ -0,0 +1,10 @@
id: WAPdB
name: WAPdB
metadata: {}
version: fbd81c5
date: "2023-11-22 00:31:17"
requires: []
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
files:
- WAPdB.py
- WAPdB.yml