246 lines
9.7 KiB
Python
246 lines
9.7 KiB
Python
import json
|
|
import io
|
|
import sys
|
|
|
|
from datetime import datetime
|
|
|
|
try:
|
|
import requests
|
|
except ModuleNotFoundError:
|
|
print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup # requires v4.10.0 and above
|
|
except ModuleNotFoundError:
|
|
print("You need to install the BeautifulSoup module (v4.10.0+). (https://pypi.org/project/beautifulsoup4/)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install beautifulsoup4", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
|
|
def check_compat():
|
|
from bs4 import __version__ as ver
|
|
major, minor, _ = ver.split('.')
|
|
if (int(major) == 4 and int(minor) >= 10) or (int(major) > 4):
|
|
return
|
|
print(f'This scraper requires BeautifulSoup 4.10.0 and above. Your version: {ver}', file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
def process_name(name):
|
|
name_map = {
|
|
'Ô': 'ou',
|
|
'ô': 'ou',
|
|
'û': 'uu',
|
|
'Û': 'uu',
|
|
'î': 'ii',
|
|
'Î': 'ii'
|
|
}
|
|
for k, v in name_map.items():
|
|
name = name.replace(k, v)
|
|
return name.title()
|
|
|
|
|
|
def get_gender(url):
|
|
if 'female-pornstar' in url:
|
|
return 'female'
|
|
if 'male-pornstar' in url:
|
|
return 'male'
|
|
|
|
|
|
def scrape_performer(url):
|
|
resp = requests.get(url)
|
|
if resp.ok:
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
if soup.find('div', {'id': 'casting-profil-mini-infos'}):
|
|
return scrape_mini_profile(soup, url)
|
|
else:
|
|
return scrape_full_profile(soup, url)
|
|
|
|
|
|
def scrape_mini_profile(soup, url):
|
|
performer = {}
|
|
birthdate_prefix = 'birthdate: '
|
|
birthplace_prefix = 'birthplace: '
|
|
measurements_prefix = 'measurements: '
|
|
height_prefix = 'height: '
|
|
|
|
performer['url'] = url
|
|
if gender := get_gender(url):
|
|
performer['gender'] = gender
|
|
|
|
if soup.find(text=lambda t: 'pornstar is not yet in our database' in t):
|
|
print('Performer not in database', file=sys.stderr)
|
|
return
|
|
|
|
if profile := soup.find('div', {'id': 'casting-profil-mini-infos'}):
|
|
if alphabet_name := profile.find('meta', {'itemprop': 'name'}):
|
|
name = alphabet_name.attrs['content']
|
|
performer['name'] = process_name(name)
|
|
if additional_name := profile.find('meta', {'itemprop': 'additionalName'}):
|
|
japanese_name = additional_name.attrs['content']
|
|
performer['aliases'] = japanese_name
|
|
|
|
if details_node := soup.find('div', {'id': 'casting-profil-mini-infos-details'}):
|
|
if birthdate_node := details_node.find('p', text=lambda t: birthdate_prefix in str(t)):
|
|
birthdate_full = birthdate_node.text.split(birthdate_prefix)[1]
|
|
if birthdate_full != 'unknown':
|
|
performer['birthdate'] = datetime.strptime(birthdate_full, '%B %d, %Y').strftime('%Y-%m-%d')
|
|
if birthplace_node := details_node.find('p', text=lambda t: birthplace_prefix in str(t)):
|
|
birthplace_full = birthplace_node.text.split(birthplace_prefix)[1]
|
|
if ', ' in birthplace_full:
|
|
birthplace = birthplace_full.split(', ')[0]
|
|
else:
|
|
birthplace = birthplace_full
|
|
if birthplace != 'unknown':
|
|
performer['country'] = birthplace
|
|
if birthplace == 'Japan':
|
|
performer['ethnicity'] = 'asian'
|
|
if measurements_node := details_node.find('p', text=lambda t: measurements_prefix in str(t)):
|
|
measurements = measurements_node.text.split(measurements_prefix)[1]
|
|
if measurements != 'unknown':
|
|
performer['measurements'] = measurements
|
|
if height_node := details_node.find('p', text=lambda t: height_prefix in str(t)):
|
|
height = height_node.text.split(height_prefix)[1].split()[0]
|
|
if height != 'unknown':
|
|
performer['height'] = height
|
|
if image_node := soup.find('div', {'id': 'casting-profil-preview'}):
|
|
image_url = image_node.find('img', {'itemprop': 'image'}).attrs['src']
|
|
if '/WAPdB-img/par-defaut/' not in image_url:
|
|
performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'
|
|
return performer
|
|
|
|
|
|
def scrape_full_profile(soup, url):
|
|
performer = {}
|
|
measurements_prefix = 'measurements: '
|
|
activity_prefix = 'porn/AV activity: '
|
|
|
|
if alphabet_name := soup.find('span', {'itemprop': 'name'}):
|
|
alphabet_name = alphabet_name.text
|
|
|
|
japanese_name = None
|
|
if additional_name := soup.find('span', {'itemprop': 'additionalName'}):
|
|
japanese_name = additional_name.text
|
|
performer['name'] = process_name(alphabet_name)
|
|
performer['url'] = url
|
|
if gender := get_gender(url):
|
|
performer['gender'] = gender
|
|
if gender_node := soup.find('meta', {'property': 'og:gender'}):
|
|
performer['gender'] = gender_node.attrs['content']
|
|
if twitter_node := soup.find(text='official Twitter'):
|
|
performer['twitter'] = twitter_node.parent.attrs['href']
|
|
if birthday_node := soup.find('time', {'itemprop': 'birthDate'}):
|
|
performer['birthdate'] = birthday_node.attrs['content']
|
|
if height_node := soup.find('p', {'itemprop': 'height'}):
|
|
if height_value_node := height_node.find('span', {'itemprop': 'value'}):
|
|
performer['height'] = height_value_node.text
|
|
if weight_node := soup.find('p', {'itemprop': 'weight'}):
|
|
if weight_value_node := weight_node.find('span', {'itemprop': 'value'}):
|
|
performer['Weight'] = weight_value_node.text
|
|
if measurements_node := soup.find(text=lambda t: measurements_prefix in str(t)):
|
|
measurements = measurements_node.text.split(measurements_prefix)[1]
|
|
if measurements != 'unknown':
|
|
performer['measurements'] = measurements_node.text.split(measurements_prefix)[1]
|
|
if activity_node := soup.find(text=lambda t: activity_prefix in str(t)):
|
|
performer['career_length'] = activity_node.text.split(activity_prefix)[1].strip()
|
|
|
|
if image_container_node := soup.find('div', {'id': 'pornostar-profil-photos-0'}):
|
|
if image_node := image_container_node.find('img', {'itemprop': 'image'}):
|
|
image_url = image_node.attrs['src']
|
|
if '/WAPdB-img/par-defaut/' not in image_url:
|
|
performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'
|
|
|
|
if len(country_nodes := soup.find_all('span', {'itemprop': 'addressCountry'})) > 1:
|
|
country = country_nodes[1].text
|
|
performer['country'] = country
|
|
if country == 'Japan':
|
|
performer['ethnicity'] = 'asian'
|
|
|
|
aliases = []
|
|
if japanese_name:
|
|
aliases.append(japanese_name)
|
|
if alias_node := soup.find('div', {'id': 'pornostar-profil-noms-alternatifs'}):
|
|
for couple in alias_node.find_all('li'):
|
|
alias = process_name(couple.text)
|
|
if alias == alphabet_name or alias == str(japanese_name):
|
|
continue
|
|
if alias not in aliases:
|
|
aliases.append(alias)
|
|
performer['aliases'] = ', '.join(set(aliases))
|
|
|
|
if tags_node := soup.find('p', {'class': 'implode-tags'}):
|
|
for tag in tags_node.find_all('a'):
|
|
if tag.text == 'breast augmentation':
|
|
performer['fake_tits'] = 'Y'
|
|
if tag.text == 'tatoos':
|
|
performer['tattoos'] = 'Y'
|
|
if tag.text == 'piercings':
|
|
performer['piercings'] = 'Y'
|
|
|
|
if physical_characteristics := soup.find('p', text=lambda t: 'distinctive physical characteristics' in str(t)):
|
|
dpc = physical_characteristics.text
|
|
if 'breast augmentation' in dpc:
|
|
performer['fake_tits'] = 'Y'
|
|
if 'tattoo(s)' in dpc:
|
|
performer['tattoos'] = 'Y'
|
|
if 'piercing(s)' in dpc:
|
|
performer['piercings'] = 'Y'
|
|
|
|
return performer
|
|
|
|
|
|
def search_performer(frag):
|
|
data = {
|
|
'recherche_critere': 'f',
|
|
'recherche_valeur': frag['name'],
|
|
'x': '20',
|
|
'y': '17'
|
|
}
|
|
base_site = 'http://warashi-asian-pornstars.fr'
|
|
performers = []
|
|
resp = requests.post(f'{base_site}/en/s-12/search', data=data)
|
|
if resp.ok:
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
entries = []
|
|
already_seen = []
|
|
if exact_match := soup.find('div', {'class': 'correspondance_exacte'}): # process exact matches first
|
|
entries.append(exact_match)
|
|
for e in soup.find_all('div', {'class': 'resultat-pornostar'}):
|
|
entries.append(e)
|
|
for entry in entries:
|
|
p = {}
|
|
if n := entry.find('span', {'class': 'correspondance-lien'}):
|
|
name = n.parent.text.strip()
|
|
p['name'] = process_name(name)
|
|
p['url'] = f'{base_site}{n.parent.attrs["href"]}'
|
|
elif len(n := entry.find_all('a')) > 1:
|
|
p['name'] = process_name(n[1].text.strip())
|
|
p['url'] = f'{base_site}{n[1].attrs["href"]}'
|
|
if p:
|
|
if p['url'] not in already_seen:
|
|
performers.append(p)
|
|
already_seen.append(p['url'])
|
|
return performers
|
|
|
|
|
|
def main():
|
|
check_compat()
|
|
# workaround for cp1252
|
|
sys.stdin = io.TextIOWrapper(sys.stdin.detach(), encoding='utf-8')
|
|
frag = json.loads(sys.stdin.read())
|
|
arg = sys.argv[-1]
|
|
if arg == 'performerByName':
|
|
performers = search_performer(frag)
|
|
result = json.dumps(performers)
|
|
print(result)
|
|
if arg in ['performerByFragment', 'performerByURL']:
|
|
performer = scrape_performer(frag['url'])
|
|
result = json.dumps(performer)
|
|
print(result)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|