stash
This commit is contained in:
245
stash/config/scrapers/community/WAPdB/WAPdB.py
Normal file
245
stash/config/scrapers/community/WAPdB/WAPdB.py
Normal file
@@ -0,0 +1,245 @@
|
||||
import json
|
||||
import io
|
||||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ModuleNotFoundError:
|
||||
print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
|
||||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup # requires v4.10.0 and above
|
||||
except ModuleNotFoundError:
|
||||
print("You need to install the BeautifulSoup module (v4.10.0+). (https://pypi.org/project/beautifulsoup4/)", file=sys.stderr)
|
||||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install beautifulsoup4", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
|
||||
def check_compat():
|
||||
from bs4 import __version__ as ver
|
||||
major, minor, _ = ver.split('.')
|
||||
if (int(major) == 4 and int(minor) >= 10) or (int(major) > 4):
|
||||
return
|
||||
print(f'This scraper requires BeautifulSoup 4.10.0 and above. Your version: {ver}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def process_name(name):
|
||||
name_map = {
|
||||
'Ô': 'ou',
|
||||
'ô': 'ou',
|
||||
'û': 'uu',
|
||||
'Û': 'uu',
|
||||
'î': 'ii',
|
||||
'Î': 'ii'
|
||||
}
|
||||
for k, v in name_map.items():
|
||||
name = name.replace(k, v)
|
||||
return name.title()
|
||||
|
||||
|
||||
def get_gender(url):
|
||||
if 'female-pornstar' in url:
|
||||
return 'female'
|
||||
if 'male-pornstar' in url:
|
||||
return 'male'
|
||||
|
||||
|
||||
def scrape_performer(url):
|
||||
resp = requests.get(url)
|
||||
if resp.ok:
|
||||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||||
if soup.find('div', {'id': 'casting-profil-mini-infos'}):
|
||||
return scrape_mini_profile(soup, url)
|
||||
else:
|
||||
return scrape_full_profile(soup, url)
|
||||
|
||||
|
||||
def scrape_mini_profile(soup, url):
|
||||
performer = {}
|
||||
birthdate_prefix = 'birthdate: '
|
||||
birthplace_prefix = 'birthplace: '
|
||||
measurements_prefix = 'measurements: '
|
||||
height_prefix = 'height: '
|
||||
|
||||
performer['url'] = url
|
||||
if gender := get_gender(url):
|
||||
performer['gender'] = gender
|
||||
|
||||
if soup.find(text=lambda t: 'pornstar is not yet in our database' in t):
|
||||
print('Performer not in database', file=sys.stderr)
|
||||
return
|
||||
|
||||
if profile := soup.find('div', {'id': 'casting-profil-mini-infos'}):
|
||||
if alphabet_name := profile.find('meta', {'itemprop': 'name'}):
|
||||
name = alphabet_name.attrs['content']
|
||||
performer['name'] = process_name(name)
|
||||
if additional_name := profile.find('meta', {'itemprop': 'additionalName'}):
|
||||
japanese_name = additional_name.attrs['content']
|
||||
performer['aliases'] = japanese_name
|
||||
|
||||
if details_node := soup.find('div', {'id': 'casting-profil-mini-infos-details'}):
|
||||
if birthdate_node := details_node.find('p', text=lambda t: birthdate_prefix in str(t)):
|
||||
birthdate_full = birthdate_node.text.split(birthdate_prefix)[1]
|
||||
if birthdate_full != 'unknown':
|
||||
performer['birthdate'] = datetime.strptime(birthdate_full, '%B %d, %Y').strftime('%Y-%m-%d')
|
||||
if birthplace_node := details_node.find('p', text=lambda t: birthplace_prefix in str(t)):
|
||||
birthplace_full = birthplace_node.text.split(birthplace_prefix)[1]
|
||||
if ', ' in birthplace_full:
|
||||
birthplace = birthplace_full.split(', ')[0]
|
||||
else:
|
||||
birthplace = birthplace_full
|
||||
if birthplace != 'unknown':
|
||||
performer['country'] = birthplace
|
||||
if birthplace == 'Japan':
|
||||
performer['ethnicity'] = 'asian'
|
||||
if measurements_node := details_node.find('p', text=lambda t: measurements_prefix in str(t)):
|
||||
measurements = measurements_node.text.split(measurements_prefix)[1]
|
||||
if measurements != 'unknown':
|
||||
performer['measurements'] = measurements
|
||||
if height_node := details_node.find('p', text=lambda t: height_prefix in str(t)):
|
||||
height = height_node.text.split(height_prefix)[1].split()[0]
|
||||
if height != 'unknown':
|
||||
performer['height'] = height
|
||||
if image_node := soup.find('div', {'id': 'casting-profil-preview'}):
|
||||
image_url = image_node.find('img', {'itemprop': 'image'}).attrs['src']
|
||||
if '/WAPdB-img/par-defaut/' not in image_url:
|
||||
performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'
|
||||
return performer
|
||||
|
||||
|
||||
def scrape_full_profile(soup, url):
|
||||
performer = {}
|
||||
measurements_prefix = 'measurements: '
|
||||
activity_prefix = 'porn/AV activity: '
|
||||
|
||||
if alphabet_name := soup.find('span', {'itemprop': 'name'}):
|
||||
alphabet_name = alphabet_name.text
|
||||
|
||||
japanese_name = None
|
||||
if additional_name := soup.find('span', {'itemprop': 'additionalName'}):
|
||||
japanese_name = additional_name.text
|
||||
performer['name'] = process_name(alphabet_name)
|
||||
performer['url'] = url
|
||||
if gender := get_gender(url):
|
||||
performer['gender'] = gender
|
||||
if gender_node := soup.find('meta', {'property': 'og:gender'}):
|
||||
performer['gender'] = gender_node.attrs['content']
|
||||
if twitter_node := soup.find(text='official Twitter'):
|
||||
performer['twitter'] = twitter_node.parent.attrs['href']
|
||||
if birthday_node := soup.find('time', {'itemprop': 'birthDate'}):
|
||||
performer['birthdate'] = birthday_node.attrs['content']
|
||||
if height_node := soup.find('p', {'itemprop': 'height'}):
|
||||
if height_value_node := height_node.find('span', {'itemprop': 'value'}):
|
||||
performer['height'] = height_value_node.text
|
||||
if weight_node := soup.find('p', {'itemprop': 'weight'}):
|
||||
if weight_value_node := weight_node.find('span', {'itemprop': 'value'}):
|
||||
performer['Weight'] = weight_value_node.text
|
||||
if measurements_node := soup.find(text=lambda t: measurements_prefix in str(t)):
|
||||
measurements = measurements_node.text.split(measurements_prefix)[1]
|
||||
if measurements != 'unknown':
|
||||
performer['measurements'] = measurements_node.text.split(measurements_prefix)[1]
|
||||
if activity_node := soup.find(text=lambda t: activity_prefix in str(t)):
|
||||
performer['career_length'] = activity_node.text.split(activity_prefix)[1].strip()
|
||||
|
||||
if image_container_node := soup.find('div', {'id': 'pornostar-profil-photos-0'}):
|
||||
if image_node := image_container_node.find('img', {'itemprop': 'image'}):
|
||||
image_url = image_node.attrs['src']
|
||||
if '/WAPdB-img/par-defaut/' not in image_url:
|
||||
performer['image'] = f'http://warashi-asian-pornstars.fr{image_url}'
|
||||
|
||||
if len(country_nodes := soup.find_all('span', {'itemprop': 'addressCountry'})) > 1:
|
||||
country = country_nodes[1].text
|
||||
performer['country'] = country
|
||||
if country == 'Japan':
|
||||
performer['ethnicity'] = 'asian'
|
||||
|
||||
aliases = []
|
||||
if japanese_name:
|
||||
aliases.append(japanese_name)
|
||||
if alias_node := soup.find('div', {'id': 'pornostar-profil-noms-alternatifs'}):
|
||||
for couple in alias_node.find_all('li'):
|
||||
alias = process_name(couple.text)
|
||||
if alias == alphabet_name or alias == str(japanese_name):
|
||||
continue
|
||||
if alias not in aliases:
|
||||
aliases.append(alias)
|
||||
performer['aliases'] = ', '.join(set(aliases))
|
||||
|
||||
if tags_node := soup.find('p', {'class': 'implode-tags'}):
|
||||
for tag in tags_node.find_all('a'):
|
||||
if tag.text == 'breast augmentation':
|
||||
performer['fake_tits'] = 'Y'
|
||||
if tag.text == 'tatoos':
|
||||
performer['tattoos'] = 'Y'
|
||||
if tag.text == 'piercings':
|
||||
performer['piercings'] = 'Y'
|
||||
|
||||
if physical_characteristics := soup.find('p', text=lambda t: 'distinctive physical characteristics' in str(t)):
|
||||
dpc = physical_characteristics.text
|
||||
if 'breast augmentation' in dpc:
|
||||
performer['fake_tits'] = 'Y'
|
||||
if 'tattoo(s)' in dpc:
|
||||
performer['tattoos'] = 'Y'
|
||||
if 'piercing(s)' in dpc:
|
||||
performer['piercings'] = 'Y'
|
||||
|
||||
return performer
|
||||
|
||||
|
||||
def search_performer(frag):
|
||||
data = {
|
||||
'recherche_critere': 'f',
|
||||
'recherche_valeur': frag['name'],
|
||||
'x': '20',
|
||||
'y': '17'
|
||||
}
|
||||
base_site = 'http://warashi-asian-pornstars.fr'
|
||||
performers = []
|
||||
resp = requests.post(f'{base_site}/en/s-12/search', data=data)
|
||||
if resp.ok:
|
||||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||||
entries = []
|
||||
already_seen = []
|
||||
if exact_match := soup.find('div', {'class': 'correspondance_exacte'}): # process exact matches first
|
||||
entries.append(exact_match)
|
||||
for e in soup.find_all('div', {'class': 'resultat-pornostar'}):
|
||||
entries.append(e)
|
||||
for entry in entries:
|
||||
p = {}
|
||||
if n := entry.find('span', {'class': 'correspondance-lien'}):
|
||||
name = n.parent.text.strip()
|
||||
p['name'] = process_name(name)
|
||||
p['url'] = f'{base_site}{n.parent.attrs["href"]}'
|
||||
elif len(n := entry.find_all('a')) > 1:
|
||||
p['name'] = process_name(n[1].text.strip())
|
||||
p['url'] = f'{base_site}{n[1].attrs["href"]}'
|
||||
if p:
|
||||
if p['url'] not in already_seen:
|
||||
performers.append(p)
|
||||
already_seen.append(p['url'])
|
||||
return performers
|
||||
|
||||
|
||||
def main():
|
||||
check_compat()
|
||||
# workaround for cp1252
|
||||
sys.stdin = io.TextIOWrapper(sys.stdin.detach(), encoding='utf-8')
|
||||
frag = json.loads(sys.stdin.read())
|
||||
arg = sys.argv[-1]
|
||||
if arg == 'performerByName':
|
||||
performers = search_performer(frag)
|
||||
result = json.dumps(performers)
|
||||
print(result)
|
||||
if arg in ['performerByFragment', 'performerByURL']:
|
||||
performer = scrape_performer(frag['url'])
|
||||
result = json.dumps(performer)
|
||||
print(result)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
25
stash/config/scrapers/community/WAPdB/WAPdB.yml
Normal file
25
stash/config/scrapers/community/WAPdB/WAPdB.yml
Normal file
@@ -0,0 +1,25 @@
|
||||
name: "WAPdB"
|
||||
performerByName:
|
||||
action: script
|
||||
script:
|
||||
- python
|
||||
- WAPdB.py
|
||||
- performerByName
|
||||
performerByFragment:
|
||||
action: script
|
||||
script:
|
||||
- python
|
||||
- WAPdB.py
|
||||
- performerByFragment
|
||||
performerByURL:
|
||||
- action: script
|
||||
url:
|
||||
- http://warashi-asian-pornstars.fr/en/
|
||||
- http://warashi-asian-pornstars.fr/fr/
|
||||
- http://warashi-asian-pornstars.fr/ja/
|
||||
script:
|
||||
- python
|
||||
- WAPdB.py
|
||||
- performerByURL
|
||||
|
||||
# Last Updated January 11, 2022
|
||||
10
stash/config/scrapers/community/WAPdB/manifest
Executable file
10
stash/config/scrapers/community/WAPdB/manifest
Executable file
@@ -0,0 +1,10 @@
|
||||
id: WAPdB
|
||||
name: WAPdB
|
||||
metadata: {}
|
||||
version: fbd81c5
|
||||
date: "2023-11-22 00:31:17"
|
||||
requires: []
|
||||
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||
files:
|
||||
- WAPdB.py
|
||||
- WAPdB.yml
|
||||
Reference in New Issue
Block a user