stash
This commit is contained in:
355
stash/config/scrapers/community/SARJ-LLC/SARJ-LLC.py
Normal file
355
stash/config/scrapers/community/SARJ-LLC/SARJ-LLC.py
Normal file
@@ -0,0 +1,355 @@
|
||||
import base64
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
import re
|
||||
from urllib.parse import urlparse, urlencode
|
||||
|
||||
# to import from a parent directory we need to add that directory to the system path
|
||||
csd = os.path.dirname(
|
||||
os.path.realpath(__file__)) # get current script directory
|
||||
parent = os.path.dirname(csd) # parent directory (should be the scrapers one)
|
||||
sys.path.append(
|
||||
parent
|
||||
) # add parent dir to sys path so that we can import py_common from there
|
||||
|
||||
try:
|
||||
from py_common import log
|
||||
except ModuleNotFoundError:
|
||||
print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ModuleNotFoundError:
|
||||
print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
|
||||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
def scrape_url(url, scrape_type):
|
||||
parsed = urlparse(url)
|
||||
|
||||
path = parsed.path.split('/')
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
if scrape_type == 'scene':
|
||||
try:
|
||||
index = path.index('movie')
|
||||
scraped = scrape_movie(base_url, path[index + 1], path[index + 2])
|
||||
except ValueError:
|
||||
log.error(f"scene scraping not supported for {url}")
|
||||
return None
|
||||
elif scrape_type == 'gallery':
|
||||
try:
|
||||
index = path.index('gallery')
|
||||
scraped = scrape_gallery(base_url, path[index + 1], path[index + 2])
|
||||
if scraped and (director := scraped.pop("Director", None)):
|
||||
scraped["Photographer"] = director
|
||||
except ValueError:
|
||||
log.error(f"gallery scraping not supported for {url}")
|
||||
return None
|
||||
elif scrape_type == 'performer':
|
||||
try:
|
||||
index = path.index('model')
|
||||
scraped = scrape_model(base_url, path[index + 1])
|
||||
except ValueError:
|
||||
log.error(f"performer scraping not supported for {url}")
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
return scraped
|
||||
|
||||
|
||||
def query(fragment, query_type):
|
||||
res = None
|
||||
if query_type in ('scene', 'gallery'):
|
||||
name = re.sub(r'\W', '_', fragment['title']).upper()
|
||||
if fragment.get('date') is None:
|
||||
log.error("Date is a required field when scraping by fragment")
|
||||
return None
|
||||
date = fragment['date'].replace('-', '')
|
||||
|
||||
scraper = globals()['scrape_' + ('movie' if query_type == 'scene' else query_type)]
|
||||
res = scraper('https://metartnetwork.com', date, name)
|
||||
return res
|
||||
|
||||
def search(s_type, name):
|
||||
search_type = {
|
||||
'scene': 'MOVIE',
|
||||
'gallery': 'GALLERY',
|
||||
'performer': 'model'
|
||||
}[s_type]
|
||||
page = 1
|
||||
page_size = 30
|
||||
args = {
|
||||
'searchPhrase': name,
|
||||
'pageSize': page_size,
|
||||
'sortBy': 'relevance'
|
||||
}
|
||||
|
||||
if s_type == 'performer':
|
||||
def map_result(result):
|
||||
item = result['item']
|
||||
return {
|
||||
'name': item['name'],
|
||||
'url': f"https://www.metartnetwork.com{item['path']}",
|
||||
}
|
||||
elif s_type == 'scene':
|
||||
def map_result(result):
|
||||
item = result['item']
|
||||
studio = get_studio(item['siteUUID'])
|
||||
if studio:
|
||||
image = f"https://www.{studio[1]}{item['thumbnailCoverPath']}"
|
||||
return {
|
||||
'title': item['name'],
|
||||
'url': f"https://www.metartnetwork.com{item['path']}",
|
||||
'date': item['publishedAt'][0:item['publishedAt'].find('T')],
|
||||
'performers': list(map(lambda m: {'name': m['name']}, item['models'])),
|
||||
'image': image,
|
||||
}
|
||||
else:
|
||||
return []
|
||||
|
||||
results = []
|
||||
|
||||
log.info(f"Searching for {s_type} '{name}'")
|
||||
while True:
|
||||
args['page'] = page
|
||||
response = fetch("https://metartnetwork.com", "search-results", args)
|
||||
|
||||
results += list(
|
||||
map(
|
||||
map_result,
|
||||
filter(
|
||||
lambda r: r['type'] == search_type,
|
||||
response['items']
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if page * page_size > response['total'] or len(response['items']) == 0:
|
||||
break
|
||||
|
||||
page += 1
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def fetch(base_url, fetch_type, arguments):
|
||||
url = f"{base_url}/api/{fetch_type}?{urlencode(arguments)}"
|
||||
log.debug(f"Fetching URL {url}")
|
||||
try:
|
||||
response = requests.get(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
|
||||
}, timeout=(3, 6))
|
||||
except requests.exceptions.RequestException as req_ex:
|
||||
log.error(f"Error fetching URL {url}: {req_ex}")
|
||||
return None
|
||||
|
||||
if response.status_code >= 400:
|
||||
log.info(f"Fetching URL {url} resulted in error status: {response.status_code}")
|
||||
return None
|
||||
|
||||
data = response.json()
|
||||
return data
|
||||
|
||||
|
||||
def scrape_model(base_url, name):
|
||||
transformed_name = str.join(
|
||||
' ',
|
||||
list(
|
||||
map(
|
||||
lambda p:
|
||||
re.sub(
|
||||
'[_-]',
|
||||
' ',
|
||||
re.sub(r'\w\S*', lambda m: m.group(0).lower().capitalize(), p),
|
||||
),
|
||||
name.split('-')
|
||||
)
|
||||
)
|
||||
)
|
||||
log.info(f"Scraping model '{name}' as '{transformed_name}'")
|
||||
data = fetch(base_url, 'model', {'name': transformed_name, 'order': 'DATE', 'direction': 'DESC'})
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
return map_model(base_url, data)
|
||||
|
||||
|
||||
def map_media(data, studio, base_url):
|
||||
urls = []
|
||||
studio_code = data["UUID"]
|
||||
studio_name = {'Name': ""}
|
||||
if studio is not None:
|
||||
studio_url = studio[1]
|
||||
urls = [f"https://www.{studio_url}{data['path']}"]
|
||||
studio_name = {'Name': studio[0]}
|
||||
|
||||
director = None
|
||||
directors = []
|
||||
|
||||
# director seems to be included in `photographers` and `crew` section
|
||||
if data.get("photographers"):
|
||||
for director in data['photographers']:
|
||||
directors.append(director.get('name').strip())
|
||||
if data.get('crew') and studio_name["Name"] not in ("Sex Art", "ALS Scan"):
|
||||
# some sites only use the `photograpers`` section for director
|
||||
for crew in data['crew']:
|
||||
if crew.get('role') == "Still Photographer":
|
||||
for crew_name in crew.get('names'):
|
||||
name = crew_name.strip()
|
||||
if name not in directors:
|
||||
directors.append(name)
|
||||
director = ", ".join(directors)
|
||||
|
||||
return {
|
||||
'Title': data['name'],
|
||||
'Details': data['description'],
|
||||
'URLs': urls,
|
||||
'Date': data['publishedAt'][0:data['publishedAt'].find('T')],
|
||||
'Tags': list(map(lambda t: {'Name': t}, data['tags'])),
|
||||
'Performers': list(map(lambda m: map_model(base_url, m), data['models'])),
|
||||
'Studio': studio_name,
|
||||
'Code': studio_code,
|
||||
"Director": director
|
||||
}
|
||||
|
||||
|
||||
def scrape_movie(base_url, date, name):
|
||||
log.info(f"Scraping movie '{name}' released on {date}")
|
||||
data = fetch(base_url, 'movie', {'name': name, 'date': date})
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
studio = get_studio(data['media']['siteUUID'])
|
||||
res = map_media(data, studio, base_url)
|
||||
image_types = ['splashImagePath', 'coverCleanImagePath', 'coverImagePath']
|
||||
for image_type in image_types:
|
||||
if image_type in data:
|
||||
image_part = data[image_type]
|
||||
res['Image'] = f"https://cdn.metartnetwork.com/{data['media']['siteUUID']}/{image_part}"
|
||||
try:
|
||||
response = requests.get(res['Image'], headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
|
||||
}, timeout=(3, 6))
|
||||
if response and response.status_code < 400:
|
||||
mime = 'image/jpeg'
|
||||
encoded = base64.b64encode(response.content).decode('utf-8')
|
||||
res['Image'] = f'data:{mime};base64,{encoded}'
|
||||
break
|
||||
except requests.exceptions.RequestException as req_ex:
|
||||
log.info(f"Error fetching URL {res['Image']}: {req_ex}")
|
||||
res['Image'] = None
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def scrape_gallery(base_url, date, name):
|
||||
log.info(f"Scraping gallery '{name}' released on {date}")
|
||||
data = fetch(base_url, 'gallery', {'name': name, 'date': date})
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
studio = get_studio(data['siteUUID'])
|
||||
return map_media(data, studio, base_url)
|
||||
|
||||
|
||||
def map_model(base_url, model):
|
||||
tags = list(map(lambda t: {'Name': t}, model['tags']))
|
||||
|
||||
def add_tag(key, tag_format):
|
||||
nonlocal tags
|
||||
if key in model and model[key] != "":
|
||||
tags.append({
|
||||
'Name': tag_format.format(model[key])
|
||||
})
|
||||
|
||||
add_tag('hair', '{} hair')
|
||||
add_tag('pubicHair', '{} pussy')
|
||||
add_tag('eyes', '{} eyes')
|
||||
add_tag('breasts', '{} breasts')
|
||||
|
||||
country_name = model.get("country", {}).get("name")
|
||||
# Unknown is not parsable by stash, convert to None
|
||||
if country_name and country_name == "Unknown":
|
||||
country_name = None
|
||||
|
||||
return {
|
||||
'Name': model.get("name"),
|
||||
'Gender': model.get("gender" or "").upper(),
|
||||
'URL': f"{base_url}{model.get('path')}",
|
||||
'Ethnicity': model.get("ethnicity"),
|
||||
'Country': country_name,
|
||||
'Height': str(model.get("height")),
|
||||
'Weight': str(model.get("weight")),
|
||||
'Measurements': model.get("size"),
|
||||
'Details': model.get("biography"),
|
||||
'hair_color': model.get("hair" or "").capitalize(),
|
||||
'eye_color': model.get("eyes" or "").capitalize(),
|
||||
'Image': f"https://cdn.metartnetwork.com/{model.get('siteUUID')}{model.get('headshotImagePath')}",
|
||||
'Tags': tags
|
||||
}
|
||||
|
||||
|
||||
studios = {
|
||||
'2163551D11D0439686AD9D291C8DFD71': ('ALS Scan', 'alsscan.com'),
|
||||
'D0E7E33329311E3BB6E0800200C93255': ('Domai', 'domai.com'),
|
||||
'FDA021004E3411DF98790800200C9A66': ('Erotic Beauty', 'eroticbeauty.com'),
|
||||
'15A9FFA04E3511DF98790800200C9A66': ('Errotica Archives', 'errotica-archives.com'),
|
||||
'706DF46B88884F7BB226097952427754': ('Eternal Desire', 'eternaldesire.com'),
|
||||
'5592E33324211E3FF640800200C93111': ('Goddess Nudes', 'goddessnudes.com'),
|
||||
'5A68E1D7B6E69E7401226779D559A10A': ('Love Hairy', 'lovehairy.com'),
|
||||
'E6B595104E3411DF98790800200C9A66': ('MetArt', 'metart.com'),
|
||||
'5C38C84F55841824817C19987F5447B0': ('MetArt Intimate', 'metart.com'),
|
||||
'E7DFB70DF31C45B3B5E0BF10D733D349': ('MetArt X', 'metartx.com'),
|
||||
'D99236C04DD011E1B86C0800200C9A66': ('Rylsky Art', 'rylskyart.com'),
|
||||
'94DB3D0036FC11E1B86C0800200C9A66': ('SexArt', 'sexart.com'),
|
||||
'3D345D1E156910B44DB5A80CDD746318': ('Straplez', 'straplez.com'),
|
||||
'18A2E47EAEFD45F29033A5FCAF1F5B91': ('Stunning 18', 'stunning18.com'),
|
||||
'FDAFDF209DC311E0AA820800200C9A66': ('The Life Erotic', 'thelifeerotic.com'),
|
||||
'4F23028982B542FA9C6DAAA747E9B5B3': ('Viv Thomas', 'vivthomas.com'),
|
||||
}
|
||||
|
||||
|
||||
def validate_url(url):
|
||||
if url is None or not re.match('^https?://', url):
|
||||
return False
|
||||
|
||||
for (_, domain) in studios.values():
|
||||
if domain in url:
|
||||
return True
|
||||
|
||||
if 'metartnetwork.com' in url:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_studio(site_uuid):
|
||||
return studios[site_uuid] if site_uuid in studios else None
|
||||
|
||||
|
||||
scraper_input = sys.stdin.read()
|
||||
i = json.loads(scraper_input)
|
||||
log.debug(f"Started with input: {scraper_input}")
|
||||
|
||||
ret = {}
|
||||
if sys.argv[1] == "scrape":
|
||||
ret = scrape_url(i['url'], sys.argv[2])
|
||||
elif sys.argv[1] == "query":
|
||||
if 'url' in i and validate_url(i['url']):
|
||||
ret = scrape_url(i['url'], sys.argv[2])
|
||||
|
||||
if ret is None or ret == {}:
|
||||
ret = query(i, sys.argv[2])
|
||||
elif sys.argv[1] == 'search':
|
||||
if i.get('title') is not None or i.get('name') is not None:
|
||||
ret = search(sys.argv[2], i['title'] if 'title' in i else i['name'])
|
||||
|
||||
if ret is not None:
|
||||
output = json.dumps(ret)
|
||||
print(output)
|
||||
else:
|
||||
print("{}")
|
||||
# log.debug(f"Send output: {output}")
|
||||
Reference in New Issue
Block a user