259 lines
8.1 KiB
Python
259 lines
8.1 KiB
Python
|
|
import json
|
|
import sys
|
|
|
|
try:
|
|
import requests
|
|
except ModuleNotFoundError:
|
|
print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
class multiscrape:
|
|
|
|
url="http://localhost:9999/graphql"
|
|
headers = {
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Content-Type": "application/json",
|
|
"Accept": "application/json",
|
|
"Connection": "keep-alive",
|
|
"DNT": "1"
|
|
}
|
|
|
|
'''
|
|
update the below config in the preferred order for each field.
|
|
If there are no results for that performer and field it will use the results of the next scraper in the list and cache the results.'''
|
|
config ={
|
|
"gender": ['stash-sqlite'],
|
|
"url" : ['Babepedia','stash-sqlite','FreeonesCommunity','Brazzers','Pornhub'],
|
|
"twitter":['Babepedia','stash-sqlite'],
|
|
"instagram": ['Babepedia'],
|
|
"birthdate": ['IMBD','FreeonesCommunity','Babepedia','stash-sqlite'],
|
|
"ethnicity": ['Babepedia','stash-sqlite'],
|
|
"country": ['Babepedia','stash-sqlite'],
|
|
"eye_color": ['Babepedia','stash-sqlite'],
|
|
"height":['Babepedia','Pornhub','stash-sqlite'],
|
|
"measurements":['Babepedia','Pornhub','FreeonesCommunity','stash-sqlite'],
|
|
"fake_tits":['Babepedia','stash-sqlite'],
|
|
"career_length": ['Pornhub','Babepedia','stash-sqlite'],
|
|
"tattoos":['Babepedia','stash-sqlite'],
|
|
"piercings": ['Babepedia','stash-sqlite'],
|
|
"aliases": ['Babepedia','stash-sqlite'],
|
|
"tags": ['Babepedia'],
|
|
"details": ['FreeonesCommunity','Babepedia','Brazzers'],
|
|
"death_date": ['Babepedia'],
|
|
"hair_color": ['Babepedia'],
|
|
"weight":['Babepedia','FreeonesCommunity'],
|
|
"image": ['performer-image-dir','Babepedia','FreeonesCommunity']
|
|
}
|
|
|
|
|
|
def __log(self,levelChar, s):
|
|
if levelChar == "":
|
|
return
|
|
|
|
print(self.__prefix(levelChar) + s + "\n", file=sys.stderr, flush=True)
|
|
|
|
def trace(self,s):
|
|
self.__log(b't', s)
|
|
|
|
def debug(self,s):
|
|
self.__log(b'd', s)
|
|
|
|
def info(self,s):
|
|
self.__log(b'i', s)
|
|
|
|
def warning(self,s):
|
|
self.__log(b'w', s)
|
|
|
|
def error(self,s):
|
|
self.__log(b'e', s)
|
|
|
|
|
|
|
|
def __callGraphQL(self, query, variables=None):
|
|
json = {}
|
|
json['query'] = query
|
|
if variables != None:
|
|
json['variables'] = variables
|
|
|
|
# handle cookies
|
|
response = requests.post(self.url, json=json, headers=self.headers)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
if result.get("error", None):
|
|
for error in result["error"]["errors"]:
|
|
raise Exception("GraphQL error: {}".format(error))
|
|
if result.get("data", None):
|
|
return result.get("data")
|
|
else:
|
|
raise Exception(
|
|
"GraphQL query failed:{} - {}. Query: {}. Variables: {}".format(response.status_code, response.content, query, variables))
|
|
|
|
|
|
def list_scrapers(self, type):
|
|
query = """query listPerformerScrapers {
|
|
listPerformerScrapers {
|
|
id
|
|
name
|
|
performer{
|
|
supported_scrapes
|
|
}
|
|
}
|
|
}"""
|
|
ret = []
|
|
result = self.__callGraphQL(query)
|
|
for r in result["listSceneScrapers"]:
|
|
if type in r["scene"]["supported_scrapes"]:
|
|
ret.append(r["id"])
|
|
return ret
|
|
|
|
def scrape_performer_list(self, scraper_id, performer):
|
|
query = """query scrapePerformerList($scraper_id: ID!, $performer: String!) {
|
|
scrapePerformerList(scraper_id: $scraper_id, query: $performer) {
|
|
name
|
|
url
|
|
gender
|
|
twitter
|
|
instagram
|
|
birthdate
|
|
ethnicity
|
|
country
|
|
eye_color
|
|
height
|
|
measurements
|
|
fake_tits
|
|
career_length
|
|
tattoos
|
|
piercings
|
|
aliases
|
|
image
|
|
}
|
|
}"""
|
|
|
|
variables = {'scraper_id': scraper_id, 'performer': performer}
|
|
result = self.__callGraphQL(query, variables)
|
|
if result is not None:
|
|
return result["scrapePerformerList"]
|
|
return None
|
|
|
|
def scrape_performer(self, scraper_id, performer):
|
|
query = """query scrapePerformer($scraper_id: ID!, $performer: ScrapedPerformerInput!) {
|
|
scrapePerformer(scraper_id: $scraper_id, scraped_performer: $performer) {
|
|
name
|
|
url
|
|
gender
|
|
twitter
|
|
instagram
|
|
birthdate
|
|
ethnicity
|
|
country
|
|
eye_color
|
|
height
|
|
measurements
|
|
fake_tits
|
|
career_length
|
|
tattoos
|
|
piercings
|
|
aliases
|
|
image
|
|
}
|
|
}"""
|
|
variables = {'scraper_id': scraper_id, 'performer': performer}
|
|
result = self.__callGraphQL(query, variables)
|
|
return result["scrapePerformer"]
|
|
|
|
def requred_scrapers(self):
|
|
scrapers=[]
|
|
for key in self.config.keys():
|
|
for s in self.config.get(key):
|
|
if s not in scrapers:
|
|
scrapers.append(s)
|
|
return scrapers
|
|
|
|
|
|
def query_performers(self,name):
|
|
ret=[]
|
|
|
|
for scraper in self.requred_scrapers():
|
|
print("Querying performers "+ scraper, file=sys.stderr)
|
|
tmp=self.scrape_performer_list(scraper,name)
|
|
if tmp is not None:
|
|
for s in tmp:
|
|
found=False
|
|
for t in ret:
|
|
if s["name"]==t["name"]:
|
|
found=True
|
|
if not found:
|
|
ret.append(s)
|
|
return ret
|
|
|
|
def fetch_performer(self,name):
|
|
ret={"name":name}
|
|
|
|
scraper_cache={}
|
|
|
|
for field in self.config.keys():
|
|
found=False
|
|
for s in self.config[field]:
|
|
if s in scraper_cache.keys():
|
|
if field in scraper_cache[s]:
|
|
ret[field]=scraper_cache[s][field]
|
|
print("updating field from cache using scraper: " + s +" for field: " +field, file=sys.stderr)
|
|
found=True
|
|
if s not in scraper_cache.keys() and not found:
|
|
print("Running scraper: " + s +" " +field, file=sys.stderr)
|
|
spl=self.scrape_performer_list(s, name)
|
|
if spl is not None:
|
|
for spli in spl:
|
|
if spli["name"].lower()==name.lower():
|
|
r=self.scrape_performer(s, {"name":spli["name"], "url":spli["url"]})
|
|
if r is not None:
|
|
scraper_cache[s]=r
|
|
found=True
|
|
break;
|
|
if found:
|
|
print("Saving results from scraper: " +field + " " +s,file=sys.stderr)
|
|
if field in scraper_cache[s]:
|
|
ret[field]=scraper_cache[s][field]
|
|
else:
|
|
found=False
|
|
else:
|
|
scraper_cache[s]={}
|
|
return ret
|
|
|
|
|
|
|
|
if sys.argv[1] == "query":
|
|
fragment = json.loads(sys.stdin.read())
|
|
print("input: " + json.dumps(fragment),file=sys.stderr)
|
|
scraper=multiscrape()
|
|
result = scraper.query_performers(fragment['name'])
|
|
if not result:
|
|
print(f"Could not determine details for performer: `{fragment['name']}`",file=sys.stderr)
|
|
print("{}")
|
|
else:
|
|
print (json.dumps(result))
|
|
|
|
if sys.argv[1] == "fetch":
|
|
fragment = json.loads(sys.stdin.read())
|
|
print("input: " + json.dumps(fragment),file=sys.stderr)
|
|
scraper=multiscrape()
|
|
result = scraper.fetch_performer(fragment['name'])
|
|
if not result:
|
|
print(f"Could not determine details for performer: `{fragment['name']}`",file=sys.stderr)
|
|
print("{}")
|
|
else:
|
|
True
|
|
print (json.dumps(result))
|
|
|
|
|
|
|
|
if sys.argv[1] == "test":
|
|
scraper=multiscrape()
|
|
scrapers=scraper.requred_scrapers()
|
|
print(scrapers)
|
|
|
|
|