stash
This commit is contained in:
@@ -0,0 +1,267 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
# to import from a parent directory we need to add that directory to the system path
|
||||
csd = os.path.dirname(os.path.realpath(__file__)) # get current script directory
|
||||
parent = os.path.dirname(csd) # parent directory (should be the scrapers one)
|
||||
sys.path.append(
|
||||
parent
|
||||
) # add parent dir to sys path so that we can import py_common from there
|
||||
|
||||
try:
|
||||
import cloudscraper
|
||||
except ModuleNotFoundError:
|
||||
print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
|
||||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ModuleNotFoundError:
|
||||
print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
|
||||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
from lxml import html
|
||||
except ModuleNotFoundError:
|
||||
print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
|
||||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
import py_common.log as log
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"You need to download the folder 'py_common' from the community repo (CommunityScrapers/tree/master/scrapers/py_common)",
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# --------------------------------------
|
||||
|
||||
# This is a scraper for: animecharactersdatabase.com
|
||||
#
|
||||
# AnimeCharactersDatabase includes characters from:
|
||||
# Anime, Hentai, (Mobile) Games, Eroge, Virtual Idols/YouTubers, Vocaloid
|
||||
#
|
||||
# These fields will be populated if available:
|
||||
# Name, Gender, Birthdate, Country, Hair Color, Eye Color, Height, Measurements, URL, Details, Tags, Image
|
||||
#
|
||||
# A number of additional tags can be configured below.
|
||||
|
||||
# ---------------------------------------
|
||||
# ---------- Tag Configuration ----------
|
||||
# ---------------------------------------
|
||||
|
||||
# Maximum number of search results (between 1 and 30).
|
||||
# Search by name includes the franchise for each result to make it easier to choose the correct one.
|
||||
# Some (non ascii, very short) names require querying the API individually to get the franchise for each result.
|
||||
# This might get you banned, since the API is rate limited.
|
||||
# See: http://wiki.animecharactersdatabase.com/index.php?title=API_Access
|
||||
limit = 15
|
||||
|
||||
# Prefix for performer tags.
|
||||
prefix = "performer:"
|
||||
|
||||
# List of additional tags.
|
||||
additional_tags = [{"name": "fictional"}] # []
|
||||
|
||||
# Tags mostly include appearance indicators like: ahoge, dress, hat, twintails, etc.
|
||||
include_tag = True
|
||||
tag_prefix = prefix
|
||||
|
||||
# Scrape the source material as tag (name of anime/game): Kantai Collection, Idolmaster: Cinderella Girls, etc.
|
||||
include_parody = True
|
||||
parody_prefix = "parody:"
|
||||
|
||||
# Scrape Zodiac Sign as tag: Libra ♎, Sagittarius ♐, etc.
|
||||
include_sign = True
|
||||
sign_prefix = prefix + "sign:"
|
||||
|
||||
# Scrape race of non-human characters as tag: Orc, Elf, etc.
|
||||
include_race = True
|
||||
race_prefix = prefix + "race:"
|
||||
|
||||
# Scrape ship class of ship girls as tag (kancolle, etc.): Destroyer, etc.
|
||||
include_ship_class = True
|
||||
ship_class_prefix = prefix + "ship:"
|
||||
|
||||
# Scrape blood type as tag: A, B, etc.
|
||||
include_blood_type = True
|
||||
blood_type_prefix = prefix + "Blood Type "
|
||||
|
||||
# Scrape apparent age as tag: Adult, Teen, etc.
|
||||
# Might differ from canonical age.
|
||||
# Canonical age will be ignored, since it would result in too many tags.
|
||||
# Birthdate is sometimes available, but the resulting calculated age represents neither canonical age nor apparent age.
|
||||
include_apparent_age = True
|
||||
apparent_age_prefix = prefix + "Apparent "
|
||||
|
||||
# Scrape Hair Length as tag: To Shoulders, To Neck, Past Waist, etc.
|
||||
include_hair_length = True
|
||||
hair_length_prefix = prefix + "Hair "
|
||||
|
||||
|
||||
# ---------------------------------------
|
||||
# ---------------------------------------
|
||||
# ---------------------------------------
|
||||
|
||||
def readJSONInput():
|
||||
input = sys.stdin.read()
|
||||
return json.loads(input)
|
||||
|
||||
|
||||
def scrapeURL(url):
|
||||
return html.fromstring(scrapeUrlToString(url))
|
||||
|
||||
|
||||
def scrapeUrlToString(url):
|
||||
scraper = cloudscraper.create_scraper()
|
||||
try:
|
||||
scraped = scraper.get(url)
|
||||
except:
|
||||
log.error("scrape error")
|
||||
sys.exit(1)
|
||||
|
||||
if scraped.status_code >= 400:
|
||||
log.error('HTTP Error: %s' % scraped.status_code)
|
||||
sys.exit(1)
|
||||
|
||||
return scraped.content
|
||||
|
||||
|
||||
def performerByName(query):
|
||||
cleanedQuery = requests.utils.quote(query)
|
||||
url = f"https://www.animecharactersdatabase.com/searchall.php?in=characters&sq={cleanedQuery}"
|
||||
tree = scrapeURL(url)
|
||||
names = tree.xpath("//li/div[@class='tile3top']/a/text()")
|
||||
ids = tree.xpath("//li/div[@class='tile3top']/a/@href")
|
||||
|
||||
results = []
|
||||
for name, id in zip(names, ids):
|
||||
results.append({
|
||||
"name": name,
|
||||
"id": id.replace("characters.php?id=", ""),
|
||||
"url": "https://www.animecharactersdatabase.com/" + id
|
||||
})
|
||||
log.info(f"scraped {len(results)} results on: {url}")
|
||||
return results
|
||||
|
||||
|
||||
def addFranchise(query, results):
|
||||
cleanedQuery = requests.utils.quote(query)
|
||||
url = f"https://www.animecharactersdatabase.com/api_series_characters.php?character_q={cleanedQuery}"
|
||||
data = json.loads(scrapeUrlToString(url))
|
||||
count1 = 0
|
||||
count2 = 0
|
||||
for result in results:
|
||||
try:
|
||||
# Try to find the franchise in API search results.
|
||||
# These results are ordered by alphabet and limited to 100,
|
||||
# so short queries might not include the correct result.
|
||||
# The API query also does not seem to support any Kanji.
|
||||
franchise = next(e["anime_name"] for e in data["search_results"] if str(e["id"]) == result["id"])
|
||||
count1 += 1
|
||||
except:
|
||||
# Use separate API calls as a backup.
|
||||
# This might get you banned, since the API is rate limited.
|
||||
franchise = apiGetCharacter(result["id"])["origin"]
|
||||
count2 += 1
|
||||
# Append franchise to character name for easier differentiation.
|
||||
result["name"] = f"{result['name']} ({franchise})"
|
||||
result.pop("id")
|
||||
log.debug(f"scraped {count1} franchises by single API call")
|
||||
log.debug(f"scraped {count2} franchises by separate API calls")
|
||||
return results
|
||||
|
||||
|
||||
def apiGetCharacter(id):
|
||||
url = f"https://www.animecharactersdatabase.com/api_series_characters.php?character_id={id}"
|
||||
return json.loads(scrapeUrlToString(url))
|
||||
|
||||
|
||||
def performerByURL(url, result={}):
|
||||
log.debug("performerByURL: " + url)
|
||||
tree = scrapeURL(url)
|
||||
result["url"] = url
|
||||
result["name"] = next(iter(tree.xpath(
|
||||
"//h3[@id='section001_summary']/following-sibling::p/a[contains(@href,'character')]/text()")), "").strip()
|
||||
result["details"] = "\n".join([s.strip() for s in tree.xpath(
|
||||
"//div[@style='padding: 0 15px 15px 15px; text-align: left;']/text()")])
|
||||
if not result["details"]:
|
||||
result["details"] = re.sub(" .$", ".", " ".join([s.strip() for s in tree.xpath(
|
||||
"//h3[@id='section001_summary']/following-sibling::p[contains(a/@href,'character')]//text()") if
|
||||
s.strip()]))
|
||||
result["image"] = next(iter(tree.xpath("//meta[@property='og:image']/@content")), "")
|
||||
|
||||
# left table, works for link and plain text fields, return result list
|
||||
def parse_left(field):
|
||||
template = "//table//th[text()='{0}' or a/text()='{0}']/following-sibling::td/a/text()"
|
||||
return tree.xpath(template.format(field))
|
||||
|
||||
result["tags"] = additional_tags
|
||||
if include_tag:
|
||||
result["tags"] += [{"name": tag_prefix + tag.strip()} for tag in parse_left("Tags ")]
|
||||
if include_parody:
|
||||
result["tags"] += [{"name": parody_prefix + tag.strip()} for tag in parse_left("From")]
|
||||
if include_blood_type:
|
||||
result["tags"] += [{"name": blood_type_prefix + tag.strip()} for tag in parse_left("Blood Type")]
|
||||
if include_race:
|
||||
result["tags"] += [{"name": race_prefix + tag.strip()} for tag in parse_left("Race")]
|
||||
if include_sign:
|
||||
result["tags"] += [{"name": sign_prefix + tag.strip()} for tag in parse_left("Sign")]
|
||||
if include_ship_class:
|
||||
result["tags"] += [{"name": ship_class_prefix + tag.strip()} for tag in parse_left("Ship Class")]
|
||||
result["country"] = next(iter(parse_left("Nationality")), "")
|
||||
birthday = parse_left("Birthday")
|
||||
birthyear = parse_left("Birthyear")
|
||||
if birthday and birthyear:
|
||||
birthdate = datetime.strptime(birthday[0].strip(), "%B %d").replace(year=int(birthyear[0].strip()))
|
||||
result["birthdate"] = birthdate.strftime("%Y-%m-%d")
|
||||
bust = parse_left("Bust")
|
||||
waist = parse_left("Waist")
|
||||
hip = parse_left("Hip")
|
||||
if bust and waist and hip:
|
||||
bust = bust[0].strip().replace("cm", "")
|
||||
waist = waist[0].strip().replace("cm", "")
|
||||
hip = hip[0].strip().replace("cm", "")
|
||||
result["measurements"] = "{}-{}-{}".format(bust, waist, hip)
|
||||
result["height"] = next(iter(parse_left("Height")), "").strip().replace("cm", "")
|
||||
|
||||
# middle/right table, reverse result list to prefer official appearance, return result or empty string
|
||||
def parse_right(field):
|
||||
template = "//table//th[text()='{}']/following-sibling::td/text()"
|
||||
return next(reversed(tree.xpath(template.format(field))), "").strip().replace("Unknown", "")
|
||||
|
||||
# should be tagged anyway if yes
|
||||
# if parse_right("Animal Ears") == "Yes":
|
||||
# result["tags"] += [{"name": "performer:animal ears"}]
|
||||
hair_length = parse_right("Hair Length")
|
||||
if include_hair_length and hair_length:
|
||||
result["tags"] += [{"name": hair_length_prefix + hair_length}]
|
||||
apparent_age = parse_right("Apparent Age")
|
||||
if include_apparent_age and apparent_age:
|
||||
result["tags"] += [{"name": apparent_age_prefix + apparent_age}]
|
||||
result["gender"] = parse_right("Gender")
|
||||
result["eye_color"] = parse_right("Eye Color")
|
||||
result["hair_color"] = parse_right("Hair Color")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# read the input
|
||||
i = readJSONInput()
|
||||
|
||||
if sys.argv[1] == "performerByURL":
|
||||
url = i["url"]
|
||||
result = performerByURL(url)
|
||||
print(json.dumps(result))
|
||||
elif sys.argv[1] == "performerByName":
|
||||
name = i["name"]
|
||||
log.info(f"Searching for name: {name}")
|
||||
results = performerByName(name)[:limit]
|
||||
results = addFranchise(name, results)
|
||||
print(json.dumps(results))
|
||||
@@ -0,0 +1,20 @@
|
||||
name: AnimeCharactersDatabase
|
||||
# requires: py_common
|
||||
|
||||
performerByURL:
|
||||
- action: script
|
||||
url:
|
||||
- animecharactersdatabase.com/characters.php
|
||||
script:
|
||||
- python
|
||||
- AnimeCharactersDatabase.py
|
||||
- performerByURL
|
||||
|
||||
performerByName:
|
||||
action: script
|
||||
script:
|
||||
- python
|
||||
- AnimeCharactersDatabase.py
|
||||
- performerByName
|
||||
|
||||
# Last Updated January 23, 2022
|
||||
10
stash/config/scrapers/community/AnimeCharactersDatabase/manifest
Executable file
10
stash/config/scrapers/community/AnimeCharactersDatabase/manifest
Executable file
@@ -0,0 +1,10 @@
|
||||
id: AnimeCharactersDatabase
|
||||
name: AnimeCharactersDatabase
|
||||
metadata: {}
|
||||
version: 4fcb313
|
||||
date: "2023-12-19 22:39:58"
|
||||
requires: []
|
||||
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||
files:
|
||||
- AnimeCharactersDatabase.yml
|
||||
- AnimeCharactersDatabase.py
|
||||
Reference in New Issue
Block a user