180 lines
6.6 KiB
Python
180 lines
6.6 KiB
Python
import os
|
|
import sys
|
|
import json
|
|
|
|
# to import from a parent directory we need to add that directory to the system path
|
|
csd = os.path.dirname(os.path.realpath(__file__)) # get current script directory
|
|
parent = os.path.dirname(csd) # parent directory (should be the scrapers one)
|
|
sys.path.append(
|
|
parent
|
|
) # add parent dir to sys path so that we can import py_common from there
|
|
|
|
try:
|
|
# Import Stash logging system from py_common
|
|
from py_common import log
|
|
except ModuleNotFoundError:
|
|
print(
|
|
"You need to download the folder 'py_common' from the community repo. (CommunityScrapers/tree/master/scrapers/py_common)",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit()
|
|
|
|
try:
|
|
# Import necessary modules.
|
|
import requests
|
|
import re
|
|
|
|
# If one of these modules is not installed:
|
|
except ModuleNotFoundError:
|
|
log.error("You need to install the python modules mentioned in requirements.txt")
|
|
log.error(
|
|
"If you have pip (normally installed with python), run this command in a terminal from the directory the scraper is located: pip install -r requirements.txt"
|
|
)
|
|
sys.exit()
|
|
|
|
# Lookup table for tag replacements. The tags are in the form of hashtags, and often have multiple words mashed together.
|
|
# This is a quick and dirty way of turning these into meaningful data, and can be expanded on to taste.
|
|
TAG_REPLACEMENTS = {
|
|
"Fin Dom": "Findom",
|
|
"Fem Dom": "Femdom",
|
|
"bigtits": "Big Tits",
|
|
"titworship": "Tit Worship",
|
|
"financialdomination": "Financial Domination",
|
|
"R I P O F F": "ripoff",
|
|
"pussydenial": "pussy denial",
|
|
}
|
|
|
|
|
|
def output_json_url(title, tags, url, image, studio, performers, description, date):
|
|
# Create a tag dictionary from the tag list.
|
|
tag_dicts = [{"name": tag.strip(". ")} for tag in tags if tag.strip() != "N/A"]
|
|
# We're only using the value of 'performers' for our performer list
|
|
performer_dicts = [{"name": performer} for performer in performers]
|
|
# Dump all of this as JSON data.
|
|
return json.dumps(
|
|
{
|
|
"title": title,
|
|
"tags": tag_dicts,
|
|
"url": url,
|
|
"image": image,
|
|
"studio": {"name": studio},
|
|
"performers": performer_dicts,
|
|
"details": description,
|
|
"date": date,
|
|
},
|
|
indent=2,
|
|
)
|
|
|
|
|
|
def get_cookies(scene_url: str):
|
|
# Establish a session.
|
|
session = requests.Session()
|
|
# Set headers required for a successful POST query.
|
|
headers = {
|
|
"Accept": "application/json",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Content-Type": "application/json",
|
|
"Origin": "https://www.loyalfans.com",
|
|
"Referer": scene_url,
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
|
}
|
|
# URL of the system status API. This is called when a Loyalfans page is first loaded from what I can tell.
|
|
url = "https://www.loyalfans.com/api/v2/system-status"
|
|
# Perform a POST query to capture initial cookies.
|
|
response = session.post(url, headers=headers)
|
|
# Return these cookies.
|
|
return response.cookies
|
|
|
|
|
|
def get_api_url(scene_url: str):
|
|
# Extract the last component of the scene URL.
|
|
end_segment = scene_url.split("/")[-1]
|
|
# Append this to the API link. As far as I can tell, post names in scene URLs are unique. I have yet to encounter any data mismatches.
|
|
return f"https://www.loyalfans.com/api/v1/social/post/{end_segment}"
|
|
|
|
|
|
def get_json(scene_url: str):
|
|
# Set headers required for a successful request.
|
|
headers = {
|
|
"Accept": "application/json",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Content-Type": "application/json",
|
|
"Origin": "https://www.loyalfans.com",
|
|
"Referer": scene_url,
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
|
}
|
|
# Set cookies using get_cookies function.
|
|
cookie_set = get_cookies(scene_url)
|
|
# Perform request using the API URL of the scene in question, adding headers and cookies.
|
|
response = requests.get(get_api_url(scene_url), headers=headers, cookies=cookie_set)
|
|
# Capture the response as JSON.
|
|
json_data = response.json()
|
|
# Return the JSON data.
|
|
return json_data
|
|
|
|
|
|
def scrape_scene(scene_url: str) -> dict:
|
|
# Capture JSON relating to this scene from the Loyalfans API.
|
|
json = get_json(scene_url)
|
|
# Extract title from the JSON and strip out any whitespace.
|
|
title = json["post"]["title"].strip()
|
|
# Use the video thumbnail/preview poster as the image.
|
|
image = json["post"]["video_object"].get("poster")
|
|
# Extract description, fix apostrophes and remove HTML newline tags.
|
|
description = json["post"]["content"].replace("\u2019", "'").replace("<br />", "")
|
|
# Sometimes hashtags are included at the bottom of the description. This line strips all that junk out, as we're utilising the hashtags for the tags. Also tidies up double-spacing and ellipses.
|
|
description = (
|
|
re.sub(r"#\w+\b", "", description)
|
|
.strip()
|
|
.replace(" ", " ")
|
|
.replace(". . .", "...")
|
|
)
|
|
# Extract studio name.
|
|
studio = json["post"]["owner"]["display_name"]
|
|
# Extract date. The JSON returns the date in the format '2023-06-18 12:00:00', but we only need the date, so the time is stripped out.
|
|
date = json["post"]["created_at"]["date"].split(" ")[0]
|
|
# Extract tags.
|
|
tags_list = json["post"]["hashtags"]
|
|
fixed_tags = []
|
|
# For every tag we find:
|
|
for tag in tags_list:
|
|
# Remove the hash from the start.
|
|
tag = tag[1:]
|
|
modified_tag = tag
|
|
# Split CamelCase tags into separate words.
|
|
modified_tag = re.sub(r"(?<!^)(?=[A-Z])", " ", tag).strip()
|
|
# Perform replacements according to the above lookup table.
|
|
for find, replace in TAG_REPLACEMENTS.items():
|
|
modified_tag = re.sub(
|
|
r"\b" + re.escape(find) + r"\b", replace, modified_tag
|
|
)
|
|
fixed_tags.append(modified_tag)
|
|
|
|
# LoyalFans doesn't provide a cast list so we'll just use the studio name as the performer name.
|
|
performers = [studio]
|
|
|
|
# Convert into meaningful JSON that Stash can use.
|
|
json_dump = output_json_url(
|
|
title, fixed_tags, scene_url, image, studio, performers, description, date
|
|
)
|
|
|
|
print(json_dump)
|
|
|
|
|
|
def main():
|
|
fragment = json.loads(sys.stdin.read())
|
|
url = fragment.get("url")
|
|
# If nothing is passed to the script:
|
|
if url is None:
|
|
log.error("No URL provided")
|
|
sys.exit(1)
|
|
# If we've been given a URL:
|
|
if url is not None:
|
|
scrape_scene(url)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
# Last updated 2023-06-18
|