stash

2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions
--- a/stash/config/scrapers/community/FratX/FratX.py
+++ b/stash/config/scrapers/community/FratX/FratX.py
@@ -0,0 +1,160 @@
+import base64
+from datetime import datetime
+import json
+import re
+import sys
+# extra modules below need to be installed
+try:
+    from py_common import log as stash_log
+except ModuleNotFoundError:
+    print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
+    sys.exit()
+
+try:
+    from bs4 import BeautifulSoup
+except ModuleNotFoundError:
+    print("You need to install the BeautifulSoup4 package. (https://pypi.org/project/beautifulsoup4/)", file=sys.stderr)
+    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install beautifulsoup4", file=sys.stderr)
+    sys.exit()
+try:
+    import requests
+except ModuleNotFoundError:
+    print("You need to install the requests package. (https://pypi.org/project/requests/)", file=sys.stderr)
+    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
+    sys.exit()
+
+# NOTES:
+# This scraper both scrapes scenes from exact URLs and attempts to
+# lookup scenes based on title fragments.
+
+# Scene by URL
+# Items returned include:
+# title: In many cases, the title listed on the current site is
+#   different from the scene's original title recorded on IAFD.com, etc.
+# date: The dates listed on the site are almost all altered to give the
+#   appearence of a more regular update schedule. If uploading to
+#   StashDB, use a more reliable source for the scene date and confirm
+#   the original title.
+# image: The background image from the video preview. This is usually, but
+#   not always, the same as the preview image on the episodes listing page.
+
+# Scene by Fragment
+# There isn't a search or API on the site so a best-effort is made to
+# guess the url slug based on the given title. Always confirm the returned
+# scene matches your content. Many scenes (include all those before
+# FX142A, 2017-10-11) have been removed from the site and can't be scraped.
+# Many url slugs are still based on the original titles, so search on
+# that if you know it.
+
+
+def log(msg):
+    stash_log.error(msg)
+    ret_null = {}
+    print(json.dumps(ret_null))
+    sys.exit(1)
+
+
+def scene_from_url(url, page=None):
+    ret = {
+        "studio": {"name": "FratX"},
+        "url": url
+    }
+
+    if not page:
+        page = requests.get(url)
+        if page.status_code != 200:
+            log(f"HTTP Errror: {page.status_code} returned when requesting {url}")
+
+    page_soup = BeautifulSoup(page.text, "html.parser")
+
+    # Try to get the image first
+    try:
+        stream_link = page_soup.find("iframe").attrs["src"]
+        stream = requests.get(stream_link)
+        stream_soup = BeautifulSoup(stream.text, "html.parser")
+        script_text = stream_soup.find("script").text
+        match_obj = re.search(r"token:\s+[\'|\"](.*)[\'|\"],", script_text)
+        token = match_obj.group(1)
+        vss = "https://videostreamingsolutions.net/api:ov-embed/parseToken?token="
+
+        video_data = requests.get(vss + token)
+        video_json = json.loads(video_data.text)
+        img_path = video_json['_video']['xdo']['banner']['path']
+        img_url = f"https://videostreamingsolutions.net{img_path}?tpl=large.jpg"
+        img_b64 = base64.b64encode(requests.get(img_url).content)
+        ret["image"] = "data:image/jpeg;base64," + img_b64.decode('utf-8')
+    except Exception as img_e:
+        stash_log.error(f"Unable to retrieve cover image due to exception:{img_e}")
+        img_b64 = ""
+        stash_log.info("Attempting to collect other metadata.")
+
+
+    scene_data = page_soup.find(class_="episode-description")
+    ret["title"] = scene_data.find("h1").text.strip().title()
+
+    date_and_details = scene_data.find("p").text
+    try:
+        # try to get date from the paragraph text
+        match_obj = re.search(r"(.*2\d{3})\s+-\s+(.*)", date_and_details)
+        ret["details"] = match_obj.group(2).strip()
+
+        date_str = match_obj.group(1)
+        # Handle dates with 1st, 2nd, 3rd, 4th, etc.
+        date_str = re.sub(r"(?<=\d)st|nd|rd|th", "", date_str).strip()
+        ret["date"] = str(datetime.strptime(date_str, "%B %d, %Y").date())
+    except AttributeError:
+        # if no date is found use all text as details
+        ret["details"] = date_and_details
+
+    print(json.dumps(ret))
+
+
+def guess_url_from_title(title):
+    title = title.strip().lower()
+    #remove file extension
+    title = re.sub(r"\.[\da-z]{2,4}$", "", title)
+    # clean the title of punctuation not likely to be in the url slug
+    title = "".join(c for c in title if c.isalnum() or c.isspace())
+    tokens = title.split()
+    # remove studio names and production numbers
+    tokens = [
+        t for t in tokens if
+        (
+            t not in ['fraternityx', 'fratx', 'fx'] and
+            not re.search(r"^(?:fx)?\d{3}\w?$", t)
+        )
+    ]
+    if not tokens:
+        return((None, None))
+
+    base_url = "https://fratx.com/episode/"
+    for connector in ["_", "-", ""]:
+        url = base_url + connector.join(tokens)
+        page = requests.get(url)
+        if page.status_code == 200:
+            return((url, page))
+
+    # Some episodes' url slugs are just the longest word
+    longest_word = sorted(tokens, key=lambda t: len(t))[-1]
+    url = base_url + longest_word
+    page = requests.get(url)
+    if page.status_code == 200:
+        return((url, page))
+
+    return((None, None))
+
+
+if sys.argv[1] == "scene_from_url":
+    frag = json.loads(sys.stdin.read())
+    if 'url' not in frag or not frag['url']:
+        log('No URL entered.')
+    scene_from_url(frag['url'])
+elif sys.argv[1] == "scene_query":
+    frag = json.loads(sys.stdin.read())
+    if 'title' not in frag or not frag['title']:
+        log('No URL entered.')
+    scene_url, scene_page = guess_url_from_title(frag['title'])
+    if scene_url and scene_page:
+        scene_from_url(scene_url, page=scene_page)
+    else:
+        log(f"Couldn't find scene URL from '{frag['title']}'")
--- a/stash/config/scrapers/community/FratX/FratX.yml
+++ b/stash/config/scrapers/community/FratX/FratX.yml
@@ -0,0 +1,18 @@
+name: FratX
+# requires: py_common
+
+sceneByURL:
+  - action: script
+    script:
+      - python
+      - FratX.py
+      - scene_from_url
+    url:
+      - fratx.com/episode/
+sceneByFragment:
+    action: script
+    script:
+      - python
+      - FratX.py
+      - scene_query
+# Last Updated June 07, 2022
--- a/stash/config/scrapers/community/FratX/manifest
+++ b/stash/config/scrapers/community/FratX/manifest
@@ -0,0 +1,10 @@
+id: FratX
+name: FratX
+metadata: {}
+version: 3479c8b
+date: "2023-11-22 01:14:42"
+requires: []
+source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
+files:
+- FratX.py
+- FratX.yml