stash
This commit is contained in:
@@ -0,0 +1,270 @@
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.parse
|
||||
from itertools import chain, zip_longest
|
||||
|
||||
# to import from a parent directory we need to add that directory to the system path
|
||||
csd = os.path.dirname(
|
||||
os.path.realpath(__file__)) # get current script directory
|
||||
parent = os.path.dirname(csd) # parent directory (should be the scrapers one)
|
||||
sys.path.append(
|
||||
parent
|
||||
) # add parent dir to sys path so that we can import py_common from there
|
||||
|
||||
try:
|
||||
import py_common.log as log
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit()
|
||||
try:
|
||||
import requests
|
||||
except ModuleNotFoundError:
|
||||
log.error(
|
||||
"You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)"
|
||||
)
|
||||
log.error(
|
||||
"If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests"
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
from lxml import etree, html
|
||||
except ModuleNotFoundError:
|
||||
log.error(
|
||||
"You need to install the lxml module. (https://lxml.de/installation.html#installation)"
|
||||
)
|
||||
log.error(
|
||||
"If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml"
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
|
||||
STUDIOS = {"Ultra Films": None, "All Fine Girls": 24,
|
||||
"WowGirls": 32, "WowPorn": 36}
|
||||
PROXIES = {}
|
||||
TIMEOUT = 10
|
||||
|
||||
|
||||
class WowVenus:
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"content-type": "application/x-www-form-urlencoded; charset=UTF-8"}
|
||||
)
|
||||
self.session.proxies.update(PROXIES)
|
||||
self.search_results = {}
|
||||
|
||||
def count_results_pages(self, studio_name):
|
||||
try:
|
||||
return len(self.search_results.get(studio_name))
|
||||
except:
|
||||
return 0
|
||||
|
||||
def wow_sub_studio_filter_toggle(self, studio_key, studio_name):
|
||||
query_studio_name = studio_name.replace(" ", "").lower()
|
||||
data = f"__operation=toggle&__state=sites%3D{studio_key}"
|
||||
scraped = None
|
||||
try:
|
||||
scraped = self.session.post(
|
||||
f"https://venus.{query_studio_name}.com/search/cf", data=data
|
||||
)
|
||||
except:
|
||||
log.error("scrape error")
|
||||
return None
|
||||
if scraped.status_code >= 400:
|
||||
log.error(f"HTTP Error: {scraped.status_code}")
|
||||
scraped = scraped.content.decode("utf-8")
|
||||
return scraped
|
||||
|
||||
def scrape_all_results_pages(self, page_content, studio_name):
|
||||
if not page_content.xpath('//div[@class="no_results"]'):
|
||||
if not self.search_results.get(studio_name):
|
||||
self.search_results[studio_name] = []
|
||||
self.search_results[studio_name].append(page_content)
|
||||
pagignator = page_content.xpath(
|
||||
"//div[@class='paginator']/div[@class='pages']//text()"
|
||||
)
|
||||
for pageNu in pagignator[1:]:
|
||||
page_content = html.fromstring(
|
||||
self.pageNu_scrape(studio_name, pageNu))
|
||||
self.search_results[studio_name].append(page_content)
|
||||
|
||||
def GET_req(self, url):
|
||||
scraped = None
|
||||
try:
|
||||
scraped = self.session.get(url, timeout=TIMEOUT)
|
||||
except:
|
||||
log.error("scrape error")
|
||||
return None
|
||||
if scraped.status_code >= 400:
|
||||
log.error(f"HTTP Error: {scraped.status_code}")
|
||||
return None
|
||||
return scraped.content
|
||||
|
||||
def set_video_filter(self, studio_name):
|
||||
query_studio_name = studio_name.replace(" ", "").lower()
|
||||
url = f"https://venus.{query_studio_name}.com/search/cf"
|
||||
data = "__state=contentTypes%3D%5Bvideo%5D"
|
||||
scraped = None
|
||||
try:
|
||||
scraped = self.session.post(url, data=data, timeout=TIMEOUT)
|
||||
except:
|
||||
log.error("scrape error")
|
||||
return None
|
||||
if scraped.status_code >= 400:
|
||||
log.error(f"HTTP Error: {scraped.status_code}")
|
||||
return None
|
||||
scraped = scraped.content.decode("utf-8")
|
||||
return scraped
|
||||
|
||||
def pageNu_scrape(self, studio_name, pageNu):
|
||||
query_studio_name = studio_name.replace(" ", "").lower()
|
||||
url = f"https://venus.{query_studio_name}.com/search/cf"
|
||||
data = f"__state=paginator.page%3D{pageNu}"
|
||||
try:
|
||||
scraped = self.session.post(url, data=data, timeout=TIMEOUT)
|
||||
except:
|
||||
log.error("scrape error")
|
||||
if scraped.status_code >= 400:
|
||||
log.error(f"HTTP Error: {scraped.status_code}")
|
||||
scraped = scraped.content.decode("utf-8")
|
||||
return scraped
|
||||
|
||||
def output_json(self, title, tags, url, b64img, studio, performers):
|
||||
return {
|
||||
"title": title,
|
||||
"tags": [{"name": x} for x in tags],
|
||||
"url": url,
|
||||
"image": "data:image/jpeg;base64," + b64img.decode("utf-8"),
|
||||
"studio": {"name": studio},
|
||||
"performers": [{"name": x.strip()} for x in performers],
|
||||
}
|
||||
|
||||
def scene_card_parse(self, scene_card):
|
||||
title = scene_card.xpath('./a[@class="title"]/text()')[0].strip()
|
||||
imgurl = scene_card.xpath(".//img[@title]/@src")[0]
|
||||
if URL:
|
||||
imgurl = re.sub("_\w*", "_1280x720", imgurl)
|
||||
img = self.GET_req(imgurl)
|
||||
b64img = base64.b64encode(img)
|
||||
performers = scene_card.xpath('.//*[@class="models"]/a/text()')
|
||||
tags = scene_card.xpath('.//span[@class="genres"]/a/text()')
|
||||
return title, b64img, performers, tags
|
||||
|
||||
def parse_results(self): # parse all scene elements, return all
|
||||
parsed_scenes = {}
|
||||
for studio_name, pages in self.search_results.items():
|
||||
query_studio_name = studio_name.replace(" ", "").lower()
|
||||
for page in pages:
|
||||
scene_cards = page.xpath(
|
||||
'//div[contains(@class, "ct_video")]//img[@title]/ancestor::div'
|
||||
)
|
||||
for scene_card in scene_cards:
|
||||
url = (
|
||||
f"https://venus.{query_studio_name}.com"
|
||||
+ scene_card.xpath("./a/@href")[0]
|
||||
)
|
||||
title, b64img, performers, tags = self.scene_card_parse(
|
||||
scene_card)
|
||||
if not parsed_scenes.get(query_studio_name):
|
||||
parsed_scenes[query_studio_name] = []
|
||||
parsed_scenes[query_studio_name].append(
|
||||
self.output_json(title, tags, url,
|
||||
b64img, studio_name, performers)
|
||||
)
|
||||
return parsed_scenes
|
||||
|
||||
def get_scene_with_id(
|
||||
self, scene_ID
|
||||
): # parse all scene elements, return single with matched id
|
||||
for studio_name, pages in self.search_results.items():
|
||||
query_studio_name = studio_name.replace(" ", "").lower()
|
||||
for page in pages:
|
||||
scene_cards_with_ID = page.xpath(
|
||||
f'//div[contains(@class, "ct_video")]//a[contains(@href,"{scene_ID}")]/ancestor::div'
|
||||
)
|
||||
if scene_cards_with_ID:
|
||||
scene_card = scene_cards_with_ID[0]
|
||||
url = (
|
||||
f"https://venus.{query_studio_name}.com"
|
||||
+ scene_card.xpath("./a/@href")[0]
|
||||
)
|
||||
title, b64img, performers, tags = self.scene_card_parse(
|
||||
scene_card)
|
||||
return self.output_json(
|
||||
title, tags, url, b64img, studio_name, performers
|
||||
)
|
||||
|
||||
def search(self, query_title, studio_name, studio_key):
|
||||
query_studio_name = studio_name.replace(" ", "").lower()
|
||||
url = f"https://venus.{query_studio_name}.com/search/?query={query_title}"
|
||||
self.GET_req(url) # send search request, needed for session data
|
||||
# set 'video only' filter for results
|
||||
scraped = self.set_video_filter(query_studio_name)
|
||||
page_content = html.fromstring(scraped)
|
||||
if studio_key: # use studio_key to filter search results by sub studio
|
||||
scraped = self.wow_sub_studio_filter_toggle(
|
||||
studio_key, query_studio_name) # toggle on
|
||||
page_content = html.fromstring(scraped)
|
||||
self.scrape_all_results_pages(page_content, studio_name)
|
||||
if studio_key:
|
||||
self.wow_sub_studio_filter_toggle(
|
||||
studio_key, query_studio_name) # toggle off
|
||||
log.debug(
|
||||
f"Searched {studio_name}, found {self.count_results_pages(studio_name)} pages"
|
||||
)
|
||||
|
||||
|
||||
def interleave_results(parsed_scenes): # interleave search results by studio
|
||||
interleaved = [
|
||||
x
|
||||
for x in chain.from_iterable(zip_longest(*parsed_scenes.values()))
|
||||
if x is not None
|
||||
]
|
||||
return interleaved
|
||||
|
||||
|
||||
def search_query_prep(string: str):
|
||||
string = string.replace("’", "'")
|
||||
a = [s for s in string if s.isalnum() or s.isspace() or s == "-" or s == "'"]
|
||||
string = "".join(a)
|
||||
return urllib.parse.quote(string)
|
||||
|
||||
|
||||
FRAGMENT = json.loads(sys.stdin.read())
|
||||
|
||||
NAME = FRAGMENT.get("name")
|
||||
URL = FRAGMENT.get("url")
|
||||
scraper = WowVenus()
|
||||
ret = {}
|
||||
|
||||
|
||||
if NAME:
|
||||
log.debug(f'Searching for "{NAME}"')
|
||||
query_title = search_query_prep(NAME)
|
||||
for studio_name, studio_key in STUDIOS.items():
|
||||
scraper.search(query_title, studio_name, studio_key)
|
||||
parsed_scenes = scraper.parse_results()
|
||||
ret = interleave_results(parsed_scenes)
|
||||
elif URL:
|
||||
query_title = URL.split("/")[-1].replace("-", " ")
|
||||
query_title = urllib.parse.unquote(query_title)
|
||||
scene_ID = URL.split("/")[4]
|
||||
log.debug(f'Searching for "{query_title}"')
|
||||
for studio_name, studio_key in STUDIOS.items():
|
||||
scraper.search(query_title, studio_name, studio_key)
|
||||
if ret := WowVenus.get_scene_with_id(scraper, scene_ID):
|
||||
log.debug("Scene found!")
|
||||
break
|
||||
if not ret:
|
||||
log.error(
|
||||
"Scene not found!\nSome scenes do not appear in search results unless you are logged in!"
|
||||
)
|
||||
sys.exit()
|
||||
print(json.dumps(ret))
|
||||
@@ -0,0 +1,24 @@
|
||||
name: WowNetworkVenus
|
||||
# requires: py_common
|
||||
|
||||
sceneByURL:
|
||||
- action: script
|
||||
script:
|
||||
- python
|
||||
- WowNetworkVenus.py
|
||||
url:
|
||||
- venus.ultrafilms.com
|
||||
- venus.wowgirls.com
|
||||
- venus.wowporn.com
|
||||
- venus.allfinegirls.com
|
||||
sceneByName:
|
||||
action: script
|
||||
script:
|
||||
- python
|
||||
- WowNetworkVenus.py
|
||||
sceneByQueryFragment:
|
||||
action: script
|
||||
script:
|
||||
- python
|
||||
- WowNetworkVenus.py
|
||||
# Last Updated September 21, 2022
|
||||
11
stash/config/scrapers/community/WowNetworkVenus/manifest
Executable file
11
stash/config/scrapers/community/WowNetworkVenus/manifest
Executable file
@@ -0,0 +1,11 @@
|
||||
id: WowNetworkVenus
|
||||
name: WowNetworkVenus
|
||||
metadata: {}
|
||||
version: 3479c8b
|
||||
date: "2023-11-22 01:14:42"
|
||||
requires: []
|
||||
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||
files:
|
||||
- WowNetworkVenus.py
|
||||
- WowNetworkVenus.yml
|
||||
- requirements.txt
|
||||
@@ -0,0 +1,2 @@
|
||||
lxml
|
||||
requests
|
||||
Reference in New Issue
Block a user