This commit is contained in:
Christoph Califice
2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions

View File

@@ -0,0 +1,98 @@
import base64
import datetime
import json
import string
import sys
from urllib.parse import urlparse
# extra modules below need to be installed
try:
import cloudscraper
except ModuleNotFoundError:
print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
sys.exit()
try:
from lxml import html
except ModuleNotFoundError:
print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr)
sys.exit()
try:
import py_common.graphql as graphql
import py_common.log as log
except ModuleNotFoundError:
print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
sys.exit()
lang = 'en'
if len(sys.argv) > 1:
if sys.argv[1] == 'fr':
lang = 'fr'
frag = json.loads(sys.stdin.read())
if not frag['url']:
log.error('No URL entered.')
sys.exit(1)
url = frag["url"]
scraper = cloudscraper.create_scraper()
try:
cookies = {'lang': lang}
scraped = scraper.get(url, cookies=cookies)
except:
log.error("scrape error")
sys.exit(1)
if scraped.status_code >= 400:
log.error(f'HTTP Error: {scraped.status_code}')
sys.exit(1)
tree = html.fromstring(scraped.text)
title = None
title_res = tree.xpath("//h1/text()")
if title_res:
title = title_res[0]
date = None
dt = tree.xpath("//span[@class='video-detail__date']/text()")
if dt:
f, *m, l = dt[0].split()
log.debug(f"found date: {l}")
if l:
if lang == 'fr':
date = datetime.datetime.strptime(l,
"%d/%m/%Y").strftime("%Y-%m-%d")
else:
# en
date = datetime.datetime.strptime(l,
"%m/%d/%Y").strftime("%Y-%m-%d")
desc = tree.xpath("//meta[@property='og:description']/@content")
details = ""
if desc:
details = desc[0]
tags = tree.xpath("//a[@class='video-detail__tag-list__link']/text()")
imgurl_res = tree.xpath("//video[@id='video-player']/@poster")
datauri = None
if imgurl_res:
imgurl = imgurl_res[0]
img = scraper.get(imgurl).content
b64img = base64.b64encode(img)
datauri = "data:image/jpeg;base64,"
ret = {
'title': title,
'tags': [{
'name': x.strip()
} for x in tags],
'date': date,
'details': details,
'image': datauri + b64img.decode('utf-8'),
'studio': {
'name': 'Jacquie Et Michel TV'
},
}
print(json.dumps(ret))