stash
This commit is contained in:
@@ -0,0 +1,98 @@
|
||||
import base64
|
||||
import datetime
|
||||
import json
|
||||
import string
|
||||
import sys
|
||||
from urllib.parse import urlparse
|
||||
# extra modules below need to be installed
|
||||
try:
|
||||
import cloudscraper
|
||||
except ModuleNotFoundError:
|
||||
print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
|
||||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
from lxml import html
|
||||
except ModuleNotFoundError:
|
||||
print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
|
||||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
import py_common.graphql as graphql
|
||||
import py_common.log as log
|
||||
except ModuleNotFoundError:
|
||||
print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
|
||||
sys.exit()
|
||||
|
||||
lang = 'en'
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
if sys.argv[1] == 'fr':
|
||||
lang = 'fr'
|
||||
|
||||
frag = json.loads(sys.stdin.read())
|
||||
if not frag['url']:
|
||||
log.error('No URL entered.')
|
||||
sys.exit(1)
|
||||
|
||||
url = frag["url"]
|
||||
scraper = cloudscraper.create_scraper()
|
||||
try:
|
||||
cookies = {'lang': lang}
|
||||
scraped = scraper.get(url, cookies=cookies)
|
||||
except:
|
||||
log.error("scrape error")
|
||||
sys.exit(1)
|
||||
|
||||
if scraped.status_code >= 400:
|
||||
log.error(f'HTTP Error: {scraped.status_code}')
|
||||
sys.exit(1)
|
||||
|
||||
tree = html.fromstring(scraped.text)
|
||||
|
||||
title = None
|
||||
title_res = tree.xpath("//h1/text()")
|
||||
if title_res:
|
||||
title = title_res[0]
|
||||
date = None
|
||||
dt = tree.xpath("//span[@class='video-detail__date']/text()")
|
||||
if dt:
|
||||
f, *m, l = dt[0].split()
|
||||
log.debug(f"found date: {l}")
|
||||
if l:
|
||||
if lang == 'fr':
|
||||
date = datetime.datetime.strptime(l,
|
||||
"%d/%m/%Y").strftime("%Y-%m-%d")
|
||||
else:
|
||||
# en
|
||||
date = datetime.datetime.strptime(l,
|
||||
"%m/%d/%Y").strftime("%Y-%m-%d")
|
||||
desc = tree.xpath("//meta[@property='og:description']/@content")
|
||||
details = ""
|
||||
if desc:
|
||||
details = desc[0]
|
||||
tags = tree.xpath("//a[@class='video-detail__tag-list__link']/text()")
|
||||
imgurl_res = tree.xpath("//video[@id='video-player']/@poster")
|
||||
datauri = None
|
||||
if imgurl_res:
|
||||
imgurl = imgurl_res[0]
|
||||
img = scraper.get(imgurl).content
|
||||
b64img = base64.b64encode(img)
|
||||
datauri = "data:image/jpeg;base64,"
|
||||
|
||||
ret = {
|
||||
'title': title,
|
||||
'tags': [{
|
||||
'name': x.strip()
|
||||
} for x in tags],
|
||||
'date': date,
|
||||
'details': details,
|
||||
'image': datauri + b64img.decode('utf-8'),
|
||||
'studio': {
|
||||
'name': 'Jacquie Et Michel TV'
|
||||
},
|
||||
}
|
||||
|
||||
print(json.dumps(ret))
|
||||
Reference in New Issue
Block a user