99 lines
2.8 KiB
Python
99 lines
2.8 KiB
Python
import base64
|
|
import datetime
|
|
import json
|
|
import string
|
|
import sys
|
|
from urllib.parse import urlparse
|
|
# extra modules below need to be installed
|
|
try:
|
|
import cloudscraper
|
|
except ModuleNotFoundError:
|
|
print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
try:
|
|
from lxml import html
|
|
except ModuleNotFoundError:
|
|
print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
try:
|
|
import py_common.graphql as graphql
|
|
import py_common.log as log
|
|
except ModuleNotFoundError:
|
|
print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
lang = 'en'
|
|
|
|
if len(sys.argv) > 1:
|
|
if sys.argv[1] == 'fr':
|
|
lang = 'fr'
|
|
|
|
frag = json.loads(sys.stdin.read())
|
|
if not frag['url']:
|
|
log.error('No URL entered.')
|
|
sys.exit(1)
|
|
|
|
url = frag["url"]
|
|
scraper = cloudscraper.create_scraper()
|
|
try:
|
|
cookies = {'lang': lang}
|
|
scraped = scraper.get(url, cookies=cookies)
|
|
except:
|
|
log.error("scrape error")
|
|
sys.exit(1)
|
|
|
|
if scraped.status_code >= 400:
|
|
log.error(f'HTTP Error: {scraped.status_code}')
|
|
sys.exit(1)
|
|
|
|
tree = html.fromstring(scraped.text)
|
|
|
|
title = None
|
|
title_res = tree.xpath("//h1/text()")
|
|
if title_res:
|
|
title = title_res[0]
|
|
date = None
|
|
dt = tree.xpath("//span[@class='video-detail__date']/text()")
|
|
if dt:
|
|
f, *m, l = dt[0].split()
|
|
log.debug(f"found date: {l}")
|
|
if l:
|
|
if lang == 'fr':
|
|
date = datetime.datetime.strptime(l,
|
|
"%d/%m/%Y").strftime("%Y-%m-%d")
|
|
else:
|
|
# en
|
|
date = datetime.datetime.strptime(l,
|
|
"%m/%d/%Y").strftime("%Y-%m-%d")
|
|
desc = tree.xpath("//meta[@property='og:description']/@content")
|
|
details = ""
|
|
if desc:
|
|
details = desc[0]
|
|
tags = tree.xpath("//a[@class='video-detail__tag-list__link']/text()")
|
|
imgurl_res = tree.xpath("//video[@id='video-player']/@poster")
|
|
datauri = None
|
|
if imgurl_res:
|
|
imgurl = imgurl_res[0]
|
|
img = scraper.get(imgurl).content
|
|
b64img = base64.b64encode(img)
|
|
datauri = "data:image/jpeg;base64,"
|
|
|
|
ret = {
|
|
'title': title,
|
|
'tags': [{
|
|
'name': x.strip()
|
|
} for x in tags],
|
|
'date': date,
|
|
'details': details,
|
|
'image': datauri + b64img.decode('utf-8'),
|
|
'studio': {
|
|
'name': 'Jacquie Et Michel TV'
|
|
},
|
|
}
|
|
|
|
print(json.dumps(ret))
|