compose-projects-arr/stash/config/scrapers/community/JacquieEtMichelTV/JacquieEtMichelTV.py

import base64
import datetime
import json
import string
import sys
from urllib.parse import urlparse
# extra modules below need to be installed
try:
    import cloudscraper
except ModuleNotFoundError:
    print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
    sys.exit()

try:
    from lxml import html
except ModuleNotFoundError:
    print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr)
    sys.exit()

try:
    import py_common.graphql as graphql
    import py_common.log as log
except ModuleNotFoundError:
    print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
    sys.exit()

lang = 'en'

if len(sys.argv) > 1:
    if sys.argv[1] == 'fr':
        lang = 'fr'

frag = json.loads(sys.stdin.read())
if not frag['url']:
    log.error('No URL entered.')
    sys.exit(1)

url = frag["url"]
scraper = cloudscraper.create_scraper()
try:
    cookies = {'lang': lang}
    scraped = scraper.get(url, cookies=cookies)
except:
    log.error("scrape error")
    sys.exit(1)

if scraped.status_code >= 400:
    log.error(f'HTTP Error: {scraped.status_code}')
    sys.exit(1)

tree = html.fromstring(scraped.text)

title = None
title_res = tree.xpath("//h1/text()")
if title_res:
    title = title_res[0]
date = None
dt = tree.xpath("//span[@class='video-detail__date']/text()")
if dt:
    f, *m, l = dt[0].split()
    log.debug(f"found date: {l}")
    if l:
        if lang == 'fr':
            date = datetime.datetime.strptime(l,
                                              "%d/%m/%Y").strftime("%Y-%m-%d")
        else:
            # en
            date = datetime.datetime.strptime(l,
                                              "%m/%d/%Y").strftime("%Y-%m-%d")
desc = tree.xpath("//meta[@property='og:description']/@content")
details = ""
if desc:
    details = desc[0]
tags = tree.xpath("//a[@class='video-detail__tag-list__link']/text()")
imgurl_res = tree.xpath("//video[@id='video-player']/@poster")
datauri = None
if imgurl_res:
    imgurl = imgurl_res[0]
    img = scraper.get(imgurl).content
    b64img = base64.b64encode(img)
    datauri = "data:image/jpeg;base64,"

ret = {
    'title': title,
    'tags': [{
        'name': x.strip()
    } for x in tags],
    'date': date,
    'details': details,
    'image': datauri + b64img.decode('utf-8'),
    'studio': {
        'name': 'Jacquie Et Michel TV'
    },
}

print(json.dumps(ret))