compose-projects-arr/stash/config/scrapers/community/IFeelMyself/IFeelMyself.py

import json
import re
import sys
from datetime import datetime
import unicodedata

# UNLESS logged in(and probably with an active subscription) scenes with certain tags(menstruation, pee) are hidden and can not be found by scraper.
# Also performer scraper will not be able to get country and details without being logged in.
# set value for ifeel_auth cookie here, may change and need to be renewed periodically.
# if no account available leave value empty and scraper won't find some videos and country and details fields will be missing from performer scrapes.

ifeelauth = ""

try:
    from mechanicalsoup import StatefulBrowser
except ModuleNotFoundError:
    print("You need to install the mechanicalsoup module. (https://mechanicalsoup.readthedocs.io/en/stable/introduction.html#installation)", file=sys.stderr)
    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install MechanicalSoup", file=sys.stderr)
    sys.exit()

try:
    from requests.cookies import create_cookie
except ModuleNotFoundError:
    print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
    sys.exit()


def readJSONInput():
    input = sys.stdin.read()
    return json.loads(input)

def extract_SceneInfo(table,cover_url=None):
    description = None
    if table.find(class_= ["blog_wide_new_text","entryBlurb"]):
        description=table.find(class_= ["blog_wide_new_text","entryBlurb"]).get_text(" ", strip=True)
        description=unicodedata.normalize('NFKC', description).encode('ascii','ignore').decode('ascii')
    date = table.find(class_=["blog-title-right","entryDatestamp"]).get_text(strip=True) #This is a BeautifulSoup element. New IFM scenes are under blog-title-right clase for date. Older videos use entryDatestamp class
    performer = table.find(class_= ["entryHeadingFlash","entryHeading"]).find_all("a")[1].get_text().replace("_"," ")
    performer = str(performer)
    debugPrint(f"performer:{performer}")
    date = datetime.strptime(date, '%d %b %Y').date().strftime('%Y-%m-%d') #Convert date to ISO format
    if cover_url == None:
        if table.find("img"):
            cover_url=str(table.find("img")['src'])
        else:
            cover_url=str(table.find("video")['poster'])
    title = table.find(class_= ["entryHeadingFlash","entryHeading"]).find('a').get_text().replace("\x92","'")
    media_id = re.search(r"\/(\d{3,5})\/",cover_url,re.I).group(1)
    artist_id = re.search(r"\/(f\d{4,5})",cover_url,re.I).group(1)
    tags = table.find_all(class_="tags-list-item-tag")
    tag_list = []
    for tag in tags:
        tag_list.append({"name": tag.get_text()})
    debugPrint(f"tags: {str(tag_list)}")
    json_info = {"title": title, "performers": [{"name": performer}], "studio": {"name": "I Feel Myself"}, "tags": tag_list, "date":date, "image": cover_url,"details": description, "url": "https://ifeelmyself.com/public/main.php?page=flash_player&out=bkg&media_id="+media_id+"&artist_id="+artist_id}
    return json_info

def debugPrint(t):
    sys.stderr.write(t + "\n")

def scrapeScene(filename,date,url):
    ret = []
    browser = StatefulBrowser(session=None)
    browser.open("https://ifeelmyself.com/public/main.php")
    cookie_obj = create_cookie(name='tags_popup_shown', value='true', domain='ifeelmyself.com')
    browser.session.cookies.set_cookie(cookie_obj)
    cover_url = None
    if url:
      debugPrint("Url found, using that to scrape")
      if url.endswith(".jpg"):
      #use the image url to extract the metadeta
          media_id = re.search(r"\/(\d{3,5})\/",url,re.I).group(1)
          artist_id = re.search(r"\/(f\d{4,5})",url,re.I).group(1)
          debugPrint(f"Artist id found: {artist_id}")
          debugPrint(f"Media id found: {media_id}")
          cover_url = url
          url = "https://ifeelmyself.com/public/main.php?page=flash_player&out=bkg&media_id="+str(media_id)+"&artist_id="+str(artist_id)
      browser.open(url)
      response = browser.page
      table = response.find(class_ = ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
      if table:
        ret = extract_SceneInfo(table,cover_url)
    else:
        debugPrint("Analyzing filename...")
        artist_id_match=re.search(r"(f\d{3,5})",filename,re.I)
        if artist_id_match:
            artist_id = artist_id_match.group(0)
            video_id = re.search(r"-(\d+)",filename,re.I).group(1)
            cookie_obj = create_cookie(name='ifm_search_keyword', value=artist_id, domain='ifeelmyself.com')
            browser.session.cookies.set_cookie(cookie_obj)
            cookie_obj = create_cookie(name='ifm_prefs', value="a%3A1%3A%7Bs%3A6%3A%22search%22%3Ba%3A17%3A%7Bs%3A8%3A%22category%22%3Ba%3A0%3A%7B%7Ds%3A7%3A%22view_by%22%3Bs%3A4%3A%22news%22%3Bs%3A7%3A%22date_by%22%3Bs%3A7%3A%22anytime%22%3Bs%3A10%3A%22from_month%22%3Bs%3A1%3A%221%22%3Bs%3A9%3A%22from_year%22%3Bs%3A4%3A%222006%22%3Bs%3A8%3A%22to_month%22%3Bs%3A2%3A%2212%22%3Bs%3A7%3A%22to_year%22%3Bs%3A4%3A%223000%22%3Bs%3A7%3A%22country%22%3Bs%3A3%3A%22all%22%3Bs%3A10%3A%22attributes%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_logical%22%3Bs%3A3%3A%22AND%22%3Bs%3A13%3A%22tags_remember%22%3Bs%3A1%3A%22n%22%3Bs%3A4%3A%22tags%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_exclude%22%3Bs%3A0%3A%22%22%3Bs%3A9%3A%22hide_tags%22%3Ba%3A0%3A%7B%7Ds%3A8%3A%22age_from%22%3Bs%3A2%3A%2218%22%3Bs%3A6%3A%22age_to%22%3Bs%3A2%3A%2299%22%3Bs%3A16%3A%22profilevid_limit%22%3Bs%3A0%3A%22%22%3B%7D%7D", domain='.ifeelmyself.com')
            browser.session.cookies.set_cookie(cookie_obj)
            cookie_obj = create_cookie(name='ifeel_auth', value=ifeelauth, domain='.ifeelmyself.com')
            browser.session.cookies.set_cookie(cookie_obj)
            browser.open("https://ifeelmyself.com/public/main.php?page=search_results")
            response = browser.page
            debugPrint("Searching for video_id")
            debugPrint(artist_id+"-"+video_id)
            tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
            for table in tables:
                    if table.find('video'): #New scenes use the video tag
                        img=str(table.find("video")['poster'])
                    elif table.find('img'): #old scenes still use the old format of a img tag
                        img=str(table.find("img")['src'])
                    debugPrint(f"Image:{str(img)}")
                    if (f"/{artist_id}-{video_id}vg.jpg" in img) or (f"/{artist_id}-{video_id}hs.jpg" in img):
                        debugPrint("Found a single match video!")
                        # Extract data from this single result
                        ret = extract_SceneInfo(table)
                        break
            else:
                sys.stderr.write("0 matches found! Checking offset")
                pages=int(response.find_all("a", class_="pagging_nonsel")[-1].get_text())
                debugPrint("Pages:  "+str(pages))
                if pages:
                    for offset in range(0,pages*10,10):
                        browser.open("https://ifeelmyself.com/public/main.php?page=search_results&offset="+str(offset))
                        response = browser.page
                        tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
                        for table in tables:
                            if table.find('video'): #New scenes use the video tag
                                img=str(table.find("video")['poster'])
                            elif table.find('img'): #old scenes still use the old format of a img tag
                                img=str(table.find("img")['src'])
                            debugPrint(f"Image:{img}")
                            if (f"/{artist_id}-{video_id}vg.jpg" in img) or (f"/{artist_id}-{video_id}hs.jpg" in img):
                                sys.stderr.write("FOUND")
                                ret = extract_SceneInfo(table)
                                break
                else:
                    sys.stderr.write("0 matches found!, check your filename")

        else:
            debugPrint("Name changed after downloading")
            filename = filename.lower()
            extract_from_filename = re.match(r"^([0-9\.]{6,10})?(?<title>.+)\s(?<artist>\w+)(\.mp4)?$",filename)
            if extract_from_filename:
                title = extract_from_filename.group('title')
                if title:
                    title = title.lower().replace("ifeelmyself","")
                    title = title.replace("-","")
                    title = title.replace("by", "")
                    debugPrint(f"Title: {title}")
                cookie_obj = create_cookie(name='ifm_search_keyword', value=title, domain='ifeelmyself.com')
                browser.session.cookies.set_cookie(cookie_obj)
                cookie_obj = create_cookie(name='ifm_prefs', value="a%3A1%3A%7Bs%3A6%3A%22search%22%3Ba%3A17%3A%7Bs%3A8%3A%22category%22%3Ba%3A0%3A%7B%7Ds%3A7%3A%22view_by%22%3Bs%3A4%3A%22news%22%3Bs%3A7%3A%22date_by%22%3Bs%3A7%3A%22anytime%22%3Bs%3A10%3A%22from_month%22%3Bs%3A1%3A%221%22%3Bs%3A9%3A%22from_year%22%3Bs%3A4%3A%222006%22%3Bs%3A8%3A%22to_month%22%3Bs%3A2%3A%2212%22%3Bs%3A7%3A%22to_year%22%3Bs%3A4%3A%223000%22%3Bs%3A7%3A%22country%22%3Bs%3A3%3A%22all%22%3Bs%3A10%3A%22attributes%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_logical%22%3Bs%3A3%3A%22AND%22%3Bs%3A13%3A%22tags_remember%22%3Bs%3A1%3A%22n%22%3Bs%3A4%3A%22tags%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_exclude%22%3Bs%3A0%3A%22%22%3Bs%3A9%3A%22hide_tags%22%3Ba%3A0%3A%7B%7Ds%3A8%3A%22age_from%22%3Bs%3A2%3A%2218%22%3Bs%3A6%3A%22age_to%22%3Bs%3A2%3A%2299%22%3Bs%3A16%3A%22profilevid_limit%22%3Bs%3A0%3A%22%22%3B%7D%7D", domain='.ifeelmyself.com')
                browser.session.cookies.set_cookie(cookie_obj)
                cookie_obj = create_cookie(name='ifeel_auth', value=ifeelauth, domain='.ifeelmyself.com')
                browser.session.cookies.set_cookie(cookie_obj)
                browser.open("https://ifeelmyself.com/public/main.php?page=search_results")
                response = browser.page
                #Obtaining and counting the results. Ideally you only have a single result
                matches=response.find_all("a", href='javascript:;') #This a href javascript contains all the titles
                if len(matches)==1:
                    debugPrint("Found a single match!")
                    table = response.find(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
                else:
                    if len(matches)==0:
                        sys.stderr.write("0 matches found! Check filename")
                        print("{}")
                        exit
                    if len(matches)>1:
                        debugPrint("Multiple videos found, maybe refine search term?")
                        tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
                        table=tables[0] #Getting first
                if table:
                    ret = extract_SceneInfo(table)
            else:
                debugPrint("Not a supported filename")
                print("{}")
                exit
    return ret


def extract_PerformerInfo(table,browser,cover_url=None):
    performer = table.find(class_= ["entryHeadingFlash","entryHeading"]).find_all("a")[1].get_text().replace("_"," ")
    performer = str(performer)
    debugPrint(f"Extracting info for performer: {performer}")
    if cover_url == None:
        cover_url=str(table.find("img")['src'])
    debugPrint(cover_url)
    artist_id = re.search(r"\/((f|m)\d{4,5})",cover_url,re.I).group(1)
    artist_img = (f"https://bcdn.ifeelmyself.com/artists/" + artist_id + ".jpg")
    if artist_id.startswith("f"):
        gender="female"
    else:
        gender="male"
    json_info = {"name": performer, "gender": gender, "url": (f"https://ifeelmyself.com/public/main.php?page=artist_bio&artist_id="+artist_id), "image": artist_img, "remote_site_id": artist_id}
    return json_info


def queryPerformer(perfname):
    browser = StatefulBrowser(session=None)
    perfname = perfname.lower()
    browser.open("https://ifeelmyself.com/public/main.php")
    cookie_obj = create_cookie(name='tags_popup_shown', value='true', domain='ifeelmyself.com')
    browser.session.cookies.set_cookie(cookie_obj)
    cookie_obj = create_cookie(name='ifm_prefs', value="a%3A1%3A%7Bs%3A6%3A%22search%22%3Ba%3A17%3A%7Bs%3A8%3A%22category%22%3Ba%3A0%3A%7B%7Ds%3A7%3A%22view_by%22%3Bs%3A4%3A%22news%22%3Bs%3A7%3A%22date_by%22%3Bs%3A7%3A%22anytime%22%3Bs%3A10%3A%22from_month%22%3Bs%3A1%3A%221%22%3Bs%3A9%3A%22from_year%22%3Bs%3A4%3A%222006%22%3Bs%3A8%3A%22to_month%22%3Bs%3A2%3A%2212%22%3Bs%3A7%3A%22to_year%22%3Bs%3A4%3A%223000%22%3Bs%3A7%3A%22country%22%3Bs%3A3%3A%22all%22%3Bs%3A10%3A%22attributes%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_logical%22%3Bs%3A3%3A%22AND%22%3Bs%3A13%3A%22tags_remember%22%3Bs%3A1%3A%22n%22%3Bs%3A4%3A%22tags%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_exclude%22%3Bs%3A0%3A%22%22%3Bs%3A9%3A%22hide_tags%22%3Ba%3A0%3A%7B%7Ds%3A8%3A%22age_from%22%3Bs%3A2%3A%2218%22%3Bs%3A6%3A%22age_to%22%3Bs%3A2%3A%2299%22%3Bs%3A16%3A%22profilevid_limit%22%3Bs%3A0%3A%22%22%3B%7D%7D", domain='.ifeelmyself.com')
    browser.session.cookies.set_cookie(cookie_obj)
    cookie_obj = create_cookie(name='ifm_search_keyword', value=perfname, domain='ifeelmyself.com')
    browser.session.cookies.set_cookie(cookie_obj)
    cookie_obj = create_cookie(name='ifeel_auth', value=ifeelauth, domain='.ifeelmyself.com')
    browser.session.cookies.set_cookie(cookie_obj)
    debugPrint("Analyzing perfname...")
    browser.open("https://ifeelmyself.com/public/main.php?page=search_results")
    response = browser.page
    #Obtaining and counting the results. Ideally you only have a single result
    matches=response.find_all("a", href='javascript:;') #This a href javascript contains all the titles
    debugPrint("Found: "+str(len(matches)))
    ret = []
    foundList = []
    if len(matches)==0:
        # often performer names use a underscore instead of a space, so replace spaces and try again
        perfname = perfname.replace(" ","_")
        cookie_obj = create_cookie(name='ifm_search_keyword', value=perfname, domain='ifeelmyself.com')
        browser.session.cookies.set_cookie(cookie_obj)
        browser.open("https://ifeelmyself.com/public/main.php?page=search_results")
        response = browser.page
        #Obtaining and counting the results. Ideally you only have a single result
        matches=response.find_all("a", href='javascript:;') #This a href javascript contains all the titles
        if len(matches)==0:
            sys.stderr.write("0 matches found! Check performer name")
            print("{}")
            exit
        if len(matches)>0:
            debugPrint("Multiple videos found, scraping multiple performers")
            tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
            for table in tables:
                result = extract_PerformerInfo(table,browser)
                if not result['name'] in foundList:
                    foundList.append(result['name'])
                    ret.append(result)
    if len(matches)>0:
        tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
        for table in tables:
            result = extract_PerformerInfo(table,browser)
            if not result['name'] in foundList:
                foundList.append(result['name'])
                ret.append(result)
    return ret


def scrapePerformer(artist_id):
    browser = StatefulBrowser(session=None)
    cookie_obj = create_cookie(name='tags_popup_shown', value='true', domain='ifeelmyself.com')
    browser.session.cookies.set_cookie(cookie_obj)
    cookie_obj = create_cookie(name='ifeel_auth', value=ifeelauth, domain='.ifeelmyself.com')
    browser.session.cookies.set_cookie(cookie_obj)
    browser.open(f"https://ifeelmyself.com/public/main.php?page=artist_bio&artist_id="+artist_id)
    response = browser.page
    tables = response.find_all(class_= ["bioTable"])
    table=tables[0]
    debugPrint(str(table))
    bio = str(table.find("td"))
    lines=bio.splitlines(True)
    countryline=bio.splitlines(0)[1]
    country=countryline.split("<br/>")[1]
    details=lines[3]+lines[4]+lines[5]+lines[6]+lines[7]+lines[8]
    details=details.replace("<strong>","").replace("</strong>","").replace("<br/>","")
    json_info = {"country": country , "details": details}
    return json_info


# read the input
i = readJSONInput()
sys.stderr.write(json.dumps(i))

if sys.argv[1] == "query" and sys.argv[2] == "scene":
    ret = scrapeScene(i['title'],i['date'],i['url'])
    print(json.dumps(ret))

if sys.argv[1] == "query" and sys.argv[2] == "performer":
    ret = queryPerformer(i['name'])
    print(json.dumps(ret))

if sys.argv[1] == "url":
    ret = scrapeScene(filename=None,date=None,url=i['url'])
    print(json.dumps(ret))

if sys.argv[1] == "scrape":
    country = ""
    details = ""
    if not ifeelauth == "":
        ret = scrapePerformer(i['remote_site_id'])
        country = ret['country']
        details = ret['details']

    json_info = {"name": i['name'], "gender": i['gender'], "url": i['url'],"country": country ,"details": details , "image": "https://bcdn.ifeelmyself.com/artists/" + i['remote_site_id'] + ".jpg"}
    print(json.dumps(json_info))