299 lines
17 KiB
Python
299 lines
17 KiB
Python
import json
|
|
import re
|
|
import sys
|
|
from datetime import datetime
|
|
import unicodedata
|
|
|
|
# UNLESS logged in(and probably with an active subscription) scenes with certain tags(menstruation, pee) are hidden and can not be found by scraper.
|
|
# Also performer scraper will not be able to get country and details without being logged in.
|
|
# set value for ifeel_auth cookie here, may change and need to be renewed periodically.
|
|
# if no account available leave value empty and scraper won't find some videos and country and details fields will be missing from performer scrapes.
|
|
|
|
ifeelauth = ""
|
|
|
|
try:
|
|
from mechanicalsoup import StatefulBrowser
|
|
except ModuleNotFoundError:
|
|
print("You need to install the mechanicalsoup module. (https://mechanicalsoup.readthedocs.io/en/stable/introduction.html#installation)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install MechanicalSoup", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
try:
|
|
from requests.cookies import create_cookie
|
|
except ModuleNotFoundError:
|
|
print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
|
|
def readJSONInput():
|
|
input = sys.stdin.read()
|
|
return json.loads(input)
|
|
|
|
def extract_SceneInfo(table,cover_url=None):
|
|
description = None
|
|
if table.find(class_= ["blog_wide_new_text","entryBlurb"]):
|
|
description=table.find(class_= ["blog_wide_new_text","entryBlurb"]).get_text(" ", strip=True)
|
|
description=unicodedata.normalize('NFKC', description).encode('ascii','ignore').decode('ascii')
|
|
date = table.find(class_=["blog-title-right","entryDatestamp"]).get_text(strip=True) #This is a BeautifulSoup element. New IFM scenes are under blog-title-right clase for date. Older videos use entryDatestamp class
|
|
performer = table.find(class_= ["entryHeadingFlash","entryHeading"]).find_all("a")[1].get_text().replace("_"," ")
|
|
performer = str(performer)
|
|
debugPrint(f"performer:{performer}")
|
|
date = datetime.strptime(date, '%d %b %Y').date().strftime('%Y-%m-%d') #Convert date to ISO format
|
|
if cover_url == None:
|
|
if table.find("img"):
|
|
cover_url=str(table.find("img")['src'])
|
|
else:
|
|
cover_url=str(table.find("video")['poster'])
|
|
title = table.find(class_= ["entryHeadingFlash","entryHeading"]).find('a').get_text().replace("\x92","'")
|
|
media_id = re.search(r"\/(\d{3,5})\/",cover_url,re.I).group(1)
|
|
artist_id = re.search(r"\/(f\d{4,5})",cover_url,re.I).group(1)
|
|
tags = table.find_all(class_="tags-list-item-tag")
|
|
tag_list = []
|
|
for tag in tags:
|
|
tag_list.append({"name": tag.get_text()})
|
|
debugPrint(f"tags: {str(tag_list)}")
|
|
json_info = {"title": title, "performers": [{"name": performer}], "studio": {"name": "I Feel Myself"}, "tags": tag_list, "date":date, "image": cover_url,"details": description, "url": "https://ifeelmyself.com/public/main.php?page=flash_player&out=bkg&media_id="+media_id+"&artist_id="+artist_id}
|
|
return json_info
|
|
|
|
def debugPrint(t):
|
|
sys.stderr.write(t + "\n")
|
|
|
|
def scrapeScene(filename,date,url):
|
|
ret = []
|
|
browser = StatefulBrowser(session=None)
|
|
browser.open("https://ifeelmyself.com/public/main.php")
|
|
cookie_obj = create_cookie(name='tags_popup_shown', value='true', domain='ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
cover_url = None
|
|
if url:
|
|
debugPrint("Url found, using that to scrape")
|
|
if url.endswith(".jpg"):
|
|
#use the image url to extract the metadeta
|
|
media_id = re.search(r"\/(\d{3,5})\/",url,re.I).group(1)
|
|
artist_id = re.search(r"\/(f\d{4,5})",url,re.I).group(1)
|
|
debugPrint(f"Artist id found: {artist_id}")
|
|
debugPrint(f"Media id found: {media_id}")
|
|
cover_url = url
|
|
url = "https://ifeelmyself.com/public/main.php?page=flash_player&out=bkg&media_id="+str(media_id)+"&artist_id="+str(artist_id)
|
|
browser.open(url)
|
|
response = browser.page
|
|
table = response.find(class_ = ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
|
|
if table:
|
|
ret = extract_SceneInfo(table,cover_url)
|
|
else:
|
|
debugPrint("Analyzing filename...")
|
|
artist_id_match=re.search(r"(f\d{3,5})",filename,re.I)
|
|
if artist_id_match:
|
|
artist_id = artist_id_match.group(0)
|
|
video_id = re.search(r"-(\d+)",filename,re.I).group(1)
|
|
cookie_obj = create_cookie(name='ifm_search_keyword', value=artist_id, domain='ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
cookie_obj = create_cookie(name='ifm_prefs', value="a%3A1%3A%7Bs%3A6%3A%22search%22%3Ba%3A17%3A%7Bs%3A8%3A%22category%22%3Ba%3A0%3A%7B%7Ds%3A7%3A%22view_by%22%3Bs%3A4%3A%22news%22%3Bs%3A7%3A%22date_by%22%3Bs%3A7%3A%22anytime%22%3Bs%3A10%3A%22from_month%22%3Bs%3A1%3A%221%22%3Bs%3A9%3A%22from_year%22%3Bs%3A4%3A%222006%22%3Bs%3A8%3A%22to_month%22%3Bs%3A2%3A%2212%22%3Bs%3A7%3A%22to_year%22%3Bs%3A4%3A%223000%22%3Bs%3A7%3A%22country%22%3Bs%3A3%3A%22all%22%3Bs%3A10%3A%22attributes%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_logical%22%3Bs%3A3%3A%22AND%22%3Bs%3A13%3A%22tags_remember%22%3Bs%3A1%3A%22n%22%3Bs%3A4%3A%22tags%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_exclude%22%3Bs%3A0%3A%22%22%3Bs%3A9%3A%22hide_tags%22%3Ba%3A0%3A%7B%7Ds%3A8%3A%22age_from%22%3Bs%3A2%3A%2218%22%3Bs%3A6%3A%22age_to%22%3Bs%3A2%3A%2299%22%3Bs%3A16%3A%22profilevid_limit%22%3Bs%3A0%3A%22%22%3B%7D%7D", domain='.ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
cookie_obj = create_cookie(name='ifeel_auth', value=ifeelauth, domain='.ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
browser.open("https://ifeelmyself.com/public/main.php?page=search_results")
|
|
response = browser.page
|
|
debugPrint("Searching for video_id")
|
|
debugPrint(artist_id+"-"+video_id)
|
|
tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
|
|
for table in tables:
|
|
if table.find('video'): #New scenes use the video tag
|
|
img=str(table.find("video")['poster'])
|
|
elif table.find('img'): #old scenes still use the old format of a img tag
|
|
img=str(table.find("img")['src'])
|
|
debugPrint(f"Image:{str(img)}")
|
|
if (f"/{artist_id}-{video_id}vg.jpg" in img) or (f"/{artist_id}-{video_id}hs.jpg" in img):
|
|
debugPrint("Found a single match video!")
|
|
# Extract data from this single result
|
|
ret = extract_SceneInfo(table)
|
|
break
|
|
else:
|
|
sys.stderr.write("0 matches found! Checking offset")
|
|
pages=int(response.find_all("a", class_="pagging_nonsel")[-1].get_text())
|
|
debugPrint("Pages: "+str(pages))
|
|
if pages:
|
|
for offset in range(0,pages*10,10):
|
|
browser.open("https://ifeelmyself.com/public/main.php?page=search_results&offset="+str(offset))
|
|
response = browser.page
|
|
tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
|
|
for table in tables:
|
|
if table.find('video'): #New scenes use the video tag
|
|
img=str(table.find("video")['poster'])
|
|
elif table.find('img'): #old scenes still use the old format of a img tag
|
|
img=str(table.find("img")['src'])
|
|
debugPrint(f"Image:{img}")
|
|
if (f"/{artist_id}-{video_id}vg.jpg" in img) or (f"/{artist_id}-{video_id}hs.jpg" in img):
|
|
sys.stderr.write("FOUND")
|
|
ret = extract_SceneInfo(table)
|
|
break
|
|
else:
|
|
sys.stderr.write("0 matches found!, check your filename")
|
|
|
|
else:
|
|
debugPrint("Name changed after downloading")
|
|
filename = filename.lower()
|
|
extract_from_filename = re.match(r"^([0-9\.]{6,10})?(?<title>.+)\s(?<artist>\w+)(\.mp4)?$",filename)
|
|
if extract_from_filename:
|
|
title = extract_from_filename.group('title')
|
|
if title:
|
|
title = title.lower().replace("ifeelmyself","")
|
|
title = title.replace("-","")
|
|
title = title.replace("by", "")
|
|
debugPrint(f"Title: {title}")
|
|
cookie_obj = create_cookie(name='ifm_search_keyword', value=title, domain='ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
cookie_obj = create_cookie(name='ifm_prefs', value="a%3A1%3A%7Bs%3A6%3A%22search%22%3Ba%3A17%3A%7Bs%3A8%3A%22category%22%3Ba%3A0%3A%7B%7Ds%3A7%3A%22view_by%22%3Bs%3A4%3A%22news%22%3Bs%3A7%3A%22date_by%22%3Bs%3A7%3A%22anytime%22%3Bs%3A10%3A%22from_month%22%3Bs%3A1%3A%221%22%3Bs%3A9%3A%22from_year%22%3Bs%3A4%3A%222006%22%3Bs%3A8%3A%22to_month%22%3Bs%3A2%3A%2212%22%3Bs%3A7%3A%22to_year%22%3Bs%3A4%3A%223000%22%3Bs%3A7%3A%22country%22%3Bs%3A3%3A%22all%22%3Bs%3A10%3A%22attributes%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_logical%22%3Bs%3A3%3A%22AND%22%3Bs%3A13%3A%22tags_remember%22%3Bs%3A1%3A%22n%22%3Bs%3A4%3A%22tags%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_exclude%22%3Bs%3A0%3A%22%22%3Bs%3A9%3A%22hide_tags%22%3Ba%3A0%3A%7B%7Ds%3A8%3A%22age_from%22%3Bs%3A2%3A%2218%22%3Bs%3A6%3A%22age_to%22%3Bs%3A2%3A%2299%22%3Bs%3A16%3A%22profilevid_limit%22%3Bs%3A0%3A%22%22%3B%7D%7D", domain='.ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
cookie_obj = create_cookie(name='ifeel_auth', value=ifeelauth, domain='.ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
browser.open("https://ifeelmyself.com/public/main.php?page=search_results")
|
|
response = browser.page
|
|
#Obtaining and counting the results. Ideally you only have a single result
|
|
matches=response.find_all("a", href='javascript:;') #This a href javascript contains all the titles
|
|
if len(matches)==1:
|
|
debugPrint("Found a single match!")
|
|
table = response.find(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
|
|
else:
|
|
if len(matches)==0:
|
|
sys.stderr.write("0 matches found! Check filename")
|
|
print("{}")
|
|
exit
|
|
if len(matches)>1:
|
|
debugPrint("Multiple videos found, maybe refine search term?")
|
|
tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
|
|
table=tables[0] #Getting first
|
|
if table:
|
|
ret = extract_SceneInfo(table)
|
|
else:
|
|
debugPrint("Not a supported filename")
|
|
print("{}")
|
|
exit
|
|
return ret
|
|
|
|
|
|
|
|
|
|
|
|
def extract_PerformerInfo(table,browser,cover_url=None):
|
|
performer = table.find(class_= ["entryHeadingFlash","entryHeading"]).find_all("a")[1].get_text().replace("_"," ")
|
|
performer = str(performer)
|
|
debugPrint(f"Extracting info for performer: {performer}")
|
|
if cover_url == None:
|
|
cover_url=str(table.find("img")['src'])
|
|
debugPrint(cover_url)
|
|
artist_id = re.search(r"\/((f|m)\d{4,5})",cover_url,re.I).group(1)
|
|
artist_img = (f"https://bcdn.ifeelmyself.com/artists/" + artist_id + ".jpg")
|
|
if artist_id.startswith("f"):
|
|
gender="female"
|
|
else:
|
|
gender="male"
|
|
json_info = {"name": performer, "gender": gender, "url": (f"https://ifeelmyself.com/public/main.php?page=artist_bio&artist_id="+artist_id), "image": artist_img, "remote_site_id": artist_id}
|
|
return json_info
|
|
|
|
|
|
def queryPerformer(perfname):
|
|
browser = StatefulBrowser(session=None)
|
|
perfname = perfname.lower()
|
|
browser.open("https://ifeelmyself.com/public/main.php")
|
|
cookie_obj = create_cookie(name='tags_popup_shown', value='true', domain='ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
cookie_obj = create_cookie(name='ifm_prefs', value="a%3A1%3A%7Bs%3A6%3A%22search%22%3Ba%3A17%3A%7Bs%3A8%3A%22category%22%3Ba%3A0%3A%7B%7Ds%3A7%3A%22view_by%22%3Bs%3A4%3A%22news%22%3Bs%3A7%3A%22date_by%22%3Bs%3A7%3A%22anytime%22%3Bs%3A10%3A%22from_month%22%3Bs%3A1%3A%221%22%3Bs%3A9%3A%22from_year%22%3Bs%3A4%3A%222006%22%3Bs%3A8%3A%22to_month%22%3Bs%3A2%3A%2212%22%3Bs%3A7%3A%22to_year%22%3Bs%3A4%3A%223000%22%3Bs%3A7%3A%22country%22%3Bs%3A3%3A%22all%22%3Bs%3A10%3A%22attributes%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_logical%22%3Bs%3A3%3A%22AND%22%3Bs%3A13%3A%22tags_remember%22%3Bs%3A1%3A%22n%22%3Bs%3A4%3A%22tags%22%3Ba%3A0%3A%7B%7Ds%3A12%3A%22tags_exclude%22%3Bs%3A0%3A%22%22%3Bs%3A9%3A%22hide_tags%22%3Ba%3A0%3A%7B%7Ds%3A8%3A%22age_from%22%3Bs%3A2%3A%2218%22%3Bs%3A6%3A%22age_to%22%3Bs%3A2%3A%2299%22%3Bs%3A16%3A%22profilevid_limit%22%3Bs%3A0%3A%22%22%3B%7D%7D", domain='.ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
cookie_obj = create_cookie(name='ifm_search_keyword', value=perfname, domain='ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
cookie_obj = create_cookie(name='ifeel_auth', value=ifeelauth, domain='.ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
debugPrint("Analyzing perfname...")
|
|
browser.open("https://ifeelmyself.com/public/main.php?page=search_results")
|
|
response = browser.page
|
|
#Obtaining and counting the results. Ideally you only have a single result
|
|
matches=response.find_all("a", href='javascript:;') #This a href javascript contains all the titles
|
|
debugPrint("Found: "+str(len(matches)))
|
|
ret = []
|
|
foundList = []
|
|
if len(matches)==0:
|
|
# often performer names use a underscore instead of a space, so replace spaces and try again
|
|
perfname = perfname.replace(" ","_")
|
|
cookie_obj = create_cookie(name='ifm_search_keyword', value=perfname, domain='ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
browser.open("https://ifeelmyself.com/public/main.php?page=search_results")
|
|
response = browser.page
|
|
#Obtaining and counting the results. Ideally you only have a single result
|
|
matches=response.find_all("a", href='javascript:;') #This a href javascript contains all the titles
|
|
if len(matches)==0:
|
|
sys.stderr.write("0 matches found! Check performer name")
|
|
print("{}")
|
|
exit
|
|
if len(matches)>0:
|
|
debugPrint("Multiple videos found, scraping multiple performers")
|
|
tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
|
|
for table in tables:
|
|
result = extract_PerformerInfo(table,browser)
|
|
if not result['name'] in foundList:
|
|
foundList.append(result['name'])
|
|
ret.append(result)
|
|
if len(matches)>0:
|
|
tables = response.find_all(class_= ["blog_wide_news_tbl entry ppss-scene","entry ppss-scene"])
|
|
for table in tables:
|
|
result = extract_PerformerInfo(table,browser)
|
|
if not result['name'] in foundList:
|
|
foundList.append(result['name'])
|
|
ret.append(result)
|
|
return ret
|
|
|
|
|
|
|
|
|
|
|
|
def scrapePerformer(artist_id):
|
|
browser = StatefulBrowser(session=None)
|
|
cookie_obj = create_cookie(name='tags_popup_shown', value='true', domain='ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
cookie_obj = create_cookie(name='ifeel_auth', value=ifeelauth, domain='.ifeelmyself.com')
|
|
browser.session.cookies.set_cookie(cookie_obj)
|
|
browser.open(f"https://ifeelmyself.com/public/main.php?page=artist_bio&artist_id="+artist_id)
|
|
response = browser.page
|
|
tables = response.find_all(class_= ["bioTable"])
|
|
table=tables[0]
|
|
debugPrint(str(table))
|
|
bio = str(table.find("td"))
|
|
lines=bio.splitlines(True)
|
|
countryline=bio.splitlines(0)[1]
|
|
country=countryline.split("<br/>")[1]
|
|
details=lines[3]+lines[4]+lines[5]+lines[6]+lines[7]+lines[8]
|
|
details=details.replace("<strong>","").replace("</strong>","").replace("<br/>","")
|
|
json_info = {"country": country , "details": details}
|
|
return json_info
|
|
|
|
|
|
|
|
|
|
# read the input
|
|
i = readJSONInput()
|
|
sys.stderr.write(json.dumps(i))
|
|
|
|
if sys.argv[1] == "query" and sys.argv[2] == "scene":
|
|
ret = scrapeScene(i['title'],i['date'],i['url'])
|
|
print(json.dumps(ret))
|
|
|
|
if sys.argv[1] == "query" and sys.argv[2] == "performer":
|
|
ret = queryPerformer(i['name'])
|
|
print(json.dumps(ret))
|
|
|
|
if sys.argv[1] == "url":
|
|
ret = scrapeScene(filename=None,date=None,url=i['url'])
|
|
print(json.dumps(ret))
|
|
|
|
if sys.argv[1] == "scrape":
|
|
country = ""
|
|
details = ""
|
|
if not ifeelauth == "":
|
|
ret = scrapePerformer(i['remote_site_id'])
|
|
country = ret['country']
|
|
details = ret['details']
|
|
|
|
json_info = {"name": i['name'], "gender": i['gender'], "url": i['url'],"country": country ,"details": details , "image": "https://bcdn.ifeelmyself.com/artists/" + i['remote_site_id'] + ".jpg"}
|
|
print(json.dumps(json_info))
|