This commit is contained in:
Christoph Califice
2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions

View File

@@ -0,0 +1,143 @@
name: Pornhub
performerByName:
action: scrapeXPath
queryURL: https://www.pornhub.com/pornstars/search?search={}
scraper: performerSearch
performerByURL:
- action: scrapeXPath
url:
- pornhub.com
scraper: performerScraper
sceneByURL:
- action: scrapeXPath
url:
- pornhub.com/view_video.php?viewkey=
scraper: sceneScraper
sceneByFragment:
action: scrapeXPath
queryURL: https://www.pornhub.com/view_video.php?viewkey={filename}
queryURLReplace:
filename:
- regex: (?:.*[^a-zA-Z\d])?((?:ph)?(?:[a-zA-Z\d]{13})).+
with: $1
- regex: .*\.[^\.]+$ # if no ph id is found in the filename
with: # clear the filename so that it doesn't leak to ph
scraper: sceneScraper
sceneByName:
action: scrapeXPath
queryURL: https://www.pornhub.com/video/search?search={}
scraper: sceneSearch
sceneByQueryFragment:
action: scrapeXPath
queryURL: "{url}"
scraper: sceneScraper
xPathScrapers:
sceneSearch:
common:
$searchItem: //ul[contains(@class, "search-video-thumbs") and not(@id="bottomVideos")]
$searchThumb: //ul[contains(@class, "search-video-thumbs") and not(@id="bottomVideos")]//div[contains(@class, "thumbnail-info-wrapper")]/span[@class="title"]/a
scene:
Title: $searchThumb/text()
URL:
selector: $searchThumb/@href
postProcess:
- replace:
- regex: ^
with: "https://www.pornhub.com"
Image:
selector: $searchItem//div[contains(@class, "phimage")]//img/@data-mediumthumb
performerSearch:
performer:
Name: //div[@class="wrap"]/div[@class="thumbnail-info-wrapper"]/a[@class="title"]/text()
URL:
selector: //div[@class="wrap"]/div[@class="thumbnail-info-wrapper"]/a[@class="title"]/@href
postProcess:
- replace:
- regex: ^
with: "https://www.pornhub.com"
performerScraper:
common:
$infoPiece: //div[@class="infoPiece"]
$infoContainer: //div[@class="infoContainer"]
$smallInfo: span[@class="smallInfo"]
performer:
Name: //h1[@itemprop="name"]|$infoContainer//h1
Birthdate:
selector: //span[@itemprop="birthDate"]|$infoPiece[contains(span,"Born:")]/text()
postProcess:
- parseDate: Jan 2, 2006
- parseDate: 2006-01-02
Country:
selector: $infoPiece[contains(span,"Birthplace:")]/text()|$infoPiece[contains(span,"City and Country:")]/$smallInfo|$infoPiece[contains(span,"Birth Place:")]/$smallInfo
postProcess:
- replace:
- regex: .+,\s?([^,]+$)
with: $1
- map:
US: "USA"
United States of America: "USA"
Gender: $infoPiece[contains(span,"Gender:")]/$smallInfo
Twitter: //ul[contains(@class,"socialList")]//a[contains(@href,"twitter.com/")]/@href
Instagram: //ul[contains(@class,"socialList")]//a[contains(@href,"instagram.com/")]/@href
Measurements: $infoPiece[contains(span,"Measurements:")]/$smallInfo|$infoPiece[contains(span,"Measurements:")]/text()
Weight:
selector: $infoPiece[contains(span,"Weight:")]/$smallInfo|$infoPiece[contains(span,"Weight:")]/text()
postProcess:
- replace:
- regex: .*\((\d+)\s*kg\)
with: $1
Height:
selector: $infoPiece[contains(span,"Height:")]/$smallInfo|$infoPiece[contains(span,"Height:")]/text()
postProcess:
- replace:
- regex: .*\((\d+)\s*cm\)
with: $1
Details: //div[@itemprop="description" or starts-with(@class,"text longBio")]
Ethnicity: $infoPiece[contains(span,"Ethnicity:")]/$smallInfo|$infoPiece[contains(span,"Ethnicity:")]/text()
FakeTits: $infoPiece[contains(span,"Fake Boobs:")]/$smallInfo
Piercings: $infoPiece[contains(span,"Piercings:")]/$smallInfo
Tattoos: $infoPiece[contains(span,"Tattoos:")]/$smallInfo
HairColor: $infoPiece[contains(span,"Hair Color:")]/$smallInfo
CareerLength:
selector: $infoPiece[contains(span,"Career Start and End:")]/$smallInfo
postProcess:
- replace:
- regex: \s+to\s+
with: "-"
URL: //link[@rel="canonical"][1]/@href
Image: //div[@class="thumbImage"]/img/@src|//img[@id="getAvatar"]/@src
sceneScraper:
common:
$datablob: //script[contains(., 'VideoObject')]/text()
$videowrap: //div[@class="video-wrapper"]
scene:
Title: //meta[@property="og:title"]/@content
URL: //meta[@property="og:url"]/@content
Date:
selector: $datablob
postProcess:
- replace:
- regex: .+(?:"uploadDate"\s*:\s*")([^T]+).+
with: $1
- parseDate: "2006-01-02"
Tags:
Name: $videowrap//div[contains(concat(" ",normalize-space(@class)," ")," categoriesWrapper ")]/a/text()|$videowrap//div[contains(concat(" ",normalize-space(@class)," ")," tagsWrapper ")]/a/text()
Performers:
Name: $videowrap//div[contains(concat(" ",normalize-space(@class)," ")," pornstarsWrapper ")]/a/@data-mxptext
Image: //meta[@property="og:image"]/@content
Studio:
Name: $videowrap//div[contains(concat(" ",normalize-space(@class)," ")," usernameWrap ")]//a/text()
driver:
cookies:
- CookieURL: "https://www.pornhub.com"
Cookies:
- Name: "accessAgeDisclaimerPH"
Domain: ".pornhub.com"
Value: "1"
Path: "/"
# Last Updated September 26, 2023

View File

@@ -0,0 +1,9 @@
id: Pornhub
name: Pornhub
metadata: {}
version: 2a9764c
date: "2024-03-07 18:09:51"
requires: []
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
files:
- Pornhub.yml