This commit is contained in:
Christoph Califice
2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions

View File

@@ -0,0 +1,134 @@
name: data18
movieByURL:
- action: scrapeXPath
url:
- data18.com/movies
scraper: movieScraper
sceneByURL:
- action: scrapeXPath
url:
- data18.com/scenes
scraper: sceneScraper
# Many people have single-file movies and want to scrape them
# as scenes instead of making a single-scene Movie object
- action: scrapeXPath
url:
- data18.com/movies
scraper: movieScraper
xPathScrapers:
sceneScraper:
common:
$performer: //div[h3[text()='Pornstars / Cast']]//a[@class='bold gen']
$studio: //div[@id="body2div_b"]//a[contains(@href,"/studios/")]
$movie: //b[text()="Movie:"]/following-sibling::a[1]
scene:
Title: //span/following-sibling::h1/a/text()
Date:
selector: //span[b[text()="Release date"]]
postProcess:
- replace:
- regex: '^Release date: ([a-zA-Z]+)(\s*\d\d)?,\s*(\d{4}).+'
with: "$1$2, $3"
- parseDate: January 02, 2006
- parseDate: January, 2006
Details:
selector: //div[b[text()="Story"]] | //b[contains(text(),"Movie Description")]/../text()
concat: " "
postProcess:
- replace:
- regex: "Story - "
with:
Tags:
Name: //b[text()='Categories:']/following-sibling::a
Performers:
Name: $performer
URL: $performer/@href
Studio:
Name: $studio
URL: $studio/@href
Movies:
Name: $movie/text()
URL: $movie/@href
Image: //img[@id="playpriimage"]/@src
movieScraper:
common:
$movieInfo: //div[@id="body2div_b"]
$studio: //b[text()='Studio']/following-sibling::b/a
$performer: //div[h3[contains(text(), 'Pornstars / Cast')]]//a[@class='bold gen']
movie:
Name:
selector: //title
postProcess:
- replace:
- regex: (.+?)(?:\s\(\d{4}\)\sPorn\sMovie\s\|\sDATA18)
with: $1
Duration:
selector: $movieInfo//b[contains(text(),"Length")]/following-sibling::span|$movieInfo//b[contains(text(),"Length")]/following-sibling::text()
postProcess:
- replace:
- regex: ^\[(.+)\]$ # handle movies with proper [xx:xx:xx] duration
with: $1
- regex: ^[^\d]*(\d+)\s*min.* # handle movies with only xx mins duration
with: "$1:00"
Date:
selector: $movieInfo//span[contains(text(), "Release date")]/text()
postProcess:
- replace:
- regex: 'Release date:\s*'
with:
- parseDate: January, 2006
Studio:
Name: $studio/text()
URL: $studio/@href
Director: //p[b[contains(text(),'Director')]]//a[@class='bold']
Synopsis:
selector: //b[text()="Description"]/..
concat: " "
postProcess:
- replace:
- regex: '^Description\s*-\s*'
with:
FrontImage: //a[@id='enlargecover']/@data-featherlight
BackImage: //a[text()='+Back']/@href
scene:
Title:
selector: //title
postProcess:
- replace:
- regex: (.+?)(?:\s\(\d{4}\)\sPorn\sMovie\s\|\sDATA18)
with: $1
Date:
selector: $movieInfo//span[contains(text(), "Release date")]/text()
postProcess:
- replace:
- regex: 'Release date:\s*'
with:
- parseDate: January, 2006
Studio:
Name: $studio/text()
URL: $studio/@href
Director: //p[b[contains(text(),'Director')]]//a[@class='bold']
Performers:
Name: $performer
URL: $performer/@href
Details:
selector: //b[text()="Description"]/..
concat: " "
postProcess:
- replace:
- regex: '^Description\s*-\s*'
with:
Image: //a[@id='enlargecover']/@data-featherlight
driver:
cookies:
- CookieURL: "https://data18.com"
Cookies:
- Name: "data_user_captcha"
Domain: ".data18.com"
Value: "1"
Path: "/"
headers:
- Key: User-Agent
Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0)
# Last Updated July 18, 2023