This commit is contained in:
Christoph Califice
2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions

View File

@@ -0,0 +1,75 @@
name: caribbeancom
sceneByURL:
- action: scrapeXPath
url:
- en.caribbeancom.com/eng/moviepages
- en.caribbeancompr.com/eng/moviepages
scraper: sceneScraper
sceneByFragment:
action: scrapeXPath
queryURL: https://en.caribbeancom{filename}
# constructs the movie URL from the filename, provided that the filename includes the movie id
queryURLReplace:
filename:
- regex: .*(\d{6}-\d{3}).* # carribeancom uses ids with form like 062212-055
with: .com/eng/moviepages/$1
- regex: .*(\d{6}_\d{3}).* # caribbeancompr uses 062212_055
with: pr.com/eng/moviepages/$1
- regex: $
with: /index.html
scraper: sceneScraper
xPathScrapers:
sceneScraper:
common:
$movieinfo: //div[@class="movie-info section divider"]
scene:
Title: //div[contains(@class,"heading")]/h1/text()
Details: $movieinfo/p
URL:
selector: //link[@hreflang="ja-JP"]/@href
postProcess:
- replace:
- regex: "https://www.caribbeancom.com"
with: "https://en.caribbeancom.com/eng"
Date:
selector: //ul/li/span[contains(.,"Release Date")]/../span[@class="spec-content"]
postProcess:
- replace:
- regex: (\d{4}).(\d{2}).(\d{2})
with: $1-$2-$3
- parseDate: 2006-01-02
Performers:
Name:
selector: //ul/li/span[contains(.,"Starring")]/..//a
concat: "," # caribbeancom splits name/surname for some performers
postProcess:
- replace:
- regex: "&" # in some cases & is used instead of , to split performers
with: ","
- regex: "([^,]),([^,])"
with: "$1 $2"
- regex: ",,"
with: ","
split: ","
Tags:
Name: //ul/li/span[contains(.,"Tags")]/../span/a[contains(@class,"spec")]
Image:
selector: //link[@hreflang="ja-JP"]/@href|//script[contains(.,"posterImage = '/moviepages/'+movie_id+'/images/")]
postProcess:
- replace:
- regex: index\.html$
with: images/l.jpg
- regex: .*posterImage\s*=\s*\'/moviepages/\'\+movie_id\+\'/images/([^\']+)\'(.|\s)*MoviePlayer\.setImage..movie_id\s.*:\s\'(\d+_\d+)\'.*
with: https://en.caribbeancompr.com/moviepages/$3/images/$1
Studio:
Name:
selector: //ul[@class="footer-copyright"]/li[contains(.,"©")]
postProcess:
- replace:
- regex: .*\s([\w]+)\.com.*
with: $1
- map:
caribbeancompr: Caribbeancom Premium
caribbeancom: Caribbeancom
# Last Updated May 20, 2021