name: caribbeancom sceneByURL: - action: scrapeXPath url: - en.caribbeancom.com/eng/moviepages - en.caribbeancompr.com/eng/moviepages scraper: sceneScraper sceneByFragment: action: scrapeXPath queryURL: https://en.caribbeancom{filename} # constructs the movie URL from the filename, provided that the filename includes the movie id queryURLReplace: filename: - regex: .*(\d{6}-\d{3}).* # carribeancom uses ids with form like 062212-055 with: .com/eng/moviepages/$1 - regex: .*(\d{6}_\d{3}).* # caribbeancompr uses 062212_055 with: pr.com/eng/moviepages/$1 - regex: $ with: /index.html scraper: sceneScraper xPathScrapers: sceneScraper: common: $movieinfo: //div[@class="movie-info section divider"] scene: Title: //div[contains(@class,"heading")]/h1/text() Details: $movieinfo/p URL: selector: //link[@hreflang="ja-JP"]/@href postProcess: - replace: - regex: "https://www.caribbeancom.com" with: "https://en.caribbeancom.com/eng" Date: selector: //ul/li/span[contains(.,"Release Date")]/../span[@class="spec-content"] postProcess: - replace: - regex: (\d{4}).(\d{2}).(\d{2}) with: $1-$2-$3 - parseDate: 2006-01-02 Performers: Name: selector: //ul/li/span[contains(.,"Starring")]/..//a concat: "," # caribbeancom splits name/surname for some performers postProcess: - replace: - regex: "&" # in some cases & is used instead of , to split performers with: "," - regex: "([^,]),([^,])" with: "$1 $2" - regex: ",," with: "," split: "," Tags: Name: //ul/li/span[contains(.,"Tags")]/../span/a[contains(@class,"spec")] Image: selector: //link[@hreflang="ja-JP"]/@href|//script[contains(.,"posterImage = '/moviepages/'+movie_id+'/images/")] postProcess: - replace: - regex: index\.html$ with: images/l.jpg - regex: .*posterImage\s*=\s*\'/moviepages/\'\+movie_id\+\'/images/([^\']+)\'(.|\s)*MoviePlayer\.setImage..movie_id\s.*:\s\'(\d+_\d+)\'.* with: https://en.caribbeancompr.com/moviepages/$3/images/$1 Studio: Name: selector: //ul[@class="footer-copyright"]/li[contains(.,"©")] postProcess: - replace: - regex: .*\s([\w]+)\.com.* with: $1 - map: caribbeancompr: Caribbeancom Premium caribbeancom: Caribbeancom # Last Updated May 20, 2021