name: Heyzo sceneByURL: - action: scrapeXPath url: - en.heyzo.com/moviepages scraper: sceneScraper sceneByFragment: action: scrapeXPath queryURL: https://en.heyzo.com/moviepages/{filename} # constructs the movie URL from the filename, provided that the filename includes the movie id queryURLReplace: filename: # heyzo uses a 4 digit number for ids, here we take a series of numbers just to be safe - regex: (.*[^a-zA-Z\d])*(\d+)[^a-zA-Z\d].* with: $2/index.html scraper: sceneScraper xPathScrapers: sceneScraper: common: $table: //div[@class="info-bg"]/table/tbody/tr $movieObject: //script[contains(.,"movie_obj")]/text() scene: Date: selector: $table/td[contains(.,"Released")]/following-sibling::td/text() postProcess: - parseDate: 2006-01-02 Performers: Name: $table/td[contains(.,"Actress")]/following-sibling::td/a/text() Image: selector: $movieObject postProcess: - replace: - regex: .*thumbnail"\s*:\s*"([^"]+).* with: https:$1 Title: selector: $movieObject postProcess: - replace: - regex: .*name"\s*:\s*"([^"]+).* with: $1 Code: selector: //script[contains(.,"movieId")]/text() postProcess: - replace: - regex: .*movieId\s*=\s*"([^"]+).* with: $1 Tags: Name: $table/td[contains(.,"Type") or contains(.,"Sex Styles") or contains(.,"Theme") ]/following-sibling::td/a/text() Studio: Name: fixed: Heyzo # Last Updated January 16, 2024