name: IMDB performerByName: action: scrapeXPath queryURL: https://www.imdb.com/search/name-text/?bio={} scraper: performerSearch performerByURL: - action: scrapeXPath url: - imdb.com scraper: performerScraper sceneByName: action: scrapeXPath queryURL: https://www.imdb.com/find?q={} scraper: sceneSearch sceneByQueryFragment: action: scrapeXPath queryURL: "{url}" scraper: sceneScraper sceneByURL: - action: scrapeXPath url: - imdb.com/title/ scraper: sceneScraper movieByURL: - action: scrapeXPath url: - imdb.com/title/ scraper: movieScraper xPathScrapers: performerSearch: common: $listAnchor: //div[@class="lister-list"]/div[@class="lister-item mode-detail"]/div[@class="lister-item-content"]/h3/a performer: Name: selector: $listAnchor/text() URL: selector: $listAnchor/@href postProcess: - replace: - regex: ^ with: https://www.imdb.com performerScraper: performer: Name: //td[@id="overview-top" or @class="name-overview-widget__section"]//h1/span[1]/text() Birthdate: selector: //time/@datetime postProcess: - parseDate: 2006-1-2 Image: &imageAttr selector: //meta[@property="og:image"]/@content postProcess: - replace: - regex: '.*/imdb[^/]*\.png' with: Details: # selector: //div[@class="name-trivia-bio-text"]/div/text()[1] selector: //span[@class='see-more inline nobr-only']/a/@href postProcess: - replace: - regex: ^ with: "https://www.imdb.com/" - subScraper: selector: //div[@class="soda odd"]/p/text() URL: //div[@id='details-official-sites']/a[contains(text(),'Official Site')]/@href # Facebook: //div[@id='details-official-sites']/a[contains(text(),'Facebook')]/@href Instagram: //div[@id='details-official-sites']/a[contains(@href,"instagram.com/")]/@href Aliases: selector: //div[@id="details-akas"]/h4/following-sibling::text() concat: "|" postProcess: - replace: - regex: '\|+' with: ", " Height: selector: //div[@id="details-height"]/h4/following-sibling::text() postProcess: - replace: - regex: (.+)\s*\(.+$ with: $1 - feetToCm: true sceneSearch: common: $scenerow: //h3[text() = "Titles"]/following-sibling::table//tr[contains(@class, 'findResult')] scene: Title: $scenerow//td[@class='result_text'] URL: selector: $scenerow//td[@class='result_text']/a/@href postProcess: - replace: - regex: ^ with: https://imdb.com Image: $scenerow//td[@class='primary_photo']//img/@src sceneScraper: scene: Title: &title //section//h1 URL: &url //meta[@property="og:url"]/@content Movies: Name: *title URL: *url Date: &date selector: //li[@data-testid='title-details-releasedate']/div/ul/li/a/text() postProcess: - replace: - regex: '\s*\(.+$' with: "" - regex: '^(\d\d\d\d)$' with: $1-01-01 - parseDate: January 2, 2006 Details: &desc //span[@data-testid="plot-xl"] Tags: Name: //div[@data-testid="genres"]/a/span Performers: Name: //a[@data-testid="title-cast-item__actor"] Image: *imageAttr Studio: &studio Name: (//li[@data-testid="title-details-companies"]/div//a)[1] movieScraper: movie: Name: *title URL: *url Date: *date Director: //section[@data-testid="title-cast"]//li[span[text()="Director"]]//a Duration: selector: //ul[@data-testid="hero-title-block__metadata"]/li[last()] postProcess: - replace: - regex: (\d+)h\s* with: "$1:" - regex: (\d+)m with: "$1:00" - regex: ":$" # only h with: ":00:00" - regex: ^(\d+:\d+)$ # only m with: "00:$1" Studio: *studio Synopsis: *desc FrontImage: *imageAttr driver: headers: - Key: User-Agent Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0) # Last Updated August 13, 2022