146 lines
4.4 KiB
YAML
146 lines
4.4 KiB
YAML
name: IMDB
|
|
performerByName:
|
|
action: scrapeXPath
|
|
queryURL: https://www.imdb.com/search/name-text/?bio={}
|
|
scraper: performerSearch
|
|
performerByURL:
|
|
- action: scrapeXPath
|
|
url:
|
|
- imdb.com
|
|
scraper: performerScraper
|
|
sceneByName:
|
|
action: scrapeXPath
|
|
queryURL: https://www.imdb.com/find?q={}
|
|
scraper: sceneSearch
|
|
sceneByQueryFragment:
|
|
action: scrapeXPath
|
|
queryURL: "{url}"
|
|
scraper: sceneScraper
|
|
sceneByURL:
|
|
- action: scrapeXPath
|
|
url:
|
|
- imdb.com/title/
|
|
scraper: sceneScraper
|
|
movieByURL:
|
|
- action: scrapeXPath
|
|
url:
|
|
- imdb.com/title/
|
|
scraper: movieScraper
|
|
xPathScrapers:
|
|
performerSearch:
|
|
common:
|
|
$listAnchor: //div[@class="lister-list"]/div[@class="lister-item mode-detail"]/div[@class="lister-item-content"]/h3/a
|
|
performer:
|
|
Name:
|
|
selector: $listAnchor/text()
|
|
URL:
|
|
selector: $listAnchor/@href
|
|
postProcess:
|
|
- replace:
|
|
- regex: ^
|
|
with: https://www.imdb.com
|
|
performerScraper:
|
|
performer:
|
|
Name: //td[@id="overview-top" or @class="name-overview-widget__section"]//h1/span[1]/text()
|
|
Birthdate:
|
|
selector: //time/@datetime
|
|
postProcess:
|
|
- parseDate: 2006-1-2
|
|
Image: &imageAttr
|
|
selector: //meta[@property="og:image"]/@content
|
|
postProcess:
|
|
- replace:
|
|
- regex: '.*/imdb[^/]*\.png'
|
|
with:
|
|
Details:
|
|
# selector: //div[@class="name-trivia-bio-text"]/div/text()[1]
|
|
selector: //span[@class='see-more inline nobr-only']/a/@href
|
|
postProcess:
|
|
- replace:
|
|
- regex: ^
|
|
with: "https://www.imdb.com/"
|
|
- subScraper:
|
|
selector: //div[@class="soda odd"]/p/text()
|
|
|
|
URL: //div[@id='details-official-sites']/a[contains(text(),'Official Site')]/@href
|
|
# Facebook: //div[@id='details-official-sites']/a[contains(text(),'Facebook')]/@href
|
|
Instagram: //div[@id='details-official-sites']/a[contains(@href,"instagram.com/")]/@href
|
|
Aliases:
|
|
selector: //div[@id="details-akas"]/h4/following-sibling::text()
|
|
concat: "|"
|
|
postProcess:
|
|
- replace:
|
|
- regex: '\|+'
|
|
with: ", "
|
|
Height:
|
|
selector: //div[@id="details-height"]/h4/following-sibling::text()
|
|
postProcess:
|
|
- replace:
|
|
- regex: (.+)\s*\(.+$
|
|
with: $1
|
|
- feetToCm: true
|
|
|
|
sceneSearch:
|
|
common:
|
|
$scenerow: //h3[text() = "Titles"]/following-sibling::table//tr[contains(@class, 'findResult')]
|
|
scene:
|
|
Title: $scenerow//td[@class='result_text']
|
|
URL:
|
|
selector: $scenerow//td[@class='result_text']/a/@href
|
|
postProcess:
|
|
- replace:
|
|
- regex: ^
|
|
with: https://imdb.com
|
|
Image: $scenerow//td[@class='primary_photo']//img/@src
|
|
|
|
sceneScraper:
|
|
scene:
|
|
Title: &title //section//h1
|
|
URL: &url //meta[@property="og:url"]/@content
|
|
Movies:
|
|
Name: *title
|
|
URL: *url
|
|
Date: &date
|
|
selector: //li[@data-testid='title-details-releasedate']/div/ul/li/a/text()
|
|
postProcess:
|
|
- replace:
|
|
- regex: '\s*\(.+$'
|
|
with: ""
|
|
- regex: '^(\d\d\d\d)$'
|
|
with: $1-01-01
|
|
- parseDate: January 2, 2006
|
|
Details: &desc //span[@data-testid="plot-xl"]
|
|
Tags:
|
|
Name: //div[@data-testid="genres"]/a/span
|
|
Performers:
|
|
Name: //a[@data-testid="title-cast-item__actor"]
|
|
Image: *imageAttr
|
|
Studio: &studio
|
|
Name: (//li[@data-testid="title-details-companies"]/div//a)[1]
|
|
movieScraper:
|
|
movie:
|
|
Name: *title
|
|
URL: *url
|
|
Date: *date
|
|
Director: //section[@data-testid="title-cast"]//li[span[text()="Director"]]//a
|
|
Duration:
|
|
selector: //ul[@data-testid="hero-title-block__metadata"]/li[last()]
|
|
postProcess:
|
|
- replace:
|
|
- regex: (\d+)h\s*
|
|
with: "$1:"
|
|
- regex: (\d+)m
|
|
with: "$1:00"
|
|
- regex: ":$" # only h
|
|
with: ":00:00"
|
|
- regex: ^(\d+:\d+)$ # only m
|
|
with: "00:$1"
|
|
Studio: *studio
|
|
Synopsis: *desc
|
|
FrontImage: *imageAttr
|
|
driver:
|
|
headers:
|
|
- Key: User-Agent
|
|
Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0)
|
|
# Last Updated August 13, 2022
|