compose-projects-arr/stash/config/scrapers/community/data18/data18.yml

name: data18
movieByURL:
  - action: scrapeXPath
    url:
      - data18.com/movies
    scraper: movieScraper
sceneByURL:
  - action: scrapeXPath
    url:
      - data18.com/scenes
    scraper: sceneScraper
  # Many people have single-file movies and want to scrape them
  # as scenes instead of making a single-scene Movie object
  - action: scrapeXPath
    url:
      - data18.com/movies
    scraper: movieScraper

xPathScrapers:
  sceneScraper:
    common:
      $performer: //div[h3[text()='Pornstars / Cast']]//a[@class='bold gen']
      $studio: //div[@id="body2div_b"]//a[contains(@href,"/studios/")]
      $movie: //b[text()="Movie:"]/following-sibling::a[1]
    scene:
      Title: //span/following-sibling::h1/a/text()
      Date:
        selector: //span[b[text()="Release date"]]
        postProcess:
          - replace:
              - regex: '^Release date: ([a-zA-Z]+)(\s*\d\d)?,\s*(\d{4}).+'
                with: "$1$2, $3"
          - parseDate: January 02, 2006
          - parseDate: January, 2006
      Details:
        selector: //div[b[text()="Story"]] | //b[contains(text(),"Movie Description")]/../text()
        concat: " "
        postProcess:
          - replace:
              - regex: "Story - "
                with:
      Tags:
        Name: //b[text()='Categories:']/following-sibling::a
      Performers:
        Name: $performer
        URL: $performer/@href
      Studio:
        Name: $studio
        URL: $studio/@href
      Movies:
        Name: $movie/text()
        URL: $movie/@href
      Image: //img[@id="playpriimage"]/@src
  movieScraper:
    common:
      $movieInfo: //div[@id="body2div_b"]
      $studio: //b[text()='Studio']/following-sibling::b/a
      $performer: //div[h3[contains(text(), 'Pornstars / Cast')]]//a[@class='bold gen']
    movie:
      Name:
        selector: //title
        postProcess:
          - replace:
              - regex: (.+?)(?:\s\(\d{4}\)\sPorn\sMovie\s\|\sDATA18)
                with: $1
      Duration:
        selector: $movieInfo//b[contains(text(),"Length")]/following-sibling::span|$movieInfo//b[contains(text(),"Length")]/following-sibling::text()
        postProcess:
          - replace:
              - regex: ^\[(.+)\]$ # handle movies with proper [xx:xx:xx] duration
                with: $1
              - regex: ^[^\d]*(\d+)\s*min.* # handle movies with only xx mins duration
                with: "$1:00"
      Date:
        selector: $movieInfo//span[contains(text(), "Release date")]/text()
        postProcess:
          - replace:
              - regex: 'Release date:\s*'
                with:
          - parseDate: January, 2006
      Studio:
        Name: $studio/text()
        URL: $studio/@href
      Director: //p[b[contains(text(),'Director')]]//a[@class='bold']
      Synopsis:
        selector: //b[text()="Description"]/..
        concat: " "
        postProcess:
          - replace:
              - regex: '^Description\s*-\s*'
                with:
      FrontImage: //a[@id='enlargecover']/@data-featherlight
      BackImage: //a[text()='+Back']/@href
    scene:
      Title:
        selector: //title
        postProcess:
          - replace:
              - regex: (.+?)(?:\s\(\d{4}\)\sPorn\sMovie\s\|\sDATA18)
                with: $1
      Date:
        selector: $movieInfo//span[contains(text(), "Release date")]/text()
        postProcess:
          - replace:
              - regex: 'Release date:\s*'
                with:
          - parseDate: January, 2006
      Studio:
        Name: $studio/text()
        URL: $studio/@href
      Director: //p[b[contains(text(),'Director')]]//a[@class='bold']
      Performers:
        Name: $performer
        URL: $performer/@href
      Details:
        selector: //b[text()="Description"]/..
        concat: " "
        postProcess:
          - replace:
              - regex: '^Description\s*-\s*'
                with:
      Image: //a[@id='enlargecover']/@data-featherlight
driver:
  cookies:
    - CookieURL: "https://data18.com"
      Cookies:
        - Name: "data_user_captcha"
          Domain: ".data18.com"
          Value: "1"
          Path: "/"
  headers:
    - Key: User-Agent
      Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0)
# Last Updated July 18, 2023