compose-projects-arr/stash/config/scrapers/community/DLsite/DLsite.yml

name: DLsite
sceneByFragment:
  action: scrapeXPath
  # expects RJ code as the title
  queryURL: https://www.dlsite.com/maniax/work/=/product_id/{title}.html
  #queryURL: https://www.dlsite.com/maniax/work/=/product_id/{title}.html/?locale=en_US
  scraper: sceneScraper
sceneByURL:
  - action: scrapeXPath
    url:
      - dlsite.com/maniax/work/
    scraper: sceneScraper
    queryURL: "{url}"
sceneByName:
  action: scrapeXPath
  queryURL: https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category%5B0%5D/male/keyword/{}/order%5B0%5D/trend/per_page/30/from/fs.header
  scraper: sceneQueryScraper
sceneByQueryFragment:
  action: scrapeXPath
  queryURL: "{url}"
  #queryURLReplace:
  #  url:
  #    - regex: $
  #      with: "/?locale=en_US"
  scraper: sceneScraper
galleryByFragment:
  action: scrapeXPath
  queryURL: https://www.dlsite.com/maniax/work/=/product_id/{title}.html
  #queryURL: https://www.dlsite.com/maniax/work/=/product_id/{title}.html/?locale=en_US
  scraper: galleryManiaxScraper
galleryByURL:
  - action: scrapeXPath
    url:
      - dlsite.com/maniax/work/
    scraper: galleryManiaxScraper
    queryURL: "{url}"
  - action: scrapeXPath
    url:
      - dlsite.com/books/work/
    scraper: galleryBookScraper
    queryURL: "{url}"

xPathScrapers:
  sceneQueryScraper:
    scene:
      Title: //dd[@class='work_name']/div[@class='multiline_truncate']/a/text()
      URL: //dd[@class='work_name']/div[@class='multiline_truncate']/a/@href
      Image: //a[@class='work_thumb_inner']/img/@src
  sceneScraper:
    scene:
      Title: &titleSel //h1[@id='work_name']/text()
      Date: &dateSel
        selector: "//th[text() = '販売日']/following-sibling::td//text()|//th[text() = 'Release date']/following-sibling::td//text()"
        postProcess:
          - replace:
              - regex: "Jan"
                with: "01"
              - regex: "Feb"
                with: "02"
              - regex: "Mar"
                with: "03"
              - regex: "Apr"
                with: "04"
              - regex: "May"
                with: "05"
              - regex: "Jun"
                with: "06"
              - regex: "Jul"
                with: "07"
              - regex: "Aug"
                with: "08"
              - regex: "Sep"
                with: "09"
              - regex: "Oct"
                with: "10"
              - regex: "Nov"
                with: "11"
              - regex: "Dec"
                with: "12"
              - regex: \s.+$
                with: ""
              - regex: (\d{2})/(\d{2})/(\d{4})
                with: "${3}年${1}月${2}日"
          - parseDate: 2006年01月02日
      Details: &detailsSel
        selector: //div[contains(@class,"work_parts_area")]
        concat: "\n\n"
      Tags: &tagsSel
        Name: //div[@class='main_genre']/a/text()
      Performers:
        # only illustrators and voice actors, writers and musicians are not included
        Name: "//th[text() = '声優']/following-sibling::td/a/text()|//th[text() = 'Voice Actor']/following-sibling::td/a/text()|//th[text() = 'イラスト']/following-sibling::td/a/text()|//th[text() = 'Illustration']/following-sibling::td/a/text()"
      Studio: &studioSel
        Name: //span[@class='maker_name']/a/text()
      Image:
        selector: //li[@class='slider_item active']/picture/img/@srcset
        postProcess:
          - replace:
              - regex: ^
                with: "https:"
      URL: &urlSel //meta[@property='og:url']/@content
  galleryManiaxScraper:
    gallery:
      Title: *titleSel
      Date: *dateSel
      Details: *detailsSel
      Tags: *tagsSel
      # only illustrators
      Performers:
        Name: "//th[text() = 'イラスト']/following-sibling::td/a/text()|//th[text() = 'Illustration']/following-sibling::td/a/text()"
      Studio: *studioSel
      URL: *urlSel
  galleryBookScraper:
    gallery:
      Title: *titleSel
      Date: *dateSel
      Details: *detailsSel
      Tags: *tagsSel
      Performers:
        # for maniax author is the writer, for books this is illustrators
        Name: "//th[text() = '著者']/following-sibling::td/a/text()|//th[text() = 'Author']/following-sibling::td/a/text()"
      Studio: *studioSel
      URL: *urlSel
# Last Updated June 19, 2022