compose-projects-arr/stash/config/scrapers/community/WikiData/WikiData.yml

name: Wikidata
performerByName:
  action: scrapeJson
  queryURL: https://query.wikidata.org/sparql?query=SELECT%20%3Fpornographic_actor%20%3Fpornographic_actorLabel%20WHERE%20%7B%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22.%20%7D%0A%20%20%3Fpornographic_actor%20wdt%3AP106%20wd%3AQ488111%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20rdfs%3Alabel%20%3Flabel.%0A%20%20FILTER(LANG(%3Flabel)%20%3D%20%22en%22).%0A%20%20FILTER(STRSTARTS(lcase(%3Flabel)%2C%20lcase(%22{}%22)))%7D&format=json
  scraper: performerSearch
performerByURL:
  - action: scrapeJson
    url:
      - https://www.wikidata.org/wiki/Q
    queryURL: "{url}"
    queryURLReplace:
      url:
        - regex: https://www.wikidata.org/wiki/
          with: https://www.wikidata.org/wiki/Special:EntityData/
        - regex: $
          with: .json
    scraper: performerScraper
jsonScrapers:
  performerSearch:
    performer:
      Name: results.bindings.#.pornographic_actorLabel.value
      URL:
        selector: results.bindings.#.pornographic_actor.value
        postProcess:
          - replace:
              - regex: http:\/\/www.wikidata.org\/entity\/
                with: https://www.wikidata.org/wiki/
  performerScraper:
    performer:
      Name: entities.*.labels.en.value
      Aliases:
        selector: entities.*.aliases.en.#.value
        concat: ", "
      Image:
        selector: entities.*.claims.P18.#.mainsnak.datavalue.value
        postProcess:
          - replace:
              - regex: \s
                with: "%20" # spaces cause 400 error
              - regex: ^
                with: "https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/"
      Weight:
        selector: entities.*.claims.P2067.#.mainsnak.datavalue.value.amount
        postProcess:
          - replace:
              - regex: \+
                with:
      Birthdate:
        selector: entities.*.claims.P569.#.mainsnak.datavalue.value.time
        postProcess:
          - replace:
              - regex: .*(\d{4}-\d{1,2}-\d{1,2}).*
                with: $1
      DeathDate:
        selector: entities.*.claims.P570.#.mainsnak.datavalue.value.time
        postProcess:
          - replace:
              - regex: .*(\d{4}-\d{1,2}-\d{1,2}).*
                with: $1
      Height:
        selector: entities.*.claims.P2048.#.mainsnak.datavalue.value.amount
        postProcess:
          - replace:
              - regex: \+
                with:
              - regex: \.
                with:
      CareerLength:
        selector: entities.*.claims.P2031.#.mainsnak.datavalue.value.time
        postProcess:
          - replace:
              - regex: .*(\d{4}).*
                with: $1
      Gender:
        selector: entities.*.claims.P21.#.mainsnak.datavalue.value.numeric-id
        postProcess:
          - replace:
              - regex: ^
                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
              - regex: $
                with: .json
          - subScraper:
              selector: entities.*.labels.en.value
      HairColor:
        selector: entities.*.claims.P1884.#.mainsnak.datavalue.value.numeric-id
        postProcess:
          - replace:
              - regex: ^
                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
              - regex: $
                with: .json
          - subScraper:
              selector: entities.*.labels.en.value
      EyeColor:
        selector: entities.*.claims.P1340.#.mainsnak.datavalue.value.numeric-id
        postProcess:
          - replace:
              - regex: ^
                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
              - regex: $
                with: .json
          - subScraper:
              selector: entities.*.labels.en.value
      Ethnicity:
        selector: entities.*.claims.P172.#.mainsnak.datavalue.value.numeric-id
        postProcess:
          - replace:
              - regex: ^
                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
              - regex: $
                with: .json
          - subScraper:
              selector: entities.*.labels.en.value
      Country:
        selector: entities.*.claims.P27.#.mainsnak.datavalue.value.numeric-id
        postProcess:
          - replace:
              - regex: ^
                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
              - regex: $
                with: .json
          - subScraper:
              selector: entities.*.labels.en.value
# Personal preference, keep the wikidata url instead of the official website of the performer
#      URL:
#        selector: entities.*.claims.P856.#.mainsnak.datavalue.value
      Twitter:
        selector: entities.*.claims.P2002.#.mainsnak.datavalue.value
        postProcess:
          - replace:
              - regex: ^
                with: "https://twitter.com/"
      Instagram:
        selector: entities.*.claims.P2003.#.mainsnak.datavalue.value
        postProcess:
          - replace:
              - regex: ^
                with: "https://www.instagram.com/"
      Details:
        selector: entities.*.sitelinks.enwiki.title
        postProcess:
          - replace:
              - regex: " "
                with:  "_"
              - regex: ^
                with: "https://en.wikipedia.org/w/api.php?action=query&origin=*&prop=extracts&explaintext&titles="
              - regex: $
                with: "&format=json"
          - subScraper:
              selector: query.pages.*.extract


# Last Updated August 29, 2021