Files
compose-projects-arr/stash/config/scrapers/community/Boobpedia/Boobpedia.yml
Christoph Califice 0a5f88d75a stash
2025-10-10 09:50:30 -03:00

448 lines
17 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
name: Boobpedia
performerByURL:
- action: scrapeXPath
url:
- boobpedia.com/boobs/
scraper: performerScraper
performerByName:
action: scrapeJson
queryURL: "https://www.boobpedia.com/wiki/api.php?action=query&format=json&list=search&srwhat=title&srsearch={}"
scraper: performerSearch
jsonScrapers:
performerSearch:
performer:
Name: query.search.#.title
URL:
selector: query.search.#.title
postProcess:
- replace:
- regex: \s
with: "_"
- regex: ^
with: "https://www.boobpedia.com/boobs/"
xPathScrapers:
performerScraper:
performer:
Name: //h1
Twitter: //table//tr/td/a[b[text()='Twitter']]/@href
Instagram: //table//tr/td/a[b[text()='Instagram']]/@href
Birthdate:
selector: //table//tr/td//b[text()='Born']/../following-sibling::td/a
concat: " "
postProcess:
- replace:
- regex: (.*\d\d\d\d).*
with: $1
- parseDate: January 2 2006
Ethnicity:
selector: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a
postProcess:
- replace:
- regex: \[\d*\]
with: ""
EyeColor:
selector: //table//tr/td/b[text()='Eye color']/../following-sibling::td
postProcess:
- replace:
- regex: \[\d*\]
with: ""
Height:
selector: //table//tr/td/b[text()='Height']/../following-sibling::td
postProcess:
- replace:
- regex: (?:.+\D)?(\d+\.\d+)\Dm.+
with: $1
- regex: \.
with: ""
Weight:
selector: //table//tr/td/b[text()='Weight']/../following-sibling::td
postProcess:
- replace:
- regex: (?:.+\D)?(\d+)\Dkg.+
with: $1
Measurements:
selector: //table//tr/td/b[text()='Measurements']/../following-sibling::td|//table//tr/td[contains(b,'cup')]/following-sibling::td
concat: "|"
postProcess:
- replace:
- regex: (\d+)-(\d+)-(\d+)[^|]*\|(\d+\S+).+ # get measurements + cup
with: $4-$2-$3
- regex: \|.+$ # fallback to clear non matching regexes
with: ""
- regex: \[\d*\] # Remove References
with: ""
- regex: ( in) # Remove Unit Inches
with: ""
FakeTits:
selector: //table//tr/td/b[text()='Boobs']/../following-sibling::td/a
postProcess:
- replace:
- regex: \[\d*\] # Remove References
with: ""
- map:
"Enhanced": "Fake"
"Natural": "Natural"
HairColor:
selector: //table//tr/td[contains(b,'Hair')]/following-sibling::td//text()
concat: ", "
postProcess:
- replace:
- regex: (,,)
with: ","
- regex: ( , )
with: " "
- regex: \[\d*\]
with: ""
# nbsp; screws up the parsing, so use contains instead
CareerLength:
selector: //table//tr/td/b[text()[contains(.,'active')]]/../following-sibling::td
postProcess:
- replace:
- regex: \[\d*\] # Remove References
with: ""
- regex: (—|)
with: "-"
- regex: (\S)-(\S)
with: $1 - $2
- regex: (?i)(present|current)
with: ""
Aliases:
selector: //table//tr/td/b[text()[contains(.,'known')]]/../following-sibling::td
postProcess:
- replace:
- regex: \[\d*\]
with: ""
Image:
#selector: //table[@class="infobox"]//img/@src #alterntive image, no need for subScraper but gets lq image
selector: //table[@class="infobox plainlinks"]//a[img[@src]]/@href
postProcess:
- replace:
- regex: ^
with: https://www.boobpedia.com
- subScraper:
selector: //div[@id="file"]/a/@href
postProcess:
- replace:
- regex: ^
with: https://www.boobpedia.com
URL:
selector: //script[contains(.,"wgPageName")]
postProcess:
- replace:
- regex: '.+wgPageName":"([^"]+)".+'
with: "https://www.boobpedia.com/boobs/$1"
Details:
selector: //div[@class="mw-parser-output"]/p
concat: "\n\n"
postProcess:
- replace:
# Remove References
- regex: \[\d*\]
with: ""
# Remove <protect> and </protect>, which appears in the details of some performers (e.g. Jenna Jameson)
- regex: </?protect>
with: ""
# Remove triple line breaks
- regex: \n\n\n
with: "\n"
Country:
selector: //table//tr/td/b[text()='Nationality']/../following-sibling::td/a
postProcess:
- map:
"Abkhaz": "Abkhazia"
"Abkhazian": "Abkhazia"
"Afghan": "Afghanistan"
"Albanian": "Albania"
"Algerian": "Algeria"
"American Samoan": "American Samoa"
"American": "United States of America"
"Andorran": "Andorra"
"Angolan": "Angola"
"Anguillan": "Anguilla"
"Antarctic": "Antarctica"
"Antiguan": "Antigua and Barbuda"
"Argentine": "Argentina"
"Argentinian": "Argentina"
"Armenian": "Armenia"
"Aruban": "Aruba"
"Australian": "Australia"
"Austrian": "Austria"
"Azerbaijani": "Azerbaijan"
"Azeri": "Azerbaijan"
"Bahamian": "Bahamas"
"Bahraini": "Bahrain"
"Bangladeshi": "Bangladesh"
"Barbadian": "Barbados"
"Barbudan": "Antigua and Barbuda"
"Basotho": "Lesotho"
"Belarusian": "Belarus"
"Belgian": "Belgium"
"Belizean": "Belize"
"Beninese": "Benin"
"Beninois": "Benin"
"Bermudan": "Bermuda"
"Bermudian": "Bermuda"
"Bhutanese": "Bhutan"
"BIOT": "British Indian Ocean Territory"
"Bissau-Guinean": "Guinea-Bissau"
"Bolivian": "Bolivia"
"Bonaire": "Bonaire"
"Bonairean": "Bonaire"
"Bosnian": "Bosnia and Herzegovina"
"Botswanan": "Botswana"
"Bouvet Island": "Bouvet Island"
"Brazilian": "Brazil"
"British Virgin Island": "Virgin Islands British"
"British": "United Kingdom"
"Bruneian": "Brunei"
"Bulgarian": "Bulgaria"
"Burkinabé": "Burkina Faso"
"Burmese": "Burma"
"Burundian": "Burundi"
"Cabo Verdean": "Cabo Verde"
"Cambodian": "Cambodia"
"Cameroonian": "Cameroon"
"Canadian": "Canada"
"Cantonese": "Hong Kong"
"Caymanian": "Cayman Islands"
"Central African": "Central African Republic"
"Chadian": "Chad"
"Channel Island": "Guernsey"
#Channel Island: "Jersey"
"Chilean": "Chile"
"Chinese": "China"
"Christmas Island": "Christmas Island"
"Cocos Island": "Cocos (Keeling) Islands"
"Colombian": "Colombia"
"Comoran": "Comoros"
"Comorian": "Comoros"
"Congolese": "Congo"
"Cook Island": "Cook Islands"
"Costa Rican": "Costa Rica"
"Croatian": "Croatia"
"Cuban": "Cuba"
"Curaçaoan": "Curaçao"
"Cypriot": "Cyprus"
"Czech": "Czech Republic"
"Danish": "Denmark"
"Djiboutian": "Djibouti"
"Dominican": "Dominica"
"Dutch": "Netherlands"
"Ecuadorian": "Ecuador"
"Egyptian": "Egypt"
"Emirati": "United Arab Emirates"
"Emiri": "United Arab Emirates"
"Emirian": "United Arab Emirates"
"English people": "England"
"English": "England"
"Equatoguinean": "Equatorial Guinea"
"Equatorial Guinean": "Equatorial Guinea"
"Eritrean": "Eritrea"
"Estonian": "Estonia"
"Ethiopian": "Ethiopia"
"European": "European Union"
"Falkland Island": "Falkland Islands"
"Faroese": "Faroe Islands"
"Fijian": "Fiji"
"Filipino": "Philippines"
"Finnish": "Finland"
"Formosan": "Taiwan"
"French Guianese": "French Guiana"
"French Polynesian": "French Polynesia"
"French Southern Territories": "French Southern Territories"
"French": "France"
"Futunan": "Wallis and Futuna"
"Gabonese": "Gabon"
"Gambian": "Gambia"
"Georgian": "Georgia"
"German": "Germany"
"Ghanaian": "Ghana"
"Gibraltar": "Gibraltar"
"Greek": "Greece"
"Greenlandic": "Greenland"
"Grenadian": "Grenada"
"Guadeloupe": "Guadeloupe"
"Guamanian": "Guam"
"Guatemalan": "Guatemala"
"Guinean": "Guinea"
"Guyanese": "Guyana"
"Haitian": "Haiti"
"Heard Island": "Heard Island and McDonald Islands"
"Hellenic": "Greece"
"Herzegovinian": "Bosnia and Herzegovina"
"Honduran": "Honduras"
"Hong Kong": "Hong Kong"
"Hong Konger": "Hong Kong"
"Hungarian": "Hungary"
"Icelandic": "Iceland"
"Indian": "India"
"Indonesian": "Indonesia"
"Iranian": "Iran"
"Iraqi": "Iraq"
"Irish": "Ireland"
"Israeli": "Israel"
"Israelite": "Israel"
"Italian": "Italy"
"Ivorian": "Ivory Coast"
"Jamaican": "Jamaica"
"Jan Mayen": "Jan Mayen"
"Japanese": "Japan"
"Jordanian": "Jordan"
"Kazakh": "Kazakhstan"
"Kazakhstani": "Kazakhstan"
"Kenyan": "Kenya"
"Kirghiz": "Kyrgyzstan"
"Kirgiz": "Kyrgyzstan"
"Kiribati": "Kiribati"
"Korean": "South Korea"
"Kosovan": "Kosovo"
"Kosovar": "Kosovo"
"Kuwaiti": "Kuwait"
"Kyrgyz": "Kyrgyzstan"
"Kyrgyzstani": "Kyrgyzstan"
"Lao": "Lao People's Democratic Republic"
"Laotian": "Lao People's Democratic Republic"
"Latvian": "Latvia"
"Lebanese": "Lebanon"
"Lettish": "Latvia"
"Liberian": "Liberia"
"Libyan": "Libya"
"Liechtensteiner": "Liechtenstein"
"Lithuanian": "Lithuania"
"Luxembourg": "Luxembourg"
"Luxembourgish": "Luxembourg"
"Macanese": "Macau"
"Macedonian": "North Macedonia"
"Magyar": "Hungary"
"Mahoran": "Mayotte"
"Malagasy": "Madagascar"
"Malawian": "Malawi"
"Malaysian": "Malaysia"
"Maldivian": "Maldives"
"Malian": "Mali"
"Malinese": "Mali"
"Maltese": "Malta"
"Manx": "Isle of Man"
"Marshallese": "Marshall Islands"
"Martinican": "Martinique"
"Martiniquais": "Martinique"
"Mauritanian": "Mauritania"
"Mauritian": "Mauritius"
"McDonald Islands": "Heard Island and McDonald Islands"
"Mexican": "Mexico"
"Moldovan": "Moldova"
"Monacan": "Monaco"
"Mongolian": "Mongolia"
"Montenegrin": "Montenegro"
"Montserratian": "Montserrat"
"Monégasque": "Monaco"
"Moroccan": "Morocco"
"Motswana": "Botswana"
"Mozambican": "Mozambique"
"Myanma": "Myanmar"
"Namibian": "Namibia"
"Nauruan": "Nauru"
"Nepalese": "Nepal"
"Nepali": "Nepal"
"Netherlandic": "Netherlands"
"New Caledonian": "New Caledonia"
"New Zealand": "New Zealand"
"Ni-Vanuatu": "Vanuatu"
"Nicaraguan": "Nicaragua"
"Nigerian": "Nigeria"
"Nigerien": "Niger"
"Niuean": "Niue"
"Norfolk Island": "Norfolk Island"
"Northern Irish": "Northern Ireland"
"Northern Marianan": "Northern Mariana Islands"
"Norwegian": "Norway"
"Omani": "Oman"
"Pakistani": "Pakistan"
"Palauan": "Palau"
"Palestinian": "Palestine"
"Panamanian": "Panama"
"Papua New Guinean": "Papua New Guinea"
"Papuan": "Papua New Guinea"
"Paraguayan": "Paraguay"
"Persian": "Iran"
"Peruvian": "Peru"
"Philippine": "Philippines"
"Pitcairn Island": "Pitcairn Islands"
"Polish": "Poland"
"Portuguese": "Portugal"
"Puerto Rican": "Puerto Rico"
"Qatari": "Qatar"
"Romanian": "Romania"
"Russian": "Russia"
"Rwandan": "Rwanda"
"Saba": "Saba"
"Saban": "Saba"
"Sahraouian": "Western Sahara"
"Sahrawi": "Western Sahara"
"Sahrawian": "Western Sahara"
"Salvadoran": "El Salvador"
"Sammarinese": "San Marino"
"Samoan": "Samoa"
"Saudi Arabian": "Saudi Arabia"
"Saudi": "Saudi Arabia"
"Scottish": "Scotland"
"Senegalese": "Senegal"
"Serbian": "Serbia"
"Seychellois": "Seychelles"
"Sierra Leonean": "Sierra Leone"
"Singapore": "Singapore"
"Singaporean": "Singapore"
"Slovak": "Slovakia"
"Slovene": "Slovenia"
"Slovenian": "Slovenia"
"Solomon Island": "Solomon Islands"
"Somali": "Somalia"
"Somalilander": "Somaliland"
"South African": "South Africa"
"South Georgia Island": "South Georgia and the South Sandwich Islands"
"South Ossetian": "South Ossetia"
"South Sandwich Island": "South Georgia and the South Sandwich Islands"
"South Sudanese": "South Sudan"
"Spanish": "Spain"
"Sri Lankan": "Sri Lanka"
"Sudanese": "Sudan"
"Surinamese": "Suriname"
"Svalbard resident": "Svalbard"
"Swati": "Eswatini"
"Swazi": "Eswatini"
"Swedish": "Sweden"
"Swiss": "Switzerland"
"Syrian": "Syrian Arab Republic"
"Taiwanese": "Taiwan"
"Tajikistani": "Tajikistan"
"Tanzanian": "Tanzania"
"Thai": "Thailand"
"Timorese": "Timor-Leste"
"Tobagonian": "Trinidad and Tobago"
"Togolese": "Togo"
"Tokelauan": "Tokelau"
"Tongan": "Tonga"
"Trinidadian": "Trinidad and Tobago"
"Tunisian": "Tunisia"
"Turkish": "Turkey"
"Turkmen": "Turkmenistan"
"Turks and Caicos Island": "Turks and Caicos Islands"
"Tuvaluan": "Tuvalu"
"Ugandan": "Uganda"
"Ukrainian": "Ukraine"
"Uruguayan": "Uruguay"
"Uzbek": "Uzbekistan"
"Uzbekistani": "Uzbekistan"
"Vanuatuan": "Vanuatu"
"Vatican": "Vatican City State"
"Venezuelan": "Venezuela"
"Vietnamese": "Vietnam"
"Wallis and Futuna": "Wallis and Futuna"
"Wallisian": "Wallis and Futuna"
"Welsh": "Wales"
"Yemeni": "Yemen"
"Zambian": "Zambia"
"Zimbabwean": "Zimbabwe"
"Åland Island": "Åland Islands"
# Last Updated December 30, 2022