261 lines
9.7 KiB
YAML
261 lines
9.7 KiB
YAML
name: Carnal+ / FTM+
|
|
sceneByURL:
|
|
# These studios have standalone scenes
|
|
- action: scrapeXPath
|
|
url:
|
|
- americanmusclehunks.com/videos/
|
|
- bangbangboys.com/videos/
|
|
- cumdumpsluts.com/videos
|
|
- dirtyboysociety.com/videos/
|
|
- edwardjames.com/videos/
|
|
- ftmmen.com/videos/
|
|
- hungfuckers.com/videos/
|
|
- jalifstudio.com/videos/
|
|
- jasonsparkslive.com/videos/
|
|
- jockbreeders.com/videos/
|
|
- jockpussy.com/videos/
|
|
- staghomme.com/videos/
|
|
- teensandtwinks.com/videos/
|
|
- twinks.com/videos/
|
|
scraper: sceneScraper
|
|
# These studios organize their scenes into series with chapters
|
|
# You can tell a studio belongs in this category
|
|
# if there's a "SERIES" link in the main navbar for their site
|
|
- action: scrapeXPath
|
|
url:
|
|
- boundtwinks.com/videos/
|
|
- boyforsale.com/videos/
|
|
- funsizeboys.com/videos/
|
|
- gaycest.com/videos/
|
|
- masonicboys.com/videos/
|
|
- rawfuckboys.com/videos/
|
|
- scoutboys.com/videos/
|
|
- transcest.com/videos/
|
|
- twinkloads.com/videos/
|
|
- twinktop.com/videos/
|
|
scraper: chapterSceneScraper
|
|
# The network site has all scenes from the standalone sites
|
|
- action: scrapeXPath
|
|
url:
|
|
- barebackplus.com/videos/
|
|
- carnalplus.com/videos/
|
|
- ftmplus.com/videos/
|
|
scraper: networkScraper
|
|
xPathScrapers:
|
|
sceneScraper:
|
|
common:
|
|
$scene: &sceneContainer //body/div[contains(@class, "mainContainer")]
|
|
scene:
|
|
Title: &title
|
|
selector: //title/text()
|
|
postProcess:
|
|
- replace:
|
|
- regex: \s*\|.*$
|
|
with:
|
|
Details: &details
|
|
selector: $scene//div[@class="full-txt"]//text()
|
|
concat: "\n\n"
|
|
Image: &image $scene//video/@poster | $scene//img[contains(@class, "hiddenImg")]/@src0_1x | $scene//img[contains(@class, "hiddenImg")]/@src | $scene//img[contains(@class, "hiddenImg")]/@data-src
|
|
URL: &url //link[@rel="canonical"]/@href
|
|
Date: &dateSubscraper
|
|
# We need to scrape the network site to get the date, but this scraper
|
|
# has to work for multiple networks so we can't hardcode the network site.
|
|
# Instead we fetch the network name from the shortcut icon and combine it
|
|
# with the canonical URL to construct the correct URL to scrape
|
|
# see https://regex101.com/r/QaZLIY/1 for an example
|
|
selector: //link[@rel="shortcut icon"]/@href | //link[@rel="canonical"]/@href
|
|
concat: __SEPARATOR__
|
|
postProcess:
|
|
- replace:
|
|
- regex: (?P<networkSite>.+\.com).*__SEPARATOR__.*(?P<path>\/videos.*).html
|
|
# We'd love to append `_vids` here but _ triggers the submatch in the regexp
|
|
# so we use the URL encoded version of an underscore instead: %5f
|
|
with: $networkSite$path%5fvids.html
|
|
- subScraper: //div[@class="releasedate"]
|
|
- replace:
|
|
&cleanDate # Remove the trailing "| Full length video : XX min YY sec" part
|
|
- regex: \s*\|.*
|
|
with:
|
|
- parseDate: January 02, 2006
|
|
Code: &studioCode
|
|
selector: //meta[@property="og:image"]/@content
|
|
postProcess:
|
|
- replace:
|
|
- regex: .*content\/([^\/]+).*
|
|
with: $1
|
|
# Some of these image URLs will not contain the studio code
|
|
# so we need to remove those manually here
|
|
- regex: ^https.*
|
|
with: ""
|
|
Studio: &studio
|
|
Name:
|
|
selector: //base/@href
|
|
postProcess:
|
|
- replace:
|
|
# https://regex101.com/r/JxFd9a/1
|
|
- regex: ^(?:https:\/\/[\w\.]*?)([^.]+)\.com.*$
|
|
with: $1
|
|
- map:
|
|
# The canonical list of studio names are based on what
|
|
# they are called on their respective network sites
|
|
americanmusclehunks: American Muscle Hunks
|
|
bangbangboys: Bang Bang Boys
|
|
boundtwinks: Bound Twinks
|
|
boyforsale: Boy For Sale
|
|
cumdumpsluts: Cum Dump Sluts
|
|
dirtyboysociety: Dirty Boy Society
|
|
edwardjames: Edward James
|
|
ftmmen: FTM Men
|
|
funsizeboys: Funsize Boys
|
|
gaycest: Gaycest
|
|
hungfuckers: Hung Fuckers
|
|
jalifstudio: Jalif Studio
|
|
jockpussy: Jock Pussy
|
|
jockbreeders: Jock Breeders
|
|
masonicboys: Masonic Boys
|
|
rawfuckboys: Raw Fuck Boys
|
|
scoutboys: Scout Boys
|
|
staghomme: Stag Homme
|
|
teensandtwinks: Teens And Twinks
|
|
transcest: Transcest
|
|
twinks: Twinks
|
|
twinkloads: Twink Loads
|
|
twinktop: Twink Top
|
|
URL: //base/@href
|
|
Tags: &tags
|
|
Name: $scene//div[@id="catMovie"]//text()
|
|
Performers: &performers
|
|
Name: $scene//div[contains(@class, "modelProfile")]//h2 | $scene//div[contains(@class, "modelProfile")]//h3
|
|
networkScraper:
|
|
common:
|
|
$scene: ((//div[contains(@class, "main")])[1]/div)[1]
|
|
scene:
|
|
Title: *title
|
|
Details:
|
|
selector: $scene//div[@class='textDescription']//text()[not(parent::span[@id='firstWords' or @id='readmore'])]
|
|
concat: "\n\n"
|
|
Image: *image
|
|
Code:
|
|
selector: (//source/@src)[1]
|
|
postProcess:
|
|
- replace:
|
|
- regex: .*(\w{3}\d{4}).trailer.*
|
|
with: $1
|
|
# Some of these trailer URLs will not contain the studio code
|
|
# so we need to remove those manually here
|
|
- regex: ^https.*
|
|
with:
|
|
Date:
|
|
selector: $scene//div[@class="releasedate"]
|
|
postProcess:
|
|
- replace: *cleanDate
|
|
- parseDate: January 02, 2006
|
|
URL:
|
|
# All scenes on network sites should be available from their subsites as well
|
|
# so we construct a valid link to the subsite both to encourage people
|
|
# to scrape from the canonical source as well as submitting both links to StashDB:
|
|
# the network site will list the duration, which is helpful when evaluating
|
|
# the submitted fingerprints for the scene
|
|
selector: //link[@rel="canonical"]/@href | //div[@class="logoSubsites"]//img/@alt
|
|
concat: __SEPARATOR__
|
|
postProcess:
|
|
- replace:
|
|
- regex: .*(?P<path>videos/.*)__SEPARATOR__(?P<domain>.*)
|
|
with: https://$domain.com/$path
|
|
- regex: _vids
|
|
with: ""
|
|
# This table is should contain the same sites Studio Name in sceneScraper
|
|
- regex: AmericanMuscleHunks
|
|
with: americanmusclehunks
|
|
- regex: BangBangBoys
|
|
with: bangbangboys
|
|
- regex: BoundTwinks
|
|
with: boundtwinks
|
|
- regex: BoyForSale
|
|
with: boyforsale
|
|
- regex: CumDumpSluts
|
|
with: cumdumpsluts
|
|
- regex: DirtyBoySociety
|
|
with: dirtyboysociety
|
|
- regex: EdwardJames
|
|
with: edwardjames
|
|
- regex: FTMmen
|
|
with: ftmmen
|
|
- regex: FunsizeBoys
|
|
with: funsizeboys
|
|
- regex: Gaycest
|
|
with: gaycest
|
|
- regex: HungFuckers
|
|
with: hungfuckers
|
|
- regex: JasonSparksLive
|
|
with: jasonsparkslive
|
|
- regex: JalifStudio
|
|
with: jalifstudio
|
|
- regex: JockBreeders
|
|
with: jockbreeders
|
|
- regex: JockPussy
|
|
with: jockpussy
|
|
- regex: MasonicBoys
|
|
with: masonicboys
|
|
- regex: RawFuckBoys
|
|
with: rawfuckboys
|
|
- regex: ScoutBoys
|
|
with: scoutboys
|
|
- regex: StagHomme
|
|
with: staghomme
|
|
- regex: TeensAndTwinks
|
|
with: teensandtwinks
|
|
- regex: Transcest
|
|
with: transcest
|
|
- regex: Twinks
|
|
with: twinks
|
|
- regex: Twinkloads
|
|
with: twinkloads
|
|
- regex: TwinkTop
|
|
with: twinktop
|
|
Tags:
|
|
Name: $scene//div[@class="update_tags"]//text()
|
|
Performers:
|
|
Name: $scene//div[@id="models"]//h4
|
|
Studio:
|
|
Name:
|
|
selector: //div[@class="logoSubsites"]//img/@alt
|
|
postProcess:
|
|
- replace:
|
|
# Turn PascalCaseWords to Pascal Case Words
|
|
- regex: ([a-z])([A-Z])
|
|
with: $1 $2
|
|
URL:
|
|
selector: //div[@class="logoSubsites"]//img/@alt
|
|
postProcess:
|
|
- replace:
|
|
- regex: (?P<domain>.*)
|
|
with: https://$domain.com
|
|
chapterSceneScraper:
|
|
common:
|
|
$scene: *sceneContainer
|
|
# The differences from the regular sceneScraper is that we
|
|
# shuffle the title around a little to reflect what's shown
|
|
# on the page instead of in the title bar (also makes them easier to sort)
|
|
# and we use the series as a movie so that people can group their scenes together
|
|
scene:
|
|
Title:
|
|
selector: //title/text()
|
|
postProcess:
|
|
- replace:
|
|
# https://regex101.com/r/y1Clkp/2
|
|
- regex: (?P<title>.*?) - (?P<series>.*?) - (?P<chapter>.*?) \|.*
|
|
with: $series - $chapter - $title
|
|
Details: *details
|
|
Date: *dateSubscraper
|
|
Code: *studioCode
|
|
Image: *image
|
|
URL: *url
|
|
Studio: *studio
|
|
Movies:
|
|
Name: //span[contains(@class, "dvdTitleScene")]
|
|
URL: //span[contains(@class, "dvdTitleScene")]//a/@href
|
|
Tags: *tags
|
|
Performers: *performers
|
|
# Last Updated September 21, 2023
|