name: PKF Studios sceneByURL: - action: scrapeXPath url: - pkfstudios.com scraper: sceneScraper xPathScrapers: sceneScraper: scene: Title: //h1[@class="entry-title"]/text() Details: # Description is a sequence of p elements containing ONLY text selector: //div[@class="entry-content"]/p[not(*)]/text() concat: "\n\n" # Remove the trailing "_ _ _ _ _" separator postProcess: - replace: - regex: "(\n\n[_ ]+)" with: "" Date: selector: //span[@class="entry-date"]//text() postProcess: - parseDate: January 2, 2006 Image: # Images are loaded with javascript, we'll just grab the last image # from the srcset because it's usually the largest selector: //div[contains(@class, "post-thumbnail")]/img/@data-lazy-srcset postProcess: - replace: - regex: ^.*\s+(https://.*)\s+\d+w$ with: $1 Studio: Name: fixed: "PKF Studios" Tags: Name: # First selector will match when the tags are outside of the tag selector: //div[@class="entry-content"]//strong[starts-with(text(), "Role")]/following-sibling::text() | //div[@class="entry-content"]//strong[starts-with(text(), "Role")]/text() postProcess: - replace: - regex: (?:.*:\s+)?([^.]*).? with: $1 split: ", " Performers: Name: # Sometimes the performers are listed in a separate tag, sometimes they're in a paragraph mixed in with the description selector: //div[@class="entry-content"]//*[contains(text(), "Starring")]/text() | //p[contains(., "Starring")] postProcess: - replace: - regex: ".*Starring (.*)" with: $1 - regex: "(?i)directed by johnm" with: "" - regex: ", (and)?" with: " and " split: " and " # Last Updated December 05, 2023