Files
compose-projects-arr/stash/config/scrapers/community/AyloAPI/slugger.py
Christoph Califice 0a5f88d75a stash
2025-10-10 09:50:30 -03:00

83 lines
3.0 KiB
Python

import re
"""
This ports the kebabCase function from lodash to Python. It is used to generate
slugs for the URLs for scenes, performers and movies scraped from the Aylo API.
https://github.com/lodash/lodash/blob/main/src/kebabCase.ts
"""
rsAstralRange = "\\ud800-\\udfff"
rsComboMarksRange = "\\u0300-\\u036f"
reComboHalfMarksRange = "\\ufe20-\\ufe2f"
rsComboSymbolsRange = "\\u20d0-\\u20ff"
rsComboMarksExtendedRange = "\\u1ab0-\\u1aff"
rsComboMarksSupplementRange = "\\u1dc0-\\u1dff"
rsComboRange = (
rsComboMarksRange
+ reComboHalfMarksRange
+ rsComboSymbolsRange
+ rsComboMarksExtendedRange
+ rsComboMarksSupplementRange
)
rsDingbatRange = "\\u2700-\\u27bf"
rsLowerRange = "a-z\\xdf-\\xf6\\xf8-\\xff"
rsMathOpRange = "\\xac\\xb1\\xd7\\xf7"
rsNonCharRange = "\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf"
rsPunctuationRange = "\\u2000-\\u206f"
rsSpaceRange = " \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000"
rsUpperRange = "A-Z\\xc0-\\xd6\\xd8-\\xde"
rsVarRange = "\\ufe0e\\ufe0f"
rsBreakRange = rsMathOpRange + rsNonCharRange + rsPunctuationRange + rsSpaceRange
rsApos = "['\u2019]"
rsBreak = f"[{rsBreakRange}]"
rsCombo = f"[{rsComboRange}]"
rsDigit = "\\d"
rsDingbat = f"[{rsDingbatRange}]"
rsLower = f"[{rsLowerRange}]"
rsMisc = f"[^{rsAstralRange}{rsBreakRange + rsDigit + rsDingbatRange + rsLowerRange + rsUpperRange}]"
rsFitz = "\\ud83c[\\udffb-\\udfff]"
rsModifier = f"(?:{rsCombo}|{rsFitz})"
rsNonAstral = f"[^{rsAstralRange}]"
rsRegional = "(?:\\ud83c[\\udde6-\\uddff]){2}"
rsSurrPair = "[\\ud800-\\udbff][\\udc00-\\udfff]"
rsUpper = f"[{rsUpperRange}]"
rsZWJ = "\\u200d"
rsMiscLower = f"(?:{rsLower}|{rsMisc})"
rsMiscUpper = f"(?:{rsUpper}|{rsMisc})"
rsOptContrLower = f"(?:{rsApos}(?:d|ll|m|re|s|t|ve))?"
rsOptContrUpper = f"(?:{rsApos}(?:D|LL|M|RE|S|T|VE))?"
reOptMod = f"{rsModifier}?"
rsOptVar = f"[{rsVarRange}]?"
rsOptJoin = f"(?:{rsZWJ}(?:{('|').join([rsNonAstral, rsRegional, rsSurrPair])}){rsOptVar + reOptMod})*"
rsOrdLower = "\\d*(?:1st|2nd|3rd|(?![123])\\dth)(?=\\b|[A-Z_])"
rsOrdUpper = "\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])"
rsSeq = rsOptVar + reOptMod + rsOptJoin
rsEmoji = rf"(?:{('|').join([rsDingbat, rsRegional, rsSurrPair])}){rsSeq}"
reUnicodeWords = re.compile(
"|".join(
[
f"{rsUpper}?{rsLower}+{rsOptContrLower}(?={('|').join([rsBreak, rsUpper, '$'])})",
f"{rsMiscUpper}+{rsOptContrUpper}(?={('|').join([rsBreak, rsUpper + rsMiscLower, '$'])})",
f"{rsUpper}?{rsMiscLower}+{rsOptContrLower}",
f"{rsUpper}+{rsOptContrUpper}",
rsOrdUpper,
rsOrdLower,
f"{rsDigit}+",
rsEmoji,
]
)
)
reAsciiWords = re.compile(r"[^\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]+")
def slugify(string):
cleaned = re.sub("['\u2019]", "", string)
if reUnicodeWords.search(cleaned):
return "-".join(reUnicodeWords.findall(cleaned)).lower()
return "-".join(reAsciiWords.findall(cleaned)).lower()