This commit is contained in:
Christoph Califice
2025-10-09 20:05:31 -03:00
parent ed22ef22bc
commit 0a5f88d75a
1442 changed files with 101562 additions and 0 deletions

View File

@@ -0,0 +1,82 @@
import re
"""
This ports the kebabCase function from lodash to Python. It is used to generate
slugs for the URLs for scenes, performers and movies scraped from the Aylo API.
https://github.com/lodash/lodash/blob/main/src/kebabCase.ts
"""
rsAstralRange = "\\ud800-\\udfff"
rsComboMarksRange = "\\u0300-\\u036f"
reComboHalfMarksRange = "\\ufe20-\\ufe2f"
rsComboSymbolsRange = "\\u20d0-\\u20ff"
rsComboMarksExtendedRange = "\\u1ab0-\\u1aff"
rsComboMarksSupplementRange = "\\u1dc0-\\u1dff"
rsComboRange = (
rsComboMarksRange
+ reComboHalfMarksRange
+ rsComboSymbolsRange
+ rsComboMarksExtendedRange
+ rsComboMarksSupplementRange
)
rsDingbatRange = "\\u2700-\\u27bf"
rsLowerRange = "a-z\\xdf-\\xf6\\xf8-\\xff"
rsMathOpRange = "\\xac\\xb1\\xd7\\xf7"
rsNonCharRange = "\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf"
rsPunctuationRange = "\\u2000-\\u206f"
rsSpaceRange = " \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000"
rsUpperRange = "A-Z\\xc0-\\xd6\\xd8-\\xde"
rsVarRange = "\\ufe0e\\ufe0f"
rsBreakRange = rsMathOpRange + rsNonCharRange + rsPunctuationRange + rsSpaceRange
rsApos = "['\u2019]"
rsBreak = f"[{rsBreakRange}]"
rsCombo = f"[{rsComboRange}]"
rsDigit = "\\d"
rsDingbat = f"[{rsDingbatRange}]"
rsLower = f"[{rsLowerRange}]"
rsMisc = f"[^{rsAstralRange}{rsBreakRange + rsDigit + rsDingbatRange + rsLowerRange + rsUpperRange}]"
rsFitz = "\\ud83c[\\udffb-\\udfff]"
rsModifier = f"(?:{rsCombo}|{rsFitz})"
rsNonAstral = f"[^{rsAstralRange}]"
rsRegional = "(?:\\ud83c[\\udde6-\\uddff]){2}"
rsSurrPair = "[\\ud800-\\udbff][\\udc00-\\udfff]"
rsUpper = f"[{rsUpperRange}]"
rsZWJ = "\\u200d"
rsMiscLower = f"(?:{rsLower}|{rsMisc})"
rsMiscUpper = f"(?:{rsUpper}|{rsMisc})"
rsOptContrLower = f"(?:{rsApos}(?:d|ll|m|re|s|t|ve))?"
rsOptContrUpper = f"(?:{rsApos}(?:D|LL|M|RE|S|T|VE))?"
reOptMod = f"{rsModifier}?"
rsOptVar = f"[{rsVarRange}]?"
rsOptJoin = f"(?:{rsZWJ}(?:{('|').join([rsNonAstral, rsRegional, rsSurrPair])}){rsOptVar + reOptMod})*"
rsOrdLower = "\\d*(?:1st|2nd|3rd|(?![123])\\dth)(?=\\b|[A-Z_])"
rsOrdUpper = "\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])"
rsSeq = rsOptVar + reOptMod + rsOptJoin
rsEmoji = rf"(?:{('|').join([rsDingbat, rsRegional, rsSurrPair])}){rsSeq}"
reUnicodeWords = re.compile(
"|".join(
[
f"{rsUpper}?{rsLower}+{rsOptContrLower}(?={('|').join([rsBreak, rsUpper, '$'])})",
f"{rsMiscUpper}+{rsOptContrUpper}(?={('|').join([rsBreak, rsUpper + rsMiscLower, '$'])})",
f"{rsUpper}?{rsMiscLower}+{rsOptContrLower}",
f"{rsUpper}+{rsOptContrUpper}",
rsOrdUpper,
rsOrdLower,
f"{rsDigit}+",
rsEmoji,
]
)
)
reAsciiWords = re.compile(r"[^\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]+")
def slugify(string):
cleaned = re.sub("['\u2019]", "", string)
if reUnicodeWords.search(cleaned):
return "-".join(reUnicodeWords.findall(cleaned)).lower()
return "-".join(reAsciiWords.findall(cleaned)).lower()