Change the regex to have lookahed and lookbehinds so the match is more 'pure'

remotes/1693045480750635534/spooky-22
Yo Mama 2021-10-19 00:46:10 +02:00
parent ff76a4d688
commit 10c8d7def6
2 changed files with 18 additions and 23 deletions

View File

@ -43,12 +43,13 @@ def create_slur_regex() -> Pattern[str]:
# words that can have suffixes and prefixes # words that can have suffixes and prefixes
words = "|".join([slur.lower() for slur in SLURS.keys() if not slur.startswith(" ")]) words = "|".join([slur.lower() for slur in SLURS.keys() if not slur.startswith(" ")])
regex = rf"(\s|>)({words})|({words})(\s|<)" # to understand the weird groups see: https://www.regular-expressions.info/lookaround.html
regex = rf"(?<=\s|>)({words})|({words})(?=\s|<)"
# words that need to match exactly # words that need to match exactly
single_words = "|".join([slur.strip().lower() for slur in SLURS.keys() if slur.startswith(" ")]) single_words = "|".join([slur.strip().lower() for slur in SLURS.keys() if slur.startswith(" ")])
return re.compile(rf"(?i){regex}|(\s|>)({single_words})(\s|<)") return re.compile(rf"(?i){regex}|(?<=\s|>)({single_words})(?=\s|<)")
def create_replace_map() -> Dict[str, str]: def create_replace_map() -> Dict[str, str]:
@ -65,23 +66,13 @@ REPLACE_MAP = create_replace_map()
def sub_matcher(match: Match) -> str: def sub_matcher(match: Match) -> str:
"""given a match returns the correct replacer string""" """given a match returns the correct replacer string"""
found = match.group(0)
# base regex: (?i)(\s|>)(words)|(words)(\s|<)|(\s|>)(words)(\s|<) # if it does not find the correct capitalization, it tries the all lower, or return the original word
if match.group(2) is not None: return REPLACE_MAP.get(found) or REPLACE_MAP.get(found.lower()) or found
found = match.group(2)
elif match.group(3) is not None:
found = match.group(3)
else:
found = match.group(6)
# if it does not find the correct capitalization, it tries the all lower
replacer = REPLACE_MAP.get(found) or REPLACE_MAP.get(found.lower())
return (match.group(1) or match.group(5) or '') + replacer + (match.group(4) or match.group(7) or '')
def censor_slurs(body: str, logged_user) -> str: def censor_slurs(body: str, logged_user) -> str:
"""Censors all the slurs in the body if the user is not logged in or if they have the slurreplacer active""" """Censors all the slurs in the body if the user is not logged-in or if they have the slurreplacer active"""
if not logged_user or logged_user.slurreplacer: if not logged_user or logged_user.slurreplacer:
try: try:

View File

@ -52,7 +52,7 @@ def test_get_permutations_slur_wiht_link_replacer():
" retard ": "r-slur", " retard ": "r-slur",
}) })
def test_create_slur_regex(): def test_create_slur_regex():
expected = r"(?i)(\s|>)(kill yourself|faggot)|(kill yourself|faggot)(\s|<)|(\s|>)(nig|retard)(\s|<)" expected = r"(?i)(?<=\s|>)(kill yourself|faggot)|(kill yourself|faggot)(?=\s|<)|(?<=\s|>)(nig|retard)(?=\s|<)"
assert_that(create_slur_regex()).is_equal_to(re.compile(expected)) assert_that(create_slur_regex()).is_equal_to(re.compile(expected))
@ -91,22 +91,23 @@ def test_create_replace_map():
@patch("files.helpers.word_censor.REPLACE_MAP", {'retard': 'r-slur', 'Faggot': 'Cute twink', 'NIG': 'πŸ€'}) @patch("files.helpers.word_censor.REPLACE_MAP", {'retard': 'r-slur', 'Faggot': 'Cute twink', 'NIG': 'πŸ€'})
def test_sub_matcher(): def test_sub_matcher():
regex = re.compile(r"(?i)(\s|>)(kill yourself|retard)|(kill yourself|retard)(\s|<)|(\s|>)(nig|faggot)(\s|<)") regex = re.compile(
r"(?i)(?<=\s|>)(kill yourself|retard)|(kill yourself|retard)(?=\s|<)|(?<=\s|>)(nig|faggot)(?=\s|<)")
match = regex.search("<p>retard</p>") match = regex.search("<p>retard</p>")
assert_that(sub_matcher(match)).is_equal_to(">r-slur") assert_that(sub_matcher(match)).is_equal_to("r-slur")
match = regex.search("<p>noretard</p>") match = regex.search("<p>noretard</p>")
assert_that(sub_matcher(match)).is_equal_to("r-slur<") assert_that(sub_matcher(match)).is_equal_to("r-slur")
match = regex.search("<p>ReTaRdEd</p>") match = regex.search("<p>ReTaRdEd</p>")
assert_that(sub_matcher(match)).is_equal_to(">r-slur") assert_that(sub_matcher(match)).is_equal_to("r-slur")
match = regex.search("<p>NIG</p>") match = regex.search("<p>NIG</p>")
assert_that(sub_matcher(match)).is_equal_to(">πŸ€<") assert_that(sub_matcher(match)).is_equal_to("πŸ€")
match = regex.search("<p>Faggot </p>") match = regex.search("<p>Faggot </p>")
assert_that(sub_matcher(match)).is_equal_to(">Cute twink ") assert_that(sub_matcher(match)).is_equal_to("Cute twink")
@patch("files.helpers.word_censor.SLURS", { @patch("files.helpers.word_censor.SLURS", {
@ -163,6 +164,8 @@ def test_censor_slurs():
@patch("files.helpers.word_censor.SLURS", {'retard': 'r-slur', 'manlet': 'little king', ' nig ': 'πŸ€'}) @patch("files.helpers.word_censor.SLURS", {'retard': 'r-slur', 'manlet': 'little king', ' nig ': 'πŸ€'})
def test_censor_slurs_does_not_error_out_on_exception(): def test_censor_slurs_does_not_error_out_on_exception():
word_censor.REPLACE_MAP = create_replace_map() word_censor.REPLACE_MAP = create_replace_map()
word_censor.SLUR_REGEX = create_slur_regex()
word_censor.REPLACE_MAP["manlet"] = None
word_censor.REPLACE_MAP["Manlet"] = None word_censor.REPLACE_MAP["Manlet"] = None
assert_that(censor_slurs(">retarded SuperManlet NIG<", None)).is_equal_to(">r-slured SuperManlet πŸ€<") assert_that(censor_slurs(">retarded SuperManlet NIG<", None)).is_equal_to(">r-slured SuperManlet πŸ€<")
@ -171,6 +174,7 @@ def test_censor_slurs_does_not_error_out_on_exception():
@patch("files.helpers.word_censor.SLURS", {'retard': 'r-slur', 'manlet': 'little king'}) @patch("files.helpers.word_censor.SLURS", {'retard': 'r-slur', 'manlet': 'little king'})
def test_censor_slurs_does_not_censor_on_flag_disabled(): def test_censor_slurs_does_not_censor_on_flag_disabled():
word_censor.REPLACE_MAP = create_replace_map() word_censor.REPLACE_MAP = create_replace_map()
word_censor.SLUR_REGEX = create_slur_regex()
class User: class User:
def __init__(self, slurreplacer): def __init__(self, slurreplacer):