From 10c8d7def601c1c4e618ee298e70f3c787391441 Mon Sep 17 00:00:00 2001 From: Yo Mama Date: Tue, 19 Oct 2021 00:46:10 +0200 Subject: [PATCH] Change the regex to have lookahed and lookbehinds so the match is more 'pure' --- files/helpers/word_censor.py | 23 +++++++---------------- test/files/helpers/test_word_censor.py | 18 +++++++++++------- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/files/helpers/word_censor.py b/files/helpers/word_censor.py index b43324f09..beaa88ed1 100644 --- a/files/helpers/word_censor.py +++ b/files/helpers/word_censor.py @@ -43,12 +43,13 @@ def create_slur_regex() -> Pattern[str]: # words that can have suffixes and prefixes words = "|".join([slur.lower() for slur in SLURS.keys() if not slur.startswith(" ")]) - regex = rf"(\s|>)({words})|({words})(\s|<)" + # to understand the weird groups see: https://www.regular-expressions.info/lookaround.html + regex = rf"(?<=\s|>)({words})|({words})(?=\s|<)" # words that need to match exactly single_words = "|".join([slur.strip().lower() for slur in SLURS.keys() if slur.startswith(" ")]) - return re.compile(rf"(?i){regex}|(\s|>)({single_words})(\s|<)") + return re.compile(rf"(?i){regex}|(?<=\s|>)({single_words})(?=\s|<)") def create_replace_map() -> Dict[str, str]: @@ -65,23 +66,13 @@ REPLACE_MAP = create_replace_map() def sub_matcher(match: Match) -> str: """given a match returns the correct replacer string""" - - # base regex: (?i)(\s|>)(words)|(words)(\s|<)|(\s|>)(words)(\s|<) - if match.group(2) is not None: - found = match.group(2) - elif match.group(3) is not None: - found = match.group(3) - else: - found = match.group(6) - - # if it does not find the correct capitalization, it tries the all lower - replacer = REPLACE_MAP.get(found) or REPLACE_MAP.get(found.lower()) - - return (match.group(1) or match.group(5) or '') + replacer + (match.group(4) or match.group(7) or '') + found = match.group(0) + # if it does not find the correct capitalization, it tries the all lower, or return the original word + return REPLACE_MAP.get(found) or REPLACE_MAP.get(found.lower()) or found def censor_slurs(body: str, logged_user) -> str: - """Censors all the slurs in the body if the user is not logged in or if they have the slurreplacer active""" + """Censors all the slurs in the body if the user is not logged-in or if they have the slurreplacer active""" if not logged_user or logged_user.slurreplacer: try: diff --git a/test/files/helpers/test_word_censor.py b/test/files/helpers/test_word_censor.py index b97dca411..1a24c8f93 100644 --- a/test/files/helpers/test_word_censor.py +++ b/test/files/helpers/test_word_censor.py @@ -52,7 +52,7 @@ def test_get_permutations_slur_wiht_link_replacer(): " retard ": "r-slur", }) def test_create_slur_regex(): - expected = r"(?i)(\s|>)(kill yourself|faggot)|(kill yourself|faggot)(\s|<)|(\s|>)(nig|retard)(\s|<)" + expected = r"(?i)(?<=\s|>)(kill yourself|faggot)|(kill yourself|faggot)(?=\s|<)|(?<=\s|>)(nig|retard)(?=\s|<)" assert_that(create_slur_regex()).is_equal_to(re.compile(expected)) @@ -91,22 +91,23 @@ def test_create_replace_map(): @patch("files.helpers.word_censor.REPLACE_MAP", {'retard': 'r-slur', 'Faggot': 'Cute twink', 'NIG': '🏀'}) def test_sub_matcher(): - regex = re.compile(r"(?i)(\s|>)(kill yourself|retard)|(kill yourself|retard)(\s|<)|(\s|>)(nig|faggot)(\s|<)") + regex = re.compile( + r"(?i)(?<=\s|>)(kill yourself|retard)|(kill yourself|retard)(?=\s|<)|(?<=\s|>)(nig|faggot)(?=\s|<)") match = regex.search("

retard

") - assert_that(sub_matcher(match)).is_equal_to(">r-slur") + assert_that(sub_matcher(match)).is_equal_to("r-slur") match = regex.search("

noretard

") - assert_that(sub_matcher(match)).is_equal_to("r-slur<") + assert_that(sub_matcher(match)).is_equal_to("r-slur") match = regex.search("

ReTaRdEd

") - assert_that(sub_matcher(match)).is_equal_to(">r-slur") + assert_that(sub_matcher(match)).is_equal_to("r-slur") match = regex.search("

NIG

") - assert_that(sub_matcher(match)).is_equal_to(">🏀<") + assert_that(sub_matcher(match)).is_equal_to("🏀") match = regex.search("

Faggot

") - assert_that(sub_matcher(match)).is_equal_to(">Cute twink ") + assert_that(sub_matcher(match)).is_equal_to("Cute twink") @patch("files.helpers.word_censor.SLURS", { @@ -163,6 +164,8 @@ def test_censor_slurs(): @patch("files.helpers.word_censor.SLURS", {'retard': 'r-slur', 'manlet': 'little king', ' nig ': '🏀'}) def test_censor_slurs_does_not_error_out_on_exception(): word_censor.REPLACE_MAP = create_replace_map() + word_censor.SLUR_REGEX = create_slur_regex() + word_censor.REPLACE_MAP["manlet"] = None word_censor.REPLACE_MAP["Manlet"] = None assert_that(censor_slurs(">retarded SuperManlet NIG<", None)).is_equal_to(">r-slured SuperManlet 🏀<") @@ -171,6 +174,7 @@ def test_censor_slurs_does_not_error_out_on_exception(): @patch("files.helpers.word_censor.SLURS", {'retard': 'r-slur', 'manlet': 'little king'}) def test_censor_slurs_does_not_censor_on_flag_disabled(): word_censor.REPLACE_MAP = create_replace_map() + word_censor.SLUR_REGEX = create_slur_regex() class User: def __init__(self, slurreplacer):