From ff76a4d6882449a0beca4fe787d847772f4d4fe9 Mon Sep 17 00:00:00 2001 From: Yo Mama Date: Tue, 19 Oct 2021 00:24:15 +0200 Subject: [PATCH] Do the replacement in a single regex --- files/helpers/word_censor.py | 61 ++++++++++++----------- test/files/helpers/test_word_censor.py | 68 ++++++++++---------------- 2 files changed, 60 insertions(+), 69 deletions(-) diff --git a/files/helpers/word_censor.py b/files/helpers/word_censor.py index bae26fc674..b43324f095 100644 --- a/files/helpers/word_censor.py +++ b/files/helpers/word_censor.py @@ -1,7 +1,7 @@ from collections import ChainMap import re from re import Match -from typing import List, Dict +from typing import Dict, Pattern from files.helpers.const import SLURS @@ -39,6 +39,18 @@ def get_permutations_slur(slur: str, replacer: str = "_") -> Dict[str, str]: return result +def create_slur_regex() -> Pattern[str]: + # words that can have suffixes and prefixes + words = "|".join([slur.lower() for slur in SLURS.keys() if not slur.startswith(" ")]) + + regex = rf"(\s|>)({words})|({words})(\s|<)" + + # words that need to match exactly + single_words = "|".join([slur.strip().lower() for slur in SLURS.keys() if slur.startswith(" ")]) + + return re.compile(rf"(?i){regex}|(\s|>)({single_words})(\s|<)") + + def create_replace_map() -> Dict[str, str]: """Creates the map that will be used to get the mathing replaced for the given slur""" dicts = [get_permutations_slur(slur, replacer) for (slur, replacer) in SLURS.items()] @@ -47,41 +59,34 @@ def create_replace_map() -> Dict[str, str]: return dict(ChainMap(*dicts)) +SLUR_REGEX = create_slur_regex() REPLACE_MAP = create_replace_map() -def create_variations_slur_regex(slur: str) -> List[str]: - """For a given match generates the corresponding replacer""" - permutations = get_permutations_slur(slur) - - if slur.startswith(" ") and slur.endswith(" "): - return [rf"(\s|>)({perm})(\s|<)" for perm in permutations.keys()] - else: - return [rf"(\s|>)({perm})|({perm})(\s|<)" for perm in permutations.keys()] - - def sub_matcher(match: Match) -> str: - # special case when it should match exact word - if len(match.groups()) == 3: - found = match.group(2) - replacer = REPLACE_MAP[found] - return match.group(1) + replacer + match.group(3) + """given a match returns the correct replacer string""" - else: # normal case with prefix or suffix - found = match.group(2) if (match.group(2) is not None) else match.group(3) - replacer = REPLACE_MAP[found] - return (match.group(1) or '') + replacer + (match.group(4) or '') + # base regex: (?i)(\s|>)(words)|(words)(\s|<)|(\s|>)(words)(\s|<) + if match.group(2) is not None: + found = match.group(2) + elif match.group(3) is not None: + found = match.group(3) + else: + found = match.group(6) + + # if it does not find the correct capitalization, it tries the all lower + replacer = REPLACE_MAP.get(found) or REPLACE_MAP.get(found.lower()) + + return (match.group(1) or match.group(5) or '') + replacer + (match.group(4) or match.group(7) or '') def censor_slurs(body: str, logged_user) -> str: - if logged_user and not logged_user.slurreplacer: - return body + """Censors all the slurs in the body if the user is not logged in or if they have the slurreplacer active""" - for (slur, replace) in SLURS.items(): - for variation in create_variations_slur_regex(slur): - try: - body = re.sub(variation, sub_matcher, body) - except Exception as e: - print(e) + if not logged_user or logged_user.slurreplacer: + try: + body = SLUR_REGEX.sub(sub_matcher, body) + except Exception as e: + print(e) return body diff --git a/test/files/helpers/test_word_censor.py b/test/files/helpers/test_word_censor.py index 1ee2c89229..b97dca411b 100644 --- a/test/files/helpers/test_word_censor.py +++ b/test/files/helpers/test_word_censor.py @@ -4,8 +4,8 @@ from unittest.mock import patch from assertpy import assert_that from files.helpers import word_censor -from files.helpers.word_censor import create_variations_slur_regex, create_replace_map, censor_slurs, sub_matcher, \ - get_permutations_slur, first_upper, first_all_upper +from files.helpers.word_censor import create_replace_map, censor_slurs, sub_matcher, \ + get_permutations_slur, first_upper, first_all_upper, create_slur_regex def test_first_upper(): @@ -45,34 +45,16 @@ def test_get_permutations_slur_wiht_link_replacer(): assert_that(result).is_equal_to(expected) -def test_create_variations_slur_regex_for_slur_with_spaces(): - expected = [r"(\s|>)(retard)(\s|<)", - r"(\s|>)(Retard)(\s|<)", - r"(\s|>)(RETARD)(\s|<)"] +@patch("files.helpers.word_censor.SLURS", { + "kill yourself": "keep yourself safe", + "faggot": "cute twink", + " nig ": "🏀", + " retard ": "r-slur", +}) +def test_create_slur_regex(): + expected = r"(?i)(\s|>)(kill yourself|faggot)|(kill yourself|faggot)(\s|<)|(\s|>)(nig|retard)(\s|<)" - result = create_variations_slur_regex(" retard ") - - assert_that(result).is_length(3).contains_only(*expected) - - -def test_create_variations_slur_regex_single_word(): - expected = [r"(\s|>)(retard)|(retard)(\s|<)", - r"(\s|>)(Retard)|(Retard)(\s|<)", - r"(\s|>)(RETARD)|(RETARD)(\s|<)"] - - result = create_variations_slur_regex("retard") - - assert_that(result).is_length(3).contains_only(*expected) - - -def test_create_variations_slur_regex_multiple_word(): - expected = [r"(\s|>)(kill yourself)|(kill yourself)(\s|<)", - r"(\s|>)(Kill yourself)|(Kill yourself)(\s|<)", - r"(\s|>)(Kill Yourself)|(Kill Yourself)(\s|<)", - r"(\s|>)(KILL YOURSELF)|(KILL YOURSELF)(\s|<)"] - result = create_variations_slur_regex("kill yourself") - - assert_that(result).is_length(4).contains_only(*expected) + assert_that(create_slur_regex()).is_equal_to(re.compile(expected)) @patch("files.helpers.word_censor.SLURS", { @@ -107,19 +89,24 @@ def test_create_replace_map(): assert_that(result).is_equal_to(expected) -@patch("files.helpers.word_censor.REPLACE_MAP", {'retard': 'r-slur', 'NIG': '🏀'}) +@patch("files.helpers.word_censor.REPLACE_MAP", {'retard': 'r-slur', 'Faggot': 'Cute twink', 'NIG': '🏀'}) def test_sub_matcher(): - match = re.search(r"(\s|>)(retard)|(retard)(\s|<)", "

retard

") + regex = re.compile(r"(?i)(\s|>)(kill yourself|retard)|(kill yourself|retard)(\s|<)|(\s|>)(nig|faggot)(\s|<)") + + match = regex.search("

retard

") assert_that(sub_matcher(match)).is_equal_to(">r-slur") - match = re.search(r"(\s|>)(retard)|(retard)(\s|<)", "

noretard

") + match = regex.search("

noretard

") assert_that(sub_matcher(match)).is_equal_to("r-slur<") - match = re.search(r"(\s|>)(NIG)(\s|<)", "

NIG

") + match = regex.search("

ReTaRdEd

") + assert_that(sub_matcher(match)).is_equal_to(">r-slur") + + match = regex.search("

NIG

") assert_that(sub_matcher(match)).is_equal_to(">🏀<") - match = re.search(r"(\s|>)(NIG)(\s|<)", "

NIG

") - assert_that(sub_matcher(match)).is_equal_to(">🏀 ") + match = regex.search("

Faggot

") + assert_that(sub_matcher(match)).is_equal_to(">Cute twink ") @patch("files.helpers.word_censor.SLURS", { @@ -131,15 +118,16 @@ def test_sub_matcher(): }) def test_censor_slurs(): word_censor.REPLACE_MAP = create_replace_map() + word_censor.SLUR_REGEX = create_slur_regex() assert_that(censor_slurs("

retard

", None)).is_equal_to("

r-slur

") assert_that(censor_slurs("

preretard

", None)).is_equal_to("

prer-slur

") assert_that(censor_slurs("that is Retarded like", None)).is_equal_to("that is R-slured like") assert_that(censor_slurs("that is SUPERRETARD like", None)).is_equal_to("that is SUPERR-SLUR like") + assert_that(censor_slurs('... ReTaRd ...', None)).is_equal_to('... r-slur ...') assert_that(censor_slurs("

Manlets get out!

", None)).is_equal_to("

Little kings get out!

") assert_that(censor_slurs('... "retard" ...', None)).is_equal_to('... "retard" ...') - assert_that(censor_slurs('... ReTaRd ...', None)).is_equal_to('... ReTaRd ...') assert_that(censor_slurs('... xretardx ...', None)).is_equal_to('... xretardx ...') assert_that(censor_slurs("LLM is a manlet hehe", None)).is_equal_to("LLM is a little king hehe") @@ -155,11 +143,9 @@ def test_censor_slurs(): assert_that(censor_slurs('... I hate Carp ...', None)).is_equal_to('... I love Carp ...') assert_that(censor_slurs('... I Hate Carp ...', None)).is_equal_to('... I Love Carp ...') assert_that(censor_slurs('... I HATE CARP ...', None)).is_equal_to('... I LOVE CARP ...') - - # Not covered: - assert_that(censor_slurs('... I Hate carp ...', None)).is_equal_to('... I Hate carp ...') - assert_that(censor_slurs('... i Hate Carp ...', None)).is_equal_to('... i Hate Carp ...') - assert_that(censor_slurs('... i Hate carp ...', None)).is_equal_to('... i Hate carp ...') + assert_that(censor_slurs('... I Hate carp ...', None)).is_equal_to('... i love Carp ...') + assert_that(censor_slurs('... i Hate Carp ...', None)).is_equal_to('... i love Carp ...') + assert_that(censor_slurs('... i Hate carp ...', None)).is_equal_to('... i love Carp ...') assert_that(censor_slurs('... i hate a carp ...', None)).is_equal_to('... i hate a carp ...')