Do the replacement in a single regex

2021-10-19 00:24:15 +02:00 · 2021-10-19 00:24:15 +02:00 · ff76a4d688
parent af8da42c73
commit ff76a4d688
2 changed files with 60 additions and 69 deletions
--- a/files/helpers/word_censor.py
+++ b/files/helpers/word_censor.py
@ -1,7 +1,7 @@
 from collections import ChainMap
 import re
 from re import Match
-from typing import List, Dict
+from typing import Dict, Pattern

 from files.helpers.const import SLURS

@ -39,6 +39,18 @@ def get_permutations_slur(slur: str, replacer: str = "_") -> Dict[str, str]:
    return result


+def create_slur_regex() -> Pattern[str]:
+    # words that can have suffixes and prefixes
+    words = "|".join([slur.lower() for slur in SLURS.keys() if not slur.startswith(" ")])
+
+    regex = rf"(\s|>)({words})|({words})(\s|<)"
+
+    # words that need to match exactly
+    single_words = "|".join([slur.strip().lower() for slur in SLURS.keys() if slur.startswith(" ")])
+
+    return re.compile(rf"(?i){regex}|(\s|>)({single_words})(\s|<)")
+
+
 def create_replace_map() -> Dict[str, str]:
    """Creates the map that will be used to get the mathing replaced for the given slur"""
    dicts = [get_permutations_slur(slur, replacer) for (slur, replacer) in SLURS.items()]
@ -47,41 +59,34 @@ def create_replace_map() -> Dict[str, str]:
    return dict(ChainMap(*dicts))


+SLUR_REGEX = create_slur_regex()
 REPLACE_MAP = create_replace_map()


-def create_variations_slur_regex(slur: str) -> List[str]:
-    """For a given match generates the corresponding replacer"""
-    permutations = get_permutations_slur(slur)
-
-    if slur.startswith(" ") and slur.endswith(" "):
-        return [rf"(\s|>)({perm})(\s|<)" for perm in permutations.keys()]
-    else:
-        return [rf"(\s|>)({perm})|({perm})(\s|<)" for perm in permutations.keys()]
-
-
 def sub_matcher(match: Match) -> str:
-    # special case when it should match exact word
-    if len(match.groups()) == 3:
-        found = match.group(2)
-        replacer = REPLACE_MAP[found]
-        return match.group(1) + replacer + match.group(3)
+    """given a match returns the correct replacer string"""

-    else:  # normal case with prefix or suffix
-        found = match.group(2) if (match.group(2) is not None) else match.group(3)
-        replacer = REPLACE_MAP[found]
-        return (match.group(1) or '') + replacer + (match.group(4) or '')
+    # base regex: (?i)(\s|>)(words)|(words)(\s|<)|(\s|>)(words)(\s|<)
+    if match.group(2) is not None:
+        found = match.group(2)
+    elif match.group(3) is not None:
+        found = match.group(3)
+    else:
+        found = match.group(6)
+
+    # if it does not find the correct capitalization, it tries the all lower
+    replacer = REPLACE_MAP.get(found) or REPLACE_MAP.get(found.lower())
+
+    return (match.group(1) or match.group(5) or '') + replacer + (match.group(4) or match.group(7) or '')


 def censor_slurs(body: str, logged_user) -> str:
-    if logged_user and not logged_user.slurreplacer:
-        return body
+    """Censors all the slurs in the body if the user is not logged in or if they have the slurreplacer active"""

-    for (slur, replace) in SLURS.items():
-        for variation in create_variations_slur_regex(slur):
-            try:
-                body = re.sub(variation, sub_matcher, body)
-            except Exception as e:
-                print(e)
+    if not logged_user or logged_user.slurreplacer:
+        try:
+            body = SLUR_REGEX.sub(sub_matcher, body)
+        except Exception as e:
+            print(e)

    return body
--- a/test/files/helpers/test_word_censor.py
+++ b/test/files/helpers/test_word_censor.py
@ -4,8 +4,8 @@ from unittest.mock import patch
 from assertpy import assert_that

 from files.helpers import word_censor
-from files.helpers.word_censor import create_variations_slur_regex, create_replace_map, censor_slurs, sub_matcher, \
-    get_permutations_slur, first_upper, first_all_upper
+from files.helpers.word_censor import create_replace_map, censor_slurs, sub_matcher, \
+    get_permutations_slur, first_upper, first_all_upper, create_slur_regex


 def test_first_upper():
@ -45,34 +45,16 @@ def test_get_permutations_slur_wiht_link_replacer():
    assert_that(result).is_equal_to(expected)


-def test_create_variations_slur_regex_for_slur_with_spaces():
-    expected = [r"(\s|>)(retard)(\s|<)",
-                r"(\s|>)(Retard)(\s|<)",
-                r"(\s|>)(RETARD)(\s|<)"]
+@patch("files.helpers.word_censor.SLURS", {
+    "kill yourself": "keep yourself safe",
+    "faggot": "cute twink",
+    " nig ": "🏀",
+    " retard ": "r-slur",
+})
+def test_create_slur_regex():
+    expected = r"(?i)(\s|>)(kill yourself|faggot)|(kill yourself|faggot)(\s|<)|(\s|>)(nig|retard)(\s|<)"

-    result = create_variations_slur_regex(" retard ")
-
-    assert_that(result).is_length(3).contains_only(*expected)
-
-
-def test_create_variations_slur_regex_single_word():
-    expected = [r"(\s|>)(retard)|(retard)(\s|<)",
-                r"(\s|>)(Retard)|(Retard)(\s|<)",
-                r"(\s|>)(RETARD)|(RETARD)(\s|<)"]
-
-    result = create_variations_slur_regex("retard")
-
-    assert_that(result).is_length(3).contains_only(*expected)
-
-
-def test_create_variations_slur_regex_multiple_word():
-    expected = [r"(\s|>)(kill yourself)|(kill yourself)(\s|<)",
-                r"(\s|>)(Kill yourself)|(Kill yourself)(\s|<)",
-                r"(\s|>)(Kill Yourself)|(Kill Yourself)(\s|<)",
-                r"(\s|>)(KILL YOURSELF)|(KILL YOURSELF)(\s|<)"]
-    result = create_variations_slur_regex("kill yourself")
-
-    assert_that(result).is_length(4).contains_only(*expected)
+    assert_that(create_slur_regex()).is_equal_to(re.compile(expected))


@patch("files.helpers.word_censor.SLURS", {
@ -107,19 +89,24 @@ def test_create_replace_map():
    assert_that(result).is_equal_to(expected)


-@patch("files.helpers.word_censor.REPLACE_MAP", {'retard': 'r-slur', 'NIG': '🏀'})
+@patch("files.helpers.word_censor.REPLACE_MAP", {'retard': 'r-slur', 'Faggot': 'Cute twink', 'NIG': '🏀'})
 def test_sub_matcher():
-    match = re.search(r"(\s|>)(retard)|(retard)(\s|<)", "<p>retard</p>")
+    regex = re.compile(r"(?i)(\s|>)(kill yourself|retard)|(kill yourself|retard)(\s|<)|(\s|>)(nig|faggot)(\s|<)")
+
+    match = regex.search("<p>retard</p>")
    assert_that(sub_matcher(match)).is_equal_to(">r-slur")

-    match = re.search(r"(\s|>)(retard)|(retard)(\s|<)", "<p>noretard</p>")
+    match = regex.search("<p>noretard</p>")
    assert_that(sub_matcher(match)).is_equal_to("r-slur<")

-    match = re.search(r"(\s|>)(NIG)(\s|<)", "<p>NIG</p>")
+    match = regex.search("<p>ReTaRdEd</p>")
+    assert_that(sub_matcher(match)).is_equal_to(">r-slur")
+
+    match = regex.search("<p>NIG</p>")
    assert_that(sub_matcher(match)).is_equal_to(">🏀<")

-    match = re.search(r"(\s|>)(NIG)(\s|<)", "<p>NIG </p>")
-    assert_that(sub_matcher(match)).is_equal_to(">🏀 ")
+    match = regex.search("<p>Faggot </p>")
+    assert_that(sub_matcher(match)).is_equal_to(">Cute twink ")


@patch("files.helpers.word_censor.SLURS", {
@ -131,15 +118,16 @@ def test_sub_matcher():
 })
 def test_censor_slurs():
    word_censor.REPLACE_MAP = create_replace_map()
+    word_censor.SLUR_REGEX = create_slur_regex()

    assert_that(censor_slurs("<p>retard</p>", None)).is_equal_to("<p>r-slur</p>")
    assert_that(censor_slurs("<p>preretard</p>", None)).is_equal_to("<p>prer-slur</p>")
    assert_that(censor_slurs("that is Retarded like", None)).is_equal_to("that is R-slured like")
    assert_that(censor_slurs("that is SUPERRETARD like", None)).is_equal_to("that is SUPERR-SLUR like")
+    assert_that(censor_slurs('... ReTaRd ...', None)).is_equal_to('... r-slur ...')
    assert_that(censor_slurs("<p>Manlets get out!</p>", None)).is_equal_to("<p>Little kings get out!</p>")

    assert_that(censor_slurs('... "retard" ...', None)).is_equal_to('... "retard" ...')
-    assert_that(censor_slurs('... ReTaRd ...', None)).is_equal_to('... ReTaRd ...')
    assert_that(censor_slurs('... xretardx ...', None)).is_equal_to('... xretardx ...')

    assert_that(censor_slurs("LLM is a manlet hehe", None)).is_equal_to("LLM is a little king hehe")
@ -155,11 +143,9 @@ def test_censor_slurs():
    assert_that(censor_slurs('... I hate Carp ...', None)).is_equal_to('... I love Carp ...')
    assert_that(censor_slurs('... I Hate Carp ...', None)).is_equal_to('... I Love Carp ...')
    assert_that(censor_slurs('... I HATE CARP ...', None)).is_equal_to('... I LOVE CARP ...')
-
-    # Not covered:
-    assert_that(censor_slurs('... I Hate carp ...', None)).is_equal_to('... I Hate carp ...')
-    assert_that(censor_slurs('... i Hate Carp ...', None)).is_equal_to('... i Hate Carp ...')
-    assert_that(censor_slurs('... i Hate carp ...', None)).is_equal_to('... i Hate carp ...')
+    assert_that(censor_slurs('... I Hate carp ...', None)).is_equal_to('... i love Carp ...')
+    assert_that(censor_slurs('... i Hate Carp ...', None)).is_equal_to('... i love Carp ...')
+    assert_that(censor_slurs('... i Hate carp ...', None)).is_equal_to('... i love Carp ...')

    assert_that(censor_slurs('... i hate a carp ...', None)).is_equal_to('... i hate a carp ...')