Merge pull request #77 from Aevann1/only_matches_full_words

Now the slur replacer only matches full words
2021-10-19 23:13:50 +02:00 · 2021-10-19 23:13:50 +02:00 · b1e2648d3e
parent e2c7a74cdd 3602a6446e
commit b1e2648d3e
3 changed files with 21 additions and 48 deletions
--- a/files/helpers/const.py
+++ b/files/helpers/const.py
@ -5,35 +5,20 @@ site = environ.get("DOMAIN", '').strip()
 #####################
 # Formatting rules: #
 #####################
-#
-# on the slur side, they will match prefixes and suffixes and not middle of words, so for example
-#   "retard" will match:
-#       - "retard"
-#       - "retarded"
-#       - "superretard"
-#       But not "superretarded"
-#
 # If all letters are lowercase then it will match lowercase, first letter up in first or all the words and all letters up
-#    "dancing israelis" will match (with prefixes and suffixes omitted for brevity):
+#    "dancing israelis" will match:
 #       - "dancing israelis"
 #       - "Dancing israelis"
 #       - "Dancing Israelis"
 #       - "DANCING ISRAELIS"
 #
 # If some letters are Uppercase, the same, but with the additional option of the original casing, and respecting already existing uppercase
-#    "NoNewNormal" will match (with prefixes and suffixes omitted for brevity):
+#    "NoNewNormal" will match:
 #       - "NoNewNormal"
 #       - "nonewnormal"
 #       - "Nonewnormal"
 #       - "NONEWNORMAL"
 #
-# If the slur has a space before and after then the match is limited to the exact word, no prefixes or suffixes
-# (previous rules about capitalization still apply)
-#   " neg " will match only:
-#       - "neg"
-#       - "Neg"
-#       - "NEG"
-#
 # Now on the replacement side, The replacement will have the same capitalization as the slur if the replacement is lowercase
 #   "kill yourself" -> "keep yourself safe"
 #   "Kill yourself" -> "Keep yourself safe"
@ -48,7 +33,6 @@ site = environ.get("DOMAIN", '').strip()
 #
 # There is a super special case that if the replacer starts with "http" then it never changes capitalization
 #
-#
 # TL;DR: Just read the above once, or don't, and try to guess!
 SLURS = {
    "faggot": "cute twink",
@ -113,8 +97,6 @@ SLURS = {
    "china flu": "SARS-CoV-2 syndemic",
    "china virus": "SARS-CoV-2 syndemic",
    "kung flu": "SARS-CoV-2 syndemic",
-
-    # if  the word has spaces in the beginning and the end it will only censor this word without prefixes or suffixes
    "nig": "🏀",
    "nigs": "🏀s",
 }
--- a/files/helpers/word_censor.py
+++ b/files/helpers/word_censor.py
@ -40,20 +40,14 @@ def get_permutations_slur(slur: str, replacer: str = "_") -> Dict[str, str]:


 def create_slur_regex() -> Pattern[str]:
-    # words that can have suffixes and prefixes
-    words = "|".join([slur.lower() for slur in SLURS.keys() if not slur.startswith(" ")])
+    """Creates the regex that will find the slurs"""
+    single_words = "|".join([slur.strip().lower() for slur in SLURS.keys()])

-    # to understand the weird groups see: https://www.regular-expressions.info/lookaround.html
-    regex = rf"(?<=\s|>)({words})|({words})(?=\s|<)"
-
-    # words that need to match exactly
-    single_words = "|".join([slur.strip().lower() for slur in SLURS.keys() if slur.startswith(" ")])
-
-    return re.compile(rf"(?i){regex}|(?<=\s|>)({single_words})(?=\s|<)")
+    return re.compile(rf"(?i)(?<=\s|>)({single_words})(?=\s|<)")


 def create_replace_map() -> Dict[str, str]:
-    """Creates the map that will be used to get the mathing replaced for the given slur"""
+    """Creates the map that will be used to get the matching replaced for the given slur"""
    dicts = [get_permutations_slur(slur, replacer) for (slur, replacer) in SLURS.items()]

    # flattens the list of dict to a single dict
--- a/test/files/helpers/test_word_censor.py
+++ b/test/files/helpers/test_word_censor.py
@ -52,7 +52,7 @@ def test_get_permutations_slur_wiht_link_replacer():
    "retard": "r-slur",
 })
 def test_create_slur_regex():
-    expected = r"(?i)(?<=\s|>)(kill yourself|faggot)|(kill yourself|faggot)(?=\s|<)|(?<=\s|>)(nig|retard)(?=\s|<)"
+    expected = r"(?i)(?<=\s|>)(kill yourself|faggot|nig|retard)(?=\s|<)"

    assert_that(create_slur_regex()).is_equal_to(re.compile(expected))

@ -97,10 +97,7 @@ def test_sub_matcher():
    match = regex.search("<p>retard</p>")
    assert_that(sub_matcher(match)).is_equal_to("r-slur")

-    match = regex.search("<p>noretard</p>")
-    assert_that(sub_matcher(match)).is_equal_to("r-slur")
-
-    match = regex.search("<p>ReTaRdEd</p>")
+    match = regex.search("<p>ReTaRd</p>")
    assert_that(sub_matcher(match)).is_equal_to("r-slur")

    match = regex.search("<p>NIG</p>")
@ -122,12 +119,13 @@ def test_censor_slurs():
    word_censor.SLUR_REGEX = create_slur_regex()

    assert_that(censor_slurs("<p>retard</p>", None)).is_equal_to("<p>r-slur</p>")
-    assert_that(censor_slurs("<p>preretard</p>", None)).is_equal_to("<p>prer-slur</p>")
-    assert_that(censor_slurs("that is Retarded like", None)).is_equal_to("that is R-slured like")
-    assert_that(censor_slurs("that is SUPERRETARD like", None)).is_equal_to("that is SUPERR-SLUR like")
    assert_that(censor_slurs('... ReTaRd ...', None)).is_equal_to('... r-slur ...')
-    assert_that(censor_slurs("<p>Manlets get out!</p>", None)).is_equal_to("<p>Little kings get out!</p>")
+    assert_that(censor_slurs("<p>Manlet get out!</p>", None)).is_equal_to("<p>Little king get out!</p>")

+    # does not work:
+    assert_that(censor_slurs("<p>preretard</p>", None)).is_equal_to("<p>preretard</p>")
+    assert_that(censor_slurs("that is Retarded like", None)).is_equal_to("that is Retarded like")
+    assert_that(censor_slurs("that is SUPERRETARD like", None)).is_equal_to("that is SUPERRETARD like")
    assert_that(censor_slurs('... "retard" ...', None)).is_equal_to('... "retard" ...')
    assert_that(censor_slurs('... xretardx ...', None)).is_equal_to('... xretardx ...')

@ -150,8 +148,7 @@ def test_censor_slurs():

    assert_that(censor_slurs('... i hate a carp ...', None)).is_equal_to('... i hate a carp ...')

-    assert_that(censor_slurs("<p>retarded SuperManlet NIG</p>", None)) \
-        .is_equal_to("<p>r-slured SuperLittle king 🏀</p>")
+    assert_that(censor_slurs("<p>retard Manlet NIG</p>", None)).is_equal_to("<p>r-slur Little king 🏀</p>")

    assert_that(censor_slurs('... kike ...', None)) \
        .is_equal_to('... https://sciencedirect.com/science/article/abs/pii/S016028960600033X ...')
@ -168,7 +165,7 @@ def test_censor_slurs_does_not_error_out_on_exception():
    word_censor.REPLACE_MAP["manlet"] = None
    word_censor.REPLACE_MAP["Manlet"] = None

-    assert_that(censor_slurs(">retarded SuperManlet NIG<", None)).is_equal_to(">r-slured SuperManlet 🏀<")
+    assert_that(censor_slurs(">retard Manlet NIG<", None)).is_equal_to(">r-slur Manlet 🏀<")


@patch("files.helpers.word_censor.SLURS", {'retard': 'r-slur', 'manlet': 'little king'})