From da82ea81d8eb0f29bffda1ce8ad2cbc30844e7b6 Mon Sep 17 00:00:00 2001
From: TLSM <duolsm@outlook.com>
Date: Tue, 7 Jun 2022 21:16:58 -0400
Subject: [PATCH] Improve slur replacer performance.

h/t @official-techsupport for digging into the regex performance and
coming up with one that greatly reduces backtracking. We see an
approximately 2x speedup under typical loads, which proves to be a
major overall savings in performance. Previously, censor_slurs was,
second to ORM DB accesses, by far the most time-consuming function
in the codebase under typical loads. It's still not ideal, but it is
much better.

Future options to improve this critical path further would be:
  1) Precompute a slur-replaced HTML, rather than recomputing
     each pageload. Storage is cheap.
  2) Tokenize the HTML and replace plaintext words using O(1)
     exact-match lookups to a dict.
---
 files/helpers/const.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/files/helpers/const.py b/files/helpers/const.py
index 00e55170b..be60601f6 100644
--- a/files/helpers/const.py
+++ b/files/helpers/const.py
@@ -899,17 +899,21 @@ email_regex = re.compile('[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}', flags=re.A|re.
 utm_regex = re.compile('utm_[a-z]+=[a-z0-9_]+&', flags=re.A)
 utm_regex2 = re.compile('[?&]utm_[a-z]+=[a-z0-9_]+', flags=re.A)
 
-slur_regex = re.compile(f"({single_words})(?![^<]*>)", flags=re.I|re.A)
-slur_regex_upper = re.compile(f"({single_words.upper()})(?![^<]*>)", flags=re.A)
+slur_regex = re.compile(f"(<[^>]*>)|({single_words})", flags=re.I|re.A)
+slur_regex_upper = re.compile(f"(<[^>]*>)|({single_words.upper()})", flags=re.A)
 torture_regex = re.compile('(^|\s)(i|me) ', flags=re.I|re.A)
 torture_regex2 = re.compile("(^|\s)i'm ", flags=re.I|re.A)
 torture_regex_exclude = re.compile('^\s*>', flags=re.A)
 
-def sub_matcher(match):
-	return SLURS[match.group(0).lower()]
+def sub_matcher(match, upper=False):
+	if match.group(1):
+		return match.group(1)
+	else: # implies match.group(2)
+		repl = SLURS[match.group(2).lower()]
+		return repl if not upper else repl.upper()
 
 def sub_matcher_upper(match):
-	return SLURS[match.group(0).lower()].upper()
+	return sub_matcher(match, upper=True)
 
 def censor_slurs(body, logged_user):
 	if not logged_user or logged_user == 'chat' or logged_user.slurreplacer: