From da82ea81d8eb0f29bffda1ce8ad2cbc30844e7b6 Mon Sep 17 00:00:00 2001 From: TLSM Date: Tue, 7 Jun 2022 21:16:58 -0400 Subject: [PATCH] Improve slur replacer performance. h/t @official-techsupport for digging into the regex performance and coming up with one that greatly reduces backtracking. We see an approximately 2x speedup under typical loads, which proves to be a major overall savings in performance. Previously, censor_slurs was, second to ORM DB accesses, by far the most time-consuming function in the codebase under typical loads. It's still not ideal, but it is much better. Future options to improve this critical path further would be: 1) Precompute a slur-replaced HTML, rather than recomputing each pageload. Storage is cheap. 2) Tokenize the HTML and replace plaintext words using O(1) exact-match lookups to a dict. --- files/helpers/const.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/files/helpers/const.py b/files/helpers/const.py index 00e55170b..be60601f6 100644 --- a/files/helpers/const.py +++ b/files/helpers/const.py @@ -899,17 +899,21 @@ email_regex = re.compile('[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}', flags=re.A|re. utm_regex = re.compile('utm_[a-z]+=[a-z0-9_]+&', flags=re.A) utm_regex2 = re.compile('[?&]utm_[a-z]+=[a-z0-9_]+', flags=re.A) -slur_regex = re.compile(f"({single_words})(?![^<]*>)", flags=re.I|re.A) -slur_regex_upper = re.compile(f"({single_words.upper()})(?![^<]*>)", flags=re.A) +slur_regex = re.compile(f"(<[^>]*>)|({single_words})", flags=re.I|re.A) +slur_regex_upper = re.compile(f"(<[^>]*>)|({single_words.upper()})", flags=re.A) torture_regex = re.compile('(^|\s)(i|me) ', flags=re.I|re.A) torture_regex2 = re.compile("(^|\s)i'm ", flags=re.I|re.A) torture_regex_exclude = re.compile('^\s*>', flags=re.A) -def sub_matcher(match): - return SLURS[match.group(0).lower()] +def sub_matcher(match, upper=False): + if match.group(1): + return match.group(1) + else: # implies match.group(2) + repl = SLURS[match.group(2).lower()] + return repl if not upper else repl.upper() def sub_matcher_upper(match): - return SLURS[match.group(0).lower()].upper() + return sub_matcher(match, upper=True) def censor_slurs(body, logged_user): if not logged_user or logged_user == 'chat' or logged_user.slurreplacer: