Improve slur replacer performance.

h/t @official-techsupport for digging into the regex performance and
coming up with one that greatly reduces backtracking. We see an
approximately 2x speedup under typical loads, which proves to be a
major overall savings in performance. Previously, censor_slurs was,
second to ORM DB accesses, by far the most time-consuming function
in the codebase under typical loads. It's still not ideal, but it is
much better.

Future options to improve this critical path further would be:
  1) Precompute a slur-replaced HTML, rather than recomputing
     each pageload. Storage is cheap.
  2) Tokenize the HTML and replace plaintext words using O(1)
     exact-match lookups to a dict.
remotes/1693045480750635534/spooky-22
Snakes 2022-06-07 21:16:58 -04:00
parent e85d76947a
commit da82ea81d8
1 changed files with 9 additions and 5 deletions

View File

@ -899,17 +899,21 @@ email_regex = re.compile('[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}', flags=re.A|re.
utm_regex = re.compile('utm_[a-z]+=[a-z0-9_]+&', flags=re.A)
utm_regex2 = re.compile('[?&]utm_[a-z]+=[a-z0-9_]+', flags=re.A)
slur_regex = re.compile(f"({single_words})(?![^<]*>)", flags=re.I|re.A)
slur_regex_upper = re.compile(f"({single_words.upper()})(?![^<]*>)", flags=re.A)
slur_regex = re.compile(f"(<[^>]*>)|({single_words})", flags=re.I|re.A)
slur_regex_upper = re.compile(f"(<[^>]*>)|({single_words.upper()})", flags=re.A)
torture_regex = re.compile('(^|\s)(i|me) ', flags=re.I|re.A)
torture_regex2 = re.compile("(^|\s)i'm ", flags=re.I|re.A)
torture_regex_exclude = re.compile('^\s*>', flags=re.A)
def sub_matcher(match):
return SLURS[match.group(0).lower()]
def sub_matcher(match, upper=False):
if match.group(1):
return match.group(1)
else: # implies match.group(2)
repl = SLURS[match.group(2).lower()]
return repl if not upper else repl.upper()
def sub_matcher_upper(match):
return SLURS[match.group(0).lower()].upper()
return sub_matcher(match, upper=True)
def censor_slurs(body, logged_user):
if not logged_user or logged_user == 'chat' or logged_user.slurreplacer: