From aaf718c78c6e79b2c285e4cadfa2d13e6ec0e478 Mon Sep 17 00:00:00 2001 From: TLSM Date: Sat, 25 Jun 2022 01:28:43 -0400 Subject: [PATCH] Fix timeout in sanitize from link_fix_regex. h/t to @official-techsupport for finding and help fixing this bug. When given certain pathological input, `sanitize` would time out (notably only on posts, rather than comments, perhaps due to the longer maximum length of input). For example, using as input the result of: with open("test.txt", "w") as f: for i in range(26): f.write(f":{chr(ord('a') + i)}: ") f.write('x' * 20_000) We believe this to be because of some combination of the greedy quantifiers and the negative lookahead before the match. The regex was rewritten to (in theory) have much more linear performance. --- files/helpers/regex.py | 2 +- files/helpers/sanitize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/files/helpers/regex.py b/files/helpers/regex.py index ff94f92ca..45e949e73 100644 --- a/files/helpers/regex.py +++ b/files/helpers/regex.py @@ -76,7 +76,7 @@ yt_id_regex = re.compile('[a-z0-9-_]{5,20}', flags=re.I|re.A) image_regex = re.compile("(^|\s)(https:\/\/[\w\-.#&/=\?@%;+,:]{5,250}(\.png|\.jpg|\.jpeg|\.gif|\.webp)(\?[\w\-.#&/=\?@%;+,:]*)?)($|\s)", flags=re.I|re.A) -link_fix_regex = re.compile("(?!.*(http|\/))(.*\[[^\]]+\]\()([^)]+\))", flags=re.A) +link_fix_regex = re.compile("(\[.*?\]\()(?!http|/)(.*?\))", flags=re.A) css_regex = re.compile('https?:\/\/[\w:~,()\-.#&\/=?@%;+]*', flags=re.I|re.A) diff --git a/files/helpers/sanitize.py b/files/helpers/sanitize.py index a9cff03c0..f524897eb 100644 --- a/files/helpers/sanitize.py +++ b/files/helpers/sanitize.py @@ -182,7 +182,7 @@ def sanitize(sanitized, edit=False): sanitized = image_check_regex.sub(r'\1', sanitized) - sanitized = link_fix_regex.sub(r'\2https://\3', sanitized) + sanitized = link_fix_regex.sub(r'\1https://\2', sanitized) sanitized = markdown(sanitized)