Fix timeout in sanitize from link_fix_regex.

h/t to @official-techsupport for finding and help fixing this bug.
When given certain pathological input, `sanitize` would time out
(notably only on posts, rather than comments, perhaps due to the
longer maximum length of input). For example, using as input the
result of:

    with open("test.txt", "w") as f:
        for i in range(26):
            f.write(f":{chr(ord('a') + i)}: ")
        f.write('x' * 20_000)

We believe this to be because of some combination of the greedy
quantifiers and the negative lookahead before the match. The regex
was rewritten to (in theory) have much more linear performance.
master
Snakes 2022-06-25 01:28:43 -04:00
parent 70c2b2cffa
commit aaf718c78c
2 changed files with 2 additions and 2 deletions

View File

@ -76,7 +76,7 @@ yt_id_regex = re.compile('[a-z0-9-_]{5,20}', flags=re.I|re.A)
image_regex = re.compile("(^|\s)(https:\/\/[\w\-.#&/=\?@%;+,:]{5,250}(\.png|\.jpg|\.jpeg|\.gif|\.webp)(\?[\w\-.#&/=\?@%;+,:]*)?)($|\s)", flags=re.I|re.A) image_regex = re.compile("(^|\s)(https:\/\/[\w\-.#&/=\?@%;+,:]{5,250}(\.png|\.jpg|\.jpeg|\.gif|\.webp)(\?[\w\-.#&/=\?@%;+,:]*)?)($|\s)", flags=re.I|re.A)
link_fix_regex = re.compile("(?!.*(http|\/))(.*\[[^\]]+\]\()([^)]+\))", flags=re.A) link_fix_regex = re.compile("(\[.*?\]\()(?!http|/)(.*?\))", flags=re.A)
css_regex = re.compile('https?:\/\/[\w:~,()\-.#&\/=?@%;+]*', flags=re.I|re.A) css_regex = re.compile('https?:\/\/[\w:~,()\-.#&\/=?@%;+]*', flags=re.I|re.A)

View File

@ -182,7 +182,7 @@ def sanitize(sanitized, edit=False):
sanitized = image_check_regex.sub(r'\1', sanitized) sanitized = image_check_regex.sub(r'\1', sanitized)
sanitized = link_fix_regex.sub(r'\2https://\3', sanitized) sanitized = link_fix_regex.sub(r'\1https://\2', sanitized)
sanitized = markdown(sanitized) sanitized = markdown(sanitized)