diff --git a/files/classes/submission.py b/files/classes/submission.py index bff89c6e0..cedb8594c 100644 --- a/files/classes/submission.py +++ b/files/classes/submission.py @@ -473,13 +473,14 @@ class Submission(Base): @property @lazy def is_video(self): - return self.url and any((self.url.lower().endswith(x) for x in ('.mp4','.webm','.mov'))) + return self.url and any((self.url.lower().endswith(x) for x in ('.mp4','.webm','.mov'))) and video_regex.fullmatch(self.url) @property @lazy def is_image(self): - if self.url: return self.url.lower().endswith('.webp') or self.url.lower().endswith('.jpg') or self.url.lower().endswith('.png') or self.url.lower().endswith('.gif') or self.url.lower().endswith('.jpeg') or self.url.lower().endswith('?maxwidth=9999') or self.url.lower().endswith('&fidelity=high') - else: return False + if self.url and (self.url.lower().endswith('.webp') or self.url.lower().endswith('.jpg') or self.url.lower().endswith('.png') or self.url.lower().endswith('.gif') or self.url.lower().endswith('.jpeg') or self.url.lower().endswith('?maxwidth=9999') or self.url.lower().endswith('&fidelity=high')) and embed_check_regex.fullmatch(self.url): + return True + return False @lazy def active_flags(self, v): return len(self.flags(v)) \ No newline at end of file diff --git a/files/helpers/const.py b/files/helpers/const.py index 3b17fd32f..6c7811624 100644 --- a/files/helpers/const.py +++ b/files/helpers/const.py @@ -692,10 +692,6 @@ poll_regex = re.compile("\s*\$\$([^\$\n]+)\$\$\s*", flags=re.A) bet_regex = re.compile("\s*\$\$\$([^\$\n]+)\$\$\$\s*", flags=re.A) choice_regex = re.compile("\s*&&([^\$\n]+)&&\s*", flags=re.A) -embed_removing_regex = re.compile('!\[\]\((.*?)\)', flags=re.A) - -image_check_regex = re.compile('!\[\]\(((?![^?\n]*\.(png|jpg|jpeg|gif|webp)).*?)\)', flags=re.I|re.A) - title_regex = re.compile("[^\w ]", flags=re.A) based_regex = re.compile("based and (.{1,20}?)(-| )pilled", flags=re.I|re.A) @@ -705,13 +701,11 @@ controversial_regex = re.compile('["> ](https:\/\/old\.reddit\.com/r/[a-zA-Z0-9_ fishylinks_regex = re.compile("https?://\S+", flags=re.A) spoiler_regex = re.compile('''\|\|(.+)\|\|''', flags=re.A) -video_regex = re.compile('

(https:\/\/[\w\-.#&/=\?@%;+]{5,250}\.(mp4|webm|mov))<\/a><\/p>', flags=re.I|re.A) -unlinked_regex = re.compile('''(^|\s|

)(https:\/\/[\w\-.#&/=\?@%;+]{5,250})''', flags=re.A) -imgur_regex = re.compile('(https://i\.imgur\.com/([a-z0-9]+))\.(jpg|png|jpeg|webp)(?!)', flags=re.I|re.A) reddit_regex = re.compile('(^|\s|

)\/?((r|u)\/(\w|-){3,25})', flags=re.A) sub_regex = re.compile('(^|\s|

)\/?(h\/(\w|-){3,25})', flags=re.A) -youtube_regex = regex.compile('(?)https:\/\/youtube\.com\/watch\?v\=([a-z0-9-_]{5,20})[\w\-.#&/=\?@%+]*', flags=regex.I|regex.A) +imgur_regex = re.compile('(https://i\.imgur\.com/([a-z0-9]+))\.(jpg|png|jpeg|webp)(?!<\/(code|pre|a)>)', flags=re.I|re.A) +youtube_regex = regex.compile('(?)https:\/\/youtube\.com\/watch\?v\=([a-z0-9-_]{5,20})[\w\-.#&/=\?@%+]*', flags=regex.I|regex.A) yt_id_regex = re.compile('[a-z0-9-_]{5,20}', flags=re.I|re.A) strikethrough_regex = re.compile('''~{1,2}([^~]+)~{1,2}''', flags=re.A) @@ -760,4 +754,47 @@ ADMIGGERS = (37696,37697,37749,37833,37838) proxies = {"http":"http://127.0.0.1:18080","https":"http://127.0.0.1:18080"} -blackjack = environ.get("BLACKJACK", "").strip() \ No newline at end of file +blackjack = environ.get("BLACKJACK", "").strip() + +approved_embed_hosts = [ + 'i.imgur.com', + 'i\.ibb\.co', + 'pomf2\.lain\.la', + 'pngfind\.com', + 'i\.kym-cdn\.com', + 'i2\.kym-cdn\.com', + 'i\.redd\.it', + 'cdn\.substack\.com', + 'cdn\.discordapp\.com', + '2\.bp\.blogspot\.com', + 'files\.catbox\.moe', + 'i\.pinimg\.com', + 'kindpng\.com', + 'cdn\.shopify\.com', + 'media\.discordapp\.net', + 'pbs\.twimg\.com', + 'upload\.wikimedia\.org', + 'i0\.wp\.com', + 'seekpng\.com', + 'i\.dailymail\.co\.uk', + 'de\.catbox\.moe', + 'www\.cdc\.gov', + 'm\.media-amazon\.com', + 'www\.washingtonpost\.com', + 'i\.imgflip\.com', + 'farm2\.static\.flickr\.com', + 'img-9gag-fun\.9cache\.com', + 'i\.ytimg\.com', + 'a57\.foxnews\.com', + 'external-content\.duckduckgo\.com', + 'blogs-images\.forbes\.com', + 'images\.gr-assets\.com' + ] + +hosts = "|".join(approved_embed_hosts) + +image_check_regex = re.compile(f'!\[\]\(((?!https:\/\/({hosts})\/).*?)\)', flags=re.A) + +embed_check_regex = regex.compile(f'(?)https:\/\/({hosts})\/[\w:~,()\-.#&\/=?@%;+]*', flags=regex.A) + +video_regex = regex.compile(f'((?)https:\/\/({hosts})\/[\w:~,()\-.#&\/=?@%;+]*?\.(mp4|webm|mov))', flags=regex.A) \ No newline at end of file diff --git a/files/helpers/sanitize.py b/files/helpers/sanitize.py index 71428d0f6..166a8af6b 100644 --- a/files/helpers/sanitize.py +++ b/files/helpers/sanitize.py @@ -1,6 +1,6 @@ import bleach from bs4 import BeautifulSoup -from bleach.linkifier import LinkifyFilter +from bleach.linkifier import LinkifyFilter, build_url_re from functools import partial from .get import * from .patter import pat @@ -13,6 +13,59 @@ import signal import time import requests +TLDS = ['ac','ad','ae','aero','af','ag','ai','al','am','an','ao','aq','ar','arpa','as','asia','at','au','aw','ax','az','ba','bb','bd','be','bf','bg','bh','bi','biz','bj','bm','bn','bo','br','bs','bt','bv','bw','by','bz','ca','cafe','cat','cc','cd','cf','cg','ch','ci','ck','cl','club','cm','cn','co','com','coop','cr','cu','cv','cx','cy','cz','de','dj','dk','dm','do','dz','ec','edu','ee','eg','er','es','et','eu','fi','fj','fk','fm','fo','fr','ga','gb','gd','ge','gf','gg','gh','gi','gl','gm','gn','gov','gp','gq','gr','gs','gt','gu','gw','gy','hk','hm','hn','hr','ht','hu','id','ie','il','im','in','info','int','io','iq','ir','is','it','je','jm','jo','jobs','jp','ke','kg','kh','ki','km','kn','kp','kr','kw','ky','kz','la','lb','lc','li','lk','lr','ls','lt','lu','lv','ly','ma','mc','md','me','mg','mh','mil','mk','ml','mm','mn','mo','mobi','mp','mq','mr','ms','mt','mu','museum','mv','mw','mx','my','mz','na','name','nc','ne','net','nf','ng','ni','nl','no','np','nr','nu','nz','om','org','pa','pe','pf','pg','ph','pk','pl','pm','pn','post','pr','pro','ps','pt','pw','py','qa','re','ro','rs','ru','rw','sa','sb','sc','sd','se','sg','sh','si','sj','sk','sl','sm','sn','so','social','sr','ss','st','su','sv','sx','sy','sz','tc','td','tel','tf','tg','th','tj','tk','tl','tm','tn','to','tp','tr','travel','tt','tv','tw','tz','ua','ug','uk','us','uy','uz','va','vc','ve','vg','vi','vn','vu','wf','win','ws','xn','xxx','ye','yt','yu','za','zm','zw'] + +allowed_tags = ['b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i','li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul','marquee','a','span','ruby','rp','rt','spoiler','img','lite-youtube','video','source'] + +def allowed_attributes(tag, name, value): + + if name == 'style': return True + + if tag == 'marquee': + if name in ['direction', 'behavior', 'scrollamount']: return True + if name in {'height', 'width'}: + try: value = int(value.replace('px', '')) + except: return False + if 0 < value <= 250: return True + return False + + if tag == 'a': + if name == 'href': return True + if name == 'rel' and value == 'nofollow noopener noreferrer': return True + if name == 'target' and value == '_blank': return True + return False + + if tag == 'img': + if name in ['src','data-src']: + if value.startswith('/') or embed_check_regex.fullmatch(value): return True + else: return False + + if name == 'loading' and value == 'lazy': return True + if name == 'referrpolicy' and value == 'no-referrer': return True + if name == 'data-bs-toggle' and value == 'tooltip': return True + if name in ['alt','title','g','b']: return True + return False + + if tag == 'lite-youtube': + if name == 'params' and value.startswith('autoplay=1&modestbranding=1'): return True + if name == 'videoid': return True + return False + + if tag == 'video': + if name == 'controls' and value == '': return True + if name == 'preload' and value == 'none': return True + return False + + if tag == 'source': + return True + return False + + if tag == 'p': + if name == 'class' and value == 'mb-0': return True + return False + + +url_re = build_url_re(tlds=TLDS, protocols=['http', 'https']) def callback(attrs, new=False): href = attrs[(None, "href")] @@ -29,7 +82,7 @@ def handler(signum, frame): raise Exception("Timeout") -def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False): +def sanitize(sanitized, alert=False, comment=False, edit=False): signal.signal(signal.SIGALRM, handler) signal.alarm(1) @@ -176,11 +229,7 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False): sanitized = sanitized.replace(url, htmlsource) - - sanitized = unlinked_regex.sub(r'\1\2', sanitized) - - if not noimages: - sanitized = video_regex.sub(r'

', sanitized) + sanitized = video_regex.sub(r'', sanitized) if comment: for marsey in g.db.query(Marsey).filter(Marsey.name.in_(marseys_used)).all(): @@ -199,61 +248,12 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False): sanitized = sanitized.replace('','').replace('','') - allowed_tags = ['b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i','li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul','marquee','a','span','ruby','rp','rt','spoiler','img','lite-youtube'] - if not noimages: allowed_tags += ['video','source'] - - - def allowed_attributes(tag, name, value): - - if name == 'style': return True - - if tag == 'marquee': - if name in ['direction', 'behavior', 'scrollamount']: return True - if name in {'height', 'width'}: - try: value = int(value.replace('px', '')) - except: return False - if 0 < value <= 250: return True - return False - - if tag == 'a': - if name == 'href': return True - if name == 'rel' and value == 'nofollow noopener noreferrer': return True - if name == 'target' and value == '_blank': return True - return False - - if tag == 'img': - if name in ['src','data-src'] and not value.startswith('/') and noimages: return False - - if name == 'loading' and value == 'lazy': return True - if name == 'referrpolicy' and value == 'no-referrer': return True - if name == 'data-bs-toggle' and value == 'tooltip': return True - if name in ['src','data-src','alt','title','g','b']: return True - return False - - if tag == 'lite-youtube': - if name == 'params' and value.startswith('autoplay=1&modestbranding=1'): return True - if name == 'videoid': return True - return False - - if tag == 'video': - if name == 'controls' and value == '': return True - if name == 'preload' and value == 'none': return True - return False - - if tag == 'source': - if name == 'src': return True - return False - - if tag == 'p': - if name == 'class' and value == 'mb-0': return True - return False - sanitized = bleach.Cleaner(tags=allowed_tags, attributes=allowed_attributes, protocols=['http', 'https'], styles=['color', 'background-color', 'font-weight', 'text-align'], - filters=[partial(LinkifyFilter, skip_tags=["pre"], parse_email=False, callbacks=[callback])] + filters=[partial(LinkifyFilter, skip_tags=["pre"], parse_email=False, callbacks=[callback], url_re=url_re)] ).clean(sanitized) @@ -266,13 +266,13 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False): -def allowed_attributes(tag, name, value): +def allowed_attributes_emojis(tag, name, value): if tag == 'img': if name == 'loading' and value == 'lazy': return True if name == 'data-bs-toggle' and value == 'tooltip': return True if name in ['src','alt','title','g']: return True - return False + return False def filter_emojis_only(title, edit=False, graceful=False): @@ -308,7 +308,7 @@ def filter_emojis_only(title, edit=False, graceful=False): title = strikethrough_regex.sub(r'\1', title) - sanitized = bleach.clean(title, tags=['img','del'], attributes=allowed_attributes, protocols=['http','https']) + sanitized = bleach.clean(title, tags=['img','del'], attributes=allowed_attributes_emojis, protocols=['http','https']) signal.alarm(0) diff --git a/files/routes/admin.py b/files/routes/admin.py index 39a3768b4..afdf5b076 100644 --- a/files/routes/admin.py +++ b/files/routes/admin.py @@ -541,7 +541,7 @@ def change_settings(v, setting): body = f"@{v.username} has {word}d `{setting}` in the [admin dashboard](/admin)!" - body_html = sanitize(body, noimages=True) + body_html = sanitize(body) new_comment = Comment(author_id=NOTIFICATIONS_ID, parent_submission=None, diff --git a/files/routes/oauth.py b/files/routes/oauth.py index 51c44cc3f..028ba8c6f 100644 --- a/files/routes/oauth.py +++ b/files/routes/oauth.py @@ -54,7 +54,7 @@ def request_api_keys(v): body = f"@{v.username} has requested API keys for `{request.values.get('name')}`. You can approve or deny the request [here](/admin/apps)." - body_html = sanitize(body, noimages=True) + body_html = sanitize(body) new_comment = Comment(author_id=NOTIFICATIONS_ID, diff --git a/files/routes/posts.py b/files/routes/posts.py index 0db192a6b..1f268bc9a 100644 --- a/files/routes/posts.py +++ b/files/routes/posts.py @@ -727,7 +727,7 @@ def thumbnail_thread(pid): if i["subreddit"] == 'PokemonGoRaids': continue - body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89', noimages=True) + body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89') existing_comment = db.query(Comment.id).filter_by(author_id=NOTIFICATIONS_ID, parent_submission=None, body_html=body_html).one_or_none() if existing_comment: break @@ -755,7 +755,7 @@ def thumbnail_thread(pid): except: break for i in data: - body_html = sanitize(f'New mention of you: https://old.reddit.com{i["permalink"]}?context=89', noimages=True) + body_html = sanitize(f'New mention of you: https://old.reddit.com{i["permalink"]}?context=89') existing_comment = db.query(Comment.id).filter_by(author_id=NOTIFICATIONS_ID, parent_submission=None,body_html=body_html).one_or_none() if existing_comment: break @@ -784,7 +784,7 @@ def thumbnail_thread(pid): except: break for i in data: - body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89', noimages=True) + body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89') existing_comment = db.query(Comment.id).filter_by(author_id=NOTIFICATIONS_ID, parent_submission=None, body_html=body_html).one_or_none() @@ -960,7 +960,7 @@ def submit_post(v, sub=None): Submission.deleted_utc == 0, Submission.is_banned == False ).first() - if repost: return redirect(repost.permalink) + if repost and SITE != 'localhost': return redirect(repost.permalink) domain_obj = get_domain(domain) if not domain_obj: domain_obj = get_domain(domain+parsed_url.path) @@ -1012,7 +1012,7 @@ def submit_post(v, sub=None): Submission.body == body ).one_or_none() - if dup: return redirect(dup.permalink) + if dup and SITE != 'localhost': return redirect(dup.permalink) now = int(time.time()) cutoff = now - 60 * 60 * 24 diff --git a/files/routes/static.py b/files/routes/static.py index 568371b48..67e272ccb 100644 --- a/files/routes/static.py +++ b/files/routes/static.py @@ -301,7 +301,7 @@ def submit_contact(v): if not body: abort(400) body = f'This message has been sent automatically to all admins via [/contact](/contact)\n\nMessage:\n\n' + body - body_html = sanitize(body, noimages=True) + body_html = sanitize(body) if request.files.get("file") and request.headers.get("cf-ipcountry") != "T1": file=request.files["file"] diff --git a/files/routes/users.py b/files/routes/users.py index caf8bb508..e158e3826 100644 --- a/files/routes/users.py +++ b/files/routes/users.py @@ -606,9 +606,7 @@ def message2(v, username): if 'linkedin.com' in message: return {"error": "This domain 'linkedin.com' is banned."}, 403 - message = embed_removing_regex.sub(r'\1', message) - - body_html = sanitize(message, noimages=True) + body_html = sanitize(message) existing = g.db.query(Comment.id).filter(Comment.author_id == v.id, Comment.sentto == user.id, @@ -666,8 +664,6 @@ def messagereply(v): if 'linkedin.com' in message: return {"error": "this domain 'linkedin.com' is banned"} - message = embed_removing_regex.sub(r'\1', message) - id = int(request.values.get("parent_id")) parent = get_comment(id, v=v) user_id = parent.author.id @@ -675,7 +671,7 @@ def messagereply(v): if parent.sentto == 2: user_id = None elif v.id == user_id: user_id = parent.sentto - body_html = sanitize(message, noimages=True) + body_html = sanitize(message) if request.files.get("file") and request.headers.get("cf-ipcountry") != "T1": file=request.files["file"]