diff --git a/files/classes/submission.py b/files/classes/submission.py index bff89c6e0..cedb8594c 100644 --- a/files/classes/submission.py +++ b/files/classes/submission.py @@ -473,13 +473,14 @@ class Submission(Base): @property @lazy def is_video(self): - return self.url and any((self.url.lower().endswith(x) for x in ('.mp4','.webm','.mov'))) + return self.url and any((self.url.lower().endswith(x) for x in ('.mp4','.webm','.mov'))) and video_regex.fullmatch(self.url) @property @lazy def is_image(self): - if self.url: return self.url.lower().endswith('.webp') or self.url.lower().endswith('.jpg') or self.url.lower().endswith('.png') or self.url.lower().endswith('.gif') or self.url.lower().endswith('.jpeg') or self.url.lower().endswith('?maxwidth=9999') or self.url.lower().endswith('&fidelity=high') - else: return False + if self.url and (self.url.lower().endswith('.webp') or self.url.lower().endswith('.jpg') or self.url.lower().endswith('.png') or self.url.lower().endswith('.gif') or self.url.lower().endswith('.jpeg') or self.url.lower().endswith('?maxwidth=9999') or self.url.lower().endswith('&fidelity=high')) and embed_check_regex.fullmatch(self.url): + return True + return False @lazy def active_flags(self, v): return len(self.flags(v)) \ No newline at end of file diff --git a/files/helpers/const.py b/files/helpers/const.py index 3b17fd32f..6c7811624 100644 --- a/files/helpers/const.py +++ b/files/helpers/const.py @@ -692,10 +692,6 @@ poll_regex = re.compile("\s*\$\$([^\$\n]+)\$\$\s*", flags=re.A) bet_regex = re.compile("\s*\$\$\$([^\$\n]+)\$\$\$\s*", flags=re.A) choice_regex = re.compile("\s*&&([^\$\n]+)&&\s*", flags=re.A) -embed_removing_regex = re.compile('!\[\]\((.*?)\)', flags=re.A) - -image_check_regex = re.compile('!\[\]\(((?![^?\n]*\.(png|jpg|jpeg|gif|webp)).*?)\)', flags=re.I|re.A) - title_regex = re.compile("[^\w ]", flags=re.A) based_regex = re.compile("based and (.{1,20}?)(-| )pilled", flags=re.I|re.A) @@ -705,13 +701,11 @@ controversial_regex = re.compile('["> ](https:\/\/old\.reddit\.com/r/[a-zA-Z0-9_ fishylinks_regex = re.compile("https?://\S+", flags=re.A) spoiler_regex = re.compile('''\|\|(.+)\|\|''', flags=re.A) -video_regex = re.compile('
(https:\/\/[\w\-.#&/=\?@%;+]{5,250}\.(mp4|webm|mov))<\/a><\/p>', flags=re.I|re.A)
-unlinked_regex = re.compile('''(^|\s| )(https:\/\/[\w\-.#&/=\?@%;+]{5,250})''', flags=re.A)
-imgur_regex = re.compile('(https://i\.imgur\.com/([a-z0-9]+))\.(jpg|png|jpeg|webp)(?!(code|pre)>)', flags=re.I|re.A)
reddit_regex = re.compile('(^|\s| )\/?((r|u)\/(\w|-){3,25})', flags=re.A)
sub_regex = re.compile('(^|\s| )\/?(h\/(\w|-){3,25})', flags=re.A)
-youtube_regex = regex.compile('(?)https:\/\/youtube\.com\/watch\?v\=([a-z0-9-_]{5,20})[\w\-.#&/=\?@%+]*', flags=regex.I|regex.A)
+imgur_regex = re.compile('(https://i\.imgur\.com/([a-z0-9]+))\.(jpg|png|jpeg|webp)(?!<\/(code|pre|a)>)', flags=re.I|re.A)
+youtube_regex = regex.compile('(?)https:\/\/youtube\.com\/watch\?v\=([a-z0-9-_]{5,20})[\w\-.#&/=\?@%+]*', flags=regex.I|regex.A)
yt_id_regex = re.compile('[a-z0-9-_]{5,20}', flags=re.I|re.A)
strikethrough_regex = re.compile('''~{1,2}([^~]+)~{1,2}''', flags=re.A)
@@ -760,4 +754,47 @@ ADMIGGERS = (37696,37697,37749,37833,37838)
proxies = {"http":"http://127.0.0.1:18080","https":"http://127.0.0.1:18080"}
-blackjack = environ.get("BLACKJACK", "").strip()
\ No newline at end of file
+blackjack = environ.get("BLACKJACK", "").strip()
+
+approved_embed_hosts = [
+ 'i.imgur.com',
+ 'i\.ibb\.co',
+ 'pomf2\.lain\.la',
+ 'pngfind\.com',
+ 'i\.kym-cdn\.com',
+ 'i2\.kym-cdn\.com',
+ 'i\.redd\.it',
+ 'cdn\.substack\.com',
+ 'cdn\.discordapp\.com',
+ '2\.bp\.blogspot\.com',
+ 'files\.catbox\.moe',
+ 'i\.pinimg\.com',
+ 'kindpng\.com',
+ 'cdn\.shopify\.com',
+ 'media\.discordapp\.net',
+ 'pbs\.twimg\.com',
+ 'upload\.wikimedia\.org',
+ 'i0\.wp\.com',
+ 'seekpng\.com',
+ 'i\.dailymail\.co\.uk',
+ 'de\.catbox\.moe',
+ 'www\.cdc\.gov',
+ 'm\.media-amazon\.com',
+ 'www\.washingtonpost\.com',
+ 'i\.imgflip\.com',
+ 'farm2\.static\.flickr\.com',
+ 'img-9gag-fun\.9cache\.com',
+ 'i\.ytimg\.com',
+ 'a57\.foxnews\.com',
+ 'external-content\.duckduckgo\.com',
+ 'blogs-images\.forbes\.com',
+ 'images\.gr-assets\.com'
+ ]
+
+hosts = "|".join(approved_embed_hosts)
+
+image_check_regex = re.compile(f'!\[\]\(((?!https:\/\/({hosts})\/).*?)\)', flags=re.A)
+
+embed_check_regex = regex.compile(f'(?)https:\/\/({hosts})\/[\w:~,()\-.#&\/=?@%;+]*', flags=regex.A)
+
+video_regex = regex.compile(f'((?)https:\/\/({hosts})\/[\w:~,()\-.#&\/=?@%;+]*?\.(mp4|webm|mov))', flags=regex.A)
\ No newline at end of file
diff --git a/files/helpers/sanitize.py b/files/helpers/sanitize.py
index 71428d0f6..166a8af6b 100644
--- a/files/helpers/sanitize.py
+++ b/files/helpers/sanitize.py
@@ -1,6 +1,6 @@
import bleach
from bs4 import BeautifulSoup
-from bleach.linkifier import LinkifyFilter
+from bleach.linkifier import LinkifyFilter, build_url_re
from functools import partial
from .get import *
from .patter import pat
@@ -13,6 +13,59 @@ import signal
import time
import requests
+TLDS = ['ac','ad','ae','aero','af','ag','ai','al','am','an','ao','aq','ar','arpa','as','asia','at','au','aw','ax','az','ba','bb','bd','be','bf','bg','bh','bi','biz','bj','bm','bn','bo','br','bs','bt','bv','bw','by','bz','ca','cafe','cat','cc','cd','cf','cg','ch','ci','ck','cl','club','cm','cn','co','com','coop','cr','cu','cv','cx','cy','cz','de','dj','dk','dm','do','dz','ec','edu','ee','eg','er','es','et','eu','fi','fj','fk','fm','fo','fr','ga','gb','gd','ge','gf','gg','gh','gi','gl','gm','gn','gov','gp','gq','gr','gs','gt','gu','gw','gy','hk','hm','hn','hr','ht','hu','id','ie','il','im','in','info','int','io','iq','ir','is','it','je','jm','jo','jobs','jp','ke','kg','kh','ki','km','kn','kp','kr','kw','ky','kz','la','lb','lc','li','lk','lr','ls','lt','lu','lv','ly','ma','mc','md','me','mg','mh','mil','mk','ml','mm','mn','mo','mobi','mp','mq','mr','ms','mt','mu','museum','mv','mw','mx','my','mz','na','name','nc','ne','net','nf','ng','ni','nl','no','np','nr','nu','nz','om','org','pa','pe','pf','pg','ph','pk','pl','pm','pn','post','pr','pro','ps','pt','pw','py','qa','re','ro','rs','ru','rw','sa','sb','sc','sd','se','sg','sh','si','sj','sk','sl','sm','sn','so','social','sr','ss','st','su','sv','sx','sy','sz','tc','td','tel','tf','tg','th','tj','tk','tl','tm','tn','to','tp','tr','travel','tt','tv','tw','tz','ua','ug','uk','us','uy','uz','va','vc','ve','vg','vi','vn','vu','wf','win','ws','xn','xxx','ye','yt','yu','za','zm','zw']
+
+allowed_tags = ['b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i','li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul','marquee','a','span','ruby','rp','rt','spoiler','img','lite-youtube','video','source']
+
+def allowed_attributes(tag, name, value):
+
+ if name == 'style': return True
+
+ if tag == 'marquee':
+ if name in ['direction', 'behavior', 'scrollamount']: return True
+ if name in {'height', 'width'}:
+ try: value = int(value.replace('px', ''))
+ except: return False
+ if 0 < value <= 250: return True
+ return False
+
+ if tag == 'a':
+ if name == 'href': return True
+ if name == 'rel' and value == 'nofollow noopener noreferrer': return True
+ if name == 'target' and value == '_blank': return True
+ return False
+
+ if tag == 'img':
+ if name in ['src','data-src']:
+ if value.startswith('/') or embed_check_regex.fullmatch(value): return True
+ else: return False
+
+ if name == 'loading' and value == 'lazy': return True
+ if name == 'referrpolicy' and value == 'no-referrer': return True
+ if name == 'data-bs-toggle' and value == 'tooltip': return True
+ if name in ['alt','title','g','b']: return True
+ return False
+
+ if tag == 'lite-youtube':
+ if name == 'params' and value.startswith('autoplay=1&modestbranding=1'): return True
+ if name == 'videoid': return True
+ return False
+
+ if tag == 'video':
+ if name == 'controls' and value == '': return True
+ if name == 'preload' and value == 'none': return True
+ return False
+
+ if tag == 'source':
+ return True
+ return False
+
+ if tag == 'p':
+ if name == 'class' and value == 'mb-0': return True
+ return False
+
+
+url_re = build_url_re(tlds=TLDS, protocols=['http', 'https'])
def callback(attrs, new=False):
href = attrs[(None, "href")]
@@ -29,7 +82,7 @@ def handler(signum, frame):
raise Exception("Timeout")
-def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False):
+def sanitize(sanitized, alert=False, comment=False, edit=False):
signal.signal(signal.SIGALRM, handler)
signal.alarm(1)
@@ -176,11 +229,7 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False):
sanitized = sanitized.replace(url, htmlsource)
-
- sanitized = unlinked_regex.sub(r'\1\2', sanitized)
-
- if not noimages:
- sanitized = video_regex.sub(r' ', sanitized)
+ sanitized = video_regex.sub(r'', sanitized)
if comment:
for marsey in g.db.query(Marsey).filter(Marsey.name.in_(marseys_used)).all():
@@ -199,61 +248,12 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False):
sanitized = sanitized.replace('\1', title)
- sanitized = bleach.clean(title, tags=['img','del'], attributes=allowed_attributes, protocols=['http','https'])
+ sanitized = bleach.clean(title, tags=['img','del'], attributes=allowed_attributes_emojis, protocols=['http','https'])
signal.alarm(0)
diff --git a/files/routes/admin.py b/files/routes/admin.py
index 39a3768b4..afdf5b076 100644
--- a/files/routes/admin.py
+++ b/files/routes/admin.py
@@ -541,7 +541,7 @@ def change_settings(v, setting):
body = f"@{v.username} has {word}d `{setting}` in the [admin dashboard](/admin)!"
- body_html = sanitize(body, noimages=True)
+ body_html = sanitize(body)
new_comment = Comment(author_id=NOTIFICATIONS_ID,
parent_submission=None,
diff --git a/files/routes/oauth.py b/files/routes/oauth.py
index 51c44cc3f..028ba8c6f 100644
--- a/files/routes/oauth.py
+++ b/files/routes/oauth.py
@@ -54,7 +54,7 @@ def request_api_keys(v):
body = f"@{v.username} has requested API keys for `{request.values.get('name')}`. You can approve or deny the request [here](/admin/apps)."
- body_html = sanitize(body, noimages=True)
+ body_html = sanitize(body)
new_comment = Comment(author_id=NOTIFICATIONS_ID,
diff --git a/files/routes/posts.py b/files/routes/posts.py
index 0db192a6b..1f268bc9a 100644
--- a/files/routes/posts.py
+++ b/files/routes/posts.py
@@ -727,7 +727,7 @@ def thumbnail_thread(pid):
if i["subreddit"] == 'PokemonGoRaids': continue
- body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89', noimages=True)
+ body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89')
existing_comment = db.query(Comment.id).filter_by(author_id=NOTIFICATIONS_ID, parent_submission=None, body_html=body_html).one_or_none()
if existing_comment: break
@@ -755,7 +755,7 @@ def thumbnail_thread(pid):
except: break
for i in data:
- body_html = sanitize(f'New mention of you: https://old.reddit.com{i["permalink"]}?context=89', noimages=True)
+ body_html = sanitize(f'New mention of you: https://old.reddit.com{i["permalink"]}?context=89')
existing_comment = db.query(Comment.id).filter_by(author_id=NOTIFICATIONS_ID, parent_submission=None,body_html=body_html).one_or_none()
if existing_comment: break
@@ -784,7 +784,7 @@ def thumbnail_thread(pid):
except: break
for i in data:
- body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89', noimages=True)
+ body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89')
existing_comment = db.query(Comment.id).filter_by(author_id=NOTIFICATIONS_ID, parent_submission=None, body_html=body_html).one_or_none()
@@ -960,7 +960,7 @@ def submit_post(v, sub=None):
Submission.deleted_utc == 0,
Submission.is_banned == False
).first()
- if repost: return redirect(repost.permalink)
+ if repost and SITE != 'localhost': return redirect(repost.permalink)
domain_obj = get_domain(domain)
if not domain_obj: domain_obj = get_domain(domain+parsed_url.path)
@@ -1012,7 +1012,7 @@ def submit_post(v, sub=None):
Submission.body == body
).one_or_none()
- if dup: return redirect(dup.permalink)
+ if dup and SITE != 'localhost': return redirect(dup.permalink)
now = int(time.time())
cutoff = now - 60 * 60 * 24
diff --git a/files/routes/static.py b/files/routes/static.py
index 568371b48..67e272ccb 100644
--- a/files/routes/static.py
+++ b/files/routes/static.py
@@ -301,7 +301,7 @@ def submit_contact(v):
if not body: abort(400)
body = f'This message has been sent automatically to all admins via [/contact](/contact)\n\nMessage:\n\n' + body
- body_html = sanitize(body, noimages=True)
+ body_html = sanitize(body)
if request.files.get("file") and request.headers.get("cf-ipcountry") != "T1":
file=request.files["file"]
diff --git a/files/routes/users.py b/files/routes/users.py
index caf8bb508..e158e3826 100644
--- a/files/routes/users.py
+++ b/files/routes/users.py
@@ -606,9 +606,7 @@ def message2(v, username):
if 'linkedin.com' in message: return {"error": "This domain 'linkedin.com' is banned."}, 403
- message = embed_removing_regex.sub(r'\1', message)
-
- body_html = sanitize(message, noimages=True)
+ body_html = sanitize(message)
existing = g.db.query(Comment.id).filter(Comment.author_id == v.id,
Comment.sentto == user.id,
@@ -666,8 +664,6 @@ def messagereply(v):
if 'linkedin.com' in message: return {"error": "this domain 'linkedin.com' is banned"}
- message = embed_removing_regex.sub(r'\1', message)
-
id = int(request.values.get("parent_id"))
parent = get_comment(id, v=v)
user_id = parent.author.id
@@ -675,7 +671,7 @@ def messagereply(v):
if parent.sentto == 2: user_id = None
elif v.id == user_id: user_id = parent.sentto
- body_html = sanitize(message, noimages=True)
+ body_html = sanitize(message)
if request.files.get("file") and request.headers.get("cf-ipcountry") != "T1":
file=request.files["file"]