fds

2022-04-17 22:20:40 +02:00 · 2022-04-17 22:20:40 +02:00 · 78ea56837f
parent 2de411fe23
commit 78ea56837f
8 changed files with 120 additions and 86 deletions
--- a/files/classes/submission.py
+++ b/files/classes/submission.py
@ -473,13 +473,14 @@ class Submission(Base):
 	@property
 	@lazy
 	def is_video(self):
-		return self.url and any((self.url.lower().endswith(x) for x in ('.mp4','.webm','.mov')))
+		return self.url and any((self.url.lower().endswith(x) for x in ('.mp4','.webm','.mov'))) and video_regex.fullmatch(self.url)

 	@property
 	@lazy
 	def is_image(self):
-		if self.url: return self.url.lower().endswith('.webp') or self.url.lower().endswith('.jpg') or self.url.lower().endswith('.png') or self.url.lower().endswith('.gif') or self.url.lower().endswith('.jpeg') or self.url.lower().endswith('?maxwidth=9999') or self.url.lower().endswith('&fidelity=high')
-		else: return False
+		if self.url and (self.url.lower().endswith('.webp') or self.url.lower().endswith('.jpg') or self.url.lower().endswith('.png') or self.url.lower().endswith('.gif') or self.url.lower().endswith('.jpeg') or self.url.lower().endswith('?maxwidth=9999') or self.url.lower().endswith('&fidelity=high')) and embed_check_regex.fullmatch(self.url):
+			return True
+		return False

 	@lazy
 	def active_flags(self, v): return len(self.flags(v))
--- a/files/helpers/const.py
+++ b/files/helpers/const.py
@ -692,10 +692,6 @@ poll_regex = re.compile("\s*\$\$([^\$\n]+)\$\$\s*", flags=re.A)
 bet_regex = re.compile("\s*\$\$\$([^\$\n]+)\$\$\$\s*", flags=re.A)
 choice_regex = re.compile("\s*&&([^\$\n]+)&&\s*", flags=re.A)

-embed_removing_regex = re.compile('!\[\]\((.*?)\)', flags=re.A)
-
-image_check_regex = re.compile('!\[\]\(((?![^?\n]*\.(png|jpg|jpeg|gif|webp)).*?)\)', flags=re.I|re.A)
-
 title_regex = re.compile("[^\w ]", flags=re.A)

 based_regex = re.compile("based and (.{1,20}?)(-| )pilled", flags=re.I|re.A)
@ -705,13 +701,11 @@ controversial_regex = re.compile('["> ](https:\/\/old\.reddit\.com/r/[a-zA-Z0-9_
 fishylinks_regex = re.compile("https?://\S+", flags=re.A)

 spoiler_regex = re.compile('''\|\|(.+)\|\|''', flags=re.A)
-video_regex = re.compile('<p><a href="(https:\/\/[\w\-.#&/=\?@%;+]{5,250}\.(mp4|webm|mov))" rel="nofollow noopener noreferrer" target="_blank">(https:\/\/[\w\-.#&/=\?@%;+]{5,250}\.(mp4|webm|mov))<\/a><\/p>', flags=re.I|re.A)
-unlinked_regex = re.compile('''(^|\s|<p>)(https:\/\/[\w\-.#&/=\?@%;+]{5,250})''', flags=re.A)
-imgur_regex = re.compile('(https://i\.imgur\.com/([a-z0-9]+))\.(jpg|png|jpeg|webp)(?!</(code|pre)>)', flags=re.I|re.A)
 reddit_regex = re.compile('(^|\s|<p>)\/?((r|u)\/(\w|-){3,25})', flags=re.A)
 sub_regex = re.compile('(^|\s|<p>)\/?(h\/(\w|-){3,25})', flags=re.A)

-youtube_regex = regex.compile('(?<!<(code|pre)>)https:\/\/youtube\.com\/watch\?v\=([a-z0-9-_]{5,20})[\w\-.#&/=\?@%+]*', flags=regex.I|regex.A)
+imgur_regex = re.compile('(https://i\.imgur\.com/([a-z0-9]+))\.(jpg|png|jpeg|webp)(?!<\/(code|pre|a)>)', flags=re.I|re.A)
+youtube_regex = regex.compile('(?<!<(code|pre|a)>)https:\/\/youtube\.com\/watch\?v\=([a-z0-9-_]{5,20})[\w\-.#&/=\?@%+]*', flags=regex.I|regex.A)
 yt_id_regex = re.compile('[a-z0-9-_]{5,20}', flags=re.I|re.A)

 strikethrough_regex = re.compile('''~{1,2}([^~]+)~{1,2}''', flags=re.A)
@ -760,4 +754,47 @@ ADMIGGERS = (37696,37697,37749,37833,37838)

 proxies = {"http":"http://127.0.0.1:18080","https":"http://127.0.0.1:18080"}

-blackjack = environ.get("BLACKJACK", "").strip()
+blackjack = environ.get("BLACKJACK", "").strip()
+
+approved_embed_hosts = [
+	'i.imgur.com',
+	'i\.ibb\.co',
+	'pomf2\.lain\.la',
+	'pngfind\.com',
+	'i\.kym-cdn\.com',
+	'i2\.kym-cdn\.com',
+	'i\.redd\.it',
+	'cdn\.substack\.com',
+	'cdn\.discordapp\.com',
+	'2\.bp\.blogspot\.com',
+	'files\.catbox\.moe',
+	'i\.pinimg\.com',
+	'kindpng\.com',
+	'cdn\.shopify\.com',
+	'media\.discordapp\.net',
+	'pbs\.twimg\.com',
+	'upload\.wikimedia\.org',
+	'i0\.wp\.com',
+	'seekpng\.com',
+	'i\.dailymail\.co\.uk',
+	'de\.catbox\.moe',
+	'www\.cdc\.gov',
+	'm\.media-amazon\.com',
+	'www\.washingtonpost\.com',
+	'i\.imgflip\.com',
+	'farm2\.static\.flickr\.com',
+	'img-9gag-fun\.9cache\.com',
+	'i\.ytimg\.com',
+	'a57\.foxnews\.com',
+	'external-content\.duckduckgo\.com',
+	'blogs-images\.forbes\.com',
+	'images\.gr-assets\.com'
+	]
+
+hosts = "|".join(approved_embed_hosts)
+
+image_check_regex = re.compile(f'!\[\]\(((?!https:\/\/({hosts})\/).*?)\)', flags=re.A)
+
+embed_check_regex = regex.compile(f'(?<!<(code|pre|a)>)https:\/\/({hosts})\/[\w:~,()\-.#&\/=?@%;+]*', flags=regex.A)
+
+video_regex = regex.compile(f'((?<!<(code|pre|a)>)https:\/\/({hosts})\/[\w:~,()\-.#&\/=?@%;+]*?\.(mp4|webm|mov))', flags=regex.A)
--- a/files/helpers/sanitize.py
+++ b/files/helpers/sanitize.py
@ -1,6 +1,6 @@
 import bleach
 from bs4 import BeautifulSoup
-from bleach.linkifier import LinkifyFilter
+from bleach.linkifier import LinkifyFilter, build_url_re
 from functools import partial
 from .get import *
 from .patter import pat
@ -13,6 +13,59 @@ import signal
 import time
 import requests

+TLDS = ['ac','ad','ae','aero','af','ag','ai','al','am','an','ao','aq','ar','arpa','as','asia','at','au','aw','ax','az','ba','bb','bd','be','bf','bg','bh','bi','biz','bj','bm','bn','bo','br','bs','bt','bv','bw','by','bz','ca','cafe','cat','cc','cd','cf','cg','ch','ci','ck','cl','club','cm','cn','co','com','coop','cr','cu','cv','cx','cy','cz','de','dj','dk','dm','do','dz','ec','edu','ee','eg','er','es','et','eu','fi','fj','fk','fm','fo','fr','ga','gb','gd','ge','gf','gg','gh','gi','gl','gm','gn','gov','gp','gq','gr','gs','gt','gu','gw','gy','hk','hm','hn','hr','ht','hu','id','ie','il','im','in','info','int','io','iq','ir','is','it','je','jm','jo','jobs','jp','ke','kg','kh','ki','km','kn','kp','kr','kw','ky','kz','la','lb','lc','li','lk','lr','ls','lt','lu','lv','ly','ma','mc','md','me','mg','mh','mil','mk','ml','mm','mn','mo','mobi','mp','mq','mr','ms','mt','mu','museum','mv','mw','mx','my','mz','na','name','nc','ne','net','nf','ng','ni','nl','no','np','nr','nu','nz','om','org','pa','pe','pf','pg','ph','pk','pl','pm','pn','post','pr','pro','ps','pt','pw','py','qa','re','ro','rs','ru','rw','sa','sb','sc','sd','se','sg','sh','si','sj','sk','sl','sm','sn','so','social','sr','ss','st','su','sv','sx','sy','sz','tc','td','tel','tf','tg','th','tj','tk','tl','tm','tn','to','tp','tr','travel','tt','tv','tw','tz','ua','ug','uk','us','uy','uz','va','vc','ve','vg','vi','vn','vu','wf','win','ws','xn','xxx','ye','yt','yu','za','zm','zw']
+
+allowed_tags = ['b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i','li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul','marquee','a','span','ruby','rp','rt','spoiler','img','lite-youtube','video','source']
+
+def allowed_attributes(tag, name, value):
+
+	if name == 'style': return True
+
+	if tag == 'marquee':
+		if name in ['direction', 'behavior', 'scrollamount']: return True
+		if name in {'height', 'width'}:
+			try: value = int(value.replace('px', ''))
+			except: return False
+			if 0 < value <= 250: return True
+		return False
+	
+	if tag == 'a':
+		if name == 'href': return True
+		if name == 'rel' and value == 'nofollow noopener noreferrer': return True
+		if name == 'target' and value == '_blank': return True
+		return False
+
+	if tag == 'img':
+		if name in ['src','data-src']:
+			if value.startswith('/') or embed_check_regex.fullmatch(value): return True
+			else: return False
+
+		if name == 'loading' and value == 'lazy': return True
+		if name == 'referrpolicy' and value == 'no-referrer': return True
+		if name == 'data-bs-toggle' and value == 'tooltip': return True
+		if name in ['alt','title','g','b']: return True
+		return False
+
+	if tag == 'lite-youtube':
+		if name == 'params' and value.startswith('autoplay=1&modestbranding=1'): return True
+		if name == 'videoid': return True
+		return False
+
+	if tag == 'video':
+		if name == 'controls' and value == '': return True
+		if name == 'preload' and value == 'none': return True
+		return False
+
+	if tag == 'source':
+		return True
+		return False
+
+	if tag == 'p':
+		if name == 'class' and value == 'mb-0': return True
+		return False
+
+
+url_re = build_url_re(tlds=TLDS, protocols=['http', 'https'])

 def callback(attrs, new=False):
 	href = attrs[(None, "href")]
@ -29,7 +82,7 @@ def handler(signum, frame):
 	raise Exception("Timeout")


-def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False):
+def sanitize(sanitized, alert=False, comment=False, edit=False):

 	signal.signal(signal.SIGALRM, handler)
 	signal.alarm(1)
@ -176,11 +229,7 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False):

 		sanitized = sanitized.replace(url, htmlsource)

-
-	sanitized = unlinked_regex.sub(r'\1<a href="\2" rel="nofollow noopener noreferrer" target="_blank">\2</a>', sanitized)
-
-	if not noimages:
-		sanitized = video_regex.sub(r'<p><video controls preload="none"><source src="\1"></video>', sanitized)
+	sanitized = video_regex.sub(r'<video controls preload="none"><source src="\1"></video>', sanitized)

 	if comment:
 		for marsey in g.db.query(Marsey).filter(Marsey.name.in_(marseys_used)).all():
@ -199,61 +248,12 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False):
 	sanitized = sanitized.replace('<html><body>','').replace('</body></html>','')


-	allowed_tags = ['b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i','li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul','marquee','a','span','ruby','rp','rt','spoiler','img','lite-youtube']
-	if not noimages: allowed_tags += ['video','source']
-
-
-	def allowed_attributes(tag, name, value):
-
-		if name == 'style': return True
-
-		if tag == 'marquee':
-			if name in ['direction', 'behavior', 'scrollamount']: return True
-			if name in {'height', 'width'}:
-				try: value = int(value.replace('px', ''))
-				except: return False
-				if 0 < value <= 250: return True
-			return False
-		
-		if tag == 'a':
-			if name == 'href': return True
-			if name == 'rel' and value == 'nofollow noopener noreferrer': return True
-			if name == 'target' and value == '_blank': return True
-			return False
-
-		if tag == 'img':
-			if name in ['src','data-src'] and not value.startswith('/') and noimages: return False
-
-			if name == 'loading' and value == 'lazy': return True
-			if name == 'referrpolicy' and value == 'no-referrer': return True
-			if name == 'data-bs-toggle' and value == 'tooltip': return True
-			if name in ['src','data-src','alt','title','g','b']: return True
-			return False
-
-		if tag == 'lite-youtube':
-			if name == 'params' and value.startswith('autoplay=1&modestbranding=1'): return True
-			if name == 'videoid': return True
-			return False
-
-		if tag == 'video':
-			if name == 'controls' and value == '': return True
-			if name == 'preload' and value == 'none': return True
-			return False
-
-		if tag == 'source':
-			if name == 'src': return True
-			return False
-
-		if tag == 'p':
-			if name == 'class' and value == 'mb-0': return True
-			return False
-

 	sanitized = bleach.Cleaner(tags=allowed_tags,
 								attributes=allowed_attributes,
 								protocols=['http', 'https'],
 								styles=['color', 'background-color', 'font-weight', 'text-align'],
-								filters=[partial(LinkifyFilter, skip_tags=["pre"], parse_email=False, callbacks=[callback])]
+								filters=[partial(LinkifyFilter, skip_tags=["pre"], parse_email=False, callbacks=[callback], url_re=url_re)]
 								).clean(sanitized)


@ -266,13 +266,13 @@ def sanitize(sanitized, noimages=False, alert=False, comment=False, edit=False):



-def allowed_attributes(tag, name, value):
+def allowed_attributes_emojis(tag, name, value):

 	if tag == 'img':
 		if name == 'loading' and value == 'lazy': return True
 		if name == 'data-bs-toggle' and value == 'tooltip': return True
 		if name in ['src','alt','title','g']: return True
-		return False
+	return False


 def filter_emojis_only(title, edit=False, graceful=False):
@ -308,7 +308,7 @@ def filter_emojis_only(title, edit=False, graceful=False):

 	title = strikethrough_regex.sub(r'<del>\1</del>', title)

-	sanitized = bleach.clean(title, tags=['img','del'], attributes=allowed_attributes, protocols=['http','https'])
+	sanitized = bleach.clean(title, tags=['img','del'], attributes=allowed_attributes_emojis, protocols=['http','https'])

 	signal.alarm(0)

--- a/files/routes/admin.py
+++ b/files/routes/admin.py
@ -541,7 +541,7 @@ def change_settings(v, setting):

 	body = f"@{v.username} has {word}d `{setting}` in the [admin dashboard](/admin)!"

-	body_html = sanitize(body, noimages=True)
+	body_html = sanitize(body)

 	new_comment = Comment(author_id=NOTIFICATIONS_ID,
 						  parent_submission=None,
--- a/files/routes/oauth.py
+++ b/files/routes/oauth.py
@ -54,7 +54,7 @@ def request_api_keys(v):

 	body = f"@{v.username} has requested API keys for `{request.values.get('name')}`. You can approve or deny the request [here](/admin/apps)."

-	body_html = sanitize(body, noimages=True)
+	body_html = sanitize(body)


 	new_comment = Comment(author_id=NOTIFICATIONS_ID,
--- a/files/routes/posts.py
+++ b/files/routes/posts.py
@ -727,7 +727,7 @@ def thumbnail_thread(pid):

 				if i["subreddit"] == 'PokemonGoRaids': continue

-				body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89', noimages=True)
+				body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89')

 				existing_comment = db.query(Comment.id).filter_by(author_id=NOTIFICATIONS_ID, parent_submission=None, body_html=body_html).one_or_none()
 				if existing_comment: break
@ -755,7 +755,7 @@ def thumbnail_thread(pid):
 			except: break

 			for i in data:
-				body_html = sanitize(f'New mention of you: https://old.reddit.com{i["permalink"]}?context=89', noimages=True)
+				body_html = sanitize(f'New mention of you: https://old.reddit.com{i["permalink"]}?context=89')

 				existing_comment = db.query(Comment.id).filter_by(author_id=NOTIFICATIONS_ID, parent_submission=None,body_html=body_html).one_or_none()
 				if existing_comment: break
@ -784,7 +784,7 @@ def thumbnail_thread(pid):
 			except: break

 			for i in data:
-				body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89', noimages=True)
+				body_html = sanitize(f'New site mention: https://old.reddit.com{i["permalink"]}?context=89')

 				existing_comment = db.query(Comment.id).filter_by(author_id=NOTIFICATIONS_ID, parent_submission=None, body_html=body_html).one_or_none()

@ -960,7 +960,7 @@ def submit_post(v, sub=None):
 			Submission.deleted_utc == 0,
 			Submission.is_banned == False
 		).first()
-		if repost: return redirect(repost.permalink)
+		if repost and SITE != 'localhost': return redirect(repost.permalink)

 		domain_obj = get_domain(domain)
 		if not domain_obj: domain_obj = get_domain(domain+parsed_url.path)
@ -1012,7 +1012,7 @@ def submit_post(v, sub=None):
 		Submission.body == body
 	).one_or_none()

-	if dup: return redirect(dup.permalink)
+	if dup and SITE != 'localhost': return redirect(dup.permalink)

 	now = int(time.time())
 	cutoff = now - 60 * 60 * 24
--- a/files/routes/static.py
+++ b/files/routes/static.py
@ -301,7 +301,7 @@ def submit_contact(v):
 	if not body: abort(400)

 	body = f'This message has been sent automatically to all admins via [/contact](/contact)\n\nMessage:\n\n' + body
-	body_html = sanitize(body, noimages=True)
+	body_html = sanitize(body)

 	if request.files.get("file") and request.headers.get("cf-ipcountry") != "T1":
 		file=request.files["file"]
--- a/files/routes/users.py
+++ b/files/routes/users.py
@ -606,9 +606,7 @@ def message2(v, username):

 	if 'linkedin.com' in message: return {"error": "This domain 'linkedin.com' is banned."}, 403

-	message = embed_removing_regex.sub(r'\1', message)
-
-	body_html = sanitize(message, noimages=True)
+	body_html = sanitize(message)

 	existing = g.db.query(Comment.id).filter(Comment.author_id == v.id,
 															Comment.sentto == user.id,
@ -666,8 +664,6 @@ def messagereply(v):

 	if 'linkedin.com' in message: return {"error": "this domain 'linkedin.com' is banned"}

-	message = embed_removing_regex.sub(r'\1', message)
-
 	id = int(request.values.get("parent_id"))
 	parent = get_comment(id, v=v)
 	user_id = parent.author.id
@ -675,7 +671,7 @@ def messagereply(v):
 	if parent.sentto == 2: user_id = None
 	elif v.id == user_id: user_id = parent.sentto

-	body_html = sanitize(message, noimages=True)
+	body_html = sanitize(message)

 	if request.files.get("file") and request.headers.get("cf-ipcountry") != "T1":
 		file=request.files["file"]