MarseyWorld/files/helpers/sanitize.py

import functools
import bleach
from bs4 import BeautifulSoup
from bleach.css_sanitizer import CSSSanitizer
from bleach.linkifier import LinkifyFilter
from functools import partial
from .get import *
from os import path, environ
import re
from mistletoe import markdown
from json import loads, dump
from random import random, choice
import signal
import time
import requests

TLDS = ('aaa','aarp','abarth','abb','abbott','abbvie','abc','able','abogado','abudhabi','ac','academy','accenture','accountant','accountants','aco','actor','ad','adac','ads','adult','ae','aeg','aero','aetna','af','afl','africa','ag','agakhan','agency','ai','aig','airbus','airforce','airtel','akdn','al','alfaromeo','alibaba','alipay','allfinanz','allstate','ally','alsace','alstom','am','amazon','americanexpress','americanfamily','amex','amfam','amica','amsterdam','analytics','android','anquan','anz','ao','aol','apartments','app','apple','aq','aquarelle','ar','arab','aramco','archi','army','arpa','art','arte','as','asda','asia','associates','at','athleta','attorney','au','auction','audi','audible','audio','auspost','author','auto','autos','avianca','aw','aws','ax','axa','az','azure','ba','baby','baidu','banamex','bananarepublic','band','bank','bar','barcelona','barclaycard','barclays','barefoot','bargains','baseball','basketball','bauhaus','bayern','bb','bbc','bbt','bbva','bcg','bcn','bd','be','beats','beauty','beer','bentley','berlin','best','bestbuy','bet','bf','bg','bh','bharti','bi','bible','bid','bike','bing','bingo','bio','biz','bj','black','blackfriday','blockbuster','blog','bloomberg','blue','bm','bms','bmw','bn','bnpparibas','bo','boats','boehringer','bofa','bom','bond','boo','book','booking','bosch','bostik','boston','bot','boutique','box','br','bradesco','bridgestone','broadway','broker','brother','brussels','bs','bt','bugatti','build','builders','business','buy','buzz','bv','bw','by','bz','bzh','ca','cab','cafe','cal','call','calvinklein','cam','camera','camp','cancerresearch','canon','capetown','capital','capitalone','car','caravan','cards','care','career','careers','cars','casa','case','cash','casino','cat','catering','catholic','cba','cbn','cbre','cbs','cc','cd','center','ceo','cern','cf','cfa','cfd','cg','ch','chanel','channel','charity','chase','chat','cheap','chintai','christmas','chrome','church','ci','cipriani','circle','cisco','citadel','citi','citic','city','cityeats','ck','cl','claims','cleaning','click','clinic','clinique','clothing','cloud','club','clubmed','cm','cn','co','coach','codes','coffee','college','cologne','com','comcast','commbank','community','company','compare','computer','comsec','condos','construction','consulting','contact','contractors','cooking','cookingchannel','cool','coop','corsica','country','coupon','coupons','courses','cpa','cr','credit','creditcard','creditunion','cricket','crown','crs','cruise','cruises','cu','cuisinella','cv','cw','cx','cy','cymru','cyou','cz','dabur','dad','dance','data','date','dating','datsun','day','dclk','dds','de','deal','dealer','deals','degree','delivery','dell','deloitte','delta','democrat','dental','dentist','desi','design','dev','dhl','diamonds','diet','digital','direct','directory','discount','discover','dish','diy','dj','dk','dm','dnp','do','docs','doctor','dog','domains','dot','download','drive','dtv','dubai','dunlop','dupont','durban','dvag','dvr','dz','earth','eat','ec','eco','edeka','edu','education','ee','eg','email','emerck','energy','engineer','engineering','enterprises','epson','equipment','er','ericsson','erni','es','esq','estate','et','etisalat','eu','eurovision','eus','events','exchange','expert','exposed','express','extraspace','fage','fail','fairwinds','faith','family','fan','fans','farm','farmers','fashion','fast','fedex','feedback','ferrari','ferrero','fi','fiat','fidelity','fido','film','final','finance','financial','fire','firestone','firmdale','fish','fishing','fit','fitness','fj','fk','flickr','flights','flir','florist','flowers','fly','fm','fo','foo','food','foodnetwork','football','ford','forex','forsale','forum','foundation','fox','fr','free','fresenius','frl','frogans','frontdoor','frontier','ftr','fujitsu','fun','fund','furniture','futbol','fyi','ga','gal','gallery','gallo','gallup','game','games','gap','garden','gay','gb','gbiz','gd','gdn','ge','gea','gent','genting','george','gf','gg','ggee','gh','gi','gift','gifts','gives','giving','gl','glass','gle','global','globo','gm','gmail','gmbh','gmo','gmx','gn','godaddy','

allowed_tags = ('b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i',
	'li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul',
	'marquee','a','span','ruby','rp','rt','spoiler','img','lite-youtube','video','source','audio','g')

allowed_styles = ['color', 'background-color', 'font-weight', 'text-align', 'filter',]

def allowed_attributes(tag, name, value):

	if name == 'style': return True

	if tag == 'marquee':
		if name in ['direction', 'behavior', 'scrollamount']: return True
		if name in {'height', 'width'}:
			try: value = int(value.replace('px', ''))
			except: return False
			if 0 < value <= 250: return True
		return False

	if tag == 'a':
		if name == 'href' and '\\' not in value and 'xn--' not in value:
			return True
		if name == 'rel' and value == 'nofollow noopener noreferrer': return True
		if name == 'target' and value == '_blank': return True
		return False

	if tag == 'img':
		if name in ['src','data-src']: return is_safe_url(value)
		if name == 'loading' and value == 'lazy': return True
		if name == 'data-bs-toggle' and value == 'tooltip': return True
		if name in ['g','b','glow'] and not value: return True
		if name in ['alt','title']: return True
		if name == 'referrerpolicy' and value == 'no-referrer': return True
		return False

	if tag == 'lite-youtube':
		if name == 'params' and value.startswith('autoplay=1&modestbranding=1'): return True
		if name == 'videoid': return True
		return False

	if tag == 'video':
		if name == 'controls' and value == '': return True
		if name == 'preload' and value == 'none': return True
		return False

	if tag == 'source':
		if name == 'src': return is_safe_url(value)

	if tag == 'audio':
		if name == 'src': return is_safe_url(value)
		if name == 'controls' and value == '': return True
		if name == 'preload' and value == 'none': return True
		return False

	if tag == 'p':
		if name == 'class' and value == 'mb-0': return True
		return False

	if tag == 'span':
		if name == 'data-bs-toggle' and value == 'tooltip': return True
		if name == 'title': return True
		if name == 'alt': return True
		return False


def build_url_re(tlds, protocols):
	"""Builds the url regex used by linkifier

	If you want a different set of tlds or allowed protocols, pass those in
	and stomp on the existing ``url_re``::

		from bleach import linkifier

		my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)

		linker = LinkifyFilter(url_re=my_url_re)

	"""
	return re.compile(
		r"""\(*# Match any opening parentheses.
		\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?# http://
		([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b# xx.yy.tld(:##)?
		(?:[/?][^#\s\{{\}}\|\\\^\[\]`<>"]*)?
			# /path/zz (excluding "unsafe" chars from RFC 1738,
			# except for ~, which happens in practice)
		(?:\#[^#\s\|\\\^\[\]`<>"]*)?
			# #hash (excluding "unsafe" chars from RFC 1738,
			# except for ~, which happens in practice)
		""".format(
			"|".join(sorted(protocols)), "|".join(sorted(tlds))
		),
		re.IGNORECASE | re.VERBOSE | re.UNICODE,
	)

url_re = build_url_re(tlds=TLDS, protocols=['http', 'https'])

def callback(attrs, new=False):
	if (None, "href") not in attrs:
		return # Incorrect <a> tag

	href = attrs[(None, "href")]

	# \ in href right after / makes most browsers ditch site hostname and allows for a host injection bypassing the check, see <a href="/\google.com">cool</a>
	if "\\" in href or not ascii_only_regex.fullmatch(href):
		attrs["_text"] = href # Laugh at this user
		del attrs[(None, "href")] # Make unclickable and reset harmful payload
		return attrs

	if not href.startswith('/') and not href.startswith(f'{SITE_FULL}/'):
		attrs[(None, "target")] = "_blank"
		attrs[(None, "rel")] = "nofollow noopener noreferrer"

	return attrs


def render_emoji(html, regexp, golden, marseys_used, b=False):
	emojis = list(regexp.finditer(html))
	captured = set()

	for i in emojis:
		if i.group(0) in captured: continue
		captured.add(i.group(0))

		emoji = i.group(1).lower()
		attrs = ''
		if b: attrs += ' b'
		if golden and len(emojis) <= 20 and ('marsey' in emoji or emoji in marseys_const2):
			if random() < 0.0025: attrs += ' g'
			elif random() < 0.00125: attrs += ' glow'

		old = emoji
		emoji = emoji.replace('!','').replace('#','')
		if emoji == 'marseyrandom': emoji = choice(marseys_const2)

		emoji_partial_pat = '<img loading="lazy" alt=":{0}:" src="{1}"{2}>'
		emoji_partial = '<img loading="lazy" data-bs-toggle="tooltip" alt=":{0}:" title=":{0}:" src="{1}"{2}>'
		emoji_html = None

		if emoji.endswith('pat') and emoji != 'marseyunpettablepat':
			if path.isfile(f"files/assets/images/emojis/{emoji.replace('pat','')}.webp"):
				emoji_html = f'<span data-bs-toggle="tooltip" alt=":{old}:" title=":{old}:"><img src="/i/hand.webp">{emoji_partial_pat.format(old, f"/e/{emoji[:-3]}.webp", attrs)}</span>'
			elif emoji.startswith('@'):
				if u := get_user(emoji[1:-3], graceful=True):
					emoji_html = f'<span data-bs-toggle="tooltip" alt=":{old}:" title=":{old}:"><img src="/i/hand.webp">{emoji_partial_pat.format(old, f"/pp/{u.id}", attrs)}</span>'
		elif path.isfile(f'files/assets/images/emojis/{emoji}.webp'):
			emoji_html = emoji_partial.format(old, f'/e/{emoji}.webp', attrs)


		if emoji_html:
			marseys_used.add(emoji)
			html = re.sub(f'(?<!"){i.group(0)}', emoji_html, html)
	return html


def with_sigalrm_timeout(timeout: int):
	'Use SIGALRM to raise an exception if the function executes for longer than timeout seconds'

	# while trying to test this using time.sleep I discovered that gunicorn does in fact do some
	# async so if we timeout on that (or on a db op) then the process is crashed without returning
	# a proper 500 error. Oh well.
	def sig_handler(signum, frame):
		print("Timeout!", flush=True)
		raise Exception("Timeout")

	def inner(func):
		@functools.wraps(func)
		def wrapped(*args, **kwargs):
			signal.signal(signal.SIGALRM, sig_handler)
			signal.alarm(timeout)
			try:
				return func(*args, **kwargs)
			finally:
				signal.alarm(0)
		return wrapped
	return inner


@with_sigalrm_timeout(2)
def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_marseys=False, torture=False):
	sanitized = sanitized.strip()

	if torture:
		sanitized = torture_ap(sanitized, g.v.username)
		sanitized += '\n:#trumpjaktalking:'

	sanitized = normalize_url(sanitized)

	if '```' not in sanitized and '<pre>' not in sanitized:
		sanitized = linefeeds_regex.sub(r'\1\n\n\2', sanitized)

	sanitized = greentext_regex.sub(r'\1<g>\>\2</g>', sanitized)

	sanitized = image_regex.sub(r'\1![](\2)\5', sanitized)

	sanitized = image_check_regex.sub(r'\1', sanitized)

	sanitized = link_fix_regex.sub(r'\1https://\2', sanitized)

	if FEATURES['MARKUP_COMMANDS']:
		sanitized = command_regex.sub(command_regex_matcher, sanitized)

	sanitized = markdown(sanitized)

	sanitized = strikethrough_regex.sub(r'\1<del>\2</del>', sanitized)

	sanitized = sanitized.replace('‎','').replace('','').replace("\ufeff", "").replace("𒐪","")

	sanitized = reddit_regex.sub(r'\1<a href="https://old.reddit.com/\2" rel="nofollow noopener noreferrer" target="_blank">/\2</a>', sanitized)
	sanitized = sub_regex.sub(r'\1<a href="/\2">/\2</a>', sanitized)

	v = getattr(g, 'v', None)

	names = set(m.group(2) for m in mention_regex.finditer(sanitized))
	if limit_pings and len(names) > limit_pings and not v.admin_level: abort(406)
	users_list = get_users(names, graceful=True)
	users_dict = {}
	for u in users_list:
		users_dict[u.username.lower()] = u
		if u.original_username:
			users_dict[u.original_username.lower()] = u

	def replacer(m):
		u = users_dict.get(m.group(2).lower())
		if not u:
			return m.group(0)
		return f'{m.group(1)}<a href="/id/{u.id}"><img loading="lazy" src="/pp/{u.id}">@{u.username}</a>'

	sanitized = mention_regex.sub(replacer, sanitized)

	soup = BeautifulSoup(sanitized, 'lxml')

	for tag in soup.find_all("img"):
		if tag.get("src") and not tag["src"].startswith('/pp/'):
			if not is_safe_url(tag["src"]):
				a = soup.new_tag("a", href=tag["src"], rel="nofollow noopener noreferrer", target="_blank")
				a.string = tag["src"]
				tag.replace_with(a)
				continue

			tag["loading"] = "lazy"
			tag["data-src"] = tag["src"]
			tag["src"] = "/i/l.webp"
			tag['alt'] = f'![]({tag["data-src"]})'

			if not is_site_url(tag["data-src"]):
				tag['referrerpolicy'] = "no-referrer"

			if tag.parent.name != 'a':
				a = soup.new_tag("a", href=tag["data-src"])
				if not is_site_url(a["href"]):
					a["rel"] = "nofollow noopener noreferrer"
					a["target"] = "_blank"
				tag = tag.replace_with(a)
				a.append(tag)

	for tag in soup.find_all("a"):
		if not tag.contents or not str(tag.contents[0]).strip():
			tag.extract()
		if tag.get("href") and fishylinks_regex.fullmatch(str(tag.string)):
			tag.string = tag["href"]


	sanitized = str(soup)

	sanitized = spoiler_regex.sub(r'<spoiler>\1</spoiler>', sanitized)

	marseys_used = set()

	emojis = list(emoji_regex.finditer(sanitized))
	if len(emojis) > 20: golden = False

	captured = []
	for i in emojis:
		if i.group(0) in captured: continue
		captured.append(i.group(0))

		old = i.group(0)
		if 'marseylong1' in old or 'marseylong2' in old or 'marseyllama1' in old or 'marseyllama2' in old: new = old.lower().replace(">", " class='mb-0'>")
		else: new = old.lower()

		new = render_emoji(new, emoji_regex2, golden, marseys_used, True)

		sanitized = sanitized.replace(old, new)

	emojis = list(emoji_regex2.finditer(sanitized))
	if len(emojis) > 20: golden = False

	sanitized = render_emoji(sanitized, emoji_regex2, golden, marseys_used)

	sanitized = sanitized.replace('&amp;','&')

	if "https://youtube.com/watch?v=" in sanitized: sanitized = sanitized.replace("?t=", "&t=")

	captured = []
	for i in youtube_regex.finditer(sanitized):
		if i.group(0) in captured: continue
		captured.append(i.group(0))

		params = parse_qs(urlparse(i.group(2)).query, keep_blank_values=True)
		t = params.get('t', params.get('start', [0]))[0]
		if isinstance(t, str): t = t.replace('s','')

		htmlsource = f'{i.group(1)}<lite-youtube videoid="{i.group(3)}" params="autoplay=1&modestbranding=1'
		if t: htmlsource += f'&start={t}'
		htmlsource += '"></lite-youtube>'

		sanitized = sanitized.replace(i.group(0), htmlsource)

	sanitized = video_sub_regex.sub(r'\1<video controls preload="metadata"><source src="\2"></video>', sanitized)
	sanitized = audio_sub_regex.sub(r'\1<audio controls preload="metadata" src="\2"></audio>', sanitized)

	if count_marseys:
		for marsey in g.db.query(Marsey).filter(Marsey.submitter_id==None, Marsey.name.in_(marseys_used)).all():
			marsey.count += 1
			g.db.add(marsey)

	sanitized = sanitized.replace('<p></p>', '')
	sanitized = utm_regex.sub('', sanitized)
	sanitized = utm_regex2.sub('', sanitized)

	sanitized = sanitized.replace('<html><body>','').replace('</body></html>','')

	css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_styles)
	sanitized = bleach.Cleaner(tags=allowed_tags,
								attributes=allowed_attributes,
								protocols=['http', 'https'],
								css_sanitizer=css_sanitizer,
								filters=[partial(LinkifyFilter, skip_tags=["pre"],
									parse_email=False, callbacks=[callback], url_re=url_re)]
								).clean(sanitized)

	soup = BeautifulSoup(sanitized, 'lxml')

	links = soup.find_all("a")

	domain_list = set()

	for link in links:

		href = link.get("href")
		if not href: continue

		url = urlparse(href)
		domain = url.netloc
		url_path = url.path
		domain_list.add(domain+url_path)

		parts = domain.split(".")
		for i in range(len(parts)):
			new_domain = parts[i]
			for j in range(i + 1, len(parts)):
				new_domain += "." + parts[j]
				domain_list.add(new_domain)

	bans = g.db.query(BannedDomain.domain).filter(BannedDomain.domain.in_(list(domain_list))).all()

	if bans: abort(403, description=f"Remove the banned domains {bans} and try again!")

	if '<pre>' not in sanitized:
		sanitized = sanitized.replace('\n','')

	if showmore and len(sanitized) > 5000:
		sanitized = showmore_regex.sub(r'\1<p><button class="showmore" onclick="showmore()">SHOW MORE</button></p><d class="d-none">\2</d>', sanitized, count=1)

	return sanitized.strip()


def allowed_attributes_emojis(tag, name, value):

	if tag == 'img':
		if name == 'src' and value.startswith('/') and '\\' not in value: return True
		if name == 'loading' and value == 'lazy': return True
		if name == 'data-bs-toggle' and value == 'tooltip': return True
		if name in ['g','glow'] and not value: return True
		if name in ['alt','title']: return True

	if tag == 'span':
		if name == 'data-bs-toggle' and value == 'tooltip': return True
		if name == 'title': return True
		if name == 'alt': return True
		return False
	return False


@with_sigalrm_timeout(1)
def filter_emojis_only(title, golden=True, count_marseys=False, graceful=False, torture=False):
	title = title.strip()

	if torture:
		title = torture_ap(title, g.v.username)

	title = title.replace('‎','').replace('','').replace("\ufeff", "").replace("𒐪","").replace("\n", "").replace("\r", "").replace("\t", "").replace("&", "&amp;").replace('<','&lt;').replace('>','&gt;').replace('"', '&quot;').replace("'", "&#039;").strip()

	marseys_used = set()

	title = render_emoji(title, emoji_regex3, golden, marseys_used)

	if count_marseys:
		for marsey in g.db.query(Marsey).filter(Marsey.submitter_id==None, Marsey.name.in_(marseys_used)).all():
			marsey.count += 1
			g.db.add(marsey)

	title = strikethrough_regex.sub(r'\1<del>\2</del>', title)

	title = bleach.clean(title, tags=['img','del','span'], attributes=allowed_attributes_emojis, protocols=['http','https'])

	if len(title) > 1500 and not graceful: abort(400)
	else: return title.replace('\n','').strip()

def normalize_url(url):
	url = reddit_domain_regex.sub(r'\1https://old.reddit.com/\3/', url)

	url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") \
			 .replace("https://music.youtube.com/watch?v=", "https://youtube.com/watch?v=") \
			 .replace("https://www.youtube.com", "https://youtube.com") \
			 .replace("https://youtube.com/shorts/", "https://youtube.com/watch?v=") \
			 .replace("https://youtube.com/v/", "https://youtube.com/watch?v=") \
			 .replace("https://mobile.twitter.com", "https://twitter.com") \
			 .replace("https://m.facebook.com", "https://facebook.com") \
			 .replace("https://m.wikipedia.org", "https://wikipedia.org") \
			 .replace("https://m.youtube.com", "https://youtube.com") \
			 .replace("https://www.twitter.com", "https://twitter.com") \
			 .replace("https://www.instagram.com", "https://instagram.com") \
			 .replace("https://www.tiktok.com", "https://tiktok.com") \
			 .replace("https://www.streamable.com", "https://streamable.com") \
			 .replace("https://streamable.com/", "https://streamable.com/e/") \
			 .replace("https://streamable.com/e/e/", "https://streamable.com/e/") \
			 .replace("https://search.marsey.cat/#", "https://camas.unddit.com/#") \
			 .replace("https://imgur.com/", "https://i.imgur.com/")

	url = imgur_regex.sub(r'\1_d.webp?maxwidth=9999&fidelity=high', url)
	url = giphy_regex.sub(r'\1.webp', url)

	return url

def validate_css(css):
	if '@import' in css:
		return False, "@import statements not allowed."

	for i in css_url_regex.finditer(css):
		url = i.group(1)
		if not is_safe_url(url):
			domain = tldextract.extract(url).registered_domain
			return False, f"The domain '{domain}' is not allowed, please use one of these domains\n\n{approved_embed_hosts}."

	return True, ""
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
+								import functools
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+								import bleach
 								from bs4 import BeautifulSoup
-												Upgrade bleach to 5.0.0.

											
										
										
											2022-05-25 00:27:41 +00:00
+								from bleach.css_sanitizer import CSSSanitizer
-												fix camas.unddit.com

											
										
										
											2022-07-15 13:27:45 +00:00
+								from bleach.linkifier import LinkifyFilter
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+								from functools import partial
 								from .get import *
 								from os import path, environ
 								import re
 								from mistletoe import markdown
 								from json import loads, dump
 								from random import random, choice
 								import signal
 								import time
 								import requests
-												support all existing TLDs

											
										
										
											2022-08-25 15:04:25 +00:00
+								TLDS = ('aaa','aarp','abarth','abb','abbott','abbvie','abc','able','abogado','abudhabi','ac','academy','accenture','accountant','accountants','aco','actor','ad','adac','ads','adult','ae','aeg','aero','aetna','af','afl','africa','ag','agakhan','agency','ai','aig','airbus','airforce','airtel','akdn','al','alfaromeo','alibaba','alipay','allfinanz','allstate','ally','alsace','alstom','am','amazon','americanexpress','americanfamily','amex','amfam','amica','amsterdam','analytics','android','anquan','anz','ao','aol','apartments','app','apple','aq','aquarelle','ar','arab','aramco','archi','army','arpa','art','arte','as','asda','asia','associates','at','athleta','attorney','au','auction','audi','audible','audio','auspost','author','auto','autos','avianca','aw','aws','ax','axa','az','azure','ba','baby','baidu','banamex','bananarepublic','band','bank','bar','barcelona','barclaycard','barclays','barefoot','bargains','baseball','basketball','bauhaus','bayern','bb','bbc','bbt','bbva','bcg','bcn','bd','be','beats','beauty','beer','bentley','berlin','best','bestbuy','bet','bf','bg','bh','bharti','bi','bible','bid','bike','bing','bingo','bio','biz','bj','black','blackfriday','blockbuster','blog','bloomberg','blue','bm','bms','bmw','bn','bnpparibas','bo','boats','boehringer','bofa','bom','bond','boo','book','booking','bosch','bostik','boston','bot','boutique','box','br','bradesco','bridgestone','broadway','broker','brother','brussels','bs','bt','bugatti','build','builders','business','buy','buzz','bv','bw','by','bz','bzh','ca','cab','cafe','cal','call','calvinklein','cam','camera','camp','cancerresearch','canon','capetown','capital','capitalone','car','caravan','cards','care','career','careers','cars','casa','case','cash','casino','cat','catering','catholic','cba','cbn','cbre','cbs','cc','cd','center','ceo','cern','cf','cfa','cfd','cg','ch','chanel','channel','charity','chase','chat','cheap','chintai','christmas','chrome','church','ci','cipriani','circle','cisco','citadel','citi','citic','city','cityeats','ck','cl','claims','cleaning','click','clinic','clinique','clothing','cloud','club','clubmed','cm','cn','co','coach','codes','coffee','college','cologne','com','comcast','commbank','community','company','compare','computer','comsec','condos','construction','consulting','contact','contractors','cooking','cookingchannel','cool','coop','corsica','country','coupon','coupons','courses','cpa','cr','credit','creditcard','creditunion','cricket','crown','crs','cruise','cruises','cu','cuisinella','cv','cw','cx','cy','cymru','cyou','cz','dabur','dad','dance','data','date','dating','datsun','day','dclk','dds','de','deal','dealer','deals','degree','delivery','dell','deloitte','delta','democrat','dental','dentist','desi','design','dev','dhl','diamonds','diet','digital','direct','directory','discount','discover','dish','diy','dj','dk','dm','dnp','do','docs','doctor','dog','domains','dot','download','drive','dtv','dubai','dunlop','dupont','durban','dvag','dvr','dz','earth','eat','ec','eco','edeka','edu','education','ee','eg','email','emerck','energy','engineer','engineering','enterprises','epson','equipment','er','ericsson','erni','es','esq','estate','et','etisalat','eu','eurovision','eus','events','exchange','expert','exposed','express','extraspace','fage','fail','fairwinds','faith','family','fan','fans','farm','farmers','fashion','fast','fedex','feedback','ferrari','ferrero','fi','fiat','fidelity','fido','film','final','finance','financial','fire','firestone','firmdale','fish','fishing','fit','fitness','fj','fk','flickr','flights','flir','florist','flowers','fly','fm','fo','foo','food','foodnetwork','football','ford','forex','forsale','forum','foundation','fox','fr','free','fresenius','frl','frogans','frontdoor','frontier','ftr','fujitsu','fun','fund','furniture','futbol','fyi','ga','gal','gallery','gallo','gallup','game','games','gap','garden','gay','gb','gbiz','gd','gdn','ge','gea','gent','genting','george','gf','gg','ggee','gh','gi','gift','gifts','gives','giving','gl','glass','gle','global','globo','gm','gmail','gmbh','gmo','gmx','gn','godaddy','
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												sanitize: Add more neo-gTLDs to TLDs tuple-list.

Per https://rdrama.net/post/70341/-/1976650 added more gTLDs that
are actually desired by site users.

Also, hard wrapped the `TLDS` and `allowed_tags` tuple-lists at a
100char hard ruler for my sanity.

											
										
										
											2022-05-24 19:16:55 +00:00
+								allowed_tags = ('b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i',
 									'li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul',
-												added greentext

											
										
										
											2022-06-19 15:05:50 +00:00
+									'marquee','a','span','ruby','rp','rt','spoiler','img','lite-youtube','video','source','audio','g')
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												Revert "remove "filter" from allowed styles"

This reverts commit 14d929623e5729bf5490d9a91bf11b2466b9a3bd.

											
										
										
											2022-06-30 22:18:05 +00:00
+								allowed_styles = ['color', 'background-color', 'font-weight', 'text-align', 'filter',]
-												Upgrade bleach to 5.0.0.

											
										
										
											2022-05-25 00:27:41 +00:00
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+								def allowed_attributes(tag, name, value):
 									if name == 'style': return True
 									if tag == 'marquee':
 										if name in ['direction', 'behavior', 'scrollamount']: return True
 										if name in {'height', 'width'}:
 											try: value = int(value.replace('px', ''))
 											except: return False
 											if 0 < value <= 250: return True
 										return False
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									if tag == 'a':
-												allow only ascii characters in links (https://rdrama.net/comment/2150032)

											
										
										
											2022-06-19 17:25:55 +00:00
+										if name == 'href' and '\\' not in value and 'xn--' not in value:
 											return True
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+										if name == 'rel' and value == 'nofollow noopener noreferrer': return True
 										if name == 'target' and value == '_blank': return True
 										return False
 									if tag == 'img':
-												crgd is a king

											
										
										
											2022-05-25 18:29:22 +00:00
+										if name in ['src','data-src']: return is_safe_url(value)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+										if name == 'loading' and value == 'lazy': return True
 										if name == 'data-bs-toggle' and value == 'tooltip': return True
-												add glowie marseys

											
										
										
											2022-06-23 00:34:37 +00:00
+										if name in ['g','b','glow'] and not value: return True
-												fc

											
										
										
											2022-05-18 18:45:04 +00:00
+										if name in ['alt','title']: return True
-												kitchen sink commit, all over the place

											
										
										
											2022-07-02 10:44:05 +00:00
+										if name == 'referrerpolicy' and value == 'no-referrer': return True
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+										return False
 									if tag == 'lite-youtube':
 										if name == 'params' and value.startswith('autoplay=1&modestbranding=1'): return True
 										if name == 'videoid': return True
 										return False
 									if tag == 'video':
 										if name == 'controls' and value == '': return True
 										if name == 'preload' and value == 'none': return True
 										return False
 									if tag == 'source':
-												crgd is a king

											
										
										
											2022-05-25 18:29:22 +00:00
+										if name == 'src': return is_safe_url(value)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												Add <audio> support to sanitize.py.

Parallels the decisions made with <video>.

											
										
										
											2022-05-15 22:47:37 +00:00
+									if tag == 'audio':
-												crgd is a king

											
										
										
											2022-05-25 18:29:22 +00:00
+										if name == 'src': return is_safe_url(value)
-												Add <audio> support to sanitize.py.

Parallels the decisions made with <video>.

											
										
										
											2022-05-15 22:47:37 +00:00
+										if name == 'controls' and value == '': return True
 										if name == 'preload' and value == 'none': return True
 										return False
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									if tag == 'p':
 										if name == 'class' and value == 'mb-0': return True
 										return False
 									if tag == 'span':
 										if name == 'data-bs-toggle' and value == 'tooltip': return True
 										if name == 'title': return True
 										if name == 'alt': return True
 										return False
-												fix camas.unddit.com

											
										
										
											2022-07-15 13:27:45 +00:00
+								def build_url_re(tlds, protocols):
-												casino + style shit

											
										
										
											2022-09-04 23:15:37 +00:00
+									"""Builds the url regex used by linkifier
 									If you want a different set of tlds or allowed protocols, pass those in
 									and stomp on the existing ``url_re``::
 										from bleach import linkifier
 										my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
 										linker = LinkifyFilter(url_re=my_url_re)
 									"""
 									return re.compile(
 										r"""\(*# Match any opening parentheses.
 										\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?# http://
 										([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b# xx.yy.tld(:##)?
 										(?:[/?][^#\s\{{\}}\|\\\^\[\]`<>"]*)?
 											# /path/zz (excluding "unsafe" chars from RFC 1738,
 											# except for ~, which happens in practice)
 										(?:\#[^#\s\|\\\^\[\]`<>"]*)?
 											# #hash (excluding "unsafe" chars from RFC 1738,
 											# except for ~, which happens in practice)
 										""".format(
 											"|".join(sorted(protocols)), "|".join(sorted(tlds))
 										),
 										re.IGNORECASE | re.VERBOSE | re.UNICODE,
 									)
-												fix camas.unddit.com

											
										
										
											2022-07-15 13:27:45 +00:00
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+								url_re = build_url_re(tlds=TLDS, protocols=['http', 'https'])
 								def callback(attrs, new=False):
-												Fix <a> tag link injection, don't throw a ValueError when href attr is missing, properly count marseys (their usage wasn't counted for 20 days :marseygasp:) (#265)


											
										
										
											2022-05-17 18:59:07 +00:00
+									if (None, "href") not in attrs:
 										return # Incorrect <a> tag
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									href = attrs[(None, "href")]
-												Fix <a> tag link injection, don't throw a ValueError when href attr is missing, properly count marseys (their usage wasn't counted for 20 days :marseygasp:) (#265)


											
										
										
											2022-05-17 18:59:07 +00:00
+									# \ in href right after / makes most browsers ditch site hostname and allows for a host injection bypassing the check, see <a href="/\google.com">cool</a>
-												allow only ascii characters in links (https://rdrama.net/comment/2150032)

											
										
										
											2022-06-19 17:25:55 +00:00
+									if "\\" in href or not ascii_only_regex.fullmatch(href):
-												Fix <a> tag link injection, don't throw a ValueError when href attr is missing, properly count marseys (their usage wasn't counted for 20 days :marseygasp:) (#265)


											
										
										
											2022-05-17 18:59:07 +00:00
+										attrs["_text"] = href # Laugh at this user
 										del attrs[(None, "href")] # Make unclickable and reset harmful payload
 										return attrs
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									if not href.startswith('/') and not href.startswith(f'{SITE_FULL}/'):
 										attrs[(None, "target")] = "_blank"
-												remove weird trailing tabs

											
										
										
											2022-06-11 09:56:16 +00:00
+										attrs[(None, "rel")] = "nofollow noopener noreferrer"
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 									return attrs
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+								def render_emoji(html, regexp, golden, marseys_used, b=False):
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									emojis = list(regexp.finditer(html))
 									captured = set()
 									for i in emojis:
 										if i.group(0) in captured: continue
 										captured.add(i.group(0))
 										emoji = i.group(1).lower()
 										attrs = ''
 										if b: attrs += ' b'
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+										if golden and len(emojis) <= 20 and ('marsey' in emoji or emoji in marseys_const2):
-												add glowie marseys

											
										
										
											2022-06-23 00:34:37 +00:00
+											if random() < 0.0025: attrs += ' g'
 											elif random() < 0.00125: attrs += ' glow'
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 										old = emoji
 										emoji = emoji.replace('!','').replace('#','')
 										if emoji == 'marseyrandom': emoji = choice(marseys_const2)
 										emoji_partial_pat = '<img loading="lazy" alt=":{0}:" src="{1}"{2}>'
 										emoji_partial = '<img loading="lazy" data-bs-toggle="tooltip" alt=":{0}:" title=":{0}:" src="{1}"{2}>'
 										emoji_html = None
-												make :marseyunpettable: unpettable

											
										
										
											2022-07-08 15:39:54 +00:00
+										if emoji.endswith('pat') and emoji != 'marseyunpettablepat':
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+											if path.isfile(f"files/assets/images/emojis/{emoji.replace('pat','')}.webp"):
-												add a shortened endpoint for static images and rename loading.webp to l.webp (to save bytes)

											
										
										
											2022-06-22 15:51:19 +00:00
+												emoji_html = f'<span data-bs-toggle="tooltip" alt=":{old}:" title=":{old}:"><img src="/i/hand.webp">{emoji_partial_pat.format(old, f"/e/{emoji[:-3]}.webp", attrs)}</span>'
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+											elif emoji.startswith('@'):
 												if u := get_user(emoji[1:-3], graceful=True):
-												add a shortened endpoint for static images and rename loading.webp to l.webp (to save bytes)

											
										
										
											2022-06-22 15:51:19 +00:00
+													emoji_html = f'<span data-bs-toggle="tooltip" alt=":{old}:" title=":{old}:"><img src="/i/hand.webp">{emoji_partial_pat.format(old, f"/pp/{u.id}", attrs)}</span>'
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+										elif path.isfile(f'files/assets/images/emojis/{emoji}.webp'):
 											emoji_html = emoji_partial.format(old, f'/e/{emoji}.webp', attrs)
 										if emoji_html:
-												Fix <a> tag link injection, don't throw a ValueError when href attr is missing, properly count marseys (their usage wasn't counted for 20 days :marseygasp:) (#265)


											
										
										
											2022-05-17 18:59:07 +00:00
+											marseys_used.add(emoji)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+											html = re.sub(f'(?<!"){i.group(0)}', emoji_html, html)
 									return html
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
+								def with_sigalrm_timeout(timeout: int):
 									'Use SIGALRM to raise an exception if the function executes for longer than timeout seconds'
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
+									# while trying to test this using time.sleep I discovered that gunicorn does in fact do some
 									# async so if we timeout on that (or on a db op) then the process is crashed without returning
 									# a proper 500 error. Oh well.
 									def sig_handler(signum, frame):
 										print("Timeout!", flush=True)
 										raise Exception("Timeout")
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
+									def inner(func):
-												sanitize.with_sigalrm_timeout: functools.wrap fix.

											
										
										
											2022-07-06 09:01:48 +00:00
+										@functools.wraps(func)
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
+										def wrapped(*args, **kwargs):
 											signal.signal(signal.SIGALRM, sig_handler)
 											signal.alarm(timeout)
 											try:
 												return func(*args, **kwargs)
 											finally:
 												signal.alarm(0)
 										return wrapped
 									return inner
 								@with_sigalrm_timeout(2)
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+								def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_marseys=False, torture=False):
-												allow uploading of all types of files (using lain.la)
+ allow multiple file upload in bios and messaging admins

											
										
										
											2022-06-18 15:53:34 +00:00
+									sanitized = sanitized.strip()
-												refactor torture_ap

											
										
										
											2022-09-05 20:05:04 +00:00
+									if torture:
 										sanitized = torture_ap(sanitized, g.v.username)
 										sanitized += '\n:#trumpjaktalking:'
-												fix this https://chapotraphouse.club/post/18459/marseycapywalking-megathread-for-bugs-and-suggestions/2178607?context=8#context

											
										
										
											2022-06-23 19:43:49 +00:00
+									sanitized = normalize_url(sanitized)
-												sfd

											
										
										
											2022-05-27 18:28:54 +00:00
+									if '```' not in sanitized and '<pre>' not in sanitized:
-												sfd

											
										
										
											2022-05-08 09:06:01 +00:00
+										sanitized = linefeeds_regex.sub(r'\1\n\n\2', sanitized)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												greentext fix

											
										
										
											2022-06-19 15:22:06 +00:00
+									sanitized = greentext_regex.sub(r'\1<g>\>\2</g>', sanitized)
-												added greentext

											
										
										
											2022-06-19 15:05:50 +00:00
-												small image embed improvements

											
										
										
											2022-06-11 12:21:59 +00:00
+									sanitized = image_regex.sub(r'\1![](\2)\5', sanitized)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 									sanitized = image_check_regex.sub(r'\1', sanitized)
-												Fix timeout in sanitize from link_fix_regex.

h/t to @official-techsupport for finding and help fixing this bug.
When given certain pathological input, `sanitize` would time out
(notably only on posts, rather than comments, perhaps due to the
longer maximum length of input). For example, using as input the
result of:

    with open("test.txt", "w") as f:
        for i in range(26):
            f.write(f":{chr(ord('a') + i)}: ")
        f.write('x' * 20_000)

We believe this to be because of some combination of the greedy
quantifiers and the negative lookahead before the match. The regex
was rewritten to (in theory) have much more linear performance.

											
										
										
											2022-06-25 05:28:43 +00:00
+									sanitized = link_fix_regex.sub(r'\1https://\2', sanitized)
-												fg

											
										
										
											2022-05-07 05:28:51 +00:00
-												LGB: disable markup commands.

											
										
										
											2022-07-20 00:07:38 +00:00
+									if FEATURES['MARKUP_COMMANDS']:
 										sanitized = command_regex.sub(command_regex_matcher, sanitized)
-												refactor comment commands

											
										
										
											2022-07-11 12:14:18 +00:00
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									sanitized = markdown(sanitized)
-												fix strikethrough, v2

											
										
										
											2022-06-28 05:52:29 +00:00
+									sanitized = strikethrough_regex.sub(r'\1<del>\2</del>', sanitized)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									sanitized = sanitized.replace('‎','').replace('','').replace("\ufeff", "").replace("𒐪","")
-												add target="_blank" to all external urls

											
										
										
											2022-07-12 20:29:27 +00:00
+									sanitized = reddit_regex.sub(r'\1<a href="https://old.reddit.com/\2" rel="nofollow noopener noreferrer" target="_blank">/\2</a>', sanitized)
-												remove the need for alert flag on sanitize()

											
										
										
											2022-06-22 22:12:47 +00:00
+									sanitized = sub_regex.sub(r'\1<a href="/\2">/\2</a>', sanitized)
-												make non-jannies unable to ping more than 3 ppl

											
										
										
											2022-07-29 13:23:34 +00:00
+									v = getattr(g, 'v', None)
-												steal the parts I wanted from spidey's PR

											
										
										
											2022-08-21 17:20:09 +00:00
+									names = set(m.group(2) for m in mention_regex.finditer(sanitized))
-												fix prev commit

											
										
										
											2022-08-21 17:22:18 +00:00
+									if limit_pings and len(names) > limit_pings and not v.admin_level: abort(406)
-												steal the parts I wanted from spidey's PR

											
										
										
											2022-08-21 17:20:09 +00:00
+									users_list = get_users(names, graceful=True)
 									users_dict = {}
 									for u in users_list:
 										users_dict[u.username.lower()] = u
 										if u.original_username:
 											users_dict[u.original_username.lower()] = u
 									def replacer(m):
 										u = users_dict.get(m.group(2).lower())
 										if not u:
 											return m.group(0)
 										return f'{m.group(1)}<a href="/id/{u.id}"><img loading="lazy" src="/pp/{u.id}">@{u.username}</a>'
 									sanitized = mention_regex.sub(replacer, sanitized)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 									soup = BeautifulSoup(sanitized, 'lxml')
 									for tag in soup.find_all("img"):
 										if tag.get("src") and not tag["src"].startswith('/pp/'):
-												add more image validation (to fix this https://chapotraphouse.club/post/18459/marseycapywalking-megathread-for-bugs-and-suggestions/2298173?context=8#context)

											
										
										
											2022-07-12 20:30:00 +00:00
+											if not is_safe_url(tag["src"]):
 												a = soup.new_tag("a", href=tag["src"], rel="nofollow noopener noreferrer", target="_blank")
 												a.string = tag["src"]
 												tag.replace_with(a)
 												continue
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+											tag["loading"] = "lazy"
 											tag["data-src"] = tag["src"]
-												add a shortened endpoint for static images and rename loading.webp to l.webp (to save bytes)

											
										
										
											2022-06-22 15:51:19 +00:00
+											tag["src"] = "/i/l.webp"
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+											tag['alt'] = f'![]({tag["data-src"]})'
-												kitchen sink commit, all over the place

											
										
										
											2022-07-02 10:44:05 +00:00
 											if not is_site_url(tag["data-src"]):
 												tag['referrerpolicy'] = "no-referrer"
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												fix snappy archiving images

											
										
										
											2022-07-02 00:25:58 +00:00
+											if tag.parent.name != 'a':
-												kitchen sink commit, all over the place

											
										
										
											2022-07-02 10:44:05 +00:00
+												a = soup.new_tag("a", href=tag["data-src"])
 												if not is_site_url(a["href"]):
 													a["rel"] = "nofollow noopener noreferrer"
-												add target="_blank" to all external urls

											
										
										
											2022-07-12 20:29:27 +00:00
+													a["target"] = "_blank"
-												fix snappy archiving images

											
										
										
											2022-07-02 00:25:58 +00:00
+												tag = tag.replace_with(a)
 												a.append(tag)
-												embed rework

											
										
										
											2022-06-27 01:00:45 +00:00
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									for tag in soup.find_all("a"):
-												remove empty anchor tags

											
										
										
											2022-07-02 00:54:59 +00:00
+										if not tag.contents or not str(tag.contents[0]).strip():
 											tag.extract()
-												Revert "delete empty <a> tags to prevent exploits"

This reverts commit 5f78b4e36581acb890c382b783d90e58a940c47c.

This commit is breaking @-mentions in a great many contexts and
I'm not quite sure how to fix it right now.

											
										
										
											2022-06-22 21:59:30 +00:00
+										if tag.get("href") and fishylinks_regex.fullmatch(str(tag.string)):
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+											tag.string = tag["href"]
 									sanitized = str(soup)
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									sanitized = spoiler_regex.sub(r'<spoiler>\1</spoiler>', sanitized)
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									marseys_used = set()
 									emojis = list(emoji_regex.finditer(sanitized))
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+									if len(emojis) > 20: golden = False
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 									captured = []
 									for i in emojis:
 										if i.group(0) in captured: continue
 										captured.append(i.group(0))
 										old = i.group(0)
 										if 'marseylong1' in old or 'marseylong2' in old or 'marseyllama1' in old or 'marseyllama2' in old: new = old.lower().replace(">", " class='mb-0'>")
 										else: new = old.lower()
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+										new = render_emoji(new, emoji_regex2, golden, marseys_used, True)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 										sanitized = sanitized.replace(old, new)
 									emojis = list(emoji_regex2.finditer(sanitized))
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+									if len(emojis) > 20: golden = False
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+									sanitized = render_emoji(sanitized, emoji_regex2, golden, marseys_used)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												fixing https://rdrama.net/post/69817/tfw-you-will-never-troll-harder/1961118?context=8#context

											
										
										
											2022-05-22 10:20:11 +00:00
+									sanitized = sanitized.replace('&amp;','&')
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 									if "https://youtube.com/watch?v=" in sanitized: sanitized = sanitized.replace("?t=", "&t=")
 									captured = []
 									for i in youtube_regex.finditer(sanitized):
 										if i.group(0) in captured: continue
 										captured.append(i.group(0))
-												keep_blank_values

											
										
										
											2022-09-01 20:46:57 +00:00
+										params = parse_qs(urlparse(i.group(2)).query, keep_blank_values=True)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+										t = params.get('t', params.get('start', [0]))[0]
 										if isinstance(t, str): t = t.replace('s','')
 										htmlsource = f'{i.group(1)}<lite-youtube videoid="{i.group(3)}" params="autoplay=1&modestbranding=1'
 										if t: htmlsource += f'&start={t}'
 										htmlsource += '"></lite-youtube>'
 										sanitized = sanitized.replace(i.group(0), htmlsource)
-												show first frame of video

											
										
										
											2022-09-01 20:43:11 +00:00
+									sanitized = video_sub_regex.sub(r'\1<video controls preload="metadata"><source src="\2"></video>', sanitized)
-												restore metadata for audio

											
										
										
											2022-09-03 03:39:35 +00:00
+									sanitized = audio_sub_regex.sub(r'\1<audio controls preload="metadata" src="\2"></audio>', sanitized)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+									if count_marseys:
-												add Marsey Submission UI

											
										
										
											2022-09-09 09:13:50 +00:00
+										for marsey in g.db.query(Marsey).filter(Marsey.submitter_id==None, Marsey.name.in_(marseys_used)).all():
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+											marsey.count += 1
 											g.db.add(marsey)
-												fds

											
										
										
											2022-05-15 08:45:57 +00:00
+									sanitized = sanitized.replace('<p></p>', '')
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									sanitized = utm_regex.sub('', sanitized)
 									sanitized = utm_regex2.sub('', sanitized)
 									sanitized = sanitized.replace('<html><body>','').replace('</body></html>','')
-												Upgrade bleach to 5.0.0.

											
										
										
											2022-05-25 00:27:41 +00:00
+									css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_styles)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									sanitized = bleach.Cleaner(tags=allowed_tags,
 																attributes=allowed_attributes,
 																protocols=['http', 'https'],
-												Upgrade bleach to 5.0.0.

											
										
										
											2022-05-25 00:27:41 +00:00
+																css_sanitizer=css_sanitizer,
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
+																filters=[partial(LinkifyFilter, skip_tags=["pre"],
-												Upgrade bleach to 5.0.0.

											
										
										
											2022-05-25 00:27:41 +00:00
+																	parse_email=False, callbacks=[callback], url_re=url_re)]
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+																).clean(sanitized)
 									soup = BeautifulSoup(sanitized, 'lxml')
 									links = soup.find_all("a")
 									domain_list = set()
 									for link in links:
 										href = link.get("href")
 										if not href: continue
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+										url = urlparse(href)
 										domain = url.netloc
 										url_path = url.path
 										domain_list.add(domain+url_path)
 										parts = domain.split(".")
 										for i in range(len(parts)):
 											new_domain = parts[i]
 											for j in range(i + 1, len(parts)):
 												new_domain += "." + parts[j]
 												domain_list.add(new_domain)
 									bans = g.db.query(BannedDomain.domain).filter(BannedDomain.domain.in_(list(domain_list))).all()
 									if bans: abort(403, description=f"Remove the banned domains {bans} and try again!")
-												fix issue with code blocks

											
										
										
											2022-06-30 23:01:10 +00:00
+									if '<pre>' not in sanitized:
 										sanitized = sanitized.replace('\n','')
-												add "show more..." button

											
										
										
											2022-06-29 00:55:44 +00:00
-												Disable showmore logic on posts.

											
										
										
											2022-08-10 21:34:15 +00:00
+									if showmore and len(sanitized) > 5000:
-												fix showmore quadratic behavior (#343)


											
										
										
											2022-08-30 21:19:53 +00:00
+										sanitized = showmore_regex.sub(r'\1<p><button class="showmore" onclick="showmore()">SHOW MORE</button></p><d class="d-none">\2</d>', sanitized, count=1)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												strip sanitized text

											
										
										
											2022-07-02 10:12:52 +00:00
+									return sanitized.strip()
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 								def allowed_attributes_emojis(tag, name, value):
 									if tag == 'img':
-												crgd is a king

											
										
										
											2022-05-25 18:29:22 +00:00
+										if name == 'src' and value.startswith('/') and '\\' not in value: return True
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+										if name == 'loading' and value == 'lazy': return True
 										if name == 'data-bs-toggle' and value == 'tooltip': return True
-												add glowie marseys

											
										
										
											2022-06-23 00:34:37 +00:00
+										if name in ['g','glow'] and not value: return True
-												fc

											
										
										
											2022-05-18 18:45:04 +00:00
+										if name in ['alt','title']: return True
-												pls review

											
										
										
											2022-05-17 19:58:41 +00:00
 									if tag == 'span':
 										if name == 'data-bs-toggle' and value == 'tooltip': return True
 										if name == 'title': return True
 										if name == 'alt': return True
 										return False
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
+									return False
-												fix exceptions in sanitize leaving SIGALRM on

											
										
										
											2022-07-05 22:11:45 +00:00
+								@with_sigalrm_timeout(1)
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+								def filter_emojis_only(title, golden=True, count_marseys=False, graceful=False, torture=False):
-												refactor torture_ap

											
										
										
											2022-09-05 20:05:04 +00:00
+									title = title.strip()
 									if torture:
 										title = torture_ap(title, g.v.username)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 									title = title.replace('‎','').replace('','').replace("\ufeff", "").replace("𒐪","").replace("\n", "").replace("\r", "").replace("\t", "").replace("&", "&amp;").replace('<','&lt;').replace('>','&gt;').replace('"', '&quot;').replace("'", "&#039;").strip()
-												make marsey counter work everywhere (not just comments)

											
										
										
											2022-06-13 18:05:24 +00:00
+									marseys_used = set()
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+									title = render_emoji(title, emoji_regex3, golden, marseys_used)
-												make marsey counter work everywhere (not just comments)

											
										
										
											2022-06-13 18:05:24 +00:00
-												minor sanitize refactor

											
										
										
											2022-09-16 16:30:34 +00:00
+									if count_marseys:
-												add Marsey Submission UI

											
										
										
											2022-09-09 09:13:50 +00:00
+										for marsey in g.db.query(Marsey).filter(Marsey.submitter_id==None, Marsey.name.in_(marseys_used)).all():
-												make marsey counter work everywhere (not just comments)

											
										
										
											2022-06-13 18:05:24 +00:00
+											marsey.count += 1
 											g.db.add(marsey)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												fixed strikethrough

											
										
										
											2022-06-28 05:41:21 +00:00
+									title = strikethrough_regex.sub(r'\1<del>\2</del>', title)
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
-												pls review

											
										
										
											2022-05-17 19:58:41 +00:00
+									title = bleach.clean(title, tags=['img','del','span'], attributes=allowed_attributes_emojis, protocols=['http','https'])
-												mn

											
										
										
											2022-05-04 23:09:46 +00:00
 									if len(title) > 1500 and not graceful: abort(400)
-												strip sanitized text

											
										
										
											2022-07-02 10:12:52 +00:00
+									else: return title.replace('\n','').strip()
-												Sanitize: modularize normalize_url, fix streamable.

Originally prompted by https://rdrama.net/post/18459/-/1984609 which
noticed that streamable.com/e/ links as posts would have another e/
added to them. This was in spite of logic in posts.py api_is_repost
and submit_post designed to specifically counteract this.
Proximal cause was a copypasta'd url.replace(...) chain which
caused the mistake before the streamable-specific logic had a chance
to avoid making it.

Solution: remove the streamable replacement from the chained statement
and create `helpers.normalize_url(url)` to get rid of the copypasta.

											
										
										
											2022-05-25 08:43:16 +00:00
-												re-refactor normalize_url

											
										
										
											2022-06-10 20:02:15 +00:00
+								def normalize_url(url):
-												fix reddit domain replacement

											
										
										
											2022-07-04 03:08:33 +00:00
+									url = reddit_domain_regex.sub(r'\1https://old.reddit.com/\3/', url)
-												re-refactor normalize_url

											
										
										
											2022-06-10 20:02:15 +00:00
-												refactor normalizing urls at runtime (I put the function in comment.py cuz there were weird import errors that i didnt wanna fix)

											
										
										
											2022-06-23 15:47:57 +00:00
+									url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") \
-												Sanitize: modularize normalize_url, fix streamable.

Originally prompted by https://rdrama.net/post/18459/-/1984609 which
noticed that streamable.com/e/ links as posts would have another e/
added to them. This was in spite of logic in posts.py api_is_repost
and submit_post designed to specifically counteract this.
Proximal cause was a copypasta'd url.replace(...) chain which
caused the mistake before the streamable-specific logic had a chance
to avoid making it.

Solution: remove the streamable replacement from the chained statement
and create `helpers.normalize_url(url)` to get rid of the copypasta.

											
										
										
											2022-05-25 08:43:16 +00:00
+											 .replace("https://music.youtube.com/watch?v=", "https://youtube.com/watch?v=") \
-												fix youtube embedding

											
										
										
											2022-08-24 22:02:06 +00:00
+											 .replace("https://www.youtube.com", "https://youtube.com") \
-												Sanitize: modularize normalize_url, fix streamable.

Originally prompted by https://rdrama.net/post/18459/-/1984609 which
noticed that streamable.com/e/ links as posts would have another e/
added to them. This was in spite of logic in posts.py api_is_repost
and submit_post designed to specifically counteract this.
Proximal cause was a copypasta'd url.replace(...) chain which
caused the mistake before the streamable-specific logic had a chance
to avoid making it.

Solution: remove the streamable replacement from the chained statement
and create `helpers.normalize_url(url)` to get rid of the copypasta.

											
										
										
											2022-05-25 08:43:16 +00:00
+											 .replace("https://youtube.com/shorts/", "https://youtube.com/watch?v=") \
-												fix youtube embedding

											
										
										
											2022-08-24 22:02:06 +00:00
+											 .replace("https://youtube.com/v/", "https://youtube.com/watch?v=") \
-												refactor normalizing urls at runtime (I put the function in comment.py cuz there were weird import errors that i didnt wanna fix)

											
										
										
											2022-06-23 15:47:57 +00:00
+											 .replace("https://mobile.twitter.com", "https://twitter.com") \
 											 .replace("https://m.facebook.com", "https://facebook.com") \
 											 .replace("https://m.wikipedia.org", "https://wikipedia.org") \
 											 .replace("https://m.youtube.com", "https://youtube.com") \
 											 .replace("https://www.twitter.com", "https://twitter.com") \
 											 .replace("https://www.instagram.com", "https://instagram.com") \
 											 .replace("https://www.tiktok.com", "https://tiktok.com") \
 											 .replace("https://www.streamable.com", "https://streamable.com") \
-												replace streamable links with full-size version

											
										
										
											2022-06-10 14:35:09 +00:00
+											 .replace("https://streamable.com/", "https://streamable.com/e/") \
-												replace search.marsey.cat with camas.unddit.com

											
										
										
											2022-07-15 13:00:51 +00:00
+											 .replace("https://streamable.com/e/e/", "https://streamable.com/e/") \
-												replace "https://imgur.com/" with "https://i.imgur.com/" for the sake of mobilecels

											
										
										
											2022-08-13 05:06:53 +00:00
+											 .replace("https://search.marsey.cat/#", "https://camas.unddit.com/#") \
-												remove kf replacing

											
										
										
											2022-09-12 11:59:49 +00:00
+											 .replace("https://imgur.com/", "https://i.imgur.com/")
-												Sanitize: modularize normalize_url, fix streamable.

Originally prompted by https://rdrama.net/post/18459/-/1984609 which
noticed that streamable.com/e/ links as posts would have another e/
added to them. This was in spite of logic in posts.py api_is_repost
and submit_post designed to specifically counteract this.
Proximal cause was a copypasta'd url.replace(...) chain which
caused the mistake before the streamable-specific logic had a chance
to avoid making it.

Solution: remove the streamable replacement from the chained statement
and create `helpers.normalize_url(url)` to get rid of the copypasta.

											
										
										
											2022-05-25 08:43:16 +00:00
-												re-refactor normalize_url

											
										
										
											2022-06-10 20:02:15 +00:00
+									url = imgur_regex.sub(r'\1_d.webp?maxwidth=9999&fidelity=high', url)
-												small image embed improvements

											
										
										
											2022-06-11 12:21:59 +00:00
+									url = giphy_regex.sub(r'\1.webp', url)
-												Sanitize: modularize normalize_url, fix streamable.

Originally prompted by https://rdrama.net/post/18459/-/1984609 which
noticed that streamable.com/e/ links as posts would have another e/
added to them. This was in spite of logic in posts.py api_is_repost
and submit_post designed to specifically counteract this.
Proximal cause was a copypasta'd url.replace(...) chain which
caused the mistake before the streamable-specific logic had a chance
to avoid making it.

Solution: remove the streamable replacement from the chained statement
and create `helpers.normalize_url(url)` to get rid of the copypasta.

											
										
										
											2022-05-25 08:43:16 +00:00
-												remove weird trailing tabs

											
										
										
											2022-06-11 09:56:16 +00:00
+									return url
-												Check URI approved embed in all CSS contexts.

											
										
										
											2022-08-05 17:09:41 +00:00
 								def validate_css(css):
 									if '@import' in css:
 										return False, "@import statements not allowed."
 									for i in css_url_regex.finditer(css):
 										url = i.group(1)
 										if not is_safe_url(url):
 											domain = tldextract.extract(url).registered_domain
 											return False, f"The domain '{domain}' is not allowed, please use one of these domains\n\n{approved_embed_hosts}."
 									return True, ""