import bleach
from bs4 import BeautifulSoup
from bleach.linkifier import LinkifyFilter, build_url_re
from functools import partial
from .get import *
from .patter import pat
from os import path, environ
import re
from mistletoe import markdown
from json import loads, dump
from random import random, choice
import signal
import time
import requests
TLDS = ('ac','ad','ae','aero','af','ag','ai','al','am','an','ao','aq','ar','arpa','as','asia','at','au','aw','ax','az','ba','bb','bd','be','bf','bg','bh','bi','biz','bj','bm','bn','bo','br','bs','bt','bv','bw','by','bz','ca','cafe','cat','cc','cd','cf','cg','ch','ci','ck','cl','club','cm','cn','co','com','coop','cr','cu','cv','cx','cy','cz','de','dj','dk','dm','do','dz','ec','edu','ee','eg','er','es','et','eu','fi','fj','fk','fm','fo','fr','ga','gb','gd','ge','gf','gg','gh','gi','gl','gm','gn','gov','gp','gq','gr','gs','gt','gu','gw','gy','hk','hm','hn','hr','ht','hu','id','ie','il','im','in','info','int','io','iq','ir','is','it','je','jm','jo','jobs','jp','ke','kg','kh','ki','km','kn','kp','kr','kw','ky','kz','la','lb','lc','li','lk','lr','ls','lt','lu','lv','ly','ma','mc','md','me','mg','mh','mil','mk','ml','mm','mn','mo','mobi','mp','mq','mr','ms','mt','mu','museum','mv','mw','mx','my','mz','na','name','nc','ne','net','nf','ng','ni','nl','no','np','nr','nu','nz','om','org','pa','pe','pf','pg','ph','pk','pl','pm','pn','post','pr','pro','ps','pt','pw','py','qa','re','ro','rs','ru','rw','sa','sb','sc','sd','se','sg','sh','si','sj','sk','sl','sm','sn','so','social','sr','ss','st','su','sv','sx','sy','sz','tc','td','tel','tf','tg','th','tj','tk','tl','tm','tn','to','tp','tr','travel','tt','tv','tw','tz','ua','ug','uk','us','uy','uz','va','vc','ve','vg','vi','vn','vu','wf','win','ws','xn','xxx','ye','yt','yu','za','zm','zw')
allowed_tags = ('b','blockquote','br','code','del','em','h1','h2','h3','h4','h5','h6','hr','i','li','ol','p','pre','strong','sub','sup','table','tbody','th','thead','td','tr','ul','marquee','a','span','ruby','rp','rt','spoiler','img','lite-youtube','video','source')
def allowed_attributes(tag, name, value):
if name == 'style': return True
if tag == 'marquee':
if name in ['direction', 'behavior', 'scrollamount']: return True
if name in {'height', 'width'}:
try: value = int(value.replace('px', ''))
except: return False
if 0 < value <= 250: return True
return False
if tag == 'a':
if name == 'href': return True
if name == 'rel' and value == 'nofollow noopener noreferrer': return True
if name == 'target' and value == '_blank': return True
return False
if tag == 'img':
if name in ['src','data-src']:
if value.startswith('/') or value.startswith(f'{SITE_FULL}/') or embed_fullmatch_regex.fullmatch(value): return True
else: return False
if name == 'loading' and value == 'lazy': return True
if name == 'referrpolicy' and value == 'no-referrer': return True
if name == 'data-bs-toggle' and value == 'tooltip': return True
if name in ['alt','title','g','b']: return True
return False
if tag == 'lite-youtube':
if name == 'params' and value.startswith('autoplay=1&modestbranding=1'): return True
if name == 'videoid': return True
return False
if tag == 'video':
if name == 'controls' and value == '': return True
if name == 'preload' and value == 'none': return True
return False
if tag == 'source':
if name == 'src' and embed_fullmatch_regex.fullmatch(value): return True
return False
if tag == 'p':
if name == 'class' and value == 'mb-0': return True
return False
url_re = build_url_re(tlds=TLDS, protocols=['http', 'https'])
def callback(attrs, new=False):
href = attrs[(None, "href")]
if not href.startswith('/') and not href.startswith(f'{SITE_FULL}/'):
attrs[(None, "target")] = "_blank"
attrs[(None, "rel")] = "nofollow noopener noreferrer"
return attrs
def handler(signum, frame):
print("Timeout!")
raise Exception("Timeout")
def sanitize(sanitized, alert=False, comment=False, edit=False):
signal.signal(signal.SIGALRM, handler)
signal.alarm(1)
sanitized = linefeeds_regex.sub(r'\1\n\n\2', sanitized)
sanitized = image_regex.sub(r'\1![](\2)\4', sanitized)
sanitized = image_check_regex.sub(r'\1', sanitized)
sanitized = markdown(sanitized)
sanitized = strikethrough_regex.sub(r'\1', sanitized)
sanitized = sanitized.replace('','').replace('','').replace("\ufeff", "").replace("𒐪","")
if alert:
captured = []
for i in mention_regex2.finditer(sanitized):
if i.group(0) in captured: continue
captured.append(i.group(0))
u = get_user(i.group(1), graceful=True)
if u:
sanitized = sanitized.replace(i.group(0), f'''
@{u.username}''')
else:
sanitized = reddit_regex.sub(r'\1/\2', sanitized)
sanitized = sub_regex.sub(r'\1/\2', sanitized)
captured = []
for i in mention_regex.finditer(sanitized):
if i.group(0) in captured: continue
captured.append(i.group(0))
u = get_user(i.group(2), graceful=True)
if u and (not (g.v and g.v.any_block_exists(u)) or g.v.admin_level > 1):
sanitized = sanitized.replace(i.group(0), f'''{i.group(1)}@{u.username}''')
sanitized = imgur_regex.sub(r'\1_d.webp?maxwidth=9999&fidelity=high', sanitized)
soup = BeautifulSoup(sanitized, 'lxml')
for tag in soup.find_all("img"):
if tag.get("src") and not tag["src"].startswith('/pp/'):
tag["loading"] = "lazy"
tag["data-src"] = tag["src"]
tag["src"] = "/assets/images/loading.webp"
tag['alt'] = f'![]({tag["data-src"]})'
tag['referrerpolicy'] = "no-referrer"
for tag in soup.find_all("a"):
if tag.get("href") and fishylinks_regex.fullmatch(str(tag.string)):
tag.string = tag["href"]
sanitized = str(soup)
sanitized = spoiler_regex.sub(r'
' + choice(FORTUNE_REPLIES) + '
' sanitized = sanitized.replace('&','&') sanitized = utm_regex.sub('', sanitized) sanitized = utm_regex2.sub('', sanitized) sanitized = sanitized.replace('','').replace('','') sanitized = bleach.Cleaner(tags=allowed_tags, attributes=allowed_attributes, protocols=['http', 'https'], styles=['color', 'background-color', 'font-weight', 'text-align'], filters=[partial(LinkifyFilter, skip_tags=["pre"], parse_email=False, callbacks=[callback], url_re=url_re)] ).clean(sanitized) soup = BeautifulSoup(sanitized, 'lxml') links = soup.find_all("a") domain_list = set() for link in links: href = link.get("href") if not href: continue url = urlparse(href) domain = url.netloc url_path = url.path domain_list.add(domain+url_path) parts = domain.split(".") for i in range(len(parts)): new_domain = parts[i] for j in range(i + 1, len(parts)): new_domain += "." + parts[j] domain_list.add(new_domain) bans = g.db.query(BannedDomain.domain).filter(BannedDomain.domain.in_(list(domain_list))).all() if bans: abort(403, description=f"Remove the banned domains {bans} and try again!") signal.alarm(0) return sanitized def allowed_attributes_emojis(tag, name, value): if tag == 'img': if name == 'loading' and value == 'lazy': return True if name == 'data-bs-toggle' and value == 'tooltip': return True if name in ['src','alt','title','g']: return True return False def filter_emojis_only(title, edit=False, graceful=False): signal.signal(signal.SIGALRM, handler) signal.alarm(1) title = title.replace('','').replace('','').replace("\ufeff", "").replace("𒐪","").replace("\n", "").replace("\r", "").replace("\t", "").replace("&", "&").replace('<','<').replace('>','>').replace('"', '"').replace("'", "'").strip() emojis = list(emoji_regex4.finditer(title)) if len(emojis) > 20: edit = True captured = [] for i in emojis: if i.group(0) in captured: continue captured.append(i.group(0)) emoji = i.group(1).lower() golden = ' ' if not edit and random() < 0.0025 and ('marsey' in emoji or emoji in marseys_const2): golden = 'g ' old = emoji emoji = emoji.replace('!','').replace('#','') if emoji == 'marseyrandom': emoji = choice(marseys_const2) if path.isfile(f'files/assets/images/emojis/{emoji}.webp'): title = re.sub(f'(?', title, flags=re.I|re.A) elif emoji.endswith('pat') and path.isfile(f"files/assets/images/emojis/{emoji.replace('pat','')}.webp"): pat(emoji.replace('pat','')) title = re.sub(f'(?', title, flags=re.I|re.A) requests.post(f'https://api.cloudflare.com/client/v4/zones/{CF_ZONE}/purge_cache', headers=CF_HEADERS, data={'files': [f"https://{request.host}/e/{emoji}.webp"]}, timeout=5) title = strikethrough_regex.sub(r'