2022-07-05 22:11:45 +00:00
import functools
2022-05-04 23:09:46 +00:00
import bleach
from bs4 import BeautifulSoup
2022-05-25 00:27:41 +00:00
from bleach . css_sanitizer import CSSSanitizer
2022-07-15 13:27:45 +00:00
from bleach . linkifier import LinkifyFilter
2022-05-04 23:09:46 +00:00
from functools import partial
from . get import *
from os import path , environ
import re
from mistletoe import markdown
from json import loads , dump
from random import random , choice
import signal
import time
import requests
2022-08-25 15:04:25 +00:00
TLDS = ( ' aaa ' , ' aarp ' , ' abarth ' , ' abb ' , ' abbott ' , ' abbvie ' , ' abc ' , ' able ' , ' abogado ' , ' abudhabi ' , ' ac ' , ' academy ' , ' accenture ' , ' accountant ' , ' accountants ' , ' aco ' , ' actor ' , ' ad ' , ' adac ' , ' ads ' , ' adult ' , ' ae ' , ' aeg ' , ' aero ' , ' aetna ' , ' af ' , ' afl ' , ' africa ' , ' ag ' , ' agakhan ' , ' agency ' , ' ai ' , ' aig ' , ' airbus ' , ' airforce ' , ' airtel ' , ' akdn ' , ' al ' , ' alfaromeo ' , ' alibaba ' , ' alipay ' , ' allfinanz ' , ' allstate ' , ' ally ' , ' alsace ' , ' alstom ' , ' am ' , ' amazon ' , ' americanexpress ' , ' americanfamily ' , ' amex ' , ' amfam ' , ' amica ' , ' amsterdam ' , ' analytics ' , ' android ' , ' anquan ' , ' anz ' , ' ao ' , ' aol ' , ' apartments ' , ' app ' , ' apple ' , ' aq ' , ' aquarelle ' , ' ar ' , ' arab ' , ' aramco ' , ' archi ' , ' army ' , ' arpa ' , ' art ' , ' arte ' , ' as ' , ' asda ' , ' asia ' , ' associates ' , ' at ' , ' athleta ' , ' attorney ' , ' au ' , ' auction ' , ' audi ' , ' audible ' , ' audio ' , ' auspost ' , ' author ' , ' auto ' , ' autos ' , ' avianca ' , ' aw ' , ' aws ' , ' ax ' , ' axa ' , ' az ' , ' azure ' , ' ba ' , ' baby ' , ' baidu ' , ' banamex ' , ' bananarepublic ' , ' band ' , ' bank ' , ' bar ' , ' barcelona ' , ' barclaycard ' , ' barclays ' , ' barefoot ' , ' bargains ' , ' baseball ' , ' basketball ' , ' bauhaus ' , ' bayern ' , ' bb ' , ' bbc ' , ' bbt ' , ' bbva ' , ' bcg ' , ' bcn ' , ' bd ' , ' be ' , ' beats ' , ' beauty ' , ' beer ' , ' bentley ' , ' berlin ' , ' best ' , ' bestbuy ' , ' bet ' , ' bf ' , ' bg ' , ' bh ' , ' bharti ' , ' bi ' , ' bible ' , ' bid ' , ' bike ' , ' bing ' , ' bingo ' , ' bio ' , ' biz ' , ' bj ' , ' black ' , ' blackfriday ' , ' blockbuster ' , ' blog ' , ' bloomberg ' , ' blue ' , ' bm ' , ' bms ' , ' bmw ' , ' bn ' , ' bnpparibas ' , ' bo ' , ' boats ' , ' boehringer ' , ' bofa ' , ' bom ' , ' bond ' , ' boo ' , ' book ' , ' booking ' , ' bosch ' , ' bostik ' , ' boston ' , ' bot ' , ' boutique ' , ' box ' , ' br ' , ' bradesco ' , ' bridgestone ' , ' broadway ' , ' broker ' , ' brother ' , ' brussels ' , ' bs ' , ' bt ' , ' bugatti ' , ' build ' , ' builders ' , ' business ' , ' buy ' , ' buzz ' , ' bv ' , ' bw ' , ' by ' , ' bz ' , ' bzh ' , ' ca ' , ' cab ' , ' cafe ' , ' cal ' , ' call ' , ' calvinklein ' , ' cam ' , ' camera ' , ' camp ' , ' cancerresearch ' , ' canon ' , ' capetown ' , ' capital ' , ' capitalone ' , ' car ' , ' caravan ' , ' cards ' , ' care ' , ' career ' , ' careers ' , ' cars ' , ' casa ' , ' case ' , ' cash ' , ' casino ' , ' cat ' , ' catering ' , ' catholic ' , ' cba ' , ' cbn ' , ' cbre ' , ' cbs ' , ' cc ' , ' cd ' , ' center ' , ' ceo ' , ' cern ' , ' cf ' , ' cfa ' , ' cfd ' , ' cg ' , ' ch ' , ' chanel ' , ' channel ' , ' charity ' , ' chase ' , ' chat ' , ' cheap ' , ' chintai ' , ' christmas ' , ' chrome ' , ' church ' , ' ci ' , ' cipriani ' , ' circle ' , ' cisco ' , ' citadel ' , ' citi ' , ' citic ' , ' city ' , ' cityeats ' , ' ck ' , ' cl ' , ' claims ' , ' cleaning ' , ' click ' , ' clinic ' , ' clinique ' , ' clothing ' , ' cloud ' , ' club ' , ' clubmed ' , ' cm ' , ' cn ' , ' co ' , ' coach ' , ' codes ' , ' coffee ' , ' college ' , ' cologne ' , ' com ' , ' comcast ' , ' commbank ' , ' community ' , ' company ' , ' compare ' , ' computer ' , ' comsec ' , ' condos ' , ' construction ' , ' consulting ' , ' contact ' , ' contractors ' , ' cooking ' , ' cookingchannel ' , ' cool ' , ' coop ' , ' corsica ' , ' country ' , ' coupon ' , ' coupons ' , ' courses ' , ' cpa ' , ' cr ' , ' credit ' , ' creditcard ' , ' creditunion ' , ' cricket ' , ' crown ' , ' crs ' , ' cruise ' , ' cruises ' , ' cu ' , ' cuisinella ' , ' cv ' , ' cw ' , ' cx ' , ' cy ' , ' cymru ' , ' cyou ' , ' cz ' , ' dabur ' , ' dad ' , ' dance ' , ' data ' , ' date ' , ' dating ' , ' datsun ' , ' day ' , ' dclk ' , ' dds ' , ' de ' , ' deal ' , ' dealer ' , ' deals ' , ' degree ' , ' delivery ' , ' dell ' , ' deloitte ' , ' delta ' , ' democrat ' , ' dental ' , ' dentist ' , ' desi ' , ' design ' , ' dev ' , ' dhl ' , ' diamonds ' , ' diet ' , ' digital ' , ' direct ' , ' directory ' , ' discount ' , ' discover ' , ' dish ' , ' diy ' , ' dj ' , ' dk ' , ' dm ' , ' dnp ' , ' do ' , ' docs ' , ' doctor ' , ' dog ' , ' domains ' , ' dot ' , ' download ' , ' drive ' , ' dtv ' , ' dubai ' , ' dunlop ' , ' dupont ' , ' durban ' , ' dvag ' , ' dvr ' , ' dz ' , ' earth ' , ' eat ' , ' ec ' , ' eco ' , ' edeka ' , ' edu ' , ' education ' , ' ee ' , ' eg ' , ' email ' , ' emerck ' , ' energy ' , ' engineer ' , ' engineering ' , ' enterprises ' , ' epson ' , ' equipment ' , ' er ' , ' ericsson ' , ' erni ' , ' es ' , ' esq ' , ' estate ' , ' et ' , ' etisalat ' , ' eu ' , ' eurovision ' , ' eus ' , ' events ' , ' exchange ' , ' expert ' , ' exposed ' , ' express ' , ' extraspace ' , ' fage ' , ' fail ' , ' fairwinds ' , ' faith ' , ' family ' , ' fan ' , ' fans ' , ' farm ' , ' farmers ' , ' fashion ' , ' fast ' , ' fedex ' , ' feedback ' , ' ferrari ' , ' ferrero ' , ' fi ' , ' fiat ' , ' fidelity ' , ' fido ' , ' film ' , ' final ' , ' finance ' , ' financial ' , ' fire ' , ' firestone ' , ' firmdale ' , ' fish ' , ' fishing ' , ' fit ' , ' fitness ' , ' fj ' , ' fk ' , ' flickr ' , ' flights ' , ' flir ' , ' florist ' , ' flowers ' , ' fly ' , ' fm ' , ' fo ' , ' foo ' , ' food ' , ' foodnetwork ' , ' football ' , ' ford ' , ' forex ' , ' forsale ' , ' forum ' , ' foundation ' , ' fox ' , ' fr ' , ' free ' , ' fresenius ' , ' frl ' , ' frogans ' , ' frontdoor ' , ' frontier ' , ' ftr ' , ' fujitsu ' , ' fun ' , ' fund ' , ' furniture ' , ' futbol ' , ' fyi ' , ' ga ' , ' gal ' , ' gallery ' , ' gallo ' , ' gallup ' , ' game ' , ' games ' , ' gap ' , ' garden ' , ' gay ' , ' gb ' , ' gbiz ' , ' gd ' , ' gdn ' , ' ge ' , ' gea ' , ' gent ' , ' genting ' , ' george ' , ' gf ' , ' gg ' , ' ggee ' , ' gh ' , ' gi ' , ' gift ' , ' gifts ' , ' gives ' , ' giving ' , ' gl ' , ' glass ' , ' gle ' , ' global ' , ' globo ' , ' gm ' , ' gmail ' , ' gmbh ' , ' gmo ' , ' gmx ' , ' gn ' , ' godaddy ' , '
2022-05-04 23:09:46 +00:00
2022-05-24 19:16:55 +00:00
allowed_tags = ( ' b ' , ' blockquote ' , ' br ' , ' code ' , ' del ' , ' em ' , ' h1 ' , ' h2 ' , ' h3 ' , ' h4 ' , ' h5 ' , ' h6 ' , ' hr ' , ' i ' ,
' li ' , ' ol ' , ' p ' , ' pre ' , ' strong ' , ' sub ' , ' sup ' , ' table ' , ' tbody ' , ' th ' , ' thead ' , ' td ' , ' tr ' , ' ul ' ,
2022-06-19 15:05:50 +00:00
' marquee ' , ' a ' , ' span ' , ' ruby ' , ' rp ' , ' rt ' , ' spoiler ' , ' img ' , ' lite-youtube ' , ' video ' , ' source ' , ' audio ' , ' g ' )
2022-05-04 23:09:46 +00:00
2022-06-30 22:18:05 +00:00
allowed_styles = [ ' color ' , ' background-color ' , ' font-weight ' , ' text-align ' , ' filter ' , ]
2022-05-25 00:27:41 +00:00
2022-05-04 23:09:46 +00:00
def allowed_attributes ( tag , name , value ) :
if name == ' style ' : return True
if tag == ' marquee ' :
if name in [ ' direction ' , ' behavior ' , ' scrollamount ' ] : return True
if name in { ' height ' , ' width ' } :
try : value = int ( value . replace ( ' px ' , ' ' ) )
except : return False
if 0 < value < = 250 : return True
return False
2022-07-05 22:11:45 +00:00
2022-05-04 23:09:46 +00:00
if tag == ' a ' :
2022-06-19 17:25:55 +00:00
if name == ' href ' and ' \\ ' not in value and ' xn-- ' not in value :
return True
2022-05-04 23:09:46 +00:00
if name == ' rel ' and value == ' nofollow noopener noreferrer ' : return True
if name == ' target ' and value == ' _blank ' : return True
return False
if tag == ' img ' :
2022-05-25 18:29:22 +00:00
if name in [ ' src ' , ' data-src ' ] : return is_safe_url ( value )
2022-05-04 23:09:46 +00:00
if name == ' loading ' and value == ' lazy ' : return True
if name == ' data-bs-toggle ' and value == ' tooltip ' : return True
2022-06-23 00:34:37 +00:00
if name in [ ' g ' , ' b ' , ' glow ' ] and not value : return True
2022-05-18 18:45:04 +00:00
if name in [ ' alt ' , ' title ' ] : return True
2022-07-02 10:44:05 +00:00
if name == ' referrerpolicy ' and value == ' no-referrer ' : return True
2022-05-04 23:09:46 +00:00
return False
if tag == ' lite-youtube ' :
if name == ' params ' and value . startswith ( ' autoplay=1&modestbranding=1 ' ) : return True
if name == ' videoid ' : return True
return False
if tag == ' video ' :
if name == ' controls ' and value == ' ' : return True
if name == ' preload ' and value == ' none ' : return True
return False
if tag == ' source ' :
2022-05-25 18:29:22 +00:00
if name == ' src ' : return is_safe_url ( value )
2022-05-04 23:09:46 +00:00
2022-05-15 22:47:37 +00:00
if tag == ' audio ' :
2022-05-25 18:29:22 +00:00
if name == ' src ' : return is_safe_url ( value )
2022-05-15 22:47:37 +00:00
if name == ' controls ' and value == ' ' : return True
if name == ' preload ' and value == ' none ' : return True
return False
2022-05-04 23:09:46 +00:00
if tag == ' p ' :
if name == ' class ' and value == ' mb-0 ' : return True
return False
if tag == ' span ' :
if name == ' data-bs-toggle ' and value == ' tooltip ' : return True
if name == ' title ' : return True
if name == ' alt ' : return True
return False
2022-07-15 13:27:45 +00:00
def build_url_re ( tlds , protocols ) :
2022-09-04 23:15:37 +00:00
""" Builds the url regex used by linkifier
If you want a different set of tlds or allowed protocols , pass those in
and stomp on the existing ` ` url_re ` ` : :
from bleach import linkifier
my_url_re = linkifier . build_url_re ( my_tlds_list , my_protocols )
linker = LinkifyFilter ( url_re = my_url_re )
"""
return re . compile (
r """ \ (*# Match any opening parentheses.
\b ( ? < ! [ @ . ] ) ( ? : ( ? : { 0 } ) : / { { 0 , 3 } } ( ? : ( ? : \w + : ) ? \w + @ ) ? ) ? # http://
( [ \w - ] + \. ) + ( ? : { 1 } ) ( ? : \: [ 0 - 9 ] + ) ? ( ? ! \. \w ) \b # xx.yy.tld(:##)?
( ? : [ / ? ] [ ^ #\s\{{\}}\|\\\^\[\]`<>"]*)?
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for ~, which happens in practice)
( ? : \#[^#\s\|\\\^\[\]`<>"]*)?
# #hash (excluding "unsafe" chars from RFC 1738,
# except for ~, which happens in practice)
""" .format(
" | " . join ( sorted ( protocols ) ) , " | " . join ( sorted ( tlds ) )
) ,
re . IGNORECASE | re . VERBOSE | re . UNICODE ,
)
2022-07-15 13:27:45 +00:00
2022-05-04 23:09:46 +00:00
url_re = build_url_re ( tlds = TLDS , protocols = [ ' http ' , ' https ' ] )
def callback ( attrs , new = False ) :
2022-05-17 18:59:07 +00:00
if ( None , " href " ) not in attrs :
return # Incorrect <a> tag
2022-05-04 23:09:46 +00:00
href = attrs [ ( None , " href " ) ]
2022-05-17 18:59:07 +00:00
# \ in href right after / makes most browsers ditch site hostname and allows for a host injection bypassing the check, see <a href="/\google.com">cool</a>
2022-06-19 17:25:55 +00:00
if " \\ " in href or not ascii_only_regex . fullmatch ( href ) :
2022-05-17 18:59:07 +00:00
attrs [ " _text " ] = href # Laugh at this user
del attrs [ ( None , " href " ) ] # Make unclickable and reset harmful payload
return attrs
2022-05-04 23:09:46 +00:00
if not href . startswith ( ' / ' ) and not href . startswith ( f ' { SITE_FULL } / ' ) :
attrs [ ( None , " target " ) ] = " _blank "
2022-06-11 09:56:16 +00:00
attrs [ ( None , " rel " ) ] = " nofollow noopener noreferrer "
2022-05-04 23:09:46 +00:00
return attrs
2022-09-16 16:30:34 +00:00
def render_emoji ( html , regexp , golden , marseys_used , b = False ) :
2022-05-04 23:09:46 +00:00
emojis = list ( regexp . finditer ( html ) )
captured = set ( )
for i in emojis :
if i . group ( 0 ) in captured : continue
captured . add ( i . group ( 0 ) )
emoji = i . group ( 1 ) . lower ( )
attrs = ' '
if b : attrs + = ' b '
2022-09-16 16:30:34 +00:00
if golden and len ( emojis ) < = 20 and ( ' marsey ' in emoji or emoji in marseys_const2 ) :
2022-06-23 00:34:37 +00:00
if random ( ) < 0.0025 : attrs + = ' g '
elif random ( ) < 0.00125 : attrs + = ' glow '
2022-05-04 23:09:46 +00:00
old = emoji
emoji = emoji . replace ( ' ! ' , ' ' ) . replace ( ' # ' , ' ' )
if emoji == ' marseyrandom ' : emoji = choice ( marseys_const2 )
emoji_partial_pat = ' <img loading= " lazy " alt= " : {0} : " src= " {1} " {2} > '
emoji_partial = ' <img loading= " lazy " data-bs-toggle= " tooltip " alt= " : {0} : " title= " : {0} : " src= " {1} " {2} > '
emoji_html = None
2022-07-08 15:39:54 +00:00
if emoji . endswith ( ' pat ' ) and emoji != ' marseyunpettablepat ' :
2022-05-04 23:09:46 +00:00
if path . isfile ( f " files/assets/images/emojis/ { emoji . replace ( ' pat ' , ' ' ) } .webp " ) :
2022-06-22 15:51:19 +00:00
emoji_html = f ' <span data-bs-toggle= " tooltip " alt= " : { old } : " title= " : { old } : " ><img src= " /i/hand.webp " > { emoji_partial_pat . format ( old , f " /e/ { emoji [ : - 3 ] } .webp " , attrs ) } </span> '
2022-05-04 23:09:46 +00:00
elif emoji . startswith ( ' @ ' ) :
if u := get_user ( emoji [ 1 : - 3 ] , graceful = True ) :
2022-06-22 15:51:19 +00:00
emoji_html = f ' <span data-bs-toggle= " tooltip " alt= " : { old } : " title= " : { old } : " ><img src= " /i/hand.webp " > { emoji_partial_pat . format ( old , f " /pp/ { u . id } " , attrs ) } </span> '
2022-05-04 23:09:46 +00:00
elif path . isfile ( f ' files/assets/images/emojis/ { emoji } .webp ' ) :
emoji_html = emoji_partial . format ( old , f ' /e/ { emoji } .webp ' , attrs )
if emoji_html :
2022-05-17 18:59:07 +00:00
marseys_used . add ( emoji )
2022-05-04 23:09:46 +00:00
html = re . sub ( f ' (?<! " ) { i . group ( 0 ) } ' , emoji_html , html )
return html
2022-07-05 22:11:45 +00:00
def with_sigalrm_timeout ( timeout : int ) :
' Use SIGALRM to raise an exception if the function executes for longer than timeout seconds '
2022-05-04 23:09:46 +00:00
2022-07-05 22:11:45 +00:00
# while trying to test this using time.sleep I discovered that gunicorn does in fact do some
# async so if we timeout on that (or on a db op) then the process is crashed without returning
# a proper 500 error. Oh well.
def sig_handler ( signum , frame ) :
print ( " Timeout! " , flush = True )
raise Exception ( " Timeout " )
2022-05-04 23:09:46 +00:00
2022-07-05 22:11:45 +00:00
def inner ( func ) :
2022-07-06 09:01:48 +00:00
@functools.wraps ( func )
2022-07-05 22:11:45 +00:00
def wrapped ( * args , * * kwargs ) :
signal . signal ( signal . SIGALRM , sig_handler )
signal . alarm ( timeout )
try :
return func ( * args , * * kwargs )
finally :
signal . alarm ( 0 )
return wrapped
return inner
@with_sigalrm_timeout ( 2 )
2022-09-16 16:30:34 +00:00
def sanitize ( sanitized , golden = True , limit_pings = 0 , showmore = True , count_marseys = False , torture = False ) :
2022-06-18 15:53:34 +00:00
sanitized = sanitized . strip ( )
2022-09-05 20:05:04 +00:00
if torture :
sanitized = torture_ap ( sanitized , g . v . username )
sanitized + = ' \n :#trumpjaktalking: '
2022-06-23 19:43:49 +00:00
sanitized = normalize_url ( sanitized )
2022-05-27 18:28:54 +00:00
if ' ``` ' not in sanitized and ' <pre> ' not in sanitized :
2022-05-08 09:06:01 +00:00
sanitized = linefeeds_regex . sub ( r ' \ 1 \ n \ n \ 2 ' , sanitized )
2022-05-04 23:09:46 +00:00
2022-06-19 15:22:06 +00:00
sanitized = greentext_regex . sub ( r ' \ 1<g> \ > \ 2</g> ' , sanitized )
2022-06-19 15:05:50 +00:00
2022-06-11 12:21:59 +00:00
sanitized = image_regex . sub ( r ' \ 1![]( \ 2) \ 5 ' , sanitized )
2022-05-04 23:09:46 +00:00
sanitized = image_check_regex . sub ( r ' \ 1 ' , sanitized )
2022-06-25 05:28:43 +00:00
sanitized = link_fix_regex . sub ( r ' \ 1https:// \ 2 ' , sanitized )
2022-05-07 05:28:51 +00:00
2022-07-20 00:07:38 +00:00
if FEATURES [ ' MARKUP_COMMANDS ' ] :
sanitized = command_regex . sub ( command_regex_matcher , sanitized )
2022-07-11 12:14:18 +00:00
2022-05-04 23:09:46 +00:00
sanitized = markdown ( sanitized )
2022-06-28 05:52:29 +00:00
sanitized = strikethrough_regex . sub ( r ' \ 1<del> \ 2</del> ' , sanitized )
2022-05-04 23:09:46 +00:00
sanitized = sanitized . replace ( ' ' , ' ' ) . replace ( ' ' , ' ' ) . replace ( " \ufeff " , " " ) . replace ( " 𒐪 " , " " )
2022-07-12 20:29:27 +00:00
sanitized = reddit_regex . sub ( r ' \ 1<a href= " https://old.reddit.com/ \ 2 " rel= " nofollow noopener noreferrer " target= " _blank " >/ \ 2</a> ' , sanitized )
2022-06-22 22:12:47 +00:00
sanitized = sub_regex . sub ( r ' \ 1<a href= " / \ 2 " >/ \ 2</a> ' , sanitized )
2022-07-29 13:23:34 +00:00
v = getattr ( g , ' v ' , None )
2022-08-21 17:20:09 +00:00
names = set ( m . group ( 2 ) for m in mention_regex . finditer ( sanitized ) )
2022-08-21 17:22:18 +00:00
if limit_pings and len ( names ) > limit_pings and not v . admin_level : abort ( 406 )
2022-08-21 17:20:09 +00:00
users_list = get_users ( names , graceful = True )
users_dict = { }
for u in users_list :
users_dict [ u . username . lower ( ) ] = u
if u . original_username :
users_dict [ u . original_username . lower ( ) ] = u
def replacer ( m ) :
u = users_dict . get ( m . group ( 2 ) . lower ( ) )
if not u :
return m . group ( 0 )
return f ' { m . group ( 1 ) } <a href= " /id/ { u . id } " ><img loading= " lazy " src= " /pp/ { u . id } " >@ { u . username } </a> '
sanitized = mention_regex . sub ( replacer , sanitized )
2022-05-04 23:09:46 +00:00
soup = BeautifulSoup ( sanitized , ' lxml ' )
for tag in soup . find_all ( " img " ) :
if tag . get ( " src " ) and not tag [ " src " ] . startswith ( ' /pp/ ' ) :
2022-07-12 20:30:00 +00:00
if not is_safe_url ( tag [ " src " ] ) :
a = soup . new_tag ( " a " , href = tag [ " src " ] , rel = " nofollow noopener noreferrer " , target = " _blank " )
a . string = tag [ " src " ]
tag . replace_with ( a )
continue
2022-05-04 23:09:46 +00:00
tag [ " loading " ] = " lazy "
tag [ " data-src " ] = tag [ " src " ]
2022-06-22 15:51:19 +00:00
tag [ " src " ] = " /i/l.webp "
2022-05-04 23:09:46 +00:00
tag [ ' alt ' ] = f ' ![]( { tag [ " data-src " ] } ) '
2022-07-02 10:44:05 +00:00
if not is_site_url ( tag [ " data-src " ] ) :
tag [ ' referrerpolicy ' ] = " no-referrer "
2022-05-04 23:09:46 +00:00
2022-07-02 00:25:58 +00:00
if tag . parent . name != ' a ' :
2022-07-02 10:44:05 +00:00
a = soup . new_tag ( " a " , href = tag [ " data-src " ] )
if not is_site_url ( a [ " href " ] ) :
a [ " rel " ] = " nofollow noopener noreferrer "
2022-07-12 20:29:27 +00:00
a [ " target " ] = " _blank "
2022-07-02 00:25:58 +00:00
tag = tag . replace_with ( a )
a . append ( tag )
2022-06-27 01:00:45 +00:00
2022-05-04 23:09:46 +00:00
for tag in soup . find_all ( " a " ) :
2022-07-02 00:54:59 +00:00
if not tag . contents or not str ( tag . contents [ 0 ] ) . strip ( ) :
tag . extract ( )
2022-06-22 21:59:30 +00:00
if tag . get ( " href " ) and fishylinks_regex . fullmatch ( str ( tag . string ) ) :
2022-05-04 23:09:46 +00:00
tag . string = tag [ " href " ]
sanitized = str ( soup )
2022-07-05 22:11:45 +00:00
2022-05-04 23:09:46 +00:00
sanitized = spoiler_regex . sub ( r ' <spoiler> \ 1</spoiler> ' , sanitized )
2022-07-05 22:11:45 +00:00
2022-05-04 23:09:46 +00:00
marseys_used = set ( )
emojis = list ( emoji_regex . finditer ( sanitized ) )
2022-09-16 16:30:34 +00:00
if len ( emojis ) > 20 : golden = False
2022-05-04 23:09:46 +00:00
captured = [ ]
for i in emojis :
if i . group ( 0 ) in captured : continue
captured . append ( i . group ( 0 ) )
old = i . group ( 0 )
if ' marseylong1 ' in old or ' marseylong2 ' in old or ' marseyllama1 ' in old or ' marseyllama2 ' in old : new = old . lower ( ) . replace ( " > " , " class= ' mb-0 ' > " )
else : new = old . lower ( )
2022-09-16 16:30:34 +00:00
new = render_emoji ( new , emoji_regex2 , golden , marseys_used , True )
2022-05-04 23:09:46 +00:00
sanitized = sanitized . replace ( old , new )
emojis = list ( emoji_regex2 . finditer ( sanitized ) )
2022-09-16 16:30:34 +00:00
if len ( emojis ) > 20 : golden = False
2022-05-04 23:09:46 +00:00
2022-09-16 16:30:34 +00:00
sanitized = render_emoji ( sanitized , emoji_regex2 , golden , marseys_used )
2022-05-04 23:09:46 +00:00
2022-05-22 10:20:11 +00:00
sanitized = sanitized . replace ( ' & ' , ' & ' )
2022-05-04 23:09:46 +00:00
if " https://youtube.com/watch?v= " in sanitized : sanitized = sanitized . replace ( " ?t= " , " &t= " )
captured = [ ]
for i in youtube_regex . finditer ( sanitized ) :
if i . group ( 0 ) in captured : continue
captured . append ( i . group ( 0 ) )
2022-09-01 20:46:57 +00:00
params = parse_qs ( urlparse ( i . group ( 2 ) ) . query , keep_blank_values = True )
2022-05-04 23:09:46 +00:00
t = params . get ( ' t ' , params . get ( ' start ' , [ 0 ] ) ) [ 0 ]
if isinstance ( t , str ) : t = t . replace ( ' s ' , ' ' )
htmlsource = f ' { i . group ( 1 ) } <lite-youtube videoid= " { i . group ( 3 ) } " params= " autoplay=1&modestbranding=1 '
if t : htmlsource + = f ' &start= { t } '
htmlsource + = ' " ></lite-youtube> '
sanitized = sanitized . replace ( i . group ( 0 ) , htmlsource )
2022-09-01 20:43:11 +00:00
sanitized = video_sub_regex . sub ( r ' \ 1<video controls preload= " metadata " ><source src= " \ 2 " ></video> ' , sanitized )
2022-09-03 03:39:35 +00:00
sanitized = audio_sub_regex . sub ( r ' \ 1<audio controls preload= " metadata " src= " \ 2 " ></audio> ' , sanitized )
2022-05-04 23:09:46 +00:00
2022-09-16 16:30:34 +00:00
if count_marseys :
2022-09-09 09:13:50 +00:00
for marsey in g . db . query ( Marsey ) . filter ( Marsey . submitter_id == None , Marsey . name . in_ ( marseys_used ) ) . all ( ) :
2022-05-04 23:09:46 +00:00
marsey . count + = 1
g . db . add ( marsey )
2022-05-15 08:45:57 +00:00
sanitized = sanitized . replace ( ' <p></p> ' , ' ' )
2022-05-04 23:09:46 +00:00
sanitized = utm_regex . sub ( ' ' , sanitized )
sanitized = utm_regex2 . sub ( ' ' , sanitized )
sanitized = sanitized . replace ( ' <html><body> ' , ' ' ) . replace ( ' </body></html> ' , ' ' )
2022-05-25 00:27:41 +00:00
css_sanitizer = CSSSanitizer ( allowed_css_properties = allowed_styles )
2022-05-04 23:09:46 +00:00
sanitized = bleach . Cleaner ( tags = allowed_tags ,
attributes = allowed_attributes ,
protocols = [ ' http ' , ' https ' ] ,
2022-05-25 00:27:41 +00:00
css_sanitizer = css_sanitizer ,
2022-07-05 22:11:45 +00:00
filters = [ partial ( LinkifyFilter , skip_tags = [ " pre " ] ,
2022-05-25 00:27:41 +00:00
parse_email = False , callbacks = [ callback ] , url_re = url_re ) ]
2022-05-04 23:09:46 +00:00
) . clean ( sanitized )
soup = BeautifulSoup ( sanitized , ' lxml ' )
links = soup . find_all ( " a " )
domain_list = set ( )
for link in links :
href = link . get ( " href " )
if not href : continue
2022-07-05 22:11:45 +00:00
2022-05-04 23:09:46 +00:00
url = urlparse ( href )
domain = url . netloc
url_path = url . path
domain_list . add ( domain + url_path )
parts = domain . split ( " . " )
for i in range ( len ( parts ) ) :
new_domain = parts [ i ]
for j in range ( i + 1 , len ( parts ) ) :
new_domain + = " . " + parts [ j ]
domain_list . add ( new_domain )
bans = g . db . query ( BannedDomain . domain ) . filter ( BannedDomain . domain . in_ ( list ( domain_list ) ) ) . all ( )
if bans : abort ( 403 , description = f " Remove the banned domains { bans } and try again! " )
2022-06-30 23:01:10 +00:00
if ' <pre> ' not in sanitized :
sanitized = sanitized . replace ( ' \n ' , ' ' )
2022-06-29 00:55:44 +00:00
2022-08-10 21:34:15 +00:00
if showmore and len ( sanitized ) > 5000 :
2022-08-30 21:19:53 +00:00
sanitized = showmore_regex . sub ( r ' \ 1<p><button class= " showmore " onclick= " showmore() " >SHOW MORE</button></p><d class= " d-none " > \ 2</d> ' , sanitized , count = 1 )
2022-05-04 23:09:46 +00:00
2022-07-02 10:12:52 +00:00
return sanitized . strip ( )
2022-05-04 23:09:46 +00:00
def allowed_attributes_emojis ( tag , name , value ) :
if tag == ' img ' :
2022-05-25 18:29:22 +00:00
if name == ' src ' and value . startswith ( ' / ' ) and ' \\ ' not in value : return True
2022-05-04 23:09:46 +00:00
if name == ' loading ' and value == ' lazy ' : return True
if name == ' data-bs-toggle ' and value == ' tooltip ' : return True
2022-06-23 00:34:37 +00:00
if name in [ ' g ' , ' glow ' ] and not value : return True
2022-05-18 18:45:04 +00:00
if name in [ ' alt ' , ' title ' ] : return True
2022-05-17 19:58:41 +00:00
if tag == ' span ' :
if name == ' data-bs-toggle ' and value == ' tooltip ' : return True
if name == ' title ' : return True
if name == ' alt ' : return True
return False
2022-05-04 23:09:46 +00:00
return False
2022-07-05 22:11:45 +00:00
@with_sigalrm_timeout ( 1 )
2022-09-16 16:30:34 +00:00
def filter_emojis_only ( title , golden = True , count_marseys = False , graceful = False , torture = False ) :
2022-09-05 20:05:04 +00:00
title = title . strip ( )
if torture :
title = torture_ap ( title , g . v . username )
2022-05-04 23:09:46 +00:00
title = title . replace ( ' ' , ' ' ) . replace ( ' ' , ' ' ) . replace ( " \ufeff " , " " ) . replace ( " 𒐪 " , " " ) . replace ( " \n " , " " ) . replace ( " \r " , " " ) . replace ( " \t " , " " ) . replace ( " & " , " & " ) . replace ( ' < ' , ' < ' ) . replace ( ' > ' , ' > ' ) . replace ( ' " ' , ' " ' ) . replace ( " ' " , " ' " ) . strip ( )
2022-06-13 18:05:24 +00:00
marseys_used = set ( )
2022-09-16 16:30:34 +00:00
title = render_emoji ( title , emoji_regex3 , golden , marseys_used )
2022-06-13 18:05:24 +00:00
2022-09-16 16:30:34 +00:00
if count_marseys :
2022-09-09 09:13:50 +00:00
for marsey in g . db . query ( Marsey ) . filter ( Marsey . submitter_id == None , Marsey . name . in_ ( marseys_used ) ) . all ( ) :
2022-06-13 18:05:24 +00:00
marsey . count + = 1
g . db . add ( marsey )
2022-05-04 23:09:46 +00:00
2022-06-28 05:41:21 +00:00
title = strikethrough_regex . sub ( r ' \ 1<del> \ 2</del> ' , title )
2022-05-04 23:09:46 +00:00
2022-05-17 19:58:41 +00:00
title = bleach . clean ( title , tags = [ ' img ' , ' del ' , ' span ' ] , attributes = allowed_attributes_emojis , protocols = [ ' http ' , ' https ' ] )
2022-05-04 23:09:46 +00:00
if len ( title ) > 1500 and not graceful : abort ( 400 )
2022-07-02 10:12:52 +00:00
else : return title . replace ( ' \n ' , ' ' ) . strip ( )
2022-05-25 08:43:16 +00:00
2022-06-10 20:02:15 +00:00
def normalize_url ( url ) :
2022-07-04 03:08:33 +00:00
url = reddit_domain_regex . sub ( r ' \ 1https://old.reddit.com/ \ 3/ ' , url )
2022-06-10 20:02:15 +00:00
2022-06-23 15:47:57 +00:00
url = url . replace ( " https://youtu.be/ " , " https://youtube.com/watch?v= " ) \
2022-05-25 08:43:16 +00:00
. replace ( " https://music.youtube.com/watch?v= " , " https://youtube.com/watch?v= " ) \
2022-08-24 22:02:06 +00:00
. replace ( " https://www.youtube.com " , " https://youtube.com " ) \
2022-05-25 08:43:16 +00:00
. replace ( " https://youtube.com/shorts/ " , " https://youtube.com/watch?v= " ) \
2022-08-24 22:02:06 +00:00
. replace ( " https://youtube.com/v/ " , " https://youtube.com/watch?v= " ) \
2022-06-23 15:47:57 +00:00
. replace ( " https://mobile.twitter.com " , " https://twitter.com " ) \
. replace ( " https://m.facebook.com " , " https://facebook.com " ) \
. replace ( " https://m.wikipedia.org " , " https://wikipedia.org " ) \
. replace ( " https://m.youtube.com " , " https://youtube.com " ) \
. replace ( " https://www.twitter.com " , " https://twitter.com " ) \
. replace ( " https://www.instagram.com " , " https://instagram.com " ) \
. replace ( " https://www.tiktok.com " , " https://tiktok.com " ) \
. replace ( " https://www.streamable.com " , " https://streamable.com " ) \
2022-06-10 14:35:09 +00:00
. replace ( " https://streamable.com/ " , " https://streamable.com/e/ " ) \
2022-07-15 13:00:51 +00:00
. replace ( " https://streamable.com/e/e/ " , " https://streamable.com/e/ " ) \
2022-08-13 05:06:53 +00:00
. replace ( " https://search.marsey.cat/# " , " https://camas.unddit.com/# " ) \
2022-09-12 11:59:49 +00:00
. replace ( " https://imgur.com/ " , " https://i.imgur.com/ " )
2022-05-25 08:43:16 +00:00
2022-06-10 20:02:15 +00:00
url = imgur_regex . sub ( r ' \ 1_d.webp?maxwidth=9999&fidelity=high ' , url )
2022-06-11 12:21:59 +00:00
url = giphy_regex . sub ( r ' \ 1.webp ' , url )
2022-05-25 08:43:16 +00:00
2022-06-11 09:56:16 +00:00
return url
2022-08-05 17:09:41 +00:00
def validate_css ( css ) :
if ' @import ' in css :
return False , " @import statements not allowed. "
for i in css_url_regex . finditer ( css ) :
url = i . group ( 1 )
if not is_safe_url ( url ) :
domain = tldextract . extract ( url ) . registered_domain
return False , f " The domain ' { domain } ' is not allowed, please use one of these domains \n \n { approved_embed_hosts } . "
return True , " "