forked from rDrama/rDrama
1. unify all link processing logic in one place
2. fix this https://rdrama.net/post/168836/texas-shooter-identified-as-mauricio-garcia/4113391#context 3. fix this https://rdrama.net/post/168836/texas-shooter-identified-as-mauricio-garcia/4142945#contextmaster
parent
8d2eca46e4
commit
09fd7a1bf7
|
@ -40,8 +40,6 @@ title_regex = re.compile("[^\w ]", flags=re.A)
|
|||
|
||||
controversial_regex = re.compile('["> ](https:\/\/old\.reddit\.com/r/\w{3,20}\/comments\/[\w\-.#&/=\?@%+]{5,250})["< ]', flags=re.A)
|
||||
|
||||
fishylinks_regex = re.compile("(https?:\/\/)?[\w\-.#&/=\?@%;+,:]{2,10}\.[\w\-.#&/=\?@%;+,:]{2,250}", flags=re.A)
|
||||
|
||||
spoiler_regex = re.compile('\|\|(.+?)\|\|(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
|
||||
reddit_regex = re.compile('(^|\s|<p>|\()\/?(([ruRU])\/(\w|-){3,25})(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
|
||||
sub_regex = re.compile('(^|\s|<p>|\()\/?([hH]\/(\w|-){3,25})(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
|
||||
|
@ -98,7 +96,7 @@ linefeeds_regex = re.compile("([^\n])\n([^\n])", flags=re.A)
|
|||
|
||||
greentext_regex = re.compile("(\n|^)>([^ >][^\n]*)", flags=re.A)
|
||||
|
||||
ascii_only_regex = re.compile("[ -~]+", flags=re.A)
|
||||
allowed_domain_regex = re.compile("[a-z0-9\-.]+", flags=re.I|re.A)
|
||||
|
||||
reddit_to_vreddit_regex = re.compile('(^|>|")https:\/\/old.reddit.com\/(r|u)\/', flags=re.A)
|
||||
twitter_to_nitter_regex = re.compile('(^|>|")https:\/\/twitter.com\/(?!i\/)', flags=re.A)
|
||||
|
|
|
@ -136,24 +136,6 @@ def build_url_re(tlds, protocols):
|
|||
|
||||
url_re = build_url_re(tlds=TLDS, protocols=['http', 'https'])
|
||||
|
||||
def callback(attrs, new=False):
|
||||
if (None, "href") not in attrs:
|
||||
return # Incorrect <a> tag
|
||||
|
||||
href = attrs[(None, "href")]
|
||||
|
||||
# \ in href right after / makes most browsers ditch site hostname and allows for a host injection bypassing the check, see <a href="/\google.com">cool</a>
|
||||
if "\\" in href or not ascii_only_regex.fullmatch(href):
|
||||
attrs["_text"] = href # Laugh at this user
|
||||
del attrs[(None, "href")] # Make unclickable and reset harmful payload
|
||||
return attrs
|
||||
|
||||
if not href.startswith('/') and not href.startswith(f'{SITE_FULL}/'):
|
||||
attrs[(None, "target")] = "_blank"
|
||||
attrs[(None, "rel")] = "nofollow noopener"
|
||||
|
||||
return attrs
|
||||
|
||||
def create_comment_duplicated(text_html):
|
||||
new_comment = Comment(author_id=AUTOJANNY_ID,
|
||||
parent_submission=None,
|
||||
|
@ -448,14 +430,7 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_emojis=
|
|||
|
||||
tag["data-src"] = tag["data-src"].replace('/giphy.webp', '/200w.webp')
|
||||
|
||||
for tag in soup.find_all("a"):
|
||||
if not tag.contents or not str(tag.contents[0]).strip():
|
||||
tag.extract()
|
||||
if not snappy and tag.get("href") and fishylinks_regex.fullmatch(str(tag.string)):
|
||||
tag.string = tag["href"]
|
||||
|
||||
|
||||
sanitized = str(soup)
|
||||
sanitized = str(soup).replace('<html><body>','').replace('</body></html>','')
|
||||
|
||||
sanitized = spoiler_regex.sub(r'<spoiler>\1</spoiler>', sanitized)
|
||||
|
||||
|
@ -503,7 +478,6 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_emojis=
|
|||
g.db.add(emoji)
|
||||
|
||||
sanitized = sanitized.replace('<p></p>', '')
|
||||
sanitized = sanitized.replace('<html><body>','').replace('</body></html>','')
|
||||
|
||||
if g.v and g.v.agendaposter:
|
||||
allowed_css_properties = allowed_styles
|
||||
|
@ -516,10 +490,10 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_emojis=
|
|||
protocols=['http', 'https'],
|
||||
css_sanitizer=css_sanitizer,
|
||||
filters=[partial(LinkifyFilter, skip_tags=["pre"],
|
||||
parse_email=False, callbacks=[callback], url_re=url_re)]
|
||||
parse_email=False, url_re=url_re)]
|
||||
).clean(sanitized)
|
||||
|
||||
#doing it again cuz of the linkifyfilter right above it
|
||||
#doing this here cuz of the linkifyfilter right above it (therefore unifying all link processing logic)
|
||||
soup = BeautifulSoup(sanitized, 'lxml')
|
||||
|
||||
links = soup.find_all("a")
|
||||
|
@ -527,11 +501,38 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_emojis=
|
|||
domain_list = set()
|
||||
|
||||
for link in links:
|
||||
#remove empty links
|
||||
if not link.contents or not str(link.contents[0]).strip():
|
||||
link.extract()
|
||||
continue
|
||||
|
||||
href = link.get("href")
|
||||
if not href: continue
|
||||
url = urlparse(href)
|
||||
d = tldextract.extract(href).registered_domain + url.path
|
||||
domain_list.add(d.lower())
|
||||
|
||||
domain = tldextract.extract(href).registered_domain
|
||||
|
||||
#\ in href right after / makes most browsers ditch site hostname and allows for a host injection bypassing the check, see <a href="/\google.com">cool</a>
|
||||
if ("\\" in href
|
||||
#https://rdrama.net/post/78376/reminder-of-the-fact-that-our/2150032#context
|
||||
or not allowed_domain_regex.fullmatch(domain)):
|
||||
link.string = href
|
||||
del link["href"]
|
||||
continue
|
||||
|
||||
#insert target="_blank" and ref="nofollower noopener" for external link
|
||||
if not href.startswith('/') and not href.startswith(f'{SITE_FULL}/'):
|
||||
link["target"] = "_blank"
|
||||
link["rel"] = "nofollow noopener"
|
||||
|
||||
#don't allow something like this [https://rԁrama.net/leaderboard](https://iplogger.org/1fRKk7)
|
||||
if not snappy and tldextract.extract(link.string).registered_domain:
|
||||
link.string = href
|
||||
|
||||
#add to set to check for banned domains later
|
||||
combined = domain + urlparse(href).path
|
||||
domain_list.add(combined.lower())
|
||||
|
||||
sanitized = str(soup).replace('<html><body>','').replace('</body></html>','')
|
||||
|
||||
def error(error):
|
||||
if chat:
|
||||
|
|
Loading…
Reference in New Issue