forked from rDrama/rDrama
1
0
Fork 0
Aevann 2023-05-12 22:12:02 +03:00
parent 8d2eca46e4
commit 09fd7a1bf7
2 changed files with 34 additions and 35 deletions

View File

@ -40,8 +40,6 @@ title_regex = re.compile("[^\w ]", flags=re.A)
controversial_regex = re.compile('["> ](https:\/\/old\.reddit\.com/r/\w{3,20}\/comments\/[\w\-.#&/=\?@%+]{5,250})["< ]', flags=re.A)
fishylinks_regex = re.compile("(https?:\/\/)?[\w\-.#&/=\?@%;+,:]{2,10}\.[\w\-.#&/=\?@%;+,:]{2,250}", flags=re.A)
spoiler_regex = re.compile('\|\|(.+?)\|\|(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
reddit_regex = re.compile('(^|\s|<p>|\()\/?(([ruRU])\/(\w|-){3,25})(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
sub_regex = re.compile('(^|\s|<p>|\()\/?([hH]\/(\w|-){3,25})(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
@ -98,7 +96,7 @@ linefeeds_regex = re.compile("([^\n])\n([^\n])", flags=re.A)
greentext_regex = re.compile("(\n|^)>([^ >][^\n]*)", flags=re.A)
ascii_only_regex = re.compile("[ -~]+", flags=re.A)
allowed_domain_regex = re.compile("[a-z0-9\-.]+", flags=re.I|re.A)
reddit_to_vreddit_regex = re.compile('(^|>|")https:\/\/old.reddit.com\/(r|u)\/', flags=re.A)
twitter_to_nitter_regex = re.compile('(^|>|")https:\/\/twitter.com\/(?!i\/)', flags=re.A)

View File

@ -136,24 +136,6 @@ def build_url_re(tlds, protocols):
url_re = build_url_re(tlds=TLDS, protocols=['http', 'https'])
def callback(attrs, new=False):
if (None, "href") not in attrs:
return # Incorrect <a> tag
href = attrs[(None, "href")]
# \ in href right after / makes most browsers ditch site hostname and allows for a host injection bypassing the check, see <a href="/\google.com">cool</a>
if "\\" in href or not ascii_only_regex.fullmatch(href):
attrs["_text"] = href # Laugh at this user
del attrs[(None, "href")] # Make unclickable and reset harmful payload
return attrs
if not href.startswith('/') and not href.startswith(f'{SITE_FULL}/'):
attrs[(None, "target")] = "_blank"
attrs[(None, "rel")] = "nofollow noopener"
return attrs
def create_comment_duplicated(text_html):
new_comment = Comment(author_id=AUTOJANNY_ID,
parent_submission=None,
@ -448,14 +430,7 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_emojis=
tag["data-src"] = tag["data-src"].replace('/giphy.webp', '/200w.webp')
for tag in soup.find_all("a"):
if not tag.contents or not str(tag.contents[0]).strip():
tag.extract()
if not snappy and tag.get("href") and fishylinks_regex.fullmatch(str(tag.string)):
tag.string = tag["href"]
sanitized = str(soup)
sanitized = str(soup).replace('<html><body>','').replace('</body></html>','')
sanitized = spoiler_regex.sub(r'<spoiler>\1</spoiler>', sanitized)
@ -503,7 +478,6 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_emojis=
g.db.add(emoji)
sanitized = sanitized.replace('<p></p>', '')
sanitized = sanitized.replace('<html><body>','').replace('</body></html>','')
if g.v and g.v.agendaposter:
allowed_css_properties = allowed_styles
@ -516,10 +490,10 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_emojis=
protocols=['http', 'https'],
css_sanitizer=css_sanitizer,
filters=[partial(LinkifyFilter, skip_tags=["pre"],
parse_email=False, callbacks=[callback], url_re=url_re)]
parse_email=False, url_re=url_re)]
).clean(sanitized)
#doing it again cuz of the linkifyfilter right above it
#doing this here cuz of the linkifyfilter right above it (therefore unifying all link processing logic)
soup = BeautifulSoup(sanitized, 'lxml')
links = soup.find_all("a")
@ -527,11 +501,38 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=True, count_emojis=
domain_list = set()
for link in links:
#remove empty links
if not link.contents or not str(link.contents[0]).strip():
link.extract()
continue
href = link.get("href")
if not href: continue
url = urlparse(href)
d = tldextract.extract(href).registered_domain + url.path
domain_list.add(d.lower())
domain = tldextract.extract(href).registered_domain
#\ in href right after / makes most browsers ditch site hostname and allows for a host injection bypassing the check, see <a href="/\google.com">cool</a>
if ("\\" in href
#https://rdrama.net/post/78376/reminder-of-the-fact-that-our/2150032#context
or not allowed_domain_regex.fullmatch(domain)):
link.string = href
del link["href"]
continue
#insert target="_blank" and ref="nofollower noopener" for external link
if not href.startswith('/') and not href.startswith(f'{SITE_FULL}/'):
link["target"] = "_blank"
link["rel"] = "nofollow noopener"
#don't allow something like this [https://rԁrama.net/leaderboard](https://iplogger.org/1fRKk7)
if not snappy and tldextract.extract(link.string).registered_domain:
link.string = href
#add to set to check for banned domains later
combined = domain + urlparse(href).path
domain_list.add(combined.lower())
sanitized = str(soup).replace('<html><body>','').replace('</body></html>','')
def error(error):
if chat: