From 3330c2517fd3eb7824408d0b3cfe3fdc1e948e48 Mon Sep 17 00:00:00 2001 From: Aevann Date: Sat, 5 Aug 2023 21:51:05 +0300 Subject: [PATCH] total tracking parameter death --- files/helpers/regex.py | 3 --- files/helpers/sanitize.py | 40 ++++++++++++++++++++++++------ files/routes/posts.py | 51 +-------------------------------------- 3 files changed, 34 insertions(+), 60 deletions(-) diff --git a/files/helpers/regex.py b/files/helpers/regex.py index 6dcb7c5e1..7b4eab64c 100644 --- a/files/helpers/regex.py +++ b/files/helpers/regex.py @@ -58,9 +58,6 @@ snappy_youtube_regex = re.compile(']*>|{slur_single_words}", flags=re.I|re.A) slur_regex_upper = re.compile(f"<[^>]*>|{slur_single_words.upper()}", flags=re.A) profanity_regex = re.compile(f"<[^>]*>|{profanity_single_words}", flags=re.I|re.A) diff --git a/files/helpers/sanitize.py b/files/helpers/sanitize.py index 38a87c517..0bbf8f372 100644 --- a/files/helpers/sanitize.py +++ b/files/helpers/sanitize.py @@ -378,11 +378,6 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=False, count_emojis if blackjack and execute_blackjack(g.v, None, sanitized, blackjack): sanitized = 'g' - sanitized = utm_regex.sub('', sanitized) - sanitized = utm_regex2.sub('', sanitized) - - sanitized = normalize_url(sanitized) - if '```' not in sanitized and '
' not in sanitized:
 		sanitized = linefeeds_regex.sub(r'\1\n\n\2', sanitized)
 
@@ -559,6 +554,8 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=False, count_emojis
 		href = link.get("href")
 		if not href: continue
 
+		href = normalize_url(href)
+
 		def unlinkfy():
 			link.string = href
 			del link["href"]
@@ -665,7 +662,21 @@ def filter_emojis_only(title, golden=True, count_emojis=False, graceful=False):
 	else:
 		return title.strip()
 
+def is_whitelisted(domain, k):
+	if k.lower().endswith('id'):
+		return True
+	if 'sort' in k.lower() or 'query' in k.lower():
+		return True
+	if k in {'v','context','q','page','time_continue','title','scrollToComments','u','url'}:
+		return True
+	if k == 't' and domain != 'youtube.com':
+		return True
+	return False
+
+
 def normalize_url(url):
+	url = unquote(url)
+
 	url = reddit_domain_regex.sub(r'\1https://old.reddit.com/\3/', url)
 
 	url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") \
@@ -685,10 +696,25 @@ def normalize_url(url):
 			 .replace("https://nitter.42l.fr/", "https://twitter.com/") \
 			 .replace("https://nitter.lacontrevoie.fr/", "https://twitter.com/") \
 			 .replace("/giphy.gif", "/giphy.webp") \
-			 .replace('https://old.reddit.com/r/place/?', 'https://new.reddit.com/r/place/?') \
+
+	url = giphy_regex.sub(r'\1.webp', url)
+
+	if not url.startswith('/'):
+		parsed_url = urlparse(url)
+		domain = parsed_url.netloc
+		qd = parse_qs(parsed_url.query, keep_blank_values=True)
+		filtered = {k: val for k, val in qd.items() if is_whitelisted(domain, k)}
+		new_url = ParseResult(scheme="https",
+							netloc=parsed_url.netloc,
+							path=parsed_url.path,
+							params=parsed_url.params,
+							query=urlencode(filtered, doseq=True),
+							fragment=parsed_url.fragment)
+		url = urlunparse(new_url)
+
+	url = url.rstrip('/')
 
 	url = imgur_regex.sub(r'\1_d.webp?maxwidth=9999&fidelity=grand', url)
-	url = giphy_regex.sub(r'\1.webp', url)
 
 	return url
 
diff --git a/files/routes/posts.py b/files/routes/posts.py
index 733cafdcd..1436b8884 100644
--- a/files/routes/posts.py
+++ b/files/routes/posts.py
@@ -5,7 +5,7 @@ from io import BytesIO
 from os import path
 from shutil import copyfile
 from sys import stdout
-from urllib.parse import ParseResult, urlparse, urlunparse, unquote
+from urllib.parse import urlparse
 
 import gevent
 import requests
@@ -397,30 +397,6 @@ def is_repost(v):
 		abort(400)
 
 	url = normalize_url(url)
-	url = unquote(url)
-	parsed_url = urlparse(url)
-
-	domain = parsed_url.netloc
-	if domain in {'old.reddit.com','twitter.com','instagram.com','tiktok.com'} and '/search' not in url:
-		new_url = ParseResult(scheme="https",
-				netloc=parsed_url.netloc,
-				path=parsed_url.path,
-				params=parsed_url.params,
-				query=None,
-				fragment=parsed_url.fragment)
-	else:
-		qd = parse_qs(parsed_url.query, keep_blank_values=True)
-		filtered = {k: val for k, val in qd.items() if not k.startswith('utm_') and not k.startswith('ref_')}
-
-		new_url = ParseResult(scheme="https",
-							netloc=parsed_url.netloc,
-							path=parsed_url.path,
-							params=parsed_url.params,
-							query=urlencode(filtered, doseq=True),
-							fragment=parsed_url.fragment)
-
-	url = urlunparse(new_url)
-	url = url.rstrip('/')
 
 	search_url = url.replace('%', '').replace('\\', '').replace('_', '\_').strip()
 	repost = g.db.query(Post).filter(
@@ -491,31 +467,6 @@ def submit_post(v, sub=None):
 
 	if url:
 		url = normalize_url(url)
-		url = unquote(url)
-		parsed_url = urlparse(url)
-
-		domain = parsed_url.netloc
-		if domain in {'old.reddit.com','twitter.com','instagram.com','tiktok.com'} and '/search' not in url:
-			new_url = ParseResult(scheme="https",
-					netloc=parsed_url.netloc,
-					path=parsed_url.path,
-					params=parsed_url.params,
-					query=None,
-					fragment=parsed_url.fragment)
-		else:
-			qd = parse_qs(parsed_url.query, keep_blank_values=True)
-			filtered = {k: val for k, val in qd.items() if not k.startswith('utm_') and not k.startswith('ref_')}
-
-			new_url = ParseResult(scheme="https",
-								netloc=parsed_url.netloc,
-								path=parsed_url.path,
-								params=parsed_url.params,
-								query=urlencode(filtered, doseq=True),
-								fragment=parsed_url.fragment)
-
-		url = urlunparse(new_url)
-
-		url = url.rstrip('/')
 
 		if v.admin_level < PERMS["IGNORE_DOMAIN_BAN"]:
 			y = tldextract.extract(url).registered_domain + parsed_url.path