total tracking parameter death

master
Aevann 2023-08-05 21:51:05 +03:00
parent d4653f824a
commit 3330c2517f
3 changed files with 34 additions and 60 deletions

View File

@ -58,9 +58,6 @@ snappy_youtube_regex = re.compile('<lite-youtube videoid="(.+?)" params="autopla
email_regex = re.compile(EMAIL_REGEX_PATTERN, flags=re.A)
utm_regex = re.compile('utm_[0-z]+=[0-z_.]+&', flags=re.A)
utm_regex2 = re.compile('[?&]utm_[0-z]+=[0-z_.]+', flags=re.A)
slur_regex = re.compile(f"<[^>]*>|{slur_single_words}", flags=re.I|re.A)
slur_regex_upper = re.compile(f"<[^>]*>|{slur_single_words.upper()}", flags=re.A)
profanity_regex = re.compile(f"<[^>]*>|{profanity_single_words}", flags=re.I|re.A)

View File

@ -378,11 +378,6 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=False, count_emojis
if blackjack and execute_blackjack(g.v, None, sanitized, blackjack):
sanitized = 'g'
sanitized = utm_regex.sub('', sanitized)
sanitized = utm_regex2.sub('', sanitized)
sanitized = normalize_url(sanitized)
if '```' not in sanitized and '<pre>' not in sanitized:
sanitized = linefeeds_regex.sub(r'\1\n\n\2', sanitized)
@ -559,6 +554,8 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=False, count_emojis
href = link.get("href")
if not href: continue
href = normalize_url(href)
def unlinkfy():
link.string = href
del link["href"]
@ -665,7 +662,21 @@ def filter_emojis_only(title, golden=True, count_emojis=False, graceful=False):
else:
return title.strip()
def is_whitelisted(domain, k):
if k.lower().endswith('id'):
return True
if 'sort' in k.lower() or 'query' in k.lower():
return True
if k in {'v','context','q','page','time_continue','title','scrollToComments','u','url'}:
return True
if k == 't' and domain != 'youtube.com':
return True
return False
def normalize_url(url):
url = unquote(url)
url = reddit_domain_regex.sub(r'\1https://old.reddit.com/\3/', url)
url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") \
@ -685,10 +696,25 @@ def normalize_url(url):
.replace("https://nitter.42l.fr/", "https://twitter.com/") \
.replace("https://nitter.lacontrevoie.fr/", "https://twitter.com/") \
.replace("/giphy.gif", "/giphy.webp") \
.replace('https://old.reddit.com/r/place/?', 'https://new.reddit.com/r/place/?') \
url = giphy_regex.sub(r'\1.webp', url)
if not url.startswith('/'):
parsed_url = urlparse(url)
domain = parsed_url.netloc
qd = parse_qs(parsed_url.query, keep_blank_values=True)
filtered = {k: val for k, val in qd.items() if is_whitelisted(domain, k)}
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=urlencode(filtered, doseq=True),
fragment=parsed_url.fragment)
url = urlunparse(new_url)
url = url.rstrip('/')
url = imgur_regex.sub(r'\1_d.webp?maxwidth=9999&fidelity=grand', url)
url = giphy_regex.sub(r'\1.webp', url)
return url

View File

@ -5,7 +5,7 @@ from io import BytesIO
from os import path
from shutil import copyfile
from sys import stdout
from urllib.parse import ParseResult, urlparse, urlunparse, unquote
from urllib.parse import urlparse
import gevent
import requests
@ -397,30 +397,6 @@ def is_repost(v):
abort(400)
url = normalize_url(url)
url = unquote(url)
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain in {'old.reddit.com','twitter.com','instagram.com','tiktok.com'} and '/search' not in url:
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=None,
fragment=parsed_url.fragment)
else:
qd = parse_qs(parsed_url.query, keep_blank_values=True)
filtered = {k: val for k, val in qd.items() if not k.startswith('utm_') and not k.startswith('ref_')}
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=urlencode(filtered, doseq=True),
fragment=parsed_url.fragment)
url = urlunparse(new_url)
url = url.rstrip('/')
search_url = url.replace('%', '').replace('\\', '').replace('_', '\_').strip()
repost = g.db.query(Post).filter(
@ -491,31 +467,6 @@ def submit_post(v, sub=None):
if url:
url = normalize_url(url)
url = unquote(url)
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain in {'old.reddit.com','twitter.com','instagram.com','tiktok.com'} and '/search' not in url:
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=None,
fragment=parsed_url.fragment)
else:
qd = parse_qs(parsed_url.query, keep_blank_values=True)
filtered = {k: val for k, val in qd.items() if not k.startswith('utm_') and not k.startswith('ref_')}
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=urlencode(filtered, doseq=True),
fragment=parsed_url.fragment)
url = urlunparse(new_url)
url = url.rstrip('/')
if v.admin_level < PERMS["IGNORE_DOMAIN_BAN"]:
y = tldextract.extract(url).registered_domain + parsed_url.path