total tracking parameter death

pull/181/head
Aevann 2023-08-05 21:51:05 +03:00
parent d4653f824a
commit 3330c2517f
3 changed files with 34 additions and 60 deletions

View File

@ -58,9 +58,6 @@ snappy_youtube_regex = re.compile('<lite-youtube videoid="(.+?)" params="autopla
email_regex = re.compile(EMAIL_REGEX_PATTERN, flags=re.A) email_regex = re.compile(EMAIL_REGEX_PATTERN, flags=re.A)
utm_regex = re.compile('utm_[0-z]+=[0-z_.]+&', flags=re.A)
utm_regex2 = re.compile('[?&]utm_[0-z]+=[0-z_.]+', flags=re.A)
slur_regex = re.compile(f"<[^>]*>|{slur_single_words}", flags=re.I|re.A) slur_regex = re.compile(f"<[^>]*>|{slur_single_words}", flags=re.I|re.A)
slur_regex_upper = re.compile(f"<[^>]*>|{slur_single_words.upper()}", flags=re.A) slur_regex_upper = re.compile(f"<[^>]*>|{slur_single_words.upper()}", flags=re.A)
profanity_regex = re.compile(f"<[^>]*>|{profanity_single_words}", flags=re.I|re.A) profanity_regex = re.compile(f"<[^>]*>|{profanity_single_words}", flags=re.I|re.A)

View File

@ -378,11 +378,6 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=False, count_emojis
if blackjack and execute_blackjack(g.v, None, sanitized, blackjack): if blackjack and execute_blackjack(g.v, None, sanitized, blackjack):
sanitized = 'g' sanitized = 'g'
sanitized = utm_regex.sub('', sanitized)
sanitized = utm_regex2.sub('', sanitized)
sanitized = normalize_url(sanitized)
if '```' not in sanitized and '<pre>' not in sanitized: if '```' not in sanitized and '<pre>' not in sanitized:
sanitized = linefeeds_regex.sub(r'\1\n\n\2', sanitized) sanitized = linefeeds_regex.sub(r'\1\n\n\2', sanitized)
@ -559,6 +554,8 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=False, count_emojis
href = link.get("href") href = link.get("href")
if not href: continue if not href: continue
href = normalize_url(href)
def unlinkfy(): def unlinkfy():
link.string = href link.string = href
del link["href"] del link["href"]
@ -665,7 +662,21 @@ def filter_emojis_only(title, golden=True, count_emojis=False, graceful=False):
else: else:
return title.strip() return title.strip()
def is_whitelisted(domain, k):
if k.lower().endswith('id'):
return True
if 'sort' in k.lower() or 'query' in k.lower():
return True
if k in {'v','context','q','page','time_continue','title','scrollToComments','u','url'}:
return True
if k == 't' and domain != 'youtube.com':
return True
return False
def normalize_url(url): def normalize_url(url):
url = unquote(url)
url = reddit_domain_regex.sub(r'\1https://old.reddit.com/\3/', url) url = reddit_domain_regex.sub(r'\1https://old.reddit.com/\3/', url)
url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") \ url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") \
@ -685,10 +696,25 @@ def normalize_url(url):
.replace("https://nitter.42l.fr/", "https://twitter.com/") \ .replace("https://nitter.42l.fr/", "https://twitter.com/") \
.replace("https://nitter.lacontrevoie.fr/", "https://twitter.com/") \ .replace("https://nitter.lacontrevoie.fr/", "https://twitter.com/") \
.replace("/giphy.gif", "/giphy.webp") \ .replace("/giphy.gif", "/giphy.webp") \
.replace('https://old.reddit.com/r/place/?', 'https://new.reddit.com/r/place/?') \
url = giphy_regex.sub(r'\1.webp', url)
if not url.startswith('/'):
parsed_url = urlparse(url)
domain = parsed_url.netloc
qd = parse_qs(parsed_url.query, keep_blank_values=True)
filtered = {k: val for k, val in qd.items() if is_whitelisted(domain, k)}
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=urlencode(filtered, doseq=True),
fragment=parsed_url.fragment)
url = urlunparse(new_url)
url = url.rstrip('/')
url = imgur_regex.sub(r'\1_d.webp?maxwidth=9999&fidelity=grand', url) url = imgur_regex.sub(r'\1_d.webp?maxwidth=9999&fidelity=grand', url)
url = giphy_regex.sub(r'\1.webp', url)
return url return url

View File

@ -5,7 +5,7 @@ from io import BytesIO
from os import path from os import path
from shutil import copyfile from shutil import copyfile
from sys import stdout from sys import stdout
from urllib.parse import ParseResult, urlparse, urlunparse, unquote from urllib.parse import urlparse
import gevent import gevent
import requests import requests
@ -397,30 +397,6 @@ def is_repost(v):
abort(400) abort(400)
url = normalize_url(url) url = normalize_url(url)
url = unquote(url)
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain in {'old.reddit.com','twitter.com','instagram.com','tiktok.com'} and '/search' not in url:
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=None,
fragment=parsed_url.fragment)
else:
qd = parse_qs(parsed_url.query, keep_blank_values=True)
filtered = {k: val for k, val in qd.items() if not k.startswith('utm_') and not k.startswith('ref_')}
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=urlencode(filtered, doseq=True),
fragment=parsed_url.fragment)
url = urlunparse(new_url)
url = url.rstrip('/')
search_url = url.replace('%', '').replace('\\', '').replace('_', '\_').strip() search_url = url.replace('%', '').replace('\\', '').replace('_', '\_').strip()
repost = g.db.query(Post).filter( repost = g.db.query(Post).filter(
@ -491,31 +467,6 @@ def submit_post(v, sub=None):
if url: if url:
url = normalize_url(url) url = normalize_url(url)
url = unquote(url)
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain in {'old.reddit.com','twitter.com','instagram.com','tiktok.com'} and '/search' not in url:
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=None,
fragment=parsed_url.fragment)
else:
qd = parse_qs(parsed_url.query, keep_blank_values=True)
filtered = {k: val for k, val in qd.items() if not k.startswith('utm_') and not k.startswith('ref_')}
new_url = ParseResult(scheme="https",
netloc=parsed_url.netloc,
path=parsed_url.path,
params=parsed_url.params,
query=urlencode(filtered, doseq=True),
fragment=parsed_url.fragment)
url = urlunparse(new_url)
url = url.rstrip('/')
if v.admin_level < PERMS["IGNORE_DOMAIN_BAN"]: if v.admin_level < PERMS["IGNORE_DOMAIN_BAN"]:
y = tldextract.extract(url).registered_domain + parsed_url.path y = tldextract.extract(url).registered_domain + parsed_url.path