Sanitize: modularize normalize_url, fix streamable.

Originally prompted by https://rdrama.net/post/18459/-/1984609 which
noticed that streamable.com/e/ links as posts would have another e/
added to them. This was in spite of logic in posts.py api_is_repost
and submit_post designed to specifically counteract this.
Proximal cause was a copypasta'd url.replace(...) chain which
caused the mistake before the streamable-specific logic had a chance
to avoid making it.

Solution: remove the streamable replacement from the chained statement
and create `helpers.normalize_url(url)` to get rid of the copypasta.
master
Snakes 2022-05-25 04:43:16 -04:00
parent 3a2c16a696
commit 1c7458e111
2 changed files with 31 additions and 17 deletions

View File

@ -251,7 +251,7 @@ def sanitize(sanitized, alert=False, comment=False, edit=False):
for rd in ["://reddit.com", "://new.reddit.com", "://www.reddit.com", "://redd.it", "://libredd.it", "://teddit.net"]:
sanitized = sanitized.replace(rd, "://old.reddit.com")
sanitized = sanitized.replace("nitter.net", "twitter.com").replace("old.reddit.com/gallery", "reddit.com/gallery").replace("https://youtu.be/", "https://youtube.com/watch?v=").replace("https://music.youtube.com/watch?v=", "https://youtube.com/watch?v=").replace("https://streamable.com/", "https://streamable.com/e/").replace("https://youtube.com/shorts/", "https://youtube.com/watch?v=").replace("https://mobile.twitter", "https://twitter").replace("https://m.facebook", "https://facebook").replace("m.wikipedia.org", "wikipedia.org").replace("https://m.youtube", "https://youtube").replace("https://www.youtube", "https://youtube").replace("https://www.twitter", "https://twitter").replace("https://www.instagram", "https://instagram").replace("https://www.tiktok", "https://tiktok")
sanitized = normalize_url(sanitized)
sanitized = sanitized.replace('&','&')
@ -380,3 +380,31 @@ def filter_emojis_only(title, edit=False, graceful=False):
if len(title) > 1500 and not graceful: abort(400)
else: return title
def normalize_url(url):
url = url.replace("nitter.net", "twitter.com") \
.replace("old.reddit.com/gallery", "reddit.com/gallery") \
.replace("https://youtu.be/", "https://youtube.com/watch?v=") \
.replace("https://music.youtube.com/watch?v=", "https://youtube.com/watch?v=") \
.replace("https://youtube.com/shorts/", "https://youtube.com/watch?v=") \
.replace("https://mobile.twitter", "https://twitter") \
.replace("https://m.facebook", "https://facebook") \
.replace("m.wikipedia.org", "wikipedia.org") \
.replace("https://m.youtube", "https://youtube") \
.replace("https://www.youtube", "https://youtube") \
.replace("https://www.twitter", "https://twitter") \
.replace("https://www.instagram", "https://instagram") \
.replace("https://www.tiktok", "https://tiktok")
if "/i.imgur.com/" in url:
url = url.replace(".png", ".webp").replace(".jpg", ".webp").replace(".jpeg", ".webp")
elif "/media.giphy.com/" in url or "/c.tenor.com/" in url:
url = url.replace(".gif", ".webp")
elif "/i.ibb.co/" in url:
url = url.replace(".png", ".webp").replace(".jpg", ".webp")\
.replace(".jpeg", ".webp").replace(".gif", ".webp")
if url.startswith("https://streamable.com/") and not url.startswith("https://streamable.com/e/"):
url = url.replace("https://streamable.com/", "https://streamable.com/e/")
return url

View File

@ -812,14 +812,7 @@ def api_is_repost():
for rd in ("://reddit.com", "://new.reddit.com", "://www.reddit.com", "://redd.it", "://libredd.it", "://teddit.net"):
url = url.replace(rd, "://old.reddit.com")
url = url.replace("nitter.net", "twitter.com").replace("old.reddit.com/gallery", "reddit.com/gallery").replace("https://youtu.be/", "https://youtube.com/watch?v=").replace("https://music.youtube.com/watch?v=", "https://youtube.com/watch?v=").replace("https://streamable.com/", "https://streamable.com/e/").replace("https://youtube.com/shorts/", "https://youtube.com/watch?v=").replace("https://mobile.twitter", "https://twitter").replace("https://m.facebook", "https://facebook").replace("m.wikipedia.org", "wikipedia.org").replace("https://m.youtube", "https://youtube").replace("https://www.youtube", "https://youtube").replace("https://www.twitter", "https://twitter").replace("https://www.instagram", "https://instagram").replace("https://www.tiktok", "https://tiktok")
if "/i.imgur.com/" in url: url = url.replace(".png", ".webp").replace(".jpg", ".webp").replace(".jpeg", ".webp")
elif "/media.giphy.com/" in url or "/c.tenor.com/" in url: url = url.replace(".gif", ".webp")
elif "/i.ibb.co/" in url: url = url.replace(".png", ".webp").replace(".jpg", ".webp").replace(".jpeg", ".webp").replace(".gif", ".webp")
if url.startswith("https://streamable.com/") and not url.startswith("https://streamable.com/e/"): url = url.replace("https://streamable.com/", "https://streamable.com/e/")
url = normalize_url(url)
parsed_url = urlparse(url)
domain = parsed_url.netloc
@ -908,14 +901,7 @@ def submit_post(v, sub=None):
for rd in ("://reddit.com", "://new.reddit.com", "://www.reddit.com", "://redd.it", "://libredd.it", "://teddit.net"):
url = url.replace(rd, "://old.reddit.com")
url = url.replace("nitter.net", "twitter.com").replace("old.reddit.com/gallery", "reddit.com/gallery").replace("https://youtu.be/", "https://youtube.com/watch?v=").replace("https://music.youtube.com/watch?v=", "https://youtube.com/watch?v=").replace("https://streamable.com/", "https://streamable.com/e/").replace("https://youtube.com/shorts/", "https://youtube.com/watch?v=").replace("https://mobile.twitter", "https://twitter").replace("https://m.facebook", "https://facebook").replace("m.wikipedia.org", "wikipedia.org").replace("https://m.youtube", "https://youtube").replace("https://www.youtube", "https://youtube").replace("https://www.twitter", "https://twitter").replace("https://www.instagram", "https://instagram").replace("https://www.tiktok", "https://tiktok")
if "/i.imgur.com/" in url: url = url.replace(".png", ".webp").replace(".jpg", ".webp").replace(".jpeg", ".webp")
elif "/media.giphy.com/" in url or "/c.tenor.com/" in url: url = url.replace(".gif", ".webp")
elif "/i.ibb.co/" in url: url = url.replace(".png", ".webp").replace(".jpg", ".webp").replace(".jpeg", ".webp").replace(".gif", ".webp")
if url.startswith("https://streamable.com/") and not url.startswith("https://streamable.com/e/"): url = url.replace("https://streamable.com/", "https://streamable.com/e/")
url = normalize_url(url)
parsed_url = urlparse(url)
domain = parsed_url.netloc