MarseyWorld/files/helpers/regex.py

188 lines
8.3 KiB
Python
Raw Normal View History

import random
import re
2022-07-14 14:43:05 +00:00
from random import choice, choices
[DO NOT MERGE] import detanglation (#442) * move Base definition to files.classes.__init__.py * fix ImportError * move userpage listing to users.py * don't import the app from classes * consts: set default values to avoid crashes consts: warn if the secret key is the default config value * card view: sneed (user db schema) * cloudflare: use DEFAULT_CONFIG_VALUE * const: set default values * decouple media.py from __main__ * pass database to avoid imports * import cleanup and import request not in const, but in the requests mega import * move asset_submissions site check to __init__ * asset submissions feature flag * flag * g.is_tor * don't import request where it's not needed * i think this is fine * mail: move to own routes and helper * wrappers * required wrappers move * unfuck wrappers a bit * move snappy quotes and marseys to stateful consts * marsify * :pepodrool: * fix missing import * import cache * ...and settings.py * and static.py * static needs cache * route * lmao all of the jinja shit was in feeds.py amazing * classes should only import what they need from flask * import Response * hdjbjdhbhjf * ... * dfdfdfdf * make get a non-required import * isort imports (mostly) * but actually * configs * reload config on import * fgfgfgfg * config * config * initialize snappy and test * cookie of doom debug * edfjnkf * xikscdfd * debug config * set session cookie domain, i think this fixes the can't login bug * sdfbgnhvfdsghbnjfbdvvfghnn * hrsfxgf * dump the entire config on a request * kyskyskyskyskyskyskyskyskys * duifhdskfjdfd * dfdfdfdfdfdfdfdfdfdfdfdf * dfdfdfdf * imoprt all of the consts beacuse fuck it * 😭 * dfdfdfdfdfdfsdasdf * print the entire session * rffdfdfjkfksj * fgbhffh * not the secret keys * minor bug fixes * be helpful in the warning * gfgfgfg * move warning lower * isort main imports (i hope this doesn't fuck something up) * test * session cookie domain redux * dfdfdfd * try only importing Flask * formkeys fix * y * :pepodrool: * route helper * remove before flight * dfdfdfdfdf * isort classes * isort helpers * move check_for_alts to routehelpers and also sort imports and get rid of unused ones * that previous commit but actkally * readd the cache in a dozen places they were implicitly imported * use g.is_tor instead of request.headers. bla bla bla * upgrade streamers to their own route file * get rid of unused imports in __main__ * fgfgf * don't pull in the entire ORM where we don't need it * features * explicit imports for the get helper * explicit imports for the get helper redux * testing allroutes * remove unused import * decouple flask from classes * syntax fix also remember these have side fx for some reason (why?) * move side effects out of the class * posts * testing on devrama * settings * reloading * settingssdsdsds * streamer features * site settings * testing settings on devrama * import * fix modlog * remove debug stuff * revert commit 67275b21ab6e2f2520819e84d10bfc1c746a15b6 * archiveorg to _archiveorg * skhudkfkjfd * fix cron for PCM * fix bugs that snekky wants me to * Fix call to realbody passing db, standardize kwarg * test * import check_for_alts from the right place * cloudflare * testing on devrama * fix cron i think * shadow properly * tasks * Remove print which will surely be annoying in prod. * v and create new session * use files.classes * make errors import little and fix rare 500 in /allow_nsfw * Revert "use files.classes" This reverts commit 98c10b876cf86ce058b7fb955cf1ec0bfb9996c6. * pass v to media functions rather than using g * fix * dfdfdfdfd * cleanup, py type checking is dumb so don't use it where it causes issues * Fix some merge bugs, add DEFAULT_RATELIMIT to main. * Fix imports on sqlalchemy expressions. * `from random import random` is an error. * Fix replies db param. * errors: fix missing import * fix rare 500: only send to GIFT_NOTIF_ID if it exists, and send them the right text * Fix signup formkey. * fix 2 500s * propagate db to submissions * fix replies * dfdfdfdf * Fix verifiedcolor. * is_manual * can't use getters outside of an app context * don't attempt to do gumroad on sites where it's not enabled * don't attempt to do gumraod on sites's where it's unnecessary * Revert "don't attempt to do gumroad on sites where it's not enabled" This reverts commit 6f8a6331878655492dfaf1907b27f8be513c14d3. * fix 500 * validate media type Co-authored-by: TLSM <duolsm@outlook.com>
2022-11-15 09:19:08 +00:00
from typing import List, Optional, Union
from .config.const import *
2023-04-23 13:22:41 +00:00
valid_username_regex = re.compile("^[\w\-]{3,25}$", flags=re.A)
2023-03-11 05:07:10 +00:00
mention_regex = re.compile('(?<![/;$#])@([\w\-]{1,30})(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
group_mention_regex = re.compile('(?<![/;$#])!([\w\-]{3,25})(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A|re.I)
2023-03-10 01:39:02 +00:00
everyone_regex = re.compile('(^|\s|>)!(everyone)(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
2023-03-01 05:32:19 +00:00
valid_password_regex = re.compile("^.{8,100}$", flags=re.A)
marseyaward_body_regex = re.compile(">[^<\s+]|[^>\s+]<", flags=re.A)
marseyaward_title_regex = re.compile("( *<img[^>]+>)+", flags=re.A)
2023-03-18 14:53:00 +00:00
emoji_name_regex = re.compile("[a-z0-9]{1,30}", flags=re.A)
2022-09-10 05:37:11 +00:00
tags_regex = re.compile("[a-z0-9: ]{1,200}", flags=re.A)
2023-04-23 13:22:41 +00:00
hat_regex = re.compile("[\w\-() ,]{1,50}", flags=re.A)
2022-09-10 05:37:11 +00:00
description_regex = re.compile("[^<>&\n\t]{1,300}", flags=re.A)
2023-03-09 22:35:45 +00:00
badge_name_regex = re.compile(r"[^\/.]+", flags=re.A)
2022-12-30 16:28:24 +00:00
2023-04-23 13:22:41 +00:00
valid_sub_regex = re.compile("^[\w\-]{3,25}$", flags=re.A)
query_regex = re.compile("(\w+):(\S+)", flags=re.A)
2023-03-12 18:40:18 +00:00
poll_regex = re.compile("(^|\n)\$\$([^\$\n]+)\$\$\s*?(?!([^<]*<\/(code|pre|a)>|[^`]*`))", flags=re.A)
bet_regex = re.compile("(^|\n)##([^#\n]+)##\s*?(?!([^<]*<\/(code|pre|a)>|[^`]*`))", flags=re.A)
choice_regex = re.compile("(^|\n)&&([^&\n]+)&&\s*?(?!([^<]*<\/(code|pre|a)>|[^`]*`))", flags=re.A)
html_comment_regex = re.compile("<!--.*-->", flags=re.A)
title_regex = re.compile("[^\w ]", flags=re.A)
2023-04-23 13:22:41 +00:00
controversial_regex = re.compile('["> ](https:\/\/old\.reddit\.com/r/\w{3,20}\/comments\/[\w\-.#&/=\?@%+]{5,250})["< ]', flags=re.A)
2022-12-30 13:20:42 +00:00
fishylinks_regex = re.compile("(https?:\/\/)?[\w\-.#&/=\?@%;+,:]{2,10}\.[\w\-.#&/=\?@%;+,:]{2,250}", flags=re.A)
2023-03-10 01:39:02 +00:00
spoiler_regex = re.compile('\|\|(.+?)\|\|(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
reddit_regex = re.compile('(^|\s|<p>)\/?(([ruRU])\/(\w|-){3,25})(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
sub_regex = re.compile('(^|\s|<p>)\/?([hH]\/(\w|-){3,25})(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
2023-03-10 01:39:02 +00:00
strikethrough_regex = re.compile('(^|\s|>)~{1,2}([^~]+)~{1,2}(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
2023-04-23 13:22:41 +00:00
mute_regex = re.compile("\/mute @?([\w\-]{3,30}) ([0-9]+)", flags=re.A|re.I)
2023-04-23 13:22:41 +00:00
emoji_regex = re.compile(f"<p>\s*(:[!#@\w\-]{{1,36}}:\s*)+<\/p>", flags=re.A)
emoji_regex2 = re.compile(f'(?<!"):([!#@\w\-]{{1,36}}?):(?!([^<]*<\/(code|pre|a)>|[^`]*`))', flags=re.A)
emoji_regex3 = re.compile(f'(?<!"):([!@\w\-]{{1,35}}?):', flags=re.A)
snappy_url_regex = re.compile('<a href="(https?:\/\/.+?)".*?>(.+?)<\/a>', flags=re.A)
2022-07-11 17:45:07 +00:00
snappy_youtube_regex = re.compile('<lite-youtube videoid="(.+?)" params="autoplay=1', flags=re.A)
2022-10-19 10:39:03 +00:00
email_regex = re.compile(EMAIL_REGEX_PATTERN, flags=re.A)
2022-09-17 21:13:14 +00:00
utm_regex = re.compile('utm_[0-z]+=[0-z_]+&', flags=re.A)
utm_regex2 = re.compile('[?&]utm_[0-z]+=[0-z_]+', flags=re.A)
slur_regex = re.compile(f"<[^>]*>|{slur_single_words}", flags=re.I|re.A)
slur_regex_upper = re.compile(f"<[^>]*>|{slur_single_words.upper()}", flags=re.A)
profanity_regex = re.compile(f"<[^>]*>|{profanity_single_words}", flags=re.I|re.A)
profanity_regex_upper = re.compile(f"<[^>]*>|{profanity_single_words.upper()}", flags=re.A)
torture_regex = re.compile('(^|\s)(i|me)($|\s)', flags=re.I|re.A)
torture_regex2 = re.compile("(^|\s)(i'm)($|\s)", flags=re.I|re.A)
torture_regex3 = re.compile("(^|\s)(my|mine)($|\s)", flags=re.I|re.A)
image_check_regex = re.compile(f'!\[\]\(((?!(https:\/\/({hosts})\/|\/)).*?)\)', flags=re.A)
2022-10-30 14:55:43 +00:00
video_regex_extensions = '|'.join(VIDEO_FORMATS)
video_sub_regex = re.compile(f'(<p>[^<]*)(https:\/\/({hosts})\/[\w:~,()\-.#&\/=?@%;+]*?\.({video_regex_extensions}))', flags=re.A)
2022-10-30 14:55:43 +00:00
audio_regex_extensions = '|'.join(AUDIO_FORMATS)
audio_sub_regex = re.compile(f'(<p>[^<]*)(https:\/\/({hosts})\/[\w:~,()\-.#&\/=?@%;+]*?\.({audio_regex_extensions}))', flags=re.A)
2022-10-30 14:55:43 +00:00
image_regex_extensions = '|'.join(IMAGE_FORMATS)
2023-03-15 02:46:54 +00:00
image_regex = re.compile(f"(^|\s)(https:\/\/[\w\-.#&/=\?@%;+,:]{{5,250}}(\.|\?format=)({image_regex_extensions})((\?|&)[\w\-.#&/=\?@%;+,:]*)?)(?=$|\s)", flags=re.I|re.A)
2023-03-07 01:40:48 +00:00
image_regex_extensions_no_gif = image_regex_extensions.replace('|gif', '')
imgur_regex = re.compile(f'(https:\/\/i\.imgur\.com\/[a-z0-9]+)\.({image_regex_extensions_no_gif})', flags=re.I|re.A)
giphy_regex = re.compile('(https:\/\/media\.giphy\.com\/media\/[a-z0-9]+\/giphy)\.gif', flags=re.I|re.A)
youtube_regex = re.compile('(<p>[^<]*)(https:\/\/youtube\.com\/watch\?[\w\-.#&/=?@%+]{7,})', flags=re.I|re.A)
2023-04-23 13:22:41 +00:00
yt_id_regex = re.compile('[\w\-]{5,20}', flags=re.A)
2022-12-08 13:02:51 +00:00
link_fix_regex = re.compile("(\[.*?\]\()(?!http|\/)(.*?\))(?!([^<]*<\/(code|pre|a)>|[^`]*`))", flags=re.A)
css_url_regex = re.compile('url\(\s*[\'"]?(.*?)[\'"]?\s*\)', flags=re.I|re.A)
linefeeds_regex = re.compile("([^\n])\n([^\n])", flags=re.A)
greentext_regex = re.compile("(\n|^)>([^ >][^\n]*)", flags=re.A)
ascii_only_regex = re.compile("[ -~]+", flags=re.A)
2022-08-14 03:14:36 +00:00
reddit_to_vreddit_regex = re.compile('(^|>|")https:\/\/old.reddit.com\/(r|u)\/', flags=re.A)
2022-10-05 00:45:58 +00:00
reddit_domain_regex = re.compile("(^|\s|\()https?:\/\/(reddit\.com|(?:(?:[A-z]{2})(?:-[A-z]{2})" "?|beta|i|m|pay|ssl|www|new|alpha)\.reddit\.com|libredd\.it|teddit\.net)\/(r|u)\/", flags=re.A)
2022-11-07 00:19:13 +00:00
color_regex = re.compile("[a-f0-9]{6}", flags=re.A)
2022-06-24 17:32:31 +00:00
# lazy match on the .*?, only match if there is trailing stuff
# Specifically match Snappy's way of formatting, this might break some losers' comments.
showmore_regex = re.compile(r"^(.*?</p>(?:</li></ul>)?)(\s*<p>.*)", flags=re.A|re.DOTALL)
2022-07-06 11:49:13 +00:00
search_token_regex = re.compile('"([^"]*)"|(\S+)', flags=re.A)
git_regex = re.compile("ref: (refs/.+)", flags=re.A)
2022-07-23 13:35:45 +00:00
pronouns_regex = re.compile("([a-z]{1,5})/[a-z]{1,5}(/[a-z]{1,5})?", flags=re.A|re.I)
2022-07-11 16:46:08 +00:00
2022-11-11 09:49:43 +00:00
html_title_regex = re.compile("<title>(.{1,200})</title>", flags=re.I)
2023-01-23 06:22:01 +00:00
def sub_matcher(match:re.Match, upper=False, replace_with:Union[dict[str, str], dict[str, List[str]]]=SLURS_FOR_REPLACING):
group_num = 0
2022-11-03 23:14:55 +00:00
match_str = match.group(group_num)
if match_str.startswith('<'):
return match_str
2022-07-17 19:17:46 +00:00
else:
2022-11-03 23:14:55 +00:00
repl = replace_with[match_str.lower()]
return repl if not upper or "<img" in repl else repl.upper()
2023-01-23 06:22:01 +00:00
def sub_matcher_upper(match, replace_with:Union[dict[str, str], dict[str, List[str]]]=SLURS_FOR_REPLACING):
return sub_matcher(match, upper=True, replace_with=replace_with)
# TODO: make censoring a bit better
def sub_matcher_slurs(match, upper=False):
2023-01-23 06:22:01 +00:00
return sub_matcher(match, upper, replace_with=SLURS_FOR_REPLACING)
def sub_matcher_slurs_upper(match):
return sub_matcher_slurs(match, upper=True)
def sub_matcher_profanities(match, upper=False):
return sub_matcher(match, upper, replace_with=PROFANITIES)
def sub_matcher_profanities_upper(match):
return sub_matcher_profanities(match, upper=True)
2022-11-03 23:01:12 +00:00
def censor_slurs(body:Optional[str], logged_user):
if not body: return ""
def replace_re(body:str, regex:re.Pattern, regex_upper:re.Pattern, sub_func, sub_func_upper):
body = regex_upper.sub(sub_func_upper, body)
return regex.sub(sub_func, body)
2023-01-01 11:36:20 +00:00
if not logged_user or logged_user == 'chat' or logged_user.slurreplacer:
body = replace_re(body, slur_regex, slur_regex_upper, sub_matcher_slurs, sub_matcher_slurs_upper)
if SITE_NAME == 'rDrama':
if not logged_user or logged_user == 'chat' or logged_user.profanityreplacer:
body = replace_re(body, profanity_regex, profanity_regex_upper, sub_matcher_profanities, sub_matcher_profanities_upper)
return body
2022-07-11 12:14:18 +00:00
commands = {
"fortune": FORTUNE_REPLIES,
"factcheck": FACTCHECK_REPLIES,
2022-07-13 13:06:59 +00:00
"8ball": EIGHTBALL_REPLIES,
"roll": range(1, 10000)
2022-07-11 12:14:18 +00:00
}
2023-01-23 07:38:16 +00:00
command_regex = re.compile("(\s|^)#(fortune|factcheck|8ball|roll)", flags=re.A|re.I)
2022-07-11 12:14:18 +00:00
def command_regex_matcher(match, upper=False):
result = str(choice(commands[match.group(2).lower()]))
2022-07-14 14:43:05 +00:00
if match.group(2) == 'roll':
color = tuple(choices(range(256), k=3))
result = f'<b style="color:rgb{color}">Your roll: {result}</b>'
2022-08-30 21:19:53 +00:00
return match.group(1) + result
2022-12-30 16:10:29 +00:00
reason_regex_post = re.compile('(/post/[0-9]+)', flags=re.A)
reason_regex_comment = re.compile('(/comment/[0-9]+)', flags=re.A)
2023-01-23 07:38:16 +00:00
2023-01-27 07:07:58 +00:00
discord_username_regex = re.compile("(\s|^|>).{2,32}#[0-9]{4}(?=[^0-9]|$)", flags=re.A)
2023-01-27 09:11:13 +00:00
2023-02-01 15:59:10 +00:00
numbered_list_regex = re.compile('((\s|^)[0-9]+)\. ', flags=re.A)
2023-02-18 21:03:23 +00:00
comment_link_regex = re.compile("/[0-9]+$", flags=re.A)
2023-02-19 19:31:26 +00:00
2023-03-25 15:07:12 +00:00
image_link_regex = re.compile(f"https://(i\.)?{SITE}\/(chat_)?images\/[0-9]{{11,17}}\.webp", flags=re.A)
video_link_regex = re.compile(f"https://(video\.)?{SITE}\/videos\/[0-9]{{11,17}}\.({video_regex_extensions})", flags=re.A)