From 182ee41d6aa1877c5494737af347b70540acb873 Mon Sep 17 00:00:00 2001 From: transbitch <> Date: Sun, 8 Oct 2023 20:49:40 -0400 Subject: [PATCH] Finish up the checklist --- files/helpers/marseyfx/modifiers.py | 526 ++++++++++++++-------------- files/helpers/marseyfx/parser.py | 231 ++++++------ files/helpers/marseyfx/tokenizer.py | 370 +++++++++---------- files/helpers/sanitize.py | 77 ++-- 4 files changed, 623 insertions(+), 581 deletions(-) diff --git a/files/helpers/marseyfx/modifiers.py b/files/helpers/marseyfx/modifiers.py index 0bac7a635..faebdff48 100644 --- a/files/helpers/marseyfx/modifiers.py +++ b/files/helpers/marseyfx/modifiers.py @@ -9,337 +9,337 @@ import files.helpers.marseyfx.parser as parser modifier_whitelist = [] class Modifier: - name: str - args: list[Token] + name: str + args: list[Token] - def __init__(self, name: str, args: list[Token]): - self.name = name - self.args = args + def __init__(self, name: str, args: list[Token]): + self.name = name + self.args = args def modifier(fn): - modifier_whitelist.append(fn.__name__) + modifier_whitelist.append(fn.__name__) - def wrapper(*args, **kwargs): - slf = args[0] - ctx = ModifierContextFrame(fn.__name__) - slf.context_frames.insert(0, ctx) - slf.child = slf.container - slf.container = slf.child.wrap(slf.soup.new_tag('div', attrs={'class': f'marseyfx-modifier marseyfx-modifier-{ctx.name}'})) - slf.add_child_class(f'marseyfx-modifier-{ctx.name}-self') - res = fn(*args, **kwargs) - slf.context_frames.pop(0) - return res - return wrapper + def wrapper(*args, **kwargs): + slf = args[0] + ctx = ModifierContextFrame(fn.__name__) + slf.context_frames.insert(0, ctx) + slf.child = slf.container + slf.container = slf.child.wrap(slf.soup.new_tag('div', attrs={'class': f'marseyfx-modifier marseyfx-modifier-{ctx.name}'})) + slf.add_child_class(f'marseyfx-modifier-{ctx.name}-self') + res = fn(*args, **kwargs) + slf.context_frames.pop(0) + return res + return wrapper def heavy(fn): - def wrapper(*args, **kwargs): - slf = args[0] - slf.heavy_count += 1 - return fn(*args, **kwargs) - return wrapper + def wrapper(*args, **kwargs): + slf = args[0] + slf.heavy_count += 1 + return fn(*args, **kwargs) + return wrapper class ModifierContextFrame: - name: str - wrap_depth: int = 0 - def __init__(self, name: str): - self.name = name + name: str + wrap_depth: int = 0 + def __init__(self, name: str): + self.name = name class Modified: - soup: BeautifulSoup - container: Tag - child: Tag - tokenizer: Tokenizer - heavy_count = 0 - context_frames: list[ModifierContextFrame] + soup: BeautifulSoup + container: Tag + child: Tag + tokenizer: Tokenizer + heavy_count = 0 + context_frames: list[ModifierContextFrame] - def __init__(self, el, tokenizer): - self.soup = BeautifulSoup() - self.container = el - self.tokenizer = tokenizer - self.context_frames = [] + def __init__(self, el, tokenizer): + self.soup = BeautifulSoup() + self.container = el + self.tokenizer = tokenizer + self.context_frames = [] - def ctx(self): - return self.context_frames[0] if len(self.context_frames) > 0 else None + def ctx(self): + return self.context_frames[0] if len(self.context_frames) > 0 else None - def add_class(self, class_: str): - if not 'class' in self.container.attrs: - self.container.attrs['class'] = [class_] - else: - self.container.attrs['class'].append(' ' + class_) + def add_class(self, class_: str): + if not 'class' in self.container.attrs: + self.container.attrs['class'] = [class_] + else: + self.container.attrs['class'].append(' ' + class_) - def add_child_class(self, class_: str): - if not 'class' in self.child.attrs: - self.child.attrs['class'] = [class_] - else: - self.child.attrs['class'].append(' ' + class_) + def add_child_class(self, class_: str): + if not 'class' in self.child.attrs: + self.child.attrs['class'] = [class_] + else: + self.child.attrs['class'].append(' ' + class_) - def apply_modifiers(self, modifiers: list[Modifier]): - for modifier in modifiers: - if modifier.name in modifier_whitelist: - getattr(self, modifier.name)(*map(GroupToken.unwrap, modifier.args)) + def apply_modifiers(self, modifiers: list[Modifier]): + for modifier in modifiers: + if modifier.name in modifier_whitelist: + getattr(self, modifier.name)(*map(GroupToken.unwrap, modifier.args)) - # Using this instead of throwing everything in a string and then parsing it helps - # mitigate the risk of XSS attacks - def image(self, name: str): + # Using this instead of throwing everything in a string and then parsing it helps + # mitigate the risk of XSS attacks + def image(self, name: str): - filename = name + filename = name - if not '.' in filename: - filename += '.webp' + if not '.' in filename: + filename += '.webp' - image = self.soup.new_tag( - 'img', - loading='lazy', - src=f'{SITE_FULL_IMAGES}/i/{filename}', - attrs={'class': f'marseyfx-image marseyfx-image-{name}'} - ) + image = self.soup.new_tag( + 'img', + loading='lazy', + src=f'{SITE_FULL_IMAGES}/i/{filename}', + attrs={'class': f'marseyfx-image marseyfx-image-{name}'} + ) - container = self.soup.new_tag( - 'div', - attrs={'class': f'marseyfx-image-container marseyfx-image-container-{name}'} - ) + container = self.soup.new_tag( + 'div', + attrs={'class': f'marseyfx-image-container marseyfx-image-container-{name}'} + ) - container.append(image) - return container - - def underlay(self, underlay: Tag): - self.container.insert(0, underlay) + container.append(image) + return container + + def underlay(self, underlay: Tag): + self.container.insert(0, underlay) - def overlay(self, overlay: Tag): - self.container.append(overlay) + def overlay(self, overlay: Tag): + self.container.append(overlay) - def add_style(self, style: str): - if 'style' in self.container.attrs: - style = self.container.attrs['style'] + style + def add_style(self, style: str): + if 'style' in self.container.attrs: + style = self.container.attrs['style'] + style - self.container.attrs['style'] = style + self.container.attrs['style'] = style - def meme_text(self, text: str, class_: Optional[str] = None): - attrs = {} - if class_ is not None: - attrs = {'class': f'marseyfx-memetext-{class_}'} + def meme_text(self, text: str, class_: Optional[str] = None): + attrs = {} + if class_ is not None: + attrs = {'class': f'marseyfx-memetext-{class_}'} - tag = self.soup.new_tag( - 'span', - attrs=attrs - ) + tag = self.soup.new_tag( + 'span', + attrs=attrs + ) - tag.string = text + tag.string = text - self.overlay(tag) + self.overlay(tag) - def create_other(self, other: GroupToken = None): - wrapper = self.soup.new_tag('div', attrs={'class': f'marseyfx-modifier-{self.ctx().name}-other'}) + def create_other(self, other: GroupToken = None): + wrapper = self.soup.new_tag('div', attrs={'class': f'marseyfx-modifier-{self.ctx().name}-other'}) - if other is None: - return wrapper - - other = other.wrap() - other_emoji = parser.parse_from_token(self.tokenizer, other) + if other is None: + return wrapper + + other = other.wrap() + other_emoji = parser.parse_from_token(self.tokenizer, other) - if other_emoji is None: - return wrapper - - other_emoji.is_primary = False + if other_emoji is None: + return wrapper + + other_emoji.is_primary = False - return other_emoji.create_el(self.tokenizer).wrap(wrapper) - - def wrap_child(self, class_: str = ''): - ctx = self.ctx() - wrap_insert = '' - if ctx.wrap_depth > 0: - wrap_insert = f'-{ctx.wrap_depth + 1}' - - self.child = self.child.wrap(self.soup.new_tag('div', attrs={'class': f'marseyfx-modifier-{self.ctx().name}-wrapper{wrap_insert} {class_}'})) + return other_emoji.create_el(self.tokenizer).wrap(wrapper) + + def wrap_child(self, class_: str = ''): + ctx = self.ctx() + wrap_insert = '' + if ctx.wrap_depth > 0: + wrap_insert = f'-{ctx.wrap_depth + 1}' + + self.child = self.child.wrap(self.soup.new_tag('div', attrs={'class': f'marseyfx-modifier-{self.ctx().name}-wrapper{wrap_insert} {class_}'})) - ctx.wrap_depth += 1 + ctx.wrap_depth += 1 - @modifier - def pat(self): - self.overlay(self.image('hand')) + @modifier + def pat(self): + self.overlay(self.image('hand')) - @modifier - def love(self): - self.overlay(self.image('love-foreground')) - self.underlay(self.image('love-background')) + @modifier + def love(self): + self.overlay(self.image('love-foreground')) + self.underlay(self.image('love-background')) - @modifier - def talking(self): - self.overlay(self.image('talking')) + @modifier + def talking(self): + self.overlay(self.image('talking')) - @modifier - def genocide(self): - pass + @modifier + def genocide(self): + pass - @modifier - def party(self): - pass + @modifier + def party(self): + pass - @modifier - def says(self, msg): - if not isinstance(msg, StringLiteralToken): - return - - container = self.soup.new_tag( - 'div', - attrs={'class': 'marseyfx-modifier-says-container'} - ) - self.container.append(container) + @modifier + def says(self, msg): + if not isinstance(msg, StringLiteralToken): + return + + container = self.soup.new_tag( + 'div', + attrs={'class': 'marseyfx-modifier-says-container'} + ) + self.container.append(container) - container.append(self.soup.new_tag( - 'div', - attrs={'class': 'marseyfx-modifier-says-nub'} - )) + container.append(self.soup.new_tag( + 'div', + attrs={'class': 'marseyfx-modifier-says-nub'} + )) - tag = self.soup.new_tag( - 'span', - attrs={'class': 'marseyfx-modifier-says-text'} - ) - tag.string = msg.value - container.append(tag) + tag = self.soup.new_tag( + 'span', + attrs={'class': 'marseyfx-modifier-says-text'} + ) + tag.string = msg.value + container.append(tag) - @modifier - def fallover(self): - self.container = self.container.wrap(self.soup.new_tag( - 'div', - attrs={'class': 'marseyfx-modifier-fallover-container'} - )) + @modifier + def fallover(self): + self.container = self.container.wrap(self.soup.new_tag( + 'div', + attrs={'class': 'marseyfx-modifier-fallover-container'} + )) - @modifier - def transform(self, transformstyle: StringLiteralToken): - if not re.fullmatch(r'[\w()\s%\.,]*', transformstyle.value): - print(f'Evil transform detected: {transformstyle.value}') - return - - self.add_style(f'transform: {transformstyle.value};') - - @heavy - @modifier - def enraged(self): - self.underlay(self.soup.new_tag( - 'div', - attrs={'class': 'marseyfx-modifier-enraged-underlay'} - )) + @modifier + def transform(self, transformstyle: StringLiteralToken): + if not re.fullmatch(r'[\w()\s%\.,]*', transformstyle.value): + print(f'Evil transform detected: {transformstyle.value}') + return + + self.add_style(f'transform: {transformstyle.value};') + + @heavy + @modifier + def enraged(self): + self.underlay(self.soup.new_tag( + 'div', + attrs={'class': 'marseyfx-modifier-enraged-underlay'} + )) - @modifier - def meme(self, toptext: Optional[StringLiteralToken] = None, bottomtext: Optional[StringLiteralToken] = None): - if isinstance(toptext, StringLiteralToken): - self.meme_text(toptext.value, 'toptext') + @modifier + def meme(self, toptext: Optional[StringLiteralToken] = None, bottomtext: Optional[StringLiteralToken] = None): + if isinstance(toptext, StringLiteralToken): + self.meme_text(toptext.value, 'toptext') - if isinstance(bottomtext, StringLiteralToken): - self.meme_text(bottomtext.value, 'bottomtext') + if isinstance(bottomtext, StringLiteralToken): + self.meme_text(bottomtext.value, 'bottomtext') - def bottomtext(self, text: StringLiteralToken): - if not isinstance(text, StringLiteralToken): - return + def bottomtext(self, text: StringLiteralToken): + if not isinstance(text, StringLiteralToken): + return - tag = self.soup.new_tag( - 'span', - attrs={'class': 'marseyfx-modifier-bottomtext-text'} - ) + tag = self.soup.new_tag( + 'span', + attrs={'class': 'marseyfx-modifier-bottomtext-text'} + ) - tag.string = text.value + tag.string = text.value - self.overlay(tag) + self.overlay(tag) - @modifier - def spin(self, speed=None): - if not isinstance(speed, NumberLiteralToken): - return + @modifier + def spin(self, speed=None): + if not isinstance(speed, NumberLiteralToken): + return - self.add_style(f'animation-duration: {1/speed.value}s;') + self.add_style(f'animation-duration: {1/speed.value}s;') - @modifier - def triumphs(self, other: GroupToken): - other = other.wrap() - other_emoji = parser.parse_from_token(self.tokenizer, other) - print(f'Other emoji: {other_emoji} / Token: {other}') + @modifier + def triumphs(self, other: GroupToken): + other = other.wrap() + other_emoji = parser.parse_from_token(self.tokenizer, other) + print(f'Other emoji: {other_emoji} / Token: {other}') - if other_emoji is None: - return - - self.add_child_class('marseyfx-modifier-triumphs-self') + if other_emoji is None: + return + + self.add_child_class('marseyfx-modifier-triumphs-self') - other_emoji.is_primary = False + other_emoji.is_primary = False - other = other_emoji.create_el(self.tokenizer).wrap( - self.soup.new_tag('div', attrs={'class': 'marseyfx-modifier-triumphs-other'}) - ) - self.underlay(other) + other = other_emoji.create_el(self.tokenizer).wrap( + self.soup.new_tag('div', attrs={'class': 'marseyfx-modifier-triumphs-other'}) + ) + self.underlay(other) - @modifier - def nested(self, inside: GroupToken): - inside = inside.wrap() - inside_emoji = parser.parse_from_token(self.tokenizer, inside) + @modifier + def nested(self, inside: GroupToken): + inside = inside.wrap() + inside_emoji = parser.parse_from_token(self.tokenizer, inside) - if inside_emoji is None: - return - - inside_emoji.is_primary = False + if inside_emoji is None: + return + + inside_emoji.is_primary = False - inside = inside_emoji.create_el(self.tokenizer).wrap( - self.soup.new_tag('div', attrs={'class': 'marseyfx-modifier-nested-other'}) - ) + inside = inside_emoji.create_el(self.tokenizer).wrap( + self.soup.new_tag('div', attrs={'class': 'marseyfx-modifier-nested-other'}) + ) - self.underlay(inside) + self.underlay(inside) - self.add_child_class('marseyfx-modifier-nested-side') - child = self.child - self.child = child.wrap(self.soup.new_tag('div', attrs={'class': 'marseyfx-modifier-nested-outer-container'})) - other_side = copy.copy(child) - self.child.append(other_side) + self.add_child_class('marseyfx-modifier-nested-side') + child = self.child + self.child = child.wrap(self.soup.new_tag('div', attrs={'class': 'marseyfx-modifier-nested-outer-container'})) + other_side = copy.copy(child) + self.child.append(other_side) - @modifier - def morph(self, other: GroupToken): - self.add_child_class('marseyfx-modifier-morph-self') + @modifier + def morph(self, other: GroupToken): + self.add_child_class('marseyfx-modifier-morph-self') - other = other.wrap() - other_emoji = parser.parse_from_token(self.tokenizer, other) + other = other.wrap() + other_emoji = parser.parse_from_token(self.tokenizer, other) - if other_emoji is None: - return - - other_emoji.is_primary = False - other = other_emoji.create_el(self.tokenizer).wrap( - self.soup.new_tag('div', attrs={'class': 'marseyfx-modifier-morph-other'}) - ) + if other_emoji is None: + return + + other_emoji.is_primary = False + other = other_emoji.create_el(self.tokenizer).wrap( + self.soup.new_tag('div', attrs={'class': 'marseyfx-modifier-morph-other'}) + ) - self.container.append(other) + self.container.append(other) - @heavy - @modifier - def bulge(self, strength: NumberLiteralToken = None): - self.child = self.child.wrap(self.soup.new_tag('svg', attrs={'class': 'marseyfx-modifier-bulge-container'})) + @heavy + @modifier + def bulge(self, strength: NumberLiteralToken = None): + self.child = self.child.wrap(self.soup.new_tag('svg', attrs={'class': 'marseyfx-modifier-bulge-container'})) - @modifier - def prohibition(self): - self.overlay(self.image('prohibition.svg')) + @modifier + def prohibition(self): + self.overlay(self.image('prohibition.svg')) - @modifier - def scope(self): - self.overlay(self.image('scope.svg')) - self.add_child_class('marseyfx-modifier-scope-target') + @modifier + def scope(self): + self.overlay(self.image('scope.svg')) + self.add_child_class('marseyfx-modifier-scope-target') - @modifier - def fucks(self, other: GroupToken): - other = self.create_other(other) - self.container.append(other) + @modifier + def fucks(self, other: GroupToken): + other = self.create_other(other) + self.container.append(other) - @heavy - @modifier - def glow(self): - pass + @heavy + @modifier + def glow(self): + pass - @heavy - @modifier - def echo(self): - for i in range(1, 4): - tag = copy.copy(self.child) - tag.attrs['class'] = tag.attrs['class'].copy() - tag.attrs['class'].append(f'marseyfx-modifier-echo-clone marseyfx-modifier-echo-clone-{i}') - self.container.append(tag) + @heavy + @modifier + def echo(self): + for i in range(1, 4): + tag = copy.copy(self.child) + tag.attrs['class'] = tag.attrs['class'].copy() + tag.attrs['class'].append(f'marseyfx-modifier-echo-clone marseyfx-modifier-echo-clone-{i}') + self.container.append(tag) - @modifier - def rentfree(self): - self.wrap_child() - self.overlay(self.image('rentfree.png')) \ No newline at end of file + @modifier + def rentfree(self): + self.wrap_child() + self.overlay(self.image('rentfree.png')) \ No newline at end of file diff --git a/files/helpers/marseyfx/parser.py b/files/helpers/marseyfx/parser.py index 7f7b7710b..aec6f0427 100644 --- a/files/helpers/marseyfx/parser.py +++ b/files/helpers/marseyfx/parser.py @@ -1,145 +1,162 @@ +import random from tokenize import Token from bs4 import BeautifulSoup -from files.helpers.config.const import SITE_FULL_IMAGES +from files.helpers.config.const import EMOJI_KINDS, SITE_FULL_IMAGES from files.helpers.get import get_user from files.helpers.marseyfx.tokenizer import ArgsToken, DotToken, GroupToken, NumberLiteralToken, Tokenizer, WordToken from files.helpers.marseyfx.modifiers import Modified, Modifier, modifier_whitelist +from sqlalchemy.sql import func + emoji_replacers = { - '!': 'is_flipped', - '#': 'is_big', - '@': 'is_user' + '!': 'is_flipped', + '#': 'is_big', + '@': 'is_user' } class Emoji: - name: str - token: Token - is_big = False - is_flipped = False - is_user = False - modifiers: list[Modifier] - is_primary = True + name: str + token: Token + is_big = False + is_flipped = False + is_user = False + modifiers: list[Modifier] + is_primary = True + is_golden = False - def __init__(self, name: str, modifiers, token: Token, **args): - for symbol, value in emoji_replacers.items(): - if symbol in name: - name = name.replace(symbol, '') - setattr(self, value, True) + def __init__(self, name: str, modifiers, token: Token, **args): + for symbol, value in emoji_replacers.items(): + if symbol in name: + name = name.replace(symbol, '') + setattr(self, value, True) - self.name = name - self.modifiers = modifiers - self.token = token - self.is_primary = args.get('is_primary', True) + if name.endswith('random'): + kind = name.split('random')[0].title() + if kind == 'Donkeykong': kind = 'Donkey Kong' + elif kind == 'Marseyflag': kind = 'Marsey Flags' + elif kind == 'Marseyalphabet': kind = 'Marsey Alphabet' - def create_el(self, tokenizer: Tokenizer): - soup = BeautifulSoup() - el = None - if (self.is_user): - user = get_user(self.name, graceful=True) - src = None - if user: - src = f'/pp/{user.id}' - - el = soup.new_tag( - 'img', - loading='lazy', - src=src, - attrs={ - 'class': f'marseyfx-emoji marseyfx-image marseyfx-user', - } - ) - else: - el = soup.new_tag( - 'img', - loading='lazy', - src=f'{SITE_FULL_IMAGES}/e/{self.name}.webp', - attrs={ - 'class': f'marseyfx-emoji marseyfx-image', - } - ) + if kind in EMOJI_KINDS: + name = g.db.query(Emoji.name).filter_by(kind=kind).order_by(func.random()).first()[0] - soup.append(el) - el = el.wrap( - soup.new_tag('div', attrs={ - 'class': 'marseyfx-emoji-container' - }) - ) + self.name = name + self.modifiers = modifiers + self.token = token + self.is_primary = args.get('is_primary', True) + if random.random() < 0.004: + self.is_golden = True - mod = Modified(el, tokenizer) - mod.apply_modifiers(self.modifiers) + def create_el(self, tokenizer: Tokenizer): + soup = BeautifulSoup() + el = None + if (self.is_user): + user = get_user(self.name, graceful=True) + src = None + if user: + src = f'/pp/{user.id}' + + el = soup.new_tag( + 'img', + loading='lazy', + src=src, + attrs={ + 'class': f'marseyfx-emoji marseyfx-image marseyfx-user', + } + ) + else: + el = soup.new_tag( + 'img', + loading='lazy', + src=f'{SITE_FULL_IMAGES}/e/{self.name}.webp', + attrs={ + 'class': f'marseyfx-emoji marseyfx-image', + } + ) + + if self.is_golden: + el['class'].append(' golden') + + soup.append(el) + el = el.wrap( + soup.new_tag('div', attrs={ + 'class': 'marseyfx-emoji-container' + }) + ) + + mod = Modified(el, tokenizer) + mod.apply_modifiers(self.modifiers) - container_attrs = { - 'class': 'marseyfx-container', - } + container_attrs = { + 'class': 'marseyfx-container', + } - if self.is_primary: - container_attrs |= { - 'data-bs-toggle': 'tooltip', - 'title': tokenizer.str - } + if self.is_primary: + container_attrs |= { + 'data-bs-toggle': 'tooltip', + 'title': tokenizer.str + } - container = soup.new_tag('div', attrs=container_attrs) + container = soup.new_tag('div', attrs=container_attrs) - if (self.is_big): - container['class'].append(' marseyfx-big') + if (self.is_big): + container['class'].append(' marseyfx-big') - if (self.is_flipped): - container['class'].append(' marseyfx-flipped') + if (self.is_flipped): + container['class'].append(' marseyfx-flipped') - return mod.container.wrap(container) + return mod.container.wrap(container), mod.heavy_count def parse_emoji(tokenizer: Tokenizer): - token = tokenizer.parse_next_tokens() + token = tokenizer.parse_next_tokens() - if len(tokenizer.errors) > 0 or token is None: - return False, None, token + if len(tokenizer.errors) > 0 or token is None: + return False, None, token - emoji = parse_from_token(tokenizer, token) - print(f'Here! {emoji}') + emoji = parse_from_token(tokenizer, token) - if not emoji: - return False, None, token + if not emoji: + return False, None, token - return True, emoji, token + return True, emoji, token def parse_from_token(tokenizer: Tokenizer, token: GroupToken): - if not isinstance(token, GroupToken): - tokenizer.error('Malformed token -- Expected a group token') - return + if not isinstance(token, GroupToken): + tokenizer.error('Malformed token -- Expected a group token') + return - emoji = token.children[0] + emoji = token.children[0] - if not isinstance(emoji, WordToken) and not isinstance(emoji, NumberLiteralToken): - tokenizer.error('Malformed token -- Expected an emoji (word token) or number literal token') - return - - modifiers = [] + if not isinstance(emoji, WordToken) and not isinstance(emoji, NumberLiteralToken): + tokenizer.error('Malformed token -- Expected an emoji (word token) or number literal token') + return + + modifiers = [] - i = 1 - while i + 1 < len(token.children): - t = token.children[i] + i = 1 + while i + 1 < len(token.children): + t = token.children[i] - if not isinstance(t, DotToken): - tokenizer.error('Malformed token -- Expected a dot') - return + if not isinstance(t, DotToken): + tokenizer.error('Malformed token -- Expected a dot') + return - modifier = token.children[i + 1] - if not isinstance(modifier, WordToken): - tokenizer.error('Malformed token -- Expected a modifier name (word token)') - return - - if not modifier.value in modifier_whitelist: - tokenizer.error(f'Unknown modifier: {modifier.value}') - return + modifier = token.children[i + 1] + if not isinstance(modifier, WordToken): + tokenizer.error('Malformed token -- Expected a modifier name (word token)') + return + + if not modifier.value in modifier_whitelist: + tokenizer.error(f'Unknown modifier: {modifier.value}') + return - if not i + 2 < len(token.children) or not isinstance(token.children[i + 2], ArgsToken): - modifiers.append(Modifier(modifier.value, [])) - i += 2 - else: - args = token.children[i + 2] - modifiers.append(Modifier(modifier.value, args.children)) - i += 3 + if not i + 2 < len(token.children) or not isinstance(token.children[i + 2], ArgsToken): + modifiers.append(Modifier(modifier.value, [])) + i += 2 + else: + args = token.children[i + 2] + modifiers.append(Modifier(modifier.value, args.children)) + i += 3 - return Emoji(tokenizer.str[emoji.span[0]:emoji.span[1]], modifiers, token) \ No newline at end of file + return Emoji(tokenizer.str[emoji.span[0]:emoji.span[1]], modifiers, token) \ No newline at end of file diff --git a/files/helpers/marseyfx/tokenizer.py b/files/helpers/marseyfx/tokenizer.py index 7c4ce7dcc..16c9a739d 100644 --- a/files/helpers/marseyfx/tokenizer.py +++ b/files/helpers/marseyfx/tokenizer.py @@ -2,235 +2,235 @@ from abc import abstractmethod import re class TokenizerError: - index: int - error: str + index: int + error: str - def __init__(self, tokenizer, index: int, error: str): - self.tokenizer = tokenizer - self.index = index - self.error = error + def __init__(self, tokenizer, index: int, error: str): + self.tokenizer = tokenizer + self.index = index + self.error = error - def __str__(self): - return f'{self.error}\n {self.tokenizer.str}\n {" " * self.index}^' + def __str__(self): + return f'{self.error}\n {self.tokenizer.str}\n {" " * self.index}^' class Tokenizer: - str: str - index: int - errors: list[TokenizerError] + str: str + index: int + errors: list[TokenizerError] - def __init__(self, str: str): - self.str = str - self.index = 0 - self.errors = [] + def __init__(self, str: str): + self.str = str + self.index = 0 + self.errors = [] - def has_next(self): - return self.index < len(self.str) + def has_next(self): + return self.index < len(self.str) - def peek(self): - if not self.has_next(): - self.error('Unexpected end of input') - return None - return self.str[self.index] - - def eat(self): - c = self.peek() - self.index += 1 - return c - - def barf(self): - self.index -= 1 - - def error(self, error: str): - self.errors.append(TokenizerError(self, self.index, error)) + def peek(self): + if not self.has_next(): + self.error('Unexpected end of input') + return None + return self.str[self.index] + + def eat(self): + c = self.peek() + self.index += 1 + return c + + def barf(self): + self.index -= 1 + + def error(self, error: str): + self.errors.append(TokenizerError(self, self.index, error)) - def token_to_string(self, token): - return self.str[token.span[0]:token.span[1]] + def token_to_string(self, token): + return self.str[token.span[0]:token.span[1]] - def parse_next_tokens(self): - start = self.index - tokens = [] - while self.has_next(): - if self.peek() == ' ': - self.eat() - elif NumberLiteralToken.can_parse(self): - tokens.append(NumberLiteralToken.parse(self)) - elif WordToken.can_parse(self): - tokens.append(WordToken.parse(self)) - elif DotToken.can_parse(self): - tokens.append(DotToken.parse(self)) - elif ArgsToken.can_parse(self): - tokens.append(ArgsToken.parse(self)) - elif StringLiteralToken.can_parse(self): - tokens.append(StringLiteralToken.parse(self)) - else: - break + def parse_next_tokens(self): + start = self.index + tokens = [] + while self.has_next(): + if self.peek() == ' ': + self.eat() + elif NumberLiteralToken.can_parse(self): + tokens.append(NumberLiteralToken.parse(self)) + elif WordToken.can_parse(self): + tokens.append(WordToken.parse(self)) + elif DotToken.can_parse(self): + tokens.append(DotToken.parse(self)) + elif ArgsToken.can_parse(self): + tokens.append(ArgsToken.parse(self)) + elif StringLiteralToken.can_parse(self): + tokens.append(StringLiteralToken.parse(self)) + else: + break - if len(tokens) == 0: - self.error('Expected a token') - return None + if len(tokens) == 0: + self.error('Expected a token') + return None - return GroupToken((start, self.index), tokens) + return GroupToken((start, self.index), tokens) class Token: - span: tuple[int, int] + span: tuple[int, int] - def wrap(self): - if isinstance(self, GroupToken): - return self - else: - return GroupToken(self.span, [self]) + def wrap(self): + if isinstance(self, GroupToken): + return self + else: + return GroupToken(self.span, [self]) - @staticmethod - @abstractmethod - def can_parse(tokenizer: Tokenizer) -> bool: - pass + @staticmethod + @abstractmethod + def can_parse(tokenizer: Tokenizer) -> bool: + pass - @staticmethod - @abstractmethod - def parse(tokenizer: Tokenizer): - pass + @staticmethod + @abstractmethod + def parse(tokenizer: Tokenizer): + pass class WordToken(Token): - value: str + value: str - def __init__(self, span: tuple[int, int], value: str): - self.value = value - self.span = span + def __init__(self, span: tuple[int, int], value: str): + self.value = value + self.span = span - @staticmethod - def can_parse(tokenizer: Tokenizer): - return re.fullmatch(r'[!#@a-zA-Z]', tokenizer.peek()) + @staticmethod + def can_parse(tokenizer: Tokenizer): + return re.fullmatch(r'[!#@a-zA-Z]', tokenizer.peek()) - @staticmethod - def parse(tokenizer: Tokenizer): - start = tokenizer.index - value = '' - while tokenizer.has_next(): - if re.fullmatch(r'[!#@a-zA-Z\d]', tokenizer.peek()): - value += tokenizer.eat() - else: - break + @staticmethod + def parse(tokenizer: Tokenizer): + start = tokenizer.index + value = '' + while tokenizer.has_next(): + if re.fullmatch(r'[!#@a-zA-Z\d]', tokenizer.peek()): + value += tokenizer.eat() + else: + break - return WordToken((start, tokenizer.index), value) + return WordToken((start, tokenizer.index), value) class StringLiteralToken(Token): - value: str + value: str - def __init__(self, span: tuple[int, int], value: str): - self.value = value - self.span = span + def __init__(self, span: tuple[int, int], value: str): + self.value = value + self.span = span - @staticmethod - def can_parse(tokenizer: Tokenizer): - return tokenizer.peek() == '"' - - # i was cuddling with my fwb while writing this ;3 - @staticmethod - def parse(tokenizer: Tokenizer): - start = tokenizer.index - tokenizer.eat() - value = '' - next_escaped = False - while tokenizer.has_next(): - if tokenizer.peek() == '"' and not next_escaped: - tokenizer.eat() - break - elif tokenizer.peek() == '\\' and not next_escaped: - next_escaped = True - tokenizer.eat() - else: - value += tokenizer.eat() - next_escaped = False + @staticmethod + def can_parse(tokenizer: Tokenizer): + return tokenizer.peek() == '"' + + # i was cuddling with my fwb while writing this ;3 + @staticmethod + def parse(tokenizer: Tokenizer): + start = tokenizer.index + tokenizer.eat() + value = '' + next_escaped = False + while tokenizer.has_next(): + if tokenizer.peek() == '"' and not next_escaped: + tokenizer.eat() + break + elif tokenizer.peek() == '\\' and not next_escaped: + next_escaped = True + tokenizer.eat() + else: + value += tokenizer.eat() + next_escaped = False - return StringLiteralToken((start, tokenizer.index), value) - + return StringLiteralToken((start, tokenizer.index), value) + class NumberLiteralToken(Token): - value: float + value: float - def __init__(self, span: tuple[int, int], value: float): - self.value = value - self.span = span + def __init__(self, span: tuple[int, int], value: float): + self.value = value + self.span = span - @staticmethod - def can_parse(tokenizer: Tokenizer): - return re.fullmatch(r'[-\d]', tokenizer.peek()) - - @staticmethod - def can_parse_next(tokenizer: Tokenizer): - return re.fullmatch(r'[-\d\.]', tokenizer.peek()) + @staticmethod + def can_parse(tokenizer: Tokenizer): + return re.fullmatch(r'[-\d]', tokenizer.peek()) + + @staticmethod + def can_parse_next(tokenizer: Tokenizer): + return re.fullmatch(r'[-\d\.]', tokenizer.peek()) - @staticmethod - def parse(tokenizer: Tokenizer): - start = tokenizer.index - value = '' - while tokenizer.has_next(): - if NumberLiteralToken.can_parse_next(tokenizer): - value += tokenizer.eat() - else: - break + @staticmethod + def parse(tokenizer: Tokenizer): + start = tokenizer.index + value = '' + while tokenizer.has_next(): + if NumberLiteralToken.can_parse_next(tokenizer): + value += tokenizer.eat() + else: + break - try: - value = float(value) - except ValueError: - tokenizer.error('Invalid number literal') - value = 0.0 + try: + value = float(value) + except ValueError: + tokenizer.error('Invalid number literal') + value = 0.0 - return NumberLiteralToken((start, tokenizer.index), value) - - def get_float(self): - return float(self.value) + return NumberLiteralToken((start, tokenizer.index), value) + + def get_float(self): + return float(self.value) class DotToken(Token): - def __init__(self, span: tuple[int, int]): - self.span = span + def __init__(self, span: tuple[int, int]): + self.span = span - @staticmethod - def can_parse(tokenizer: Tokenizer): - return tokenizer.peek() == '.' + @staticmethod + def can_parse(tokenizer: Tokenizer): + return tokenizer.peek() == '.' - @staticmethod - def parse(tokenizer: Tokenizer): - tokenizer.eat() - return DotToken((tokenizer.index, tokenizer.index + 1)) + @staticmethod + def parse(tokenizer: Tokenizer): + tokenizer.eat() + return DotToken((tokenizer.index, tokenizer.index + 1)) class GroupToken(Token): - children: list[Token] + children: list[Token] - def __init__(self, span: tuple[int, int], children: list[Token]): - self.children = children + def __init__(self, span: tuple[int, int], children: list[Token]): + self.children = children - # this span is probably wrong tbh but idc - self.span = span + # this span is probably wrong tbh but idc + self.span = span - def unwrap(self): - if len(self.children) == 1: - return self.children[0] - else: - return self + def unwrap(self): + if len(self.children) == 1: + return self.children[0] + else: + return self class ArgsToken(Token): - children: list[GroupToken] - def __init__(self, span: tuple[int, int], children: list[Token]): - self.children = children - self.span = span + children: list[GroupToken] + def __init__(self, span: tuple[int, int], children: list[Token]): + self.children = children + self.span = span - @staticmethod - def can_parse(tokenizer: Tokenizer): - return tokenizer.peek() == '(' + @staticmethod + def can_parse(tokenizer: Tokenizer): + return tokenizer.peek() == '(' - @staticmethod - def parse(tokenizer: Tokenizer): - start = tokenizer.index - tokens = [] - while tokenizer.has_next(): - if tokenizer.peek() == '(': - tokenizer.eat() - if tokenizer.peek() == ')': - tokenizer.eat() - break - elif tokenizer.peek() == ',': - tokenizer.eat() - else: - tokens.append(tokenizer.parse_next_tokens()) + @staticmethod + def parse(tokenizer: Tokenizer): + start = tokenizer.index + tokens = [] + while tokenizer.has_next(): + if tokenizer.peek() == '(': + tokenizer.eat() + if tokenizer.peek() == ')': + tokenizer.eat() + break + elif tokenizer.peek() == ',': + tokenizer.eat() + else: + tokens.append(tokenizer.parse_next_tokens()) - return ArgsToken((start, tokenizer.index), tokens) \ No newline at end of file + return ArgsToken((start, tokenizer.index), tokens) \ No newline at end of file diff --git a/files/helpers/sanitize.py b/files/helpers/sanitize.py index 4190988e9..077aa44b6 100644 --- a/files/helpers/sanitize.py +++ b/files/helpers/sanitize.py @@ -1,9 +1,11 @@ +import copy import functools import random import re import signal from functools import partial from os import path, listdir +from typing import Union from typing_extensions import deprecated from urllib.parse import parse_qs, urlparse, unquote, ParseResult, urlencode, urlunparse import time @@ -29,6 +31,8 @@ from files.helpers.const_stateful import * from files.helpers.regex import * from files.helpers.get import * +from bs4 import Tag + TLDS = ( # Original gTLDs and ccTLDs 'ac','ad','ae','aero','af','ag','ai','al','am','an','ao','aq','ar','arpa','as','asia','at', 'au','aw','ax','az','ba','bb','bd','be','bf','bg','bh','bi','biz','bj','bm','bn','bo','br', @@ -272,7 +276,7 @@ def find_all_emote_endings(word): class RenderEmojisResult: emojis_used: set[str] heavy_count = 0 - tags: list[str] + tags: list[Union[str, Tag]] def __init__(self): self.emojis_used = set() @@ -283,11 +287,33 @@ class RenderEmojisResult: self.heavy_count += other.heavy_count self.tags.extend(other.tags) -def render_emojis(markup: str, **kwargs): + def db_update_count(self): + for emoji in g.db.query(Emoji).filter(Emoji.submitter_id==None, Emoji.name.in_(self.emojis_used)): + emoji.count += 1 + g.db.add(emoji) + +def render_emojis_tag(tag: Tag, **kwargs): + result = RenderEmojisResult() + tag = copy.copy(tag) + + for text_el in tag.find_all(text=True): + if not text_el.parent or text_el.parent.name in {'code', 'pre'}: + continue + res = render_emojis(text_el.text) + text_el.replace_with(*res.tags) + result.update(res) + + result.tags = [tag] + + return result + +def render_emojis(markup: Union[str, Tag], **kwargs): + if isinstance(markup, Tag): + return render_emojis_tag(markup, **kwargs) result = RenderEmojisResult() last_match_end = 0 - golden = kwargs.get('golden', True) + permit_golden = kwargs.get('permit_golden', True) permit_big = kwargs.get('permit_big', True) for emoji_match in marseyfx_emoji_regex.finditer(markup): @@ -306,8 +332,12 @@ def render_emojis(markup: str, **kwargs): if not permit_big: emoji.is_big = False - emoji_html = emoji.create_el(tokenizer) + if not permit_golden: + emoji.is_golden = False + + emoji_html, heavy_count = emoji.create_el(tokenizer) result.tags.append(emoji_html) + result.heavy_count += heavy_count if len(tokenizer.errors) > 0: soup = BeautifulSoup() @@ -595,19 +625,14 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=False, count_emojis soup = BeautifulSoup(sanitized, 'lxml') # -- EMOJI RENDERING -- - emojis_used = set() + emoji_render = render_emojis(soup) + soup = emoji_render.tags[0] - for text_el in soup.find_all(text=True): - if not text_el.parent or text_el.parent.name in {'code', 'pre'}: - continue - res = render_emojis(text_el.text) - text_el.replace_with(*res.tags) - emojis_used.update(res.emojis_used) + if emoji_render.heavy_count > 5: + error("Too many heavy emojis! (Max 5)") if count_emojis: - for emoji in g.db.query(Emoji).filter(Emoji.submitter_id==None, Emoji.name.in_(emojis_used)): - emoji.count += 1 - g.db.add(emoji) + emoji_render.db_update_count() # -- @ MENTIONS -- ping_count = 0 @@ -770,32 +795,32 @@ def allowed_attributes_emojis(tag, name, value): if name == 'cide' and not value: return True return False - @with_sigalrm_timeout(2) def filter_emojis_only(title, golden=True, count_emojis=False): - + # XSS warning: do not allow any html tags, otherwise someone could do something like this: + # `` because when :marsey: is rendered, it will include quotes that + # will end the attribute and allow someone to inject an evil attribute like onerror title = title.replace("\n", "").replace("\r", "").replace("\t", "").replace('<','<').replace('>','>') - title = remove_cuniform(title) - res = render_emojis(title) #old_render_emoji(title, emoji_regex2, golden, emojis_used, is_title=True) + title = strikethrough_regex.sub(r'\1\2', title) + title = bleach.clean(title, tags=['img','del','span'], attributes=allowed_attributes_emojis, protocols=['http','https']).replace('\n','') + + res = render_emojis(title, permit_big=False) #old_render_emoji(title, emoji_regex2, golden, emojis_used, is_title=True) + + if res.heavy_count > 0: + abort(400, "You can't have heavy/filter emojis in the title!") title = ''.join(map(str, res.tags)) if count_emojis: - for emoji in g.db.query(Emoji).filter(Emoji.submitter_id==None, Emoji.name.in_(emojis_used)): - emoji.count += 1 - g.db.add(emoji) + res.db_update_count() - title = strikethrough_regex.sub(r'\1\2', title) - - title = bleach.clean(title, tags=['img','del','span'], attributes=allowed_attributes_emojis, protocols=['http','https']).replace('\n','') + title = title.strip() if len(title) > POST_TITLE_HTML_LENGTH_LIMIT: abort(400, "Rendered title is too big!") - title = title.strip() - return title def is_whitelisted(domain, k):