import bleach from bs4 import BeautifulSoup from bleach.linkifier import LinkifyFilter from urllib.parse import ParseResult, urlunparse from functools import partial from .get import * import os.path _allowed_tags = tags = ['b', 'blockquote', 'br', 'code', 'del', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'li', 'ol', 'p', 'pre', 'strong', 'sub', 'sup', 'table', 'tbody', 'th', 'thead', 'td', 'tr', 'ul', 'marquee', 'a', 'img', 'span', ] _allowed_attributes = { 'a': ['href', 'title', "rel", "data-original-name"], 'i': [], 'img': ['src', 'class'], 'span': ['style'] } _allowed_protocols = [ 'http', 'https' ] _allowed_styles =[ 'color', 'font-weight' ] # filter to make all links show domain on hover def a_modify(attrs, new=False): raw_url=attrs.get((None, "href"), None) if raw_url: parsed_url = urlparse(raw_url) domain = parsed_url.netloc attrs[(None, "target")] = "_blank" if domain and not domain.endswith("rdrama.net"): attrs[(None, "rel")] = "nofollow noopener" # Force https for all external links in comments # (Drama already forces its own https) new_url = ParseResult(scheme="https", netloc=parsed_url.netloc, path=parsed_url.path, params=parsed_url.params, query=parsed_url.query, fragment=parsed_url.fragment) attrs[(None, "href")] = urlunparse(new_url) return attrs _clean_wo_links = bleach.Cleaner(tags=_allowed_tags, attributes=_allowed_attributes, protocols=_allowed_protocols, ) _clean_w_links = bleach.Cleaner(tags=_allowed_tags, attributes=_allowed_attributes, protocols=_allowed_protocols, styles=_allowed_styles, filters=[partial(LinkifyFilter, skip_tags=["pre"], parse_email=False, callbacks=[a_modify] ) ] ) def sanitize(text, linkgen=False, flair=False): text = text.replace("\ufeff", "").replace("m.youtube.com", "youtube.com") if linkgen: sanitized = _clean_w_links.clean(text) #soupify soup = BeautifulSoup(sanitized, features="html.parser") #img elements - embed for tag in soup.find_all("img"): url = tag.get("src", "") if not url: continue netloc = urlparse(url).netloc domain = get_domain(netloc) if not(netloc) or (domain and domain.show_thumbnail): if "profile-pic-20" not in tag.get("class", ""): #print(tag.get('class')) # set classes and wrap in link tag["rel"] = "nofollow" tag["style"] = "max-height: 100px; max-width: 100%;" tag["class"] = "in-comment-image rounded-sm my-2" link = soup.new_tag("a") link["href"] = tag["src"] link["rel"] = "nofollow noopener" link["target"] = "_blank" link["onclick"] = f"expandDesktopImage('{tag['src']}');" link["data-toggle"] = "modal" link["data-target"] = "#expandImageModal" tag.wrap(link) else: # non-whitelisted images get replaced with links new_tag = soup.new_tag("a") new_tag.string = tag["src"] new_tag["href"] = tag["src"] new_tag["rel"] = "nofollow noopener" tag.replace_with(new_tag) #disguised link preventer for tag in soup.find_all("a"): if re.match("https?://\S+", str(tag.string)): try: tag.string = tag["href"] except: tag.string = "" #clean up tags in code for tag in soup.find_all("code"): tag.contents=[x.string for x in tag.contents if x.string] #whatever else happens with images, there are only two sets of classes allowed for tag in soup.find_all("img"): if 'profile-pic-20' not in tag.attrs.get("class",""): tag.attrs['class']="in-comment-image rounded-sm my-2" #table format for tag in soup.find_all("table"): tag.attrs['class']="table table-striped" for tag in soup.find_all("thead"): tag.attrs['class']="bg-primary text-white" sanitized = str(soup) else: sanitized = _clean_wo_links.clean(text) start = '<s>' end = '</s>' if start in sanitized and end in sanitized and start in sanitized.split(end)[0] and end in sanitized.split(start)[1]: sanitized = sanitized.replace(start, '').replace(end, '') if flair: emojisize = 20 else: emojisize = 30 for i in re.finditer(':(.{1,30}?):', sanitized): if os.path.isfile(f'/d/drama/assets/images/emojis/{i.group(1)}.gif'): sanitized = sanitized.replace(f':{i.group(1)}:', f'') sanitized = sanitized.replace("https://www.", "https://").replace("https://youtu.be/", "https://youtube.com/embed/").replace("https://music.youtube.com/watch?v=", "https://youtube.com/embed/").replace("/watch?v=", "/embed/").replace("https://open.spotify.com/", "https://open.spotify.com/embed/").replace("https://streamable.com/", "https://streamable.com/e/").replace("https://youtube.com/shorts/", "https://youtube.com/embed/") for i in re.finditer('{url}' htmlsource = f'
' sanitized = sanitized.replace(replacing, htmlsource) for i in re.finditer('{url}' htmlsource = f'' sanitized = sanitized.replace(replacing, htmlsource) sanitized = sanitized.replace("https://mobile.twitter.com", "https://twitter.com") for rd in ["https://reddit.com/", "https://new.reddit.com/", "https://www.reddit.com/", "https://redd.it/"]: sanitized = sanitized.replace(rd, "https://old.reddit.com/") for i in re.finditer('(/comments/.*?)"', sanitized): url = i.group(1) if not "sort=" in url: sanitized = sanitized.replace(url, f"{url}?sort=controversial") return sanitized