From bbe412b81c452b07f4669c4fc9c2b6e4d24943a0 Mon Sep 17 00:00:00 2001 From: Hey Moon Date: Sun, 8 May 2022 10:21:05 -0500 Subject: [PATCH] remove markdown --- autodrama.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/autodrama.py b/autodrama.py index cb13630..d35dedc 100644 --- a/autodrama.py +++ b/autodrama.py @@ -12,6 +12,8 @@ from psaw import PushshiftAPI from os.path import exists, join, realpath, split import langdetect from RDramaAPIInterface import RDramaAPIInterface +from bs4 import BeautifulSoup +from markdown import markdown BANNED_WORDS_IN_POST = ['comment', 'promotion'] LANGUAGE_DETECTION_ACCURACY_THRESHOLD = 10 @@ -97,6 +99,12 @@ def get_based_submissions(subreddit, time_frame, limit): print(f"Error while processing {submission} : {e}") return submissions +def strip_markdown(markdown_string): + html = markdown(markdown_string) + soup = BeautifulSoup(html, "html.parser") + text = ''.join(soup.findAll(text=True)) + return text + def analyze_comments(submission : 'Submission'): print(f"[{submission.id}]Retrieving Comments") comments = pushshift_api.search_comments(subreddit=submission.subreddit.display_name, link_id=submission.id) @@ -208,7 +216,9 @@ def generate_comment_display_section(submissions : 'Tuple[float, Submission]', s if (parent != None): parent_body = remove_quoted_text(parent.body) + parent_body = strip_markdown(parent_body) parent_body = parent_body.replace("\n", "") + if len(parent_body) > max_len: parent_body = parent_body[0:max_len-3] + "..." markdown_lines.append(f"> {parent_body} ({parent.score})") @@ -217,6 +227,7 @@ def generate_comment_display_section(submissions : 'Tuple[float, Submission]', s comment_indent = ">" comment_body = remove_quoted_text(comment.body) + comment_body = strip_markdown(comment_body) comment_body = comment_body.replace("\n", "") if len(comment_body) > max_len: comment_body = comment_body[0:max_len-3] + "..."