remove markdown

master
Hey Moon 2022-05-08 10:21:05 -05:00
parent 39d1aac958
commit bbe412b81c
1 changed files with 11 additions and 0 deletions

View File

@ -12,6 +12,8 @@ from psaw import PushshiftAPI
from os.path import exists, join, realpath, split from os.path import exists, join, realpath, split
import langdetect import langdetect
from RDramaAPIInterface import RDramaAPIInterface from RDramaAPIInterface import RDramaAPIInterface
from bs4 import BeautifulSoup
from markdown import markdown
BANNED_WORDS_IN_POST = ['comment', 'promotion'] BANNED_WORDS_IN_POST = ['comment', 'promotion']
LANGUAGE_DETECTION_ACCURACY_THRESHOLD = 10 LANGUAGE_DETECTION_ACCURACY_THRESHOLD = 10
@ -97,6 +99,12 @@ def get_based_submissions(subreddit, time_frame, limit):
print(f"Error while processing {submission} : {e}") print(f"Error while processing {submission} : {e}")
return submissions return submissions
def strip_markdown(markdown_string):
html = markdown(markdown_string)
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(text=True))
return text
def analyze_comments(submission : 'Submission'): def analyze_comments(submission : 'Submission'):
print(f"[{submission.id}]Retrieving Comments") print(f"[{submission.id}]Retrieving Comments")
comments = pushshift_api.search_comments(subreddit=submission.subreddit.display_name, link_id=submission.id) comments = pushshift_api.search_comments(subreddit=submission.subreddit.display_name, link_id=submission.id)
@ -208,7 +216,9 @@ def generate_comment_display_section(submissions : 'Tuple[float, Submission]', s
if (parent != None): if (parent != None):
parent_body = remove_quoted_text(parent.body) parent_body = remove_quoted_text(parent.body)
parent_body = strip_markdown(parent_body)
parent_body = parent_body.replace("\n", "") parent_body = parent_body.replace("\n", "")
if len(parent_body) > max_len: if len(parent_body) > max_len:
parent_body = parent_body[0:max_len-3] + "..." parent_body = parent_body[0:max_len-3] + "..."
markdown_lines.append(f"> {parent_body} ({parent.score})") markdown_lines.append(f"> {parent_body} ({parent.score})")
@ -217,6 +227,7 @@ def generate_comment_display_section(submissions : 'Tuple[float, Submission]', s
comment_indent = ">" comment_indent = ">"
comment_body = remove_quoted_text(comment.body) comment_body = remove_quoted_text(comment.body)
comment_body = strip_markdown(comment_body)
comment_body = comment_body.replace("\n", "") comment_body = comment_body.replace("\n", "")
if len(comment_body) > max_len: if len(comment_body) > max_len:
comment_body = comment_body[0:max_len-3] + "..." comment_body = comment_body[0:max_len-3] + "..."