remove markdown

master
Hey Moon 2022-05-08 10:21:05 -05:00
parent 39d1aac958
commit bbe412b81c
1 changed files with 11 additions and 0 deletions

View File

@ -12,6 +12,8 @@ from psaw import PushshiftAPI
from os.path import exists, join, realpath, split
import langdetect
from RDramaAPIInterface import RDramaAPIInterface
from bs4 import BeautifulSoup
from markdown import markdown
BANNED_WORDS_IN_POST = ['comment', 'promotion']
LANGUAGE_DETECTION_ACCURACY_THRESHOLD = 10
@ -97,6 +99,12 @@ def get_based_submissions(subreddit, time_frame, limit):
print(f"Error while processing {submission} : {e}")
return submissions
def strip_markdown(markdown_string):
html = markdown(markdown_string)
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(text=True))
return text
def analyze_comments(submission : 'Submission'):
print(f"[{submission.id}]Retrieving Comments")
comments = pushshift_api.search_comments(subreddit=submission.subreddit.display_name, link_id=submission.id)
@ -208,7 +216,9 @@ def generate_comment_display_section(submissions : 'Tuple[float, Submission]', s
if (parent != None):
parent_body = remove_quoted_text(parent.body)
parent_body = strip_markdown(parent_body)
parent_body = parent_body.replace("\n", "")
if len(parent_body) > max_len:
parent_body = parent_body[0:max_len-3] + "..."
markdown_lines.append(f"> {parent_body} ({parent.score})")
@ -217,6 +227,7 @@ def generate_comment_display_section(submissions : 'Tuple[float, Submission]', s
comment_indent = ">"
comment_body = remove_quoted_text(comment.body)
comment_body = strip_markdown(comment_body)
comment_body = comment_body.replace("\n", "")
if len(comment_body) > max_len:
comment_body = comment_body[0:max_len-3] + "..."