English only, improve lolcow

master
Hey Moon 2022-05-07 19:19:13 -05:00
parent 76cc5a8530
commit 5a4c8e81f6
1 changed files with 43 additions and 12 deletions

View File

@ -10,10 +10,11 @@ from praw.models import Comment, Submission
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from psaw import PushshiftAPI from psaw import PushshiftAPI
from os.path import exists, join, realpath, split from os.path import exists, join, realpath, split
import langdetect
from RDramaAPIInterface import RDramaAPIInterface from RDramaAPIInterface import RDramaAPIInterface
BANNED_WORDS_IN_POST = ['comment', 'promotion'] BANNED_WORDS_IN_POST = ['comment', 'promotion']
LANGUAGE_DETECTION_ACCURACY_THRESHOLD = 10
def get_real_filename(filename : str): def get_real_filename(filename : str):
path_to_script = realpath(__file__) path_to_script = realpath(__file__)
@ -35,22 +36,51 @@ reddit = praw.Reddit(
pushshift_api = PushshiftAPI(reddit) pushshift_api = PushshiftAPI(reddit)
def is_english(string : str) -> bool:
return string != '' and langdetect.detect(string) == 'en'
def string_split(string):
return [a for a in sanitize_sentence(string).split(" ") if a != ""]
def is_submission_english(submission : Submission):
title = sanitize_sentence(submission.title)
if (title != "" and is_english(title)):
return True
elif (len(string_split(title)) <= LANGUAGE_DETECTION_ACCURACY_THRESHOLD):
description = submission.subreddit.description
if (is_english(description)):
return True
elif (len(string_split(description)) <= LANGUAGE_DETECTION_ACCURACY_THRESHOLD):
return is_english(submission.subreddit.display_name)
else:
return False
def sanitize_sentence(sentence):
to_return = ''.join([i for i in sentence.lower() if i in 'abcdefghijklmnopqrstuvwxyz '])
return to_return
def contains_banned_words(sentence): def contains_banned_words(sentence):
santitized_sentence = ''.join([i for i in sentence.lower() if i in 'abcdefghijklmnopqrstuvwxyz ']) santitized_sentence = ''.join([i for i in sentence.lower() if i in 'abcdefghijklmnopqrstuvwxyz '])
return bool(set(BANNED_WORDS_IN_POST).intersection(santitized_sentence.split(" "))) return bool(set(BANNED_WORDS_IN_POST).intersection(santitized_sentence.split(" ")))
def has_banned_submitter(submission : Submission):
if submission.author == None:
return False
return submission.author.name == "AutoModerator"
def get_based_submissions(subreddit, time_frame, limit): def get_based_submissions(subreddit, time_frame, limit):
subscriber_cache = {} subscriber_cache = {}
submissions = [] submissions = []
most_based_submission = None
most_based_score = 0 most_based_score = 0
most_relatively_based_submission = None
most_relatively_based_score = 0 most_relatively_based_score = 0
for submission in reddit.subreddit(subreddit).controversial(time_frame, limit=limit): for submission in reddit.subreddit(subreddit).controversial(time_frame, limit=limit):
try: try:
basedness = (1-submission.upvote_ratio)*submission.num_comments basedness = (1-submission.upvote_ratio)*submission.num_comments
if (submission.author.name == "AutoModerator" or contains_banned_words(submission.title)): if (has_banned_submitter(submission) or contains_banned_words(submission.title)):
continue
if (not is_submission_english(submission)):
print(f"Disregarding \"{submission.title}\" ({submission.id}): Not english")
continue continue
if (submission.subreddit not in subscriber_cache): if (submission.subreddit not in subscriber_cache):
subscriber_cache[submission.subreddit] = submission.subreddit.subscribers subscriber_cache[submission.subreddit] = submission.subreddit.subscribers
@ -63,9 +93,8 @@ def get_based_submissions(subreddit, time_frame, limit):
most_relatively_based_submission = submission most_relatively_based_submission = submission
submissions.append((basedness, relative_basedness, submission)) submissions.append((basedness, relative_basedness, submission))
print(f"(B: {basedness} RB: {relative_basedness}){submission.title}") print(f"(B: {basedness} RB: {relative_basedness}){submission.title}")
except: except Exception as e:
print(f"Error while processing {submission}") print(f"Error while processing {submission} : {e}")
return submissions return submissions
def analyze_comments(submission : 'Submission'): def analyze_comments(submission : 'Submission'):
@ -240,11 +269,11 @@ def generate_lolcow_display_section(lolcows):
markdown_lines.append(f"Number of comments: {number_of_comments}") markdown_lines.append(f"Number of comments: {number_of_comments}")
comment_angryness_scores = [-1*a[0] for a in biggest_lolcow_info[1]['comments']] comment_angryness_scores = [-1*a[0] for a in biggest_lolcow_info[1]['comments']]
average_angriness = average(comment_angryness_scores) average_angriness = average(comment_angryness_scores)
maximum_angry_info = max(lolcow_comments, key=lambda a : a[0]) maximum_angry_info = min(lolcow_comments, key=lambda a : a[0])
minimum_angry_info = min(lolcow_comments, key=lambda a : a[0]) minimum_angry_info = max(lolcow_comments, key=lambda a : a[0])
markdown_lines.append(f"Average angriness: {angriness_score_string(average_angriness)}") markdown_lines.append(f"Average angriness: {angriness_score_string(average_angriness)}")
markdown_lines.append(f"Maximum angriness: [{angriness_score_string(-1*maximum_angry_info[0])}]({get_reddit_full_url(maximum_angry_info[1]['comment'].permalink)})") markdown_lines.append(f"Maximum angriness: [{angriness_score_string(maximum_angry_info[0])}]({get_reddit_full_url(maximum_angry_info[1]['comment'].permalink)})")
markdown_lines.append(f"Minimum angriness: [{angriness_score_string(-1*minimum_angry_info[0])}]({get_reddit_full_url(minimum_angry_info[1]['comment'].permalink)})") markdown_lines.append(f"Minimum angriness: [{angriness_score_string(minimum_angry_info[0])}]({get_reddit_full_url(minimum_angry_info[1]['comment'].permalink)})")
return markdown_lines return markdown_lines
@ -373,4 +402,6 @@ else:
rdrama = RDramaAPIInterface(auth, website, timeout, https=https) rdrama = RDramaAPIInterface(auth, website, timeout, https=https)
#daily_drama_post(rdrama) #daily_drama_post(rdrama)
print(generate_submission_report(Submission(reddit, "uesnfd"), True)) print(generate_submission_report(Submission(reddit, "ukgxe8"), True))
#get_based_submissions("all", "day", 150)
#is_submission_english(Submission(reddit, "uk817z"))