English only, improve lolcow
parent
76cc5a8530
commit
5a4c8e81f6
55
autodrama.py
55
autodrama.py
|
@ -10,10 +10,11 @@ from praw.models import Comment, Submission
|
||||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||||
from psaw import PushshiftAPI
|
from psaw import PushshiftAPI
|
||||||
from os.path import exists, join, realpath, split
|
from os.path import exists, join, realpath, split
|
||||||
|
import langdetect
|
||||||
from RDramaAPIInterface import RDramaAPIInterface
|
from RDramaAPIInterface import RDramaAPIInterface
|
||||||
|
|
||||||
BANNED_WORDS_IN_POST = ['comment', 'promotion']
|
BANNED_WORDS_IN_POST = ['comment', 'promotion']
|
||||||
|
LANGUAGE_DETECTION_ACCURACY_THRESHOLD = 10
|
||||||
|
|
||||||
def get_real_filename(filename : str):
|
def get_real_filename(filename : str):
|
||||||
path_to_script = realpath(__file__)
|
path_to_script = realpath(__file__)
|
||||||
|
@ -35,22 +36,51 @@ reddit = praw.Reddit(
|
||||||
|
|
||||||
pushshift_api = PushshiftAPI(reddit)
|
pushshift_api = PushshiftAPI(reddit)
|
||||||
|
|
||||||
|
def is_english(string : str) -> bool:
|
||||||
|
return string != '' and langdetect.detect(string) == 'en'
|
||||||
|
|
||||||
|
def string_split(string):
|
||||||
|
return [a for a in sanitize_sentence(string).split(" ") if a != ""]
|
||||||
|
|
||||||
|
def is_submission_english(submission : Submission):
|
||||||
|
title = sanitize_sentence(submission.title)
|
||||||
|
if (title != "" and is_english(title)):
|
||||||
|
return True
|
||||||
|
elif (len(string_split(title)) <= LANGUAGE_DETECTION_ACCURACY_THRESHOLD):
|
||||||
|
description = submission.subreddit.description
|
||||||
|
if (is_english(description)):
|
||||||
|
return True
|
||||||
|
elif (len(string_split(description)) <= LANGUAGE_DETECTION_ACCURACY_THRESHOLD):
|
||||||
|
return is_english(submission.subreddit.display_name)
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def sanitize_sentence(sentence):
|
||||||
|
to_return = ''.join([i for i in sentence.lower() if i in 'abcdefghijklmnopqrstuvwxyz '])
|
||||||
|
return to_return
|
||||||
|
|
||||||
def contains_banned_words(sentence):
|
def contains_banned_words(sentence):
|
||||||
santitized_sentence = ''.join([i for i in sentence.lower() if i in 'abcdefghijklmnopqrstuvwxyz '])
|
santitized_sentence = ''.join([i for i in sentence.lower() if i in 'abcdefghijklmnopqrstuvwxyz '])
|
||||||
return bool(set(BANNED_WORDS_IN_POST).intersection(santitized_sentence.split(" ")))
|
return bool(set(BANNED_WORDS_IN_POST).intersection(santitized_sentence.split(" ")))
|
||||||
|
|
||||||
|
def has_banned_submitter(submission : Submission):
|
||||||
|
if submission.author == None:
|
||||||
|
return False
|
||||||
|
return submission.author.name == "AutoModerator"
|
||||||
|
|
||||||
def get_based_submissions(subreddit, time_frame, limit):
|
def get_based_submissions(subreddit, time_frame, limit):
|
||||||
subscriber_cache = {}
|
subscriber_cache = {}
|
||||||
|
|
||||||
submissions = []
|
submissions = []
|
||||||
most_based_submission = None
|
|
||||||
most_based_score = 0
|
most_based_score = 0
|
||||||
most_relatively_based_submission = None
|
|
||||||
most_relatively_based_score = 0
|
most_relatively_based_score = 0
|
||||||
for submission in reddit.subreddit(subreddit).controversial(time_frame, limit=limit):
|
for submission in reddit.subreddit(subreddit).controversial(time_frame, limit=limit):
|
||||||
try:
|
try:
|
||||||
basedness = (1-submission.upvote_ratio)*submission.num_comments
|
basedness = (1-submission.upvote_ratio)*submission.num_comments
|
||||||
if (submission.author.name == "AutoModerator" or contains_banned_words(submission.title)):
|
if (has_banned_submitter(submission) or contains_banned_words(submission.title)):
|
||||||
|
continue
|
||||||
|
if (not is_submission_english(submission)):
|
||||||
|
print(f"Disregarding \"{submission.title}\" ({submission.id}): Not english")
|
||||||
continue
|
continue
|
||||||
if (submission.subreddit not in subscriber_cache):
|
if (submission.subreddit not in subscriber_cache):
|
||||||
subscriber_cache[submission.subreddit] = submission.subreddit.subscribers
|
subscriber_cache[submission.subreddit] = submission.subreddit.subscribers
|
||||||
|
@ -63,9 +93,8 @@ def get_based_submissions(subreddit, time_frame, limit):
|
||||||
most_relatively_based_submission = submission
|
most_relatively_based_submission = submission
|
||||||
submissions.append((basedness, relative_basedness, submission))
|
submissions.append((basedness, relative_basedness, submission))
|
||||||
print(f"(B: {basedness} RB: {relative_basedness}){submission.title}")
|
print(f"(B: {basedness} RB: {relative_basedness}){submission.title}")
|
||||||
except:
|
except Exception as e:
|
||||||
print(f"Error while processing {submission}")
|
print(f"Error while processing {submission} : {e}")
|
||||||
|
|
||||||
return submissions
|
return submissions
|
||||||
|
|
||||||
def analyze_comments(submission : 'Submission'):
|
def analyze_comments(submission : 'Submission'):
|
||||||
|
@ -240,11 +269,11 @@ def generate_lolcow_display_section(lolcows):
|
||||||
markdown_lines.append(f"Number of comments: {number_of_comments}")
|
markdown_lines.append(f"Number of comments: {number_of_comments}")
|
||||||
comment_angryness_scores = [-1*a[0] for a in biggest_lolcow_info[1]['comments']]
|
comment_angryness_scores = [-1*a[0] for a in biggest_lolcow_info[1]['comments']]
|
||||||
average_angriness = average(comment_angryness_scores)
|
average_angriness = average(comment_angryness_scores)
|
||||||
maximum_angry_info = max(lolcow_comments, key=lambda a : a[0])
|
maximum_angry_info = min(lolcow_comments, key=lambda a : a[0])
|
||||||
minimum_angry_info = min(lolcow_comments, key=lambda a : a[0])
|
minimum_angry_info = max(lolcow_comments, key=lambda a : a[0])
|
||||||
markdown_lines.append(f"Average angriness: {angriness_score_string(average_angriness)}")
|
markdown_lines.append(f"Average angriness: {angriness_score_string(average_angriness)}")
|
||||||
markdown_lines.append(f"Maximum angriness: [{angriness_score_string(-1*maximum_angry_info[0])}]({get_reddit_full_url(maximum_angry_info[1]['comment'].permalink)})")
|
markdown_lines.append(f"Maximum angriness: [{angriness_score_string(maximum_angry_info[0])}]({get_reddit_full_url(maximum_angry_info[1]['comment'].permalink)})")
|
||||||
markdown_lines.append(f"Minimum angriness: [{angriness_score_string(-1*minimum_angry_info[0])}]({get_reddit_full_url(minimum_angry_info[1]['comment'].permalink)})")
|
markdown_lines.append(f"Minimum angriness: [{angriness_score_string(minimum_angry_info[0])}]({get_reddit_full_url(minimum_angry_info[1]['comment'].permalink)})")
|
||||||
|
|
||||||
return markdown_lines
|
return markdown_lines
|
||||||
|
|
||||||
|
@ -373,4 +402,6 @@ else:
|
||||||
rdrama = RDramaAPIInterface(auth, website, timeout, https=https)
|
rdrama = RDramaAPIInterface(auth, website, timeout, https=https)
|
||||||
|
|
||||||
#daily_drama_post(rdrama)
|
#daily_drama_post(rdrama)
|
||||||
print(generate_submission_report(Submission(reddit, "uesnfd"), True))
|
print(generate_submission_report(Submission(reddit, "ukgxe8"), True))
|
||||||
|
#get_based_submissions("all", "day", 150)
|
||||||
|
#is_submission_english(Submission(reddit, "uk817z"))
|
Loading…
Reference in New Issue