diff --git a/files/classes/comment.py b/files/classes/comment.py index b5df483b7..799b59cc4 100644 --- a/files/classes/comment.py +++ b/files/classes/comment.py @@ -237,6 +237,7 @@ class Comment(Base): casino_game = relationship("CasinoGame") wall_user = relationship("User", primaryjoin="User.id==Comment.wall_user_id") edits = relationship("CommentEdit", order_by="CommentEdit.id.desc()") + media_usages = relationship("MediaUsage") def __init__(self, *args, **kwargs): if "created_utc" not in kwargs: diff --git a/files/classes/media.py b/files/classes/media.py index 40b0e8494..0c0366c23 100644 --- a/files/classes/media.py +++ b/files/classes/media.py @@ -1,5 +1,6 @@ import time from sqlalchemy import Column, ForeignKey +from sqlalchemy.orm import relationship from sqlalchemy.sql.sqltypes import * from flask import request, has_request_context from files.classes import Base @@ -23,3 +24,23 @@ class Media(Base): def __repr__(self): return f"<{self.__class__.__name__}(kind={self.kind}, filename={self.filename})>" + +class MediaUsage(Base): + __tablename__ = "media_usages" + id = Column(Integer, primary_key=True) + filename = Column(String, ForeignKey("media.filename")) + post_id = Column(Integer, ForeignKey("posts.id")) + comment_id = Column(Integer, ForeignKey("comments.id")) + created_utc = Column(Integer) + deleted_utc = Column(Integer) + removed_utc = Column(Integer) + + media = relationship("Media") + + def __init__(self, *args, **kwargs): + if "created_utc" not in kwargs: + kwargs["created_utc"] = int(time.time()) + super().__init__(*args, **kwargs) + + def __repr__(self): + return f"<{self.__class__.__name__}(id={self.id})>" diff --git a/files/classes/post.py b/files/classes/post.py index ac7d8e3cb..3d127ca91 100644 --- a/files/classes/post.py +++ b/files/classes/post.py @@ -87,6 +87,7 @@ class Post(Base): hole_obj = relationship("Hole", primaryjoin="foreign(Post.hole)==remote(Hole.name)") options = relationship("PostOption", order_by="PostOption.id") edits = relationship("PostEdit", order_by="PostEdit.id.desc()") + media_usages = relationship("MediaUsage") def __init__(self, *args, **kwargs): if "created_utc" not in kwargs: diff --git a/files/helpers/actions.py b/files/helpers/actions.py index ae5dbb3a6..8b86e2966 100644 --- a/files/helpers/actions.py +++ b/files/helpers/actions.py @@ -430,6 +430,11 @@ def execute_antispam_post_check(title, v, url): for post in similar_posts + similar_urls: post.is_banned = True + + for media_usage in post.media_usages: + media_usage.removed_utc = time.time() + g.db.add(media_usage) + post.profile_pinned = False post.ban_reason = "AutoJanny for spamming" g.db.add(post) @@ -491,6 +496,11 @@ def execute_antispam_comment_check(body, v): for comment in similar_comments: comment.is_banned = True + + for media_usage in comment.media_usages: + media_usage.removed_utc = time.time() + g.db.add(media_usage) + comment.ban_reason = "AutoJanny for spamming" g.db.add(comment) ma = ModAction( diff --git a/files/helpers/cron.py b/files/helpers/cron.py index a7b67889c..5bcae1cd7 100644 --- a/files/helpers/cron.py +++ b/files/helpers/cron.py @@ -12,6 +12,7 @@ from sqlalchemy import or_ import click import requests +import humanize from files.helpers.stats import * from files.routes.front import frontlist @@ -421,4 +422,26 @@ def _set_top_poster_of_the_day_id(): cache.set("top_poster_of_the_day_id", user.id, timeout=86400) def _cleanup_videos(): - subprocess.call("scripts/cleanup_videos.sh", timeout=3000) \ No newline at end of file + subprocess.call("scripts/cleanup_videos.sh", timeout=3000) + + db = db_session() + + clean = [x[0] for x in db.query(MediaUsage.filename).filter_by(deleted_utc=None, removed_utc=None)] + + one_month_ago = time.time() - 2592000 + + to_delete = db.query(MediaUsage.filename, Media.size).join(MediaUsage.media).filter( + MediaUsage.filename.notin_(clean), + or_( + MediaUsage.deleted_utc < one_month_ago, + MediaUsage.removed_utc < one_month_ago, + ), + ).order_by(Media.size.desc()) + + total_saved = 0 + for filename, size in to_delete: + total_saved += size + print(filename, humanize.naturalsize(size, binary=True), flush=True) + + total_saved = humanize.naturalsize(total_saved, binary=True) + print(f"Total saved: {total_saved}") \ No newline at end of file diff --git a/files/helpers/media.py b/files/helpers/media.py index 7c5c824a4..9f99d8816 100644 --- a/files/helpers/media.py +++ b/files/helpers/media.py @@ -163,7 +163,7 @@ def reencode_video(old, new, check_sizes=False): -def process_video(file, v): +def process_video(file, v, post=None): if isinstance(file, str): old = file else: @@ -224,6 +224,12 @@ def process_video(file, v): ) g.db.add(media) + if post: + media_usage = MediaUsage(filename=new) + media_usage.post_id = post.id + g.db.add(media_usage) + + url = SITE_FULL_VIDEOS + new.split('/videos')[1] name = f'/images/{time.time()}'.replace('.','') + '.webp' diff --git a/files/helpers/regex.py b/files/helpers/regex.py index ef8fe1b87..6a9c20303 100644 --- a/files/helpers/regex.py +++ b/files/helpers/regex.py @@ -2,7 +2,7 @@ import random import re from flask import g -from files.classes.media import Media +from files.classes.media import * from .config.const import * NOT_IN_CODE_OR_LINKS = '(?!([^<]*<\/(code|pre|a)>|[^`\n]*`))' @@ -109,14 +109,30 @@ image_check_regex = re.compile(f'!\[\]\(((?!(https:\/\/({hosts})\/|\/)).*?)\)', video_regex_extensions = '|'.join(VIDEO_FORMATS) video_sub_regex = re.compile(f'(?

' + media = g.db.get(Media, filename) + if media: + if obj: + if not obj.id: raise Exception("The thing that never happens happened again") + if str(obj.__class__) == "": + existing = g.db.query(MediaUsage.id).filter_by(filename=filename, post_id=obj.id).one_or_none() + if not existing: + media_usage = MediaUsage(filename=filename) + media_usage.post_id = obj.id + g.db.add(media_usage) + else: + existing = g.db.query(MediaUsage.id).filter_by(filename=filename, comment_id=obj.id).one_or_none() + if not existing: + media_usage = MediaUsage(filename=filename) + media_usage.comment_id = obj.id + g.db.add(media_usage) + + if media.posterurl: + return f'

' return f'

' audio_regex_extensions = '|'.join(AUDIO_FORMATS) diff --git a/files/helpers/sanitize.py b/files/helpers/sanitize.py index 339ec435c..e81fab1fe 100644 --- a/files/helpers/sanitize.py +++ b/files/helpers/sanitize.py @@ -487,7 +487,7 @@ def sanitize(sanitized, golden=True, limit_pings=0, showmore=False, count_emojis sanitized = sanitized.replace('&','&') - sanitized = video_sub_regex.sub(video_sub_regex_matcher, sanitized) + sanitized = video_sub_regex.sub(lambda match: video_sub_regex_matcher(match, obj), sanitized) sanitized = audio_sub_regex.sub(r'', sanitized) if count_emojis: diff --git a/files/routes/admin.py b/files/routes/admin.py index 2241a8727..c6d1d492e 100644 --- a/files/routes/admin.py +++ b/files/routes/admin.py @@ -245,6 +245,11 @@ def revert_actions(v, username): for item in posts + comments: item.is_banned = False + + for media_usage in item.media_usages: + media_usage.removed_utc = None + g.db.add(media_usage) + item.ban_reason = None item.is_approved = v.id g.db.add(item) @@ -1377,6 +1382,11 @@ def unprogstack_comment(comment_id, v): def remove_post(post_id, v): post = get_post(post_id) post.is_banned = True + + for media_usage in post.media_usages: + media_usage.removed_utc = time.time() + g.db.add(media_usage) + post.is_approved = None if not FEATURES['AWARDS'] or not post.pinned or not post.pinned.endswith(PIN_AWARD_TEXT): @@ -1423,6 +1433,11 @@ def approve_post(post_id, v): g.db.add(ma) post.is_banned = False + + for media_usage in post.media_usages: + media_usage.removed_utc = None + g.db.add(media_usage) + post.ban_reason = None post.is_approved = v.id @@ -1595,6 +1610,11 @@ def remove_comment(c_id, v): comment = get_comment(c_id) comment.is_banned = True + + for media_usage in comment.media_usages: + media_usage.removed_utc = time.time() + g.db.add(media_usage) + comment.is_approved = None comment.ban_reason = v.username g.db.add(comment) @@ -1633,6 +1653,11 @@ def approve_comment(c_id, v): g.db.add(ma) comment.is_banned = False + + for media_usage in comment.media_usages: + media_usage.removed_utc = None + g.db.add(media_usage) + comment.ban_reason = None comment.is_approved = v.id @@ -1721,6 +1746,11 @@ def admin_nuke_user(v): continue post.is_banned = True + + for media_usage in post.media_usages: + media_usage.removed_utc = time.time() + g.db.add(media_usage) + post.ban_reason = v.username g.db.add(post) @@ -1729,6 +1759,11 @@ def admin_nuke_user(v): continue comment.is_banned = True + + for media_usage in comment.media_usages: + media_usage.removed_utc = time.time() + g.db.add(media_usage) + comment.ban_reason = v.username g.db.add(comment) @@ -1757,6 +1792,11 @@ def admin_nunuke_user(v): continue post.is_banned = False + + for media_usage in post.media_usages: + media_usage.removed_utc = None + g.db.add(media_usage) + post.ban_reason = None post.is_approved = v.id g.db.add(post) @@ -1766,6 +1806,11 @@ def admin_nunuke_user(v): continue comment.is_banned = False + + for media_usage in comment.media_usages: + media_usage.removed_utc = None + g.db.add(media_usage) + comment.ban_reason = None comment.is_approved = v.id g.db.add(comment) diff --git a/files/routes/comments.py b/files/routes/comments.py index a1b5cce04..dba1283a5 100644 --- a/files/routes/comments.py +++ b/files/routes/comments.py @@ -232,6 +232,7 @@ def comment(v): c.upvotes = 1 + g.db.add(c) body_html = sanitize(body, limit_pings=5, showmore=(not v.hieroglyphs), count_emojis=not v.marsify, commenters_ping_post_id=commenters_ping_post_id, obj=c, author=v) if post_target.id not in ADMIGGER_THREADS and not (v.chud and v.chud_phrase.lower() in body.lower()): @@ -256,7 +257,6 @@ def comment(v): c.body_html = body_html - g.db.add(c) g.db.flush() if not posting_to_post and v.admin_level >= PERMS['ADMIN_NOTES'] and request.values.get('admin_note') == 'true' : @@ -277,6 +277,11 @@ def comment(v): if not complies_with_chud(c): c.is_banned = True + + for media_usage in c.media_usages: + media_usage.removed_utc = time.time() + g.db.add(media_usage) + c.ban_reason = "AutoJanny for lack of chud phrase" g.db.add(c) @@ -425,6 +430,10 @@ def delete_comment(cid, v): v.comment_count -= 1 g.db.add(v) + for media_usage in c.media_usages: + media_usage.deleted_utc = c.deleted_utc + g.db.add(media_usage) + cache.delete_memoized(comment_idlist) if c.parent_post: @@ -459,6 +468,10 @@ def undelete_comment(cid, v): v.comment_count += 1 g.db.add(v) + for media_usage in c.media_usages: + media_usage.deleted_utc = None + g.db.add(media_usage) + cache.delete_memoized(comment_idlist) if c.parent_post: diff --git a/files/routes/posts.py b/files/routes/posts.py index beffb2822..e42e10152 100644 --- a/files/routes/posts.py +++ b/files/routes/posts.py @@ -641,6 +641,7 @@ def submit_post(v, hole=None): p.title_html = title_html + g.db.add(p) body_html = sanitize(body, count_emojis=True, limit_pings=100, obj=p, author=v) if v.hieroglyphs and not p.distinguished and marseyaward_body_regex.search(body_html): @@ -684,7 +685,7 @@ def submit_post(v, hole=None): copyfile(name, name2) p.thumburl = process_image(name2, v, resize=199) elif file.content_type.startswith('video/'): - p.url, p.posterurl, name = process_video(file, v) + p.url, p.posterurl, name = process_video(file, v, post=p) if p.posterurl: name2 = name.replace('.webp', 'r.webp') copyfile(name, name2) @@ -693,9 +694,27 @@ def submit_post(v, hole=None): p.url = process_audio(file, v) else: stop(415) + elif p.url and p.url.startswith(SITE_FULL_VIDEOS): + filename = p.url.split(SITE_FULL_VIDEOS)[0] + print(filename, flush=True) + media = g.db.get(Media, filename) + print(media, flush=True) + if media: + media_usage = MediaUsage( + filename=filename, + post_id=p.id, + ) + g.db.add(media_usage) + if media.posterurl: + p.posterurl = media.posterurl if not p.draft and not complies_with_chud(p): p.is_banned = True + + for media_usage in p.media_usages: + media_usage.removed_utc = time.time() + g.db.add(media_usage) + p.ban_reason = "AutoJanny for lack of chud phrase" body = random.choice(CHUD_MSGS).format(username=v.username, type='post', CHUD_PHRASE=v.chud_phrase) @@ -802,6 +821,10 @@ def delete_post_pid(pid, v): v.post_count -= 1 g.db.add(v) + for media_usage in p.media_usages: + media_usage.deleted_utc = p.deleted_utc + g.db.add(media_usage) + for sort in COMMENT_SORTS.keys(): cache.delete(f'post_{p.id}_{sort}') @@ -830,6 +853,10 @@ def undelete_post_pid(pid, v): v.post_count += 1 g.db.add(v) + for media_usage in p.media_usages: + media_usage.deleted_utc = None + g.db.add(media_usage) + for sort in COMMENT_SORTS.keys(): cache.delete(f'post_{p.id}_{sort}') diff --git a/migrations/20241022-cleanup-videos.sql b/migrations/20241022-cleanup-videos.sql new file mode 100644 index 000000000..1221cfd97 --- /dev/null +++ b/migrations/20241022-cleanup-videos.sql @@ -0,0 +1,28 @@ +create table media_usages ( + id integer primary key, + filename character varying(55) NOT NULL, + post_id integer, + comment_id integer, + created_utc integer not null, + deleted_utc integer, + removed_utc integer +); + +CREATE SEQUENCE public.media_usages_id_seq + AS integer + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1; + +ALTER SEQUENCE public.media_usages_id_seq OWNED BY public.media_usages.id; + +ALTER TABLE ONLY public.media_usages ALTER COLUMN id SET DEFAULT nextval('public.media_usages_id_seq'::regclass); + + +alter table media_usages + add constraint media_usages_post_fkey foreign key (post_id) references posts(id); + +alter table media_usages + add constraint media_usages_comment_fkey foreign key (comment_id) references comments(id); \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a18e01e29..4407595a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ gevent-websocket gevent_inotifyx greenlet gunicorn +humanize imagehash inotify isodate