diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/umabot/bot.py | 12 | ||||
| -rw-r--r-- | src/umabot/rules/spam_detector.py | 50 |
2 files changed, 35 insertions, 27 deletions
diff --git a/src/umabot/bot.py b/src/umabot/bot.py index 2519962..89b876a 100644 --- a/src/umabot/bot.py +++ b/src/umabot/bot.py @@ -2,8 +2,8 @@ import time import threading +from collections import OrderedDict import praw -from typing import List from http.server import HTTPServer, BaseHTTPRequestHandler from socketserver import ThreadingMixIn from loguru import logger @@ -93,7 +93,7 @@ class UmaBot: ] # Track processed submissions to avoid processing old posts - self.processed_submissions = set() + self.processed_submissions = OrderedDict() self.initialized = False self.logger.info(f"Bot initialized for r/{config.subreddit_name}") @@ -140,7 +140,7 @@ class UmaBot: if not self.initialized: self.logger.info("Initializing bot - marking existing posts as processed") for submission in new_submissions: - self.processed_submissions.add(submission.id) + self.processed_submissions[submission.id] = None self.initialized = True self.logger.info(f"Bot initialized with {len(self.processed_submissions)} existing posts marked as processed") return @@ -150,7 +150,7 @@ class UmaBot: for submission in new_submissions: if submission.id not in self.processed_submissions: truly_new_submissions.append(submission) - self.processed_submissions.add(submission.id) + self.processed_submissions[submission.id] = None if not truly_new_submissions: self.logger.debug("No truly new submissions found") @@ -224,6 +224,6 @@ class UmaBot: # Keep only the last 1000 processed submissions if len(self.processed_submissions) > 1000: # Convert to list, keep last 1000, convert back to set - submissions_list = list(self.processed_submissions) - self.processed_submissions = set(submissions_list[-1000:]) + while len(self.processed_submissions) > 1000: + self.processed_submissions.popitem(last=False) self.logger.debug(f"Cleaned up processed submissions, keeping {len(self.processed_submissions)} most recent") diff --git a/src/umabot/rules/spam_detector.py b/src/umabot/rules/spam_detector.py index 2de48f2..861616c 100644 --- a/src/umabot/rules/spam_detector.py +++ b/src/umabot/rules/spam_detector.py @@ -1,8 +1,7 @@ """Spam detection rule for limiting posts per user per day.""" -import time from datetime import datetime, timedelta, timezone -from typing import Dict, List +from typing import Dict, Set import praw.models from .base import Rule @@ -13,7 +12,7 @@ class SpamDetector(Rule): def __init__(self, config): """Initialize the spam detector.""" super().__init__(config) - self.user_posts: Dict[str, List[tuple[float, str]]] = {} # (timestamp, post_id) + self.user_posts: Dict[str, Dict[str, float]] = {} self.max_posts = config.max_posts_per_day def should_remove(self, submission: praw.models.Submission) -> bool: @@ -23,22 +22,23 @@ class SpamDetector(Rule): username = submission.author.name current_utc = datetime.now(timezone.utc) + submission_utc = self._get_submission_utc(submission) - # Clean old posts from tracking (remove posts from previous days) self._clean_old_posts(username, current_utc) + + if submission_utc.date() != current_utc.date(): + return False - # Count current active posts in today's UTC day if username not in self.user_posts: - self.user_posts[username] = [] + self.user_posts[username] = {} - # Filter out removed posts and count active ones active_posts = self._get_active_posts(username, current_utc) - post_count = len(active_posts) - - # Add current post to tracking - self.user_posts[username].append((current_utc.timestamp(), submission.id)) + if submission.id in active_posts: + return False - # Check if this post exceeds the limit + post_count = len(active_posts) + self.user_posts[username][submission.id] = submission_utc.timestamp() + if post_count >= self.max_posts: self.logger.info( f"User {username} has posted {post_count + 1} active times today (UTC) " @@ -81,28 +81,36 @@ class SpamDetector(Rule): today_timestamp = today_start.timestamp() # Keep only posts from today - self.user_posts[username] = [ - (post_time, post_id) for post_time, post_id in self.user_posts[username] + self.user_posts[username] = { + post_id: post_time + for post_id, post_time in self.user_posts[username].items() if post_time >= today_timestamp - ] + } - def _get_active_posts(self, username: str, current_utc: datetime) -> List[tuple[float, str]]: + def _get_active_posts(self, username: str, current_utc: datetime) -> Set[str]: """Get active (non-removed) posts for a user.""" if username not in self.user_posts: - return [] + return set() # Get start of current UTC day today_start = current_utc.replace(hour=0, minute=0, second=0, microsecond=0) today_timestamp = today_start.timestamp() - active_posts = [] - for post_time, post_id in self.user_posts[username]: + active_posts = set() + for post_id, post_time in self.user_posts[username].items(): if post_time >= today_timestamp: - # Check if the post is still active (not removed) if self._is_post_active(post_id): - active_posts.append((post_time, post_id)) + active_posts.add(post_id) return active_posts + + def _get_submission_utc(self, submission: praw.models.Submission) -> datetime: + """Get the submission timestamp in UTC.""" + created_utc = getattr(submission, "created_utc", None) + if created_utc is None: + return datetime.now(timezone.utc) + + return datetime.fromtimestamp(created_utc, timezone.utc) def _is_post_active(self, post_id: str) -> bool: """Check if a post is still active (not removed) by checking its status.""" |