diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/umabot/bot.py | 12 | ||||
| -rw-r--r-- | src/umabot/rules/spam_detector.py | 50 | ||||
| -rw-r--r-- | src/umabot/rules/umaddit_removal.py | 56 |
3 files changed, 72 insertions, 46 deletions
diff --git a/src/umabot/bot.py b/src/umabot/bot.py index 2519962..89b876a 100644 --- a/src/umabot/bot.py +++ b/src/umabot/bot.py @@ -2,8 +2,8 @@ import time import threading +from collections import OrderedDict import praw -from typing import List from http.server import HTTPServer, BaseHTTPRequestHandler from socketserver import ThreadingMixIn from loguru import logger @@ -93,7 +93,7 @@ class UmaBot: ] # Track processed submissions to avoid processing old posts - self.processed_submissions = set() + self.processed_submissions = OrderedDict() self.initialized = False self.logger.info(f"Bot initialized for r/{config.subreddit_name}") @@ -140,7 +140,7 @@ class UmaBot: if not self.initialized: self.logger.info("Initializing bot - marking existing posts as processed") for submission in new_submissions: - self.processed_submissions.add(submission.id) + self.processed_submissions[submission.id] = None self.initialized = True self.logger.info(f"Bot initialized with {len(self.processed_submissions)} existing posts marked as processed") return @@ -150,7 +150,7 @@ class UmaBot: for submission in new_submissions: if submission.id not in self.processed_submissions: truly_new_submissions.append(submission) - self.processed_submissions.add(submission.id) + self.processed_submissions[submission.id] = None if not truly_new_submissions: self.logger.debug("No truly new submissions found") @@ -224,6 +224,6 @@ class UmaBot: # Keep only the last 1000 processed submissions if len(self.processed_submissions) > 1000: # Convert to list, keep last 1000, convert back to set - submissions_list = list(self.processed_submissions) - self.processed_submissions = set(submissions_list[-1000:]) + while len(self.processed_submissions) > 1000: + self.processed_submissions.popitem(last=False) self.logger.debug(f"Cleaned up processed submissions, keeping {len(self.processed_submissions)} most recent") diff --git a/src/umabot/rules/spam_detector.py b/src/umabot/rules/spam_detector.py index 2de48f2..861616c 100644 --- a/src/umabot/rules/spam_detector.py +++ b/src/umabot/rules/spam_detector.py @@ -1,8 +1,7 @@ """Spam detection rule for limiting posts per user per day.""" -import time from datetime import datetime, timedelta, timezone -from typing import Dict, List +from typing import Dict, Set import praw.models from .base import Rule @@ -13,7 +12,7 @@ class SpamDetector(Rule): def __init__(self, config): """Initialize the spam detector.""" super().__init__(config) - self.user_posts: Dict[str, List[tuple[float, str]]] = {} # (timestamp, post_id) + self.user_posts: Dict[str, Dict[str, float]] = {} self.max_posts = config.max_posts_per_day def should_remove(self, submission: praw.models.Submission) -> bool: @@ -23,22 +22,23 @@ class SpamDetector(Rule): username = submission.author.name current_utc = datetime.now(timezone.utc) + submission_utc = self._get_submission_utc(submission) - # Clean old posts from tracking (remove posts from previous days) self._clean_old_posts(username, current_utc) + + if submission_utc.date() != current_utc.date(): + return False - # Count current active posts in today's UTC day if username not in self.user_posts: - self.user_posts[username] = [] + self.user_posts[username] = {} - # Filter out removed posts and count active ones active_posts = self._get_active_posts(username, current_utc) - post_count = len(active_posts) - - # Add current post to tracking - self.user_posts[username].append((current_utc.timestamp(), submission.id)) + if submission.id in active_posts: + return False - # Check if this post exceeds the limit + post_count = len(active_posts) + self.user_posts[username][submission.id] = submission_utc.timestamp() + if post_count >= self.max_posts: self.logger.info( f"User {username} has posted {post_count + 1} active times today (UTC) " @@ -81,28 +81,36 @@ class SpamDetector(Rule): today_timestamp = today_start.timestamp() # Keep only posts from today - self.user_posts[username] = [ - (post_time, post_id) for post_time, post_id in self.user_posts[username] + self.user_posts[username] = { + post_id: post_time + for post_id, post_time in self.user_posts[username].items() if post_time >= today_timestamp - ] + } - def _get_active_posts(self, username: str, current_utc: datetime) -> List[tuple[float, str]]: + def _get_active_posts(self, username: str, current_utc: datetime) -> Set[str]: """Get active (non-removed) posts for a user.""" if username not in self.user_posts: - return [] + return set() # Get start of current UTC day today_start = current_utc.replace(hour=0, minute=0, second=0, microsecond=0) today_timestamp = today_start.timestamp() - active_posts = [] - for post_time, post_id in self.user_posts[username]: + active_posts = set() + for post_id, post_time in self.user_posts[username].items(): if post_time >= today_timestamp: - # Check if the post is still active (not removed) if self._is_post_active(post_id): - active_posts.append((post_time, post_id)) + active_posts.add(post_id) return active_posts + + def _get_submission_utc(self, submission: praw.models.Submission) -> datetime: + """Get the submission timestamp in UTC.""" + created_utc = getattr(submission, "created_utc", None) + if created_utc is None: + return datetime.now(timezone.utc) + + return datetime.fromtimestamp(created_utc, timezone.utc) def _is_post_active(self, post_id: str) -> bool: """Check if a post is still active (not removed) by checking its status.""" diff --git a/src/umabot/rules/umaddit_removal.py b/src/umabot/rules/umaddit_removal.py index 759c784..91562d4 100644 --- a/src/umabot/rules/umaddit_removal.py +++ b/src/umabot/rules/umaddit_removal.py @@ -1,4 +1,4 @@ -"""Rule to remove posts containing umaddit references.""" +"""Rule to remove posts containing umaddit and umagusher references.""" import re import praw.models @@ -6,25 +6,31 @@ from .base import Rule class UmadditRemovalRule(Rule): - """Removes posts containing umaddit subreddit references (r/umaddit, /umaddit).""" + """Removes posts containing umaddit or umagusher subreddit references (r/umaddit, /umaddit, r/umagusher, /umagusher).""" def __init__(self, config): - """Initialize the umaddit removal rule.""" + """Initialize the umaddit/umagusher removal rule.""" super().__init__(config) - # Regex pattern to match subreddit references to umaddit - # Matches: r/umaddit, /umaddit (case-insensitive) - # Word boundary after umaddit ensures we don't match "umaddit123" or similar + # Regex patterns to match subreddit references + # Umaddit: only matches with subreddit prefix (r/umaddit, /umaddit) + # Umagusher: matches with or without prefix, and handles whitespace variations (uma gusher, uma-gusher, etc.) self.umaddit_pattern = re.compile(r'(?:r/|/)umaddit\b', re.IGNORECASE) + # Match umagusher with prefix, without prefix, or with whitespace variations + # Pattern allows: r/umagusher, /umagusher, umagusher, uma gusher, uma-gusher, uma_gusher, umagusher123 + # Uses negative lookbehind to prevent false matches like "forumagusher" + # No word boundary at end to catch variations like "umagusher123" + self.umagusher_pattern = re.compile(r'(?<![a-z])(?:r/|/)?uma[\s\-_]?gusher', re.IGNORECASE) def should_remove(self, submission: praw.models.Submission) -> bool: - """Check if a post contains umaddit subreddit references.""" + """Check if a post contains umaddit or umagusher subreddit references.""" if not submission.author: return False - # Check if the post contains umaddit subreddit references - if self._contains_umaddit_reference(submission): + # Check if the post contains umaddit or umagusher subreddit references + reference_type = self._contains_forbidden_reference(submission) + if reference_type: self.logger.info( - f"Post by {submission.author.name} silently removed for containing umaddit subreddit reference " + f"Post by {submission.author.name} silently removed for containing {reference_type} subreddit reference " f"(post ID: {submission.id})" ) return True @@ -32,11 +38,15 @@ class UmadditRemovalRule(Rule): return False def get_removal_message(self, submission: praw.models.Submission) -> str: - """Get the umaddit removal message - silent removal.""" + """Get the umaddit/umagusher removal message - silent removal.""" return "" # Silent removal - no message sent - def _contains_umaddit_reference(self, submission: praw.models.Submission) -> bool: - """Check if a submission contains any umaddit subreddit references.""" + def _contains_forbidden_reference(self, submission: praw.models.Submission) -> str: + """Check if a submission contains any umaddit or umagusher subreddit references. + + Returns: + str: The type of reference found ('umaddit' or 'umagusher'), or empty string if none found. + """ try: # Get all text content to check content_to_check = [] @@ -53,19 +63,27 @@ class UmadditRemovalRule(Rule): if submission.url: content_to_check.append(submission.url) - # Check each piece of content for umaddit subreddit references + # Check each piece of content for forbidden subreddit references for content in content_to_check: if content: - # Use regex search + # Check for umaddit references match = self.umaddit_pattern.search(content) if match: self.logger.info( f"Found umaddit subreddit reference '{match.group()}' in post {submission.id}" ) - return True + return "umaddit" + + # Check for umagusher references + match = self.umagusher_pattern.search(content) + if match: + self.logger.info( + f"Found umagusher subreddit reference '{match.group()}' in post {submission.id}" + ) + return "umagusher" - return False + return "" except Exception as e: - self.logger.error(f"Error checking umaddit references for submission {submission.id}: {e}") - return False
\ No newline at end of file + self.logger.error(f"Error checking forbidden references for submission {submission.id}: {e}") + return ""
\ No newline at end of file |