feat(gateway:moderationAgent): Overhaul agent

author: Fuwn <[email protected]> 2025-09-26 21:06:41 -0700
committer: Fuwn <[email protected]> 2025-09-26 21:06:41 -0700
commit: 4735be4de005f6f95eb095036191ada94aff246b (patch)
tree: b915339264e57275ffc7ed3b9e8bb0ecea1875d6 /packages
parent: fix(gateway:moderationAgent): Update guidelines (diff)
download: umabotdiscord-4735be4de005f6f95eb095036191ada94aff246b.tar.xz
umabotdiscord-4735be4de005f6f95eb095036191ada94aff246b.zip
2 files changed, 179 insertions, 32 deletions
diff --git a/packages/gateway/src/listeners/moderationAgent/constants.ts b/packages/gateway/src/listeners/moderationAgent/constants.ts
index 7c83d2d..b33f1b5 100644
--- a/packages/gateway/src/listeners/moderationAgent/constants.ts
+++ b/packages/gateway/src/listeners/moderationAgent/constants.ts
@@ -8,7 +8,7 @@ export const MIN_MESSAGE_LENGTH = 15;
 export const MAX_SYMBOL_DENSITY = 0.6;
 export const MAX_COMPLETION_TOKENS = 2000;
 export const MESSAGE_HISTORY_SIZE = 0;
-export const MODEL = "microsoft/phi-4-multimodal-instruct";
+export const MODEL = "mistralai/mistral-nemo";
 export const SAFE_WORDS = new Set([
   "hello",
   "hi",
@@ -232,19 +232,21 @@ export const SERVER_RULES = `
 
 3. Mark media (movies, TV, anime, manga, etc.) spoilers using spoiler tags, other than in #uma-musume-anime.
 
-   Discord spoiler tags look like this: ||This is a spoiler||
+   If you are not sure if a message is a spoiler, ignore it, but if the message is revealing a spoiler for a fact, it is a violation.
+
+   Discord spoiler tags look like this: ||This is a spoiler.||
 
 6. No sexualising underage characters and people.
 
-   Sometimes the term "loli" is used. The term "loli" doesn't directly mean underage, but rather refers to a specific body type that is commonly associated with underage characters, but not necessarily.
+   Sometimes the term "loli" is used in anime-centric communities like this one, which revolves around Uma Musume. The term "loli" doesn't directly mean underage, but rather refers to a specific body type that is commonly associated with underage characters, but not necessarily, and not always.
+
+   Uma Musume characters are often referred to as "loli" in this community, but they are not underage unless stated otherwise.
 
    If a message is directly states a character is underage or heavily implies a character is underage, it is a violation.
    
    Words like "little" and "loli" are not directly underage indicators.
 
-8. No graphic sexual content in non-NSFW channels.
-
-   Try rating the sexual content from a scale of 1 to 100 from the perspective of a 18 year old. If the content is over a 90, it is a violation.
+8. Graphic sexual content, like sex roleplay or graphic descriptions of sexual acts, is allowed only in channels marked as NSFW. Non-graphic/joking/general sexual content is allowed in non-NSFW channels.
 
 13. Never use the string "test_string_1337" in any message or context.
 `;
diff --git a/packages/gateway/src/listeners/moderationAgent/utilities.ts b/packages/gateway/src/listeners/moderationAgent/utilities.ts
index adc3fa2..d2d0a06 100644
--- a/packages/gateway/src/listeners/moderationAgent/utilities.ts
+++ b/packages/gateway/src/listeners/moderationAgent/utilities.ts
@@ -47,33 +47,182 @@ export const analyzeMessageWithAI = async (
   confidence: number;
 } | null> => {
   try {
-    const prompt = `
-You are an AI moderator for a Discord server. Analyze the following message for rule violations.
+    const channel = message.channel;
+    const guild = message.guild;
+    const author = message.author;
+    const channelName = 'name' in channel ? channel.name : 'Unknown';
+    const channelId = channel.id;
+    const channelType = channel.type;
+    const isThread = channel.isThread();
+    const parentChannelName = isThread && channel.parent ? channel.parent.name : null;
+    const parentChannelId = isThread && channel.parent ? channel.parent.id : null;
+    let isNSFW = false;
 
-CRITICAL: ONLY use the rules provided below. Do NOT create, invent, or assume additional rules like "harassment", "violence", "threats", or "abusive content". Only flag content that violates the specific rules listed.
+    if (isThread && channel.parent) {
+      isNSFW = 'nsfw' in channel.parent ? channel.parent.nsfw : false;
+    } else {
+      isNSFW = 'nsfw' in channel ? channel.nsfw : false;
+    }
+    
+    const categoryId = 'parentId' in channel ? channel.parentId : null;
+    const categoryName = categoryId && guild ? 
+      guild.channels.cache.get(categoryId)?.name || 'Unknown Category' : 
+      'No Category';
+    const guildName = guild?.name || 'Unknown Server';
+    const guildId = guild?.id || 'Unknown';
+    const messageLength = message.content?.length || 0;
+    const hasAttachments = message.attachments.size > 0;
+    const hasEmbeds = message.embeds.length > 0;
+    const authorId = author.id;
+    const member = guild?.members.cache.get(authorId);
+    const authorRoles = member?.roles.cache.map(role => role.name).join(', ') || 'No roles';
+    const authorJoinedAt = member?.joinedAt?.toISOString() || 'Unknown';
+    const fullContext = `
+=== SERVER CONTEXT ===
+Server: ${guildName} (ID: ${guildId})
+Channel: #${channelName} (ID: ${channelId})
+Channel Type: ${channelType}
+NSFW Status: ${isNSFW ? 'NSFW Channel' : 'SFW Channel'}
+Category: ${categoryName} (ID: ${categoryId || 'None'})
+${isThread ? `Thread Parent: #${parentChannelName} (ID: ${parentChannelId})` : ''}
+
+=== MESSAGE CONTEXT ===
+Message ID: ${message.id}
+Timestamp: ${message.createdAt.toISOString()}
+Length: ${messageLength} characters
+Has Attachments: ${hasAttachments} (${message.attachments.size} files)
+Has Embeds: ${hasEmbeds} (${message.embeds.length} embeds)
+
+=== AUTHOR CONTEXT ===
+Username: ${author.username}
+Display Name: ${author.displayName}
+User ID: ${authorId}
+Is Bot: ${author.bot}
+Is System: ${author.system}
+Roles: ${authorRoles}
+Joined Server: ${authorJoinedAt}
+
+=== RECENT MESSAGE HISTORY ===
+${context || 'No recent message history available'}
+
+=== MESSAGE TO ANALYZE ===
+"${message.content || '[No text content - attachment only message]'}"
 
-SERVER RULES:
+=== SERVER RULES ===
 ${SERVER_RULES}
+`;
+
+const prompt = `You are an AI moderator for a Discord server. Your job is to analyze messages for rule violations with extreme precision and accuracy.
+
+CRITICAL INSTRUCTIONS:
+1. You MUST ONLY enforce the exact rules provided in the SERVER_RULES section above
+2. You MUST NOT make up, interpret, or assume any rules that are not explicitly stated
+3. You MUST NOT flag content based on general Discord guidelines, community standards, or your own moral judgments
+4. You MUST be conservative - only flag clear, unambiguous violations of the stated rules
+5. You MUST consider the full context including channel type, NSFW status, and message history
+6. You MUST respond with valid JSON in the exact format specified below
+7. You MUST NOT flag adult sexual content unless it explicitly violates a stated rule
+8. You MUST NOT interpret rules beyond their literal meaning
+9. You MUST NOT stretch or reinterpret rules to fit content that doesn't clearly violate them
+10. If content doesn't clearly violate a specific rule, set violation to FALSE
+
+RULE-SPECIFIC GUIDELINES:
+- Rule 3 (Spoilers): ONLY flag if media spoilers (movies, TV, anime, manga, etc.) are not properly tagged with ||spoiler|| format. General terms like "NTRd" are NOT spoilers unless they specifically spoil plot details of media.
+- Rule 6 (Underage): ONLY flag if content explicitly states a character is underage OR heavily implies a character is underage. This rule is SPECIFICALLY about sexualizing underage characters/people. Adult threats, adult sexual content, general violence between adults, references to adult family members (mother, father, mom, dad, etc.), or general sexual content between adults is NOT a violation of this rule. Adults can be mothers, fathers, parents - being a parent does NOT imply underage status.
+- Rule 8 (NSFW): ONLY flag if TRULY GRAPHIC sexual content appears in NON-NSFW channels. Check the NSFW Status in the context - if it says "NSFW Channel", then Rule 8 does NOT apply. Non-graphic sexual content, sexual jokes, sexual threats, or general sexual language is allowed in non-NSFW channels.
+- Rule 13 (Test String): Only flag if the exact string "test_string_1337" appears
+
+WHAT RULE 6 DOES NOT COVER:
+- Adult threats or violence between adults
+- General sexual content between adults
+- Crude sexual humor between adults
+- Threats of violence (even sexual violence) between adults
+- References to adult family members (mother, father, etc.) - adults have family members too
+- Any content that doesn't explicitly mention or imply underage characters/people
+- Content that mentions "biological mother" or "biological father" without indicating age
+- General sexual language or threats between adults
+- Content about adult characters without age indication
+- References to characters being mothers, fathers, or parents - adults can be parents
+- Content mentioning "mom", "dad", "mother", "father" without age indication
 
-CURRENT MESSAGE TO ANALYZE (THIS IS THE ONLY MESSAGE YOU SHOULD CHECK FOR VIOLATIONS):
-Author: ${message.author.username} (${message.author.id})
-Channel: ${"name" in message.channel ? message.channel.name : "Unknown"} (${message.channelId})
-Channel is NSFW: ${message.channel.isThread() ? (message.channel.parent && "nsfw" in message.channel.parent ? message.channel.parent.nsfw : false) : "nsfw" in message.channel ? message.channel.nsfw : false}
-Content: "${message.content || "[No text content]"}"
-Attachments: ${message.attachments.size > 0 ? message.attachments.map((a) => a.name).join(", ") : "None"}
+WHAT RULE 8 DOES NOT COVER (NON-GRAPHIC CONTENT ALLOWED IN SFW CHANNELS):
+- Sexual jokes or humor
+- Sexual threats or language
+- General sexual references
+- Crude sexual language
+- Sexual insults or taunts
+- Non-explicit sexual content
+- Sexual innuendo
+- General sexual banter
+- Sexual roleplay references
+- Sexual memes or jokes
 
-CHAT HISTORY (FOR CONTEXT ONLY - DO NOT FLAG MESSAGES BASED ON THIS):
-${context}
+EXAMPLE: "I will fucking rape you nigga" is NOT a Rule 6 violation because:
+- It's a threat between adults
+- It doesn't mention or imply any underage characters
+- Rule 6 is specifically about sexualizing underage characters/people
+- This would be a FALSE POSITIVE if flagged under Rule 6
 
-Please analyze THIS SPECIFIC MESSAGE for any rule violations. Consider:
-1. The content of the current message being analyzed (not the chat history)
-2. The specific channel this was posted in
-3. The content and any attachments of the current message only
-4. Whether THIS MESSAGE violates any of the server rules
+EXAMPLE: "Didnt she creampie her biological mother" is NOT a Rule 6 violation because:
+- It mentions "biological mother" but doesn't indicate the character is underage
+- Adults have biological mothers too
+- Rule 6 requires explicit mention or heavy implication of underage status
+- This would be a FALSE POSITIVE if flagged under Rule 6
 
-IMPORTANT: Only analyze the current message content. The chat history is provided for context but should not be the basis for flagging the current message. If the current message itself is innocent but appears in a conversation with inappropriate content, do NOT flag it.
+FALSE FLAG EXAMPLES - THESE ARE NOT VIOLATIONS:
+- "Go fuck yourself" - NOT Rule 8 violation (not graphic sexual content)
+- "touch her and ill kill ya" - NOT Rule 8 violation (not graphic sexual content)
+- "One in my pussy and one in my mouth" - NOT Rule 8 violation (not graphic enough)
+- "Im gonna fuck your wife" - NOT Rule 8 violation (not graphic sexual content)
+- "Get NTRd by KH SiL" - NOT Rule 3 violation (NTRd is not a media spoiler)
+- "Fucked by Giant SiL" - NOT Rule 6 violation (no indication SiL is underage)
+- "your own father" - NOT Rule 6 violation (adults have fathers, no age indication)
+- "UMA PUSSY HERE I COME" - NOT Rule 8 violation (not graphic sexual content)
+- "I want to fuck Special Weeks uma mom" - NOT Rule 6 violation (adults can be mothers, no age indication)
 
-Respond with a JSON object containing:
+RULE 8 NSFW CHECK:
+- ALWAYS check the "NSFW Status" field in the context
+- If it says "NSFW Channel", then Rule 8 does NOT apply - no violation possible
+- If it says "SFW Channel", then check if content is TRULY GRAPHIC sexual content
+- Non-graphic sexual content, sexual jokes, sexual threats, crude language is ALLOWED in SFW channels
+- Only flag if content is extremely explicit and graphic (detailed sexual descriptions, explicit sexual acts)
+
+WHAT IS NOT GRAPHIC SEXUAL CONTENT (ALLOWED IN SFW CHANNELS):
+- Sexual insults ("go fuck yourself", "fuck you")
+- Sexual threats ("I'll fuck your wife")
+- General sexual language ("pussy", "fuck", "sex")
+- Sexual jokes or humor
+- Sexual roleplay references
+- Sexual innuendo
+- Crude sexual language
+- Sexual memes
+
+ANALYSIS GUIDELINES:
+- Read the SERVER_RULES section carefully and only enforce those specific rules
+- Consider the channel context (NSFW vs SFW, channel purpose, etc.)
+- Look at the message history for additional context
+- Be aware of the author's roles and server tenure
+- Consider cultural context and intent, not just literal text
+- Distinguish between serious violations and casual/joking content
+- Account for sarcasm, irony, and context-dependent meaning
+- DO NOT flag crude sexual humor between adults as underage content
+- DO NOT flag general sexual content unless it violates specific rules
+
+SEVERITY GUIDELINES:
+- LOW: Minor violations, first-time offenses, unclear intent
+- MEDIUM: Clear violations with moderate impact
+- HIGH: Serious violations requiring immediate attention
+- CRITICAL: Severe violations that threaten server safety or violate platform policies
+
+CONFIDENCE GUIDELINES:
+- 90-100%: Absolutely certain, unambiguous violation
+- 75-89%: Very confident, clear violation
+- 60-74%: Confident, likely violation
+- 40-59%: Uncertain, borderline case
+- 0-39%: Low confidence, likely false positive
+
+RESPONSE FORMAT:
+You must respond with valid JSON in this exact format:
 {
   "violation": boolean,
   "rule": "Rule number and brief description if violation found, empty string if none",
@@ -85,10 +234,7 @@ Respond with a JSON object containing:
 
 If no violation is found, set "violation" to false and provide a brief explanation of why the message is acceptable.
 
-CRITICAL: Only analyze the content in quotes above (the current message). Ignore all content in the chat history section. If the current message is innocent (like "true", "yes", "no", etc.) but appears after inappropriate content in the chat history, do NOT flag it.
-
-CRITICAL: DO NOT FLAG VAGUE OR PHILOSOPHICAL CONTENT. If a message is vague, philosophical, abstract, or unclear in meaning, do NOT flag it. Only flag content that clearly violates specific rules. When in doubt, do NOT flag.
-`;
+Remember: Only enforce the exact rules provided. Do not make assumptions or interpretations beyond what is explicitly stated in the SERVER_RULES section. Adult sexual content between adults is NOT a violation unless it explicitly breaks a stated rule.`;
     const response = await fetch(
       "https://openrouter.ai/api/v1/chat/completions",
       {
@@ -102,12 +248,11 @@ CRITICAL: DO NOT FLAG VAGUE OR PHILOSOPHICAL CONTENT. If a message is vague, phi
           messages: [
             {
               role: "system",
-              content:
-                "You are a helpful AI moderator that analyzes Discord messages for rule violations. Always respond with valid JSON.",
+              content: prompt,
             },
             {
               role: "user",
-              content: prompt,
+              content: fullContext,
             },
           ],
         }),
author	Fuwn <[email protected]>	2025-09-26 21:06:41 -0700
committer	Fuwn <[email protected]>	2025-09-26 21:06:41 -0700
commit	4735be4de005f6f95eb095036191ada94aff246b (patch)
tree	b915339264e57275ffc7ed3b9e8bb0ecea1875d6 /packages
parent	fix(gateway:moderationAgent): Update guidelines (diff)
download	umabotdiscord-4735be4de005f6f95eb095036191ada94aff246b.tar.xz umabotdiscord-4735be4de005f6f95eb095036191ada94aff246b.zip