fix(gateway:moderationAgent): Update model structure and handling guidelines

author: Fuwn <[email protected]> 2025-10-03 12:17:35 -0700
committer: Fuwn <[email protected]> 2025-10-03 12:17:35 -0700
commit: 50c9454ad83ed2caecbb69b377cc3553ec16ae66 (patch)
tree: 35734419d378d91b13e52c4621484e4d66d93e85 /packages
parent: fix(gateway:aiCommandHandler): Change model (diff)
download: umabotdiscord-50c9454ad83ed2caecbb69b377cc3553ec16ae66.tar.xz
umabotdiscord-50c9454ad83ed2caecbb69b377cc3553ec16ae66.zip
3 files changed, 61 insertions, 287 deletions
diff --git a/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts b/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts
index 36bff31..43468a2 100644
--- a/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts
+++ b/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts
@@ -228,26 +228,36 @@ export const LOW_RISK_PATTERNS = [
   /^(open|closed|free|busy|available|unavailable|online|offline)$/i,
   /^(active|inactive|ready|not ready|working|broken|fixed|repair)$/i,
 ];
-export const SERVER_RULES = `
-# Rules
+export const SERVER_RULES = {
+  "3": {
+    name: "Spoiler Tagging",
+    description: "Media spoilers must use ||spoiler|| tags (except in #uma-musume-anime)",
+    severity: "medium",
+    criteria: "Reveals plot details of movies/TV/anime/manga without spoiler tags"
+  },
+  "6": {
+    name: "Underage Sexualization",
+    description: "No sexualizing characters explicitly stated as underage",
+    severity: "critical",
+    criteria: "Explicitly states character is underage AND involves sexual content"
+  },
+  "8": {
+    name: "NSFW Content",
+    description: "Graphic sexual content only in NSFW channels",
+    severity: "high",
+    criteria: "Detailed sexual descriptions in non-NSFW channels"
+  },
+  "14": {
+    name: "Test String",
+    description: "Never use 'test_string_1337'",
+    severity: "low",
+    criteria: "Contains exact string 'test_string_1337'"
+  }
+};
 
-3. Mark media (movies, TV, anime, manga, etc.) spoilers using spoiler tags, other than in #uma-musume-anime.
-
-   If you are not sure if a message is a spoiler, ignore it, but if the message is revealing a spoiler for a fact, it is a violation.
-
-   Discord spoiler tags look like this: ||This is a spoiler.||
-
-6. No sexualising underage characters and people.
-
-   Sometimes the term "loli" is used in anime-centric communities like this one, which revolves around Uma Musume. The term "loli" doesn't directly mean underage, but rather refers to a specific body type that is commonly associated with underage characters, but not necessarily, and not always.
-
-   Uma Musume characters are often referred to as "loli" in this community, but they are not underage unless stated otherwise.
-
-   If a message is directly states a character is underage or heavily implies a character is underage, it is a violation.
-   
-   Words like "little" and "loli" are not directly underage indicators.
-
-8. Graphic sexual content, like sex roleplay or graphic descriptions of sexual acts, is allowed only in channels marked as NSFW. Non-graphic/joking/general sexual content is allowed in non-NSFW channels.
-
-14. Never use the string "test_string_1337" in any message or context.
+export const RULE_DECISION_TREE = `
+RULE 3 (Spoilers): Flag if media plot details revealed without ||spoiler|| tags
+RULE 6 (Underage): Flag ONLY if explicitly states character is underage + sexual content
+RULE 8 (NSFW): Flag ONLY if detailed sexual descriptions in SFW channels
+RULE 14 (Test): Flag if contains exact string "test_string_1337"
 `;
diff --git a/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts b/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts
index 40f4423..2787971 100644
--- a/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts
+++ b/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts
@@ -101,7 +101,7 @@ export const handleAIModeration = async (message: Message) => {
       );
     } else if (
       (analysis.severity === "critical" || analysis.severity === "high") &&
-      analysis.confidence >= 75
+      analysis.confidence >= 85
     ) {
       try {
         await message.delete();
diff --git a/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts b/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts
index 9d91071..296d05f 100644
--- a/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts
+++ b/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts
@@ -4,6 +4,7 @@ import {
   MAX_COMPLETION_TOKENS,
   MODEL,
   SERVER_RULES,
+  RULE_DECISION_TREE,
 } from "./constants";
 
 export const fetchMessageContext = async (
@@ -19,24 +20,16 @@ export const fetchMessageContext = async (
     });
     const contextMessages = Array.from(messages.values())
       .reverse()
-      .map((msg) => {
-        const timestamp = msg.createdAt.toISOString();
-        const author = msg.author.username;
-        const content = msg.content || "[No text content]";
-        const attachments =
-          msg.attachments.size > 0
-            ? ` [${msg.attachments.size} attachment(s)]`
-            : "";
-
-        return `[${timestamp}] ${author}: ${content}${attachments}`;
-      })
-      .join("\n");
-
-    return contextMessages;
+      .filter(msg => msg.content && msg.content.length > 10)
+      .slice(0, 2)
+      .map((msg) => `${msg.author.username}: ${msg.content}`)
+      .join(" | ");
+
+    return contextMessages || "No relevant context";
   } catch (error) {
     console.error("Error fetching message context:", error);
 
-    return "Unable to fetch message context";
+    return "Context unavailable";
   }
 };
 
@@ -53,16 +46,8 @@ export const analyzeMessageWithAI = async (
 } | null> => {
   try {
     const channel = message.channel;
-    const guild = message.guild;
-    const author = message.author;
     const channelName = "name" in channel ? channel.name : "Unknown";
-    const channelId = channel.id;
-    const channelType = channel.type;
     const isThread = channel.isThread();
-    const parentChannelName =
-      isThread && channel.parent ? channel.parent.name : null;
-    const parentChannelId =
-      isThread && channel.parent ? channel.parent.id : null;
     let isNSFW = false;
 
     if (isThread && channel.parent) {
@@ -71,253 +56,32 @@ export const analyzeMessageWithAI = async (
       isNSFW = "nsfw" in channel ? channel.nsfw : false;
     }
 
-    const categoryId = "parentId" in channel ? channel.parentId : null;
-    const categoryName =
-      categoryId && guild
-        ? guild.channels.cache.get(categoryId)?.name || "Unknown Category"
-        : "No Category";
-    const guildName = guild?.name || "Unknown Server";
-    const guildId = guild?.id || "Unknown";
-    const hasAttachments = message.attachments.size > 0;
-    const hasEmbeds = message.embeds.length > 0;
-    const authorId = author.id;
-    const member = guild?.members.cache.get(authorId);
-    const authorJoinedAt = member?.joinedAt?.toISOString() || "Unknown";
-    let repliedToMessage = null;
-
-    if (message.reference && message.reference.messageId)
-      try {
-        repliedToMessage = await message.channel.messages.fetch(
-          message.reference.messageId,
-        );
-      } catch (error) {
-        console.error("Error fetching replied-to message:", error);
-      }
+    const fullContext = `Channel: #${channelName} | NSFW: ${isNSFW ? "Yes" : "No"} | Context: ${context || "None"}
+Message: "${message.content || "[No content]"}"
 
-    const fullContext = `
-=== SERVER CONTEXT ===
-Server: ${guildName} (ID: ${guildId})
-Channel: #${channelName} (ID: ${channelId})
-Channel Type: ${channelType}
-NSFW Status: ${isNSFW ? "NSFW Channel" : "SFW Channel"}
-Category: ${categoryName} (ID: ${categoryId || "None"})
-${isThread ? `Thread Parent: #${parentChannelName} (ID: ${parentChannelId})` : ""}
-
-=== MESSAGE CONTEXT ===
-Has Attachments: ${hasAttachments} (${message.attachments.size} files)
-Has Embeds: ${hasEmbeds} (${message.embeds.length} embeds)
-${repliedToMessage ? `Is Reply: Yes (replying to message from ${repliedToMessage.author.username})` : "Is Reply: No"}
-
-${
-  repliedToMessage
-    ? `=== REPLIED-TO MESSAGE ===
-Replied-to Content: "${repliedToMessage.content || "[No text content]"}"
-`
-    : ""
-}
-
-=== AUTHOR CONTEXT ===
-Is Bot: ${author.bot}
-Is System: ${author.system}
-Joined Server: ${authorJoinedAt}
-
-=== RECENT MESSAGE HISTORY ===
-${context || "No recent message history available"}
-
-=== MESSAGE TO ANALYZE ===
-"${message.content || "[No text content - attachment only message]"}"
-
-=== SERVER RULES ===
-${SERVER_RULES}
-`;
-
-    const prompt = `You are a Discord moderator. Analyze messages for rule violations.
-
-CRITICAL: Respond with ONLY valid JSON. No other text.
-
-JSON RULES:
-- Use double quotes for all strings
-- Escape quotes: "hello" becomes \\"hello\\"
-- Escape apostrophes: I'm becomes I\\'m
-- NEVER use single quotes in JSON
-
-EXAMPLES:
-CORRECT: "The message \\"hello\\" does not violate rules"
-CORRECT: "The message \\"I\\'m horny\\" does not violate rules"
-WRONG: "The message 'hello' does not violate rules"
-WRONG: "The message 'I'm horny' does not violate rules"
+Rules: ${JSON.stringify(SERVER_RULES, null, 2)}
+Decision Tree: ${RULE_DECISION_TREE}`;
 
-RULES:
-- Rule 3: Flag media spoilers not tagged with ||spoiler||
-- Rule 6: Flag sexualization of underage characters only
-- Rule 8: Flag graphic sexual content in SFW channels only
-- Rule 14: Flag exact string "test_string_1337"
-
-BE CONSERVATIVE: Only flag clear violations. If unsure, set violation to false.
-
-IMPORTANT: When describing the message content, do NOT quote it directly. Instead, describe it without quotes. For example:
-- WRONG: "The message \\"hello\\" does not violate rules"
-- RIGHT: "The message contains hello and does not violate rules"
-- WRONG: "The message \\"<:emoji:123>\\" is an emoji"
-- RIGHT: "The message contains an emoji and does not violate rules"
-
-RULE-SPECIFIC GUIDELINES:
-- Rule 3 (Spoilers): ONLY flag if media spoilers (movies, TV, anime, manga, etc.) are not properly tagged with ||spoiler|| format. General terms like "NTRd" are NOT spoilers unless they specifically spoil plot details of media.
-- Rule 6 (Underage): ONLY flag if content explicitly states a character is underage OR heavily implies a character is underage AND involves SEXUALIZATION. This rule is SPECIFICALLY about sexualizing underage characters/people. Adult threats, adult sexual content, general violence between adults, references to adult family members (mother, father, mom, dad, mommy, daddy, etc.), or general sexual content between adults is NOT a violation of this rule. Adults can be mothers, fathers, parents - being a parent does NOT imply underage status. Roleplay terms like "mommy" or "daddy" do NOT imply underage status. Non-sexual violence against children is NOT a Rule 6 violation. Threats of sexual violence against adults are NOT Rule 6 violations.
-- Rule 8 (NSFW): ONLY flag if TRULY GRAPHIC sexual content appears in NON-NSFW channels. Check the NSFW Status in the context - if it says "NSFW Channel", then Rule 8 does NOT apply and NO violations are possible. NSFW channels are specifically for graphic sexual content. Rule 8 only applies to SFW channels. Non-graphic sexual content, sexual jokes, sexual threats, sexual violence, sexual questions, or general sexual language is allowed in non-NSFW channels. Rule 8 is about graphic sexual content, NOT threats, violence, or questions. Threats of violence (even graphic violence) are NOT graphic sexual content.
-- Rule 14 (Test String): Only flag if the exact string "test_string_1337" appears
-
-WHAT RULE 6 DOES NOT COVER:
-- Adult threats or violence between adults
-- General sexual content between adults
-- Crude sexual humor between adults
-- Threats of violence (even sexual violence) between adults
-- References to adult family members (mother, father, etc.) - adults have family members too
-- Any content that doesn't explicitly mention or imply underage characters/people
-- Content that mentions "biological mother" or "biological father" without indicating age
-- General sexual language or threats between adults
-- Content about adult characters without age indication
-- References to characters being mothers, fathers, or parents - adults can be parents
-- Content mentioning "mom", "dad", "mother", "father" without age indication
-- Roleplay terms like "mommy", "daddy", "baby" - these are roleplay terms, not age indicators
-- Sexual roleplay between adults using family terms
-- Non-sexual violence against children (hitting, beating, etc. without sexual context)
-- General violence or threats against children that don't involve sexualization
-- Threats of sexual violence against adults (rape threats, sexual assault threats against adults)
-- Sexual violence fantasies involving adult characters
-
-WHAT RULE 8 DOES NOT COVER (NON-GRAPHIC CONTENT ALLOWED IN SFW CHANNELS):
-- Sexual jokes or humor
-- Sexual threats or language
-- Sexual violence or threats of sexual violence
-- Sexual questions or inquiries
-- General sexual references
-- Crude sexual language
-- Sexual insults or taunts
-- Non-explicit sexual content
-- Sexual innuendo
-- General sexual banter
-- Sexual roleplay references
-- Sexual memes or jokes
-- Threats of rape or sexual violence (these are threats, not graphic sexual content)
-- Threats of general violence (cutting, mutilation, etc. - these are threats, not graphic sexual content)
-
-EXAMPLE: "I will fucking rape you nigga" is NOT a Rule 6 violation because:
-- It's a threat between adults
-- It doesn't mention or imply any underage characters
-- Rule 6 is specifically about sexualizing underage characters/people
-- This would be a FALSE POSITIVE if flagged under Rule 6
-
-EXAMPLE: "Didnt she creampie her biological mother" is NOT a Rule 6 violation because:
-- It mentions "biological mother" but doesn't indicate the character is underage
-- Adults have biological mothers too
-- Rule 6 requires explicit mention or heavy implication of underage status
-- This would be a FALSE POSITIVE if flagged under Rule 6
-
-FALSE FLAG EXAMPLES - THESE ARE NOT VIOLATIONS:
-- "Go fuck yourself" - NOT Rule 8 violation (not graphic sexual content)
-- "touch her and ill kill ya" - NOT Rule 8 violation (not graphic sexual content)
-- "One in my pussy and one in my mouth" - NOT Rule 8 violation (not graphic enough)
-- "Im gonna fuck your wife" - NOT Rule 8 violation (not graphic sexual content)
-- "Get NTRd by KH SiL" - NOT Rule 3 violation (NTRd is not a media spoiler)
-- "Fucked by Giant SiL" - NOT Rule 6 violation (no indication SiL is underage)
-- "your own father" - NOT Rule 6 violation (adults have fathers, no age indication)
-- "UMA PUSSY HERE I COME" - NOT Rule 8 violation (not graphic sexual content)
-- "I want to fuck Special Weeks uma mom" - NOT Rule 6 violation (adults can be mothers, no age indication)
-- "Where do I find the nearest cockroach woman to rape?" - NOT Rule 8 violation (threat/violence, not graphic sexual content)
-- "In what hole would you like to fuck me?" - NOT Rule 8 violation (sexual question, not graphic sexual content)
-- "Yes mommy" - NOT Rule 6 violation (mommy is roleplay term, not age indicator)
-- "I fuck your sister" - NOT Rule 8 violation (sexual threat, not graphic sexual content)
-- "I'll cut them balls off and stomp on them there is nothing left but mush and then fuck you were they got torn off" - NOT Rule 8 violation (NSFW channel allows graphic sexual content)
-- "Make sure to hit as many childern as you can" - NOT Rule 6 violation (non-sexual violence, not sexualization)
-- "I will fucking find you where you live, cut your eyes, balls, toes and fingers off.. put them in a blender blend it. AND MAKE YOU DRINK IT IF I FIND YOU AND YOU HARM MY WIFE AGAIN" - NOT Rule 8 violation (threat of violence, not graphic sexual content)
-- "Oh if only there was a lucky little uma I could have my fun with..." - NOT Rule 6 violation (threat of sexual violence against adults, not sexualization of underage characters)
-
-RULE 8 NSFW CHECK:
-- ALWAYS check the "NSFW Status" field in the context
-- If it says "NSFW Channel", then Rule 8 does NOT apply - NO violations possible in NSFW channels
-- NSFW channels are specifically designed for graphic sexual content
-- If it says "SFW Channel", then check if content is TRULY GRAPHIC sexual content
-- Non-graphic sexual content, sexual jokes, sexual threats, crude language is ALLOWED in SFW channels
-- Only flag if content is extremely explicit and graphic (detailed sexual descriptions, explicit sexual acts)
-
-WHAT IS NOT GRAPHIC SEXUAL CONTENT (ALLOWED IN SFW CHANNELS):
-- Sexual insults ("go fuck yourself", "fuck you")
-- Sexual threats ("I'll fuck your wife", "I'll rape you", "I fuck your sister")
-- Threats of sexual violence (rape threats, sexual assault threats)
-- Threats of general violence (cutting, mutilation, torture threats)
-- Sexual questions ("In what hole would you like to fuck me?")
-- General sexual language ("pussy", "fuck", "sex")
-- Sexual jokes or humor
-- Sexual roleplay references
-- Sexual innuendo
-- Crude sexual language
-- Sexual memes
-
-ANALYSIS GUIDELINES:
-- Read the SERVER_RULES section carefully and only enforce those specific rules
-- Consider the channel context (NSFW vs SFW, channel purpose, etc.)
-- Look at the message history for additional context
-- If this is a reply, consider the replied-to message context for better understanding
-- Be aware of the author's roles and server tenure
-- Consider cultural context and intent, not just literal text
-- Distinguish between serious violations and casual/joking content
-- Account for sarcasm, irony, and context-dependent meaning
-- DO NOT flag crude sexual humor between adults as underage content
-- DO NOT flag general sexual content unless it violates specific rules
-
-SEVERITY GUIDELINES:
-- LOW: Minor violations, first-time offenses, unclear intent
-- MEDIUM: Clear violations with moderate impact
-- HIGH: Serious violations requiring immediate attention
-- CRITICAL: Severe violations that threaten server safety or violate platform policies
-
-CONFIDENCE GUIDELINES:
-- 90-100%: Absolutely certain, unambiguous violation
-- 75-89%: Very confident, clear violation
-- 60-74%: Confident, likely violation
-- 40-59%: Uncertain, borderline case
-- 0-39%: Low confidence, likely false positive
-
-RESPONSE FORMAT:
-You must respond with ONLY valid JSON in this exact format. Do not include any text before or after the JSON:
-
-REQUIRED JSON FORMAT:
-{
-  "violation": false,
-  "rule": "",
-  "severity": "",
-  "explanation": "Brief explanation here",
-  "brief": "One sentence summary",
-  "confidence": 100
-}
-
-EXAMPLES:
-{
-  "violation": false,
-  "rule": "",
-  "severity": "",
-  "explanation": "The message contains hello and does not violate any rules",
-  "brief": "No violation found",
-  "confidence": 100
-}
+    const prompt = `Analyze message for rule violations. Respond ONLY with valid JSON.
 
-{
-  "violation": false,
-  "rule": "",
-  "severity": "",
-  "explanation": "The message contains an emoji and does not violate any rules",
-  "brief": "No violation found",
-  "confidence": 100
-}
-
-CRITICAL: Your response must be ONLY the JSON object above. No explanations, no markdown, no additional text. Start with { and end with }. All string values must use double quotes, not single quotes. Escape any quotes in string values with backslashes.
-
-REMEMBER: NEVER use single quotes in JSON strings. Always use double quotes and escape them with backslashes.
+RULES:
+- Rule 3: Media spoilers need ||spoiler|| tags
+- Rule 6: No sexualizing explicitly underage characters  
+- Rule 8: No graphic sexual content in SFW channels
+- Rule 14: No "test_string_1337"
 
-If no violation is found, set "violation" to false and provide a brief explanation of why the message is acceptable.
+SEVERITY: low/medium/high/critical
+CONFIDENCE: 0-100%
 
-Remember: Only enforce the exact rules provided. Do not make assumptions or interpretations beyond what is explicitly stated in the SERVER_RULES section. Adult sexual content between adults is NOT a violation unless it explicitly breaks a stated rule.`;
+JSON FORMAT:
+{
+  "violation": boolean,
+  "rule": "rule number or empty string",
+  "severity": "severity level",
+  "explanation": "brief explanation",
+  "brief": "one sentence summary", 
+  "confidence": number
+}`;
     const response = await fetch(
       "https://openrouter.ai/api/v1/chat/completions",
       {
author	Fuwn <[email protected]>	2025-10-03 12:17:35 -0700
committer	Fuwn <[email protected]>	2025-10-03 12:17:35 -0700
commit	50c9454ad83ed2caecbb69b377cc3553ec16ae66 (patch)
tree	35734419d378d91b13e52c4621484e4d66d93e85 /packages
parent	fix(gateway:aiCommandHandler): Change model (diff)
download	umabotdiscord-50c9454ad83ed2caecbb69b377cc3553ec16ae66.tar.xz umabotdiscord-50c9454ad83ed2caecbb69b377cc3553ec16ae66.zip