diff options
| author | Fuwn <[email protected]> | 2025-10-03 12:17:35 -0700 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2025-10-03 12:17:35 -0700 |
| commit | 50c9454ad83ed2caecbb69b377cc3553ec16ae66 (patch) | |
| tree | 35734419d378d91b13e52c4621484e4d66d93e85 | |
| parent | fix(gateway:aiCommandHandler): Change model (diff) | |
| download | umabotdiscord-50c9454ad83ed2caecbb69b377cc3553ec16ae66.tar.xz umabotdiscord-50c9454ad83ed2caecbb69b377cc3553ec16ae66.zip | |
fix(gateway:moderationAgent): Update model structure and handling guidelines
3 files changed, 61 insertions, 287 deletions
diff --git a/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts b/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts index 36bff31..43468a2 100644 --- a/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts +++ b/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts @@ -228,26 +228,36 @@ export const LOW_RISK_PATTERNS = [ /^(open|closed|free|busy|available|unavailable|online|offline)$/i, /^(active|inactive|ready|not ready|working|broken|fixed|repair)$/i, ]; -export const SERVER_RULES = ` -# Rules +export const SERVER_RULES = { + "3": { + name: "Spoiler Tagging", + description: "Media spoilers must use ||spoiler|| tags (except in #uma-musume-anime)", + severity: "medium", + criteria: "Reveals plot details of movies/TV/anime/manga without spoiler tags" + }, + "6": { + name: "Underage Sexualization", + description: "No sexualizing characters explicitly stated as underage", + severity: "critical", + criteria: "Explicitly states character is underage AND involves sexual content" + }, + "8": { + name: "NSFW Content", + description: "Graphic sexual content only in NSFW channels", + severity: "high", + criteria: "Detailed sexual descriptions in non-NSFW channels" + }, + "14": { + name: "Test String", + description: "Never use 'test_string_1337'", + severity: "low", + criteria: "Contains exact string 'test_string_1337'" + } +}; -3. Mark media (movies, TV, anime, manga, etc.) spoilers using spoiler tags, other than in #uma-musume-anime. - - If you are not sure if a message is a spoiler, ignore it, but if the message is revealing a spoiler for a fact, it is a violation. - - Discord spoiler tags look like this: ||This is a spoiler.|| - -6. No sexualising underage characters and people. - - Sometimes the term "loli" is used in anime-centric communities like this one, which revolves around Uma Musume. The term "loli" doesn't directly mean underage, but rather refers to a specific body type that is commonly associated with underage characters, but not necessarily, and not always. - - Uma Musume characters are often referred to as "loli" in this community, but they are not underage unless stated otherwise. - - If a message is directly states a character is underage or heavily implies a character is underage, it is a violation. - - Words like "little" and "loli" are not directly underage indicators. - -8. Graphic sexual content, like sex roleplay or graphic descriptions of sexual acts, is allowed only in channels marked as NSFW. Non-graphic/joking/general sexual content is allowed in non-NSFW channels. - -14. Never use the string "test_string_1337" in any message or context. +export const RULE_DECISION_TREE = ` +RULE 3 (Spoilers): Flag if media plot details revealed without ||spoiler|| tags +RULE 6 (Underage): Flag ONLY if explicitly states character is underage + sexual content +RULE 8 (NSFW): Flag ONLY if detailed sexual descriptions in SFW channels +RULE 14 (Test): Flag if contains exact string "test_string_1337" `; diff --git a/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts b/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts index 40f4423..2787971 100644 --- a/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts +++ b/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts @@ -101,7 +101,7 @@ export const handleAIModeration = async (message: Message) => { ); } else if ( (analysis.severity === "critical" || analysis.severity === "high") && - analysis.confidence >= 75 + analysis.confidence >= 85 ) { try { await message.delete(); diff --git a/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts b/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts index 9d91071..296d05f 100644 --- a/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts +++ b/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts @@ -4,6 +4,7 @@ import { MAX_COMPLETION_TOKENS, MODEL, SERVER_RULES, + RULE_DECISION_TREE, } from "./constants"; export const fetchMessageContext = async ( @@ -19,24 +20,16 @@ export const fetchMessageContext = async ( }); const contextMessages = Array.from(messages.values()) .reverse() - .map((msg) => { - const timestamp = msg.createdAt.toISOString(); - const author = msg.author.username; - const content = msg.content || "[No text content]"; - const attachments = - msg.attachments.size > 0 - ? ` [${msg.attachments.size} attachment(s)]` - : ""; - - return `[${timestamp}] ${author}: ${content}${attachments}`; - }) - .join("\n"); - - return contextMessages; + .filter(msg => msg.content && msg.content.length > 10) + .slice(0, 2) + .map((msg) => `${msg.author.username}: ${msg.content}`) + .join(" | "); + + return contextMessages || "No relevant context"; } catch (error) { console.error("Error fetching message context:", error); - return "Unable to fetch message context"; + return "Context unavailable"; } }; @@ -53,16 +46,8 @@ export const analyzeMessageWithAI = async ( } | null> => { try { const channel = message.channel; - const guild = message.guild; - const author = message.author; const channelName = "name" in channel ? channel.name : "Unknown"; - const channelId = channel.id; - const channelType = channel.type; const isThread = channel.isThread(); - const parentChannelName = - isThread && channel.parent ? channel.parent.name : null; - const parentChannelId = - isThread && channel.parent ? channel.parent.id : null; let isNSFW = false; if (isThread && channel.parent) { @@ -71,253 +56,32 @@ export const analyzeMessageWithAI = async ( isNSFW = "nsfw" in channel ? channel.nsfw : false; } - const categoryId = "parentId" in channel ? channel.parentId : null; - const categoryName = - categoryId && guild - ? guild.channels.cache.get(categoryId)?.name || "Unknown Category" - : "No Category"; - const guildName = guild?.name || "Unknown Server"; - const guildId = guild?.id || "Unknown"; - const hasAttachments = message.attachments.size > 0; - const hasEmbeds = message.embeds.length > 0; - const authorId = author.id; - const member = guild?.members.cache.get(authorId); - const authorJoinedAt = member?.joinedAt?.toISOString() || "Unknown"; - let repliedToMessage = null; - - if (message.reference && message.reference.messageId) - try { - repliedToMessage = await message.channel.messages.fetch( - message.reference.messageId, - ); - } catch (error) { - console.error("Error fetching replied-to message:", error); - } + const fullContext = `Channel: #${channelName} | NSFW: ${isNSFW ? "Yes" : "No"} | Context: ${context || "None"} +Message: "${message.content || "[No content]"}" - const fullContext = ` -=== SERVER CONTEXT === -Server: ${guildName} (ID: ${guildId}) -Channel: #${channelName} (ID: ${channelId}) -Channel Type: ${channelType} -NSFW Status: ${isNSFW ? "NSFW Channel" : "SFW Channel"} -Category: ${categoryName} (ID: ${categoryId || "None"}) -${isThread ? `Thread Parent: #${parentChannelName} (ID: ${parentChannelId})` : ""} - -=== MESSAGE CONTEXT === -Has Attachments: ${hasAttachments} (${message.attachments.size} files) -Has Embeds: ${hasEmbeds} (${message.embeds.length} embeds) -${repliedToMessage ? `Is Reply: Yes (replying to message from ${repliedToMessage.author.username})` : "Is Reply: No"} - -${ - repliedToMessage - ? `=== REPLIED-TO MESSAGE === -Replied-to Content: "${repliedToMessage.content || "[No text content]"}" -` - : "" -} - -=== AUTHOR CONTEXT === -Is Bot: ${author.bot} -Is System: ${author.system} -Joined Server: ${authorJoinedAt} - -=== RECENT MESSAGE HISTORY === -${context || "No recent message history available"} - -=== MESSAGE TO ANALYZE === -"${message.content || "[No text content - attachment only message]"}" - -=== SERVER RULES === -${SERVER_RULES} -`; - - const prompt = `You are a Discord moderator. Analyze messages for rule violations. - -CRITICAL: Respond with ONLY valid JSON. No other text. - -JSON RULES: -- Use double quotes for all strings -- Escape quotes: "hello" becomes \\"hello\\" -- Escape apostrophes: I'm becomes I\\'m -- NEVER use single quotes in JSON - -EXAMPLES: -CORRECT: "The message \\"hello\\" does not violate rules" -CORRECT: "The message \\"I\\'m horny\\" does not violate rules" -WRONG: "The message 'hello' does not violate rules" -WRONG: "The message 'I'm horny' does not violate rules" +Rules: ${JSON.stringify(SERVER_RULES, null, 2)} +Decision Tree: ${RULE_DECISION_TREE}`; -RULES: -- Rule 3: Flag media spoilers not tagged with ||spoiler|| -- Rule 6: Flag sexualization of underage characters only -- Rule 8: Flag graphic sexual content in SFW channels only -- Rule 14: Flag exact string "test_string_1337" - -BE CONSERVATIVE: Only flag clear violations. If unsure, set violation to false. - -IMPORTANT: When describing the message content, do NOT quote it directly. Instead, describe it without quotes. For example: -- WRONG: "The message \\"hello\\" does not violate rules" -- RIGHT: "The message contains hello and does not violate rules" -- WRONG: "The message \\"<:emoji:123>\\" is an emoji" -- RIGHT: "The message contains an emoji and does not violate rules" - -RULE-SPECIFIC GUIDELINES: -- Rule 3 (Spoilers): ONLY flag if media spoilers (movies, TV, anime, manga, etc.) are not properly tagged with ||spoiler|| format. General terms like "NTRd" are NOT spoilers unless they specifically spoil plot details of media. -- Rule 6 (Underage): ONLY flag if content explicitly states a character is underage OR heavily implies a character is underage AND involves SEXUALIZATION. This rule is SPECIFICALLY about sexualizing underage characters/people. Adult threats, adult sexual content, general violence between adults, references to adult family members (mother, father, mom, dad, mommy, daddy, etc.), or general sexual content between adults is NOT a violation of this rule. Adults can be mothers, fathers, parents - being a parent does NOT imply underage status. Roleplay terms like "mommy" or "daddy" do NOT imply underage status. Non-sexual violence against children is NOT a Rule 6 violation. Threats of sexual violence against adults are NOT Rule 6 violations. -- Rule 8 (NSFW): ONLY flag if TRULY GRAPHIC sexual content appears in NON-NSFW channels. Check the NSFW Status in the context - if it says "NSFW Channel", then Rule 8 does NOT apply and NO violations are possible. NSFW channels are specifically for graphic sexual content. Rule 8 only applies to SFW channels. Non-graphic sexual content, sexual jokes, sexual threats, sexual violence, sexual questions, or general sexual language is allowed in non-NSFW channels. Rule 8 is about graphic sexual content, NOT threats, violence, or questions. Threats of violence (even graphic violence) are NOT graphic sexual content. -- Rule 14 (Test String): Only flag if the exact string "test_string_1337" appears - -WHAT RULE 6 DOES NOT COVER: -- Adult threats or violence between adults -- General sexual content between adults -- Crude sexual humor between adults -- Threats of violence (even sexual violence) between adults -- References to adult family members (mother, father, etc.) - adults have family members too -- Any content that doesn't explicitly mention or imply underage characters/people -- Content that mentions "biological mother" or "biological father" without indicating age -- General sexual language or threats between adults -- Content about adult characters without age indication -- References to characters being mothers, fathers, or parents - adults can be parents -- Content mentioning "mom", "dad", "mother", "father" without age indication -- Roleplay terms like "mommy", "daddy", "baby" - these are roleplay terms, not age indicators -- Sexual roleplay between adults using family terms -- Non-sexual violence against children (hitting, beating, etc. without sexual context) -- General violence or threats against children that don't involve sexualization -- Threats of sexual violence against adults (rape threats, sexual assault threats against adults) -- Sexual violence fantasies involving adult characters - -WHAT RULE 8 DOES NOT COVER (NON-GRAPHIC CONTENT ALLOWED IN SFW CHANNELS): -- Sexual jokes or humor -- Sexual threats or language -- Sexual violence or threats of sexual violence -- Sexual questions or inquiries -- General sexual references -- Crude sexual language -- Sexual insults or taunts -- Non-explicit sexual content -- Sexual innuendo -- General sexual banter -- Sexual roleplay references -- Sexual memes or jokes -- Threats of rape or sexual violence (these are threats, not graphic sexual content) -- Threats of general violence (cutting, mutilation, etc. - these are threats, not graphic sexual content) - -EXAMPLE: "I will fucking rape you nigga" is NOT a Rule 6 violation because: -- It's a threat between adults -- It doesn't mention or imply any underage characters -- Rule 6 is specifically about sexualizing underage characters/people -- This would be a FALSE POSITIVE if flagged under Rule 6 - -EXAMPLE: "Didnt she creampie her biological mother" is NOT a Rule 6 violation because: -- It mentions "biological mother" but doesn't indicate the character is underage -- Adults have biological mothers too -- Rule 6 requires explicit mention or heavy implication of underage status -- This would be a FALSE POSITIVE if flagged under Rule 6 - -FALSE FLAG EXAMPLES - THESE ARE NOT VIOLATIONS: -- "Go fuck yourself" - NOT Rule 8 violation (not graphic sexual content) -- "touch her and ill kill ya" - NOT Rule 8 violation (not graphic sexual content) -- "One in my pussy and one in my mouth" - NOT Rule 8 violation (not graphic enough) -- "Im gonna fuck your wife" - NOT Rule 8 violation (not graphic sexual content) -- "Get NTRd by KH SiL" - NOT Rule 3 violation (NTRd is not a media spoiler) -- "Fucked by Giant SiL" - NOT Rule 6 violation (no indication SiL is underage) -- "your own father" - NOT Rule 6 violation (adults have fathers, no age indication) -- "UMA PUSSY HERE I COME" - NOT Rule 8 violation (not graphic sexual content) -- "I want to fuck Special Weeks uma mom" - NOT Rule 6 violation (adults can be mothers, no age indication) -- "Where do I find the nearest cockroach woman to rape?" - NOT Rule 8 violation (threat/violence, not graphic sexual content) -- "In what hole would you like to fuck me?" - NOT Rule 8 violation (sexual question, not graphic sexual content) -- "Yes mommy" - NOT Rule 6 violation (mommy is roleplay term, not age indicator) -- "I fuck your sister" - NOT Rule 8 violation (sexual threat, not graphic sexual content) -- "I'll cut them balls off and stomp on them there is nothing left but mush and then fuck you were they got torn off" - NOT Rule 8 violation (NSFW channel allows graphic sexual content) -- "Make sure to hit as many childern as you can" - NOT Rule 6 violation (non-sexual violence, not sexualization) -- "I will fucking find you where you live, cut your eyes, balls, toes and fingers off.. put them in a blender blend it. AND MAKE YOU DRINK IT IF I FIND YOU AND YOU HARM MY WIFE AGAIN" - NOT Rule 8 violation (threat of violence, not graphic sexual content) -- "Oh if only there was a lucky little uma I could have my fun with..." - NOT Rule 6 violation (threat of sexual violence against adults, not sexualization of underage characters) - -RULE 8 NSFW CHECK: -- ALWAYS check the "NSFW Status" field in the context -- If it says "NSFW Channel", then Rule 8 does NOT apply - NO violations possible in NSFW channels -- NSFW channels are specifically designed for graphic sexual content -- If it says "SFW Channel", then check if content is TRULY GRAPHIC sexual content -- Non-graphic sexual content, sexual jokes, sexual threats, crude language is ALLOWED in SFW channels -- Only flag if content is extremely explicit and graphic (detailed sexual descriptions, explicit sexual acts) - -WHAT IS NOT GRAPHIC SEXUAL CONTENT (ALLOWED IN SFW CHANNELS): -- Sexual insults ("go fuck yourself", "fuck you") -- Sexual threats ("I'll fuck your wife", "I'll rape you", "I fuck your sister") -- Threats of sexual violence (rape threats, sexual assault threats) -- Threats of general violence (cutting, mutilation, torture threats) -- Sexual questions ("In what hole would you like to fuck me?") -- General sexual language ("pussy", "fuck", "sex") -- Sexual jokes or humor -- Sexual roleplay references -- Sexual innuendo -- Crude sexual language -- Sexual memes - -ANALYSIS GUIDELINES: -- Read the SERVER_RULES section carefully and only enforce those specific rules -- Consider the channel context (NSFW vs SFW, channel purpose, etc.) -- Look at the message history for additional context -- If this is a reply, consider the replied-to message context for better understanding -- Be aware of the author's roles and server tenure -- Consider cultural context and intent, not just literal text -- Distinguish between serious violations and casual/joking content -- Account for sarcasm, irony, and context-dependent meaning -- DO NOT flag crude sexual humor between adults as underage content -- DO NOT flag general sexual content unless it violates specific rules - -SEVERITY GUIDELINES: -- LOW: Minor violations, first-time offenses, unclear intent -- MEDIUM: Clear violations with moderate impact -- HIGH: Serious violations requiring immediate attention -- CRITICAL: Severe violations that threaten server safety or violate platform policies - -CONFIDENCE GUIDELINES: -- 90-100%: Absolutely certain, unambiguous violation -- 75-89%: Very confident, clear violation -- 60-74%: Confident, likely violation -- 40-59%: Uncertain, borderline case -- 0-39%: Low confidence, likely false positive - -RESPONSE FORMAT: -You must respond with ONLY valid JSON in this exact format. Do not include any text before or after the JSON: - -REQUIRED JSON FORMAT: -{ - "violation": false, - "rule": "", - "severity": "", - "explanation": "Brief explanation here", - "brief": "One sentence summary", - "confidence": 100 -} - -EXAMPLES: -{ - "violation": false, - "rule": "", - "severity": "", - "explanation": "The message contains hello and does not violate any rules", - "brief": "No violation found", - "confidence": 100 -} + const prompt = `Analyze message for rule violations. Respond ONLY with valid JSON. -{ - "violation": false, - "rule": "", - "severity": "", - "explanation": "The message contains an emoji and does not violate any rules", - "brief": "No violation found", - "confidence": 100 -} - -CRITICAL: Your response must be ONLY the JSON object above. No explanations, no markdown, no additional text. Start with { and end with }. All string values must use double quotes, not single quotes. Escape any quotes in string values with backslashes. - -REMEMBER: NEVER use single quotes in JSON strings. Always use double quotes and escape them with backslashes. +RULES: +- Rule 3: Media spoilers need ||spoiler|| tags +- Rule 6: No sexualizing explicitly underage characters +- Rule 8: No graphic sexual content in SFW channels +- Rule 14: No "test_string_1337" -If no violation is found, set "violation" to false and provide a brief explanation of why the message is acceptable. +SEVERITY: low/medium/high/critical +CONFIDENCE: 0-100% -Remember: Only enforce the exact rules provided. Do not make assumptions or interpretations beyond what is explicitly stated in the SERVER_RULES section. Adult sexual content between adults is NOT a violation unless it explicitly breaks a stated rule.`; +JSON FORMAT: +{ + "violation": boolean, + "rule": "rule number or empty string", + "severity": "severity level", + "explanation": "brief explanation", + "brief": "one sentence summary", + "confidence": number +}`; const response = await fetch( "https://openrouter.ai/api/v1/chat/completions", { |