summaryrefslogtreecommitdiff
path: root/packages
diff options
context:
space:
mode:
authorFuwn <[email protected]>2025-10-03 12:17:35 -0700
committerFuwn <[email protected]>2025-10-03 12:17:35 -0700
commit50c9454ad83ed2caecbb69b377cc3553ec16ae66 (patch)
tree35734419d378d91b13e52c4621484e4d66d93e85 /packages
parentfix(gateway:aiCommandHandler): Change model (diff)
downloadumabotdiscord-50c9454ad83ed2caecbb69b377cc3553ec16ae66.tar.xz
umabotdiscord-50c9454ad83ed2caecbb69b377cc3553ec16ae66.zip
fix(gateway:moderationAgent): Update model structure and handling guidelines
Diffstat (limited to 'packages')
-rw-r--r--packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts52
-rw-r--r--packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts2
-rw-r--r--packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts294
3 files changed, 61 insertions, 287 deletions
diff --git a/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts b/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts
index 36bff31..43468a2 100644
--- a/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts
+++ b/packages/gateway/src/listeners/messageCreate/moderationAgent/constants.ts
@@ -228,26 +228,36 @@ export const LOW_RISK_PATTERNS = [
/^(open|closed|free|busy|available|unavailable|online|offline)$/i,
/^(active|inactive|ready|not ready|working|broken|fixed|repair)$/i,
];
-export const SERVER_RULES = `
-# Rules
+export const SERVER_RULES = {
+ "3": {
+ name: "Spoiler Tagging",
+ description: "Media spoilers must use ||spoiler|| tags (except in #uma-musume-anime)",
+ severity: "medium",
+ criteria: "Reveals plot details of movies/TV/anime/manga without spoiler tags"
+ },
+ "6": {
+ name: "Underage Sexualization",
+ description: "No sexualizing characters explicitly stated as underage",
+ severity: "critical",
+ criteria: "Explicitly states character is underage AND involves sexual content"
+ },
+ "8": {
+ name: "NSFW Content",
+ description: "Graphic sexual content only in NSFW channels",
+ severity: "high",
+ criteria: "Detailed sexual descriptions in non-NSFW channels"
+ },
+ "14": {
+ name: "Test String",
+ description: "Never use 'test_string_1337'",
+ severity: "low",
+ criteria: "Contains exact string 'test_string_1337'"
+ }
+};
-3. Mark media (movies, TV, anime, manga, etc.) spoilers using spoiler tags, other than in #uma-musume-anime.
-
- If you are not sure if a message is a spoiler, ignore it, but if the message is revealing a spoiler for a fact, it is a violation.
-
- Discord spoiler tags look like this: ||This is a spoiler.||
-
-6. No sexualising underage characters and people.
-
- Sometimes the term "loli" is used in anime-centric communities like this one, which revolves around Uma Musume. The term "loli" doesn't directly mean underage, but rather refers to a specific body type that is commonly associated with underage characters, but not necessarily, and not always.
-
- Uma Musume characters are often referred to as "loli" in this community, but they are not underage unless stated otherwise.
-
- If a message is directly states a character is underage or heavily implies a character is underage, it is a violation.
-
- Words like "little" and "loli" are not directly underage indicators.
-
-8. Graphic sexual content, like sex roleplay or graphic descriptions of sexual acts, is allowed only in channels marked as NSFW. Non-graphic/joking/general sexual content is allowed in non-NSFW channels.
-
-14. Never use the string "test_string_1337" in any message or context.
+export const RULE_DECISION_TREE = `
+RULE 3 (Spoilers): Flag if media plot details revealed without ||spoiler|| tags
+RULE 6 (Underage): Flag ONLY if explicitly states character is underage + sexual content
+RULE 8 (NSFW): Flag ONLY if detailed sexual descriptions in SFW channels
+RULE 14 (Test): Flag if contains exact string "test_string_1337"
`;
diff --git a/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts b/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts
index 40f4423..2787971 100644
--- a/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts
+++ b/packages/gateway/src/listeners/messageCreate/moderationAgent/index.ts
@@ -101,7 +101,7 @@ export const handleAIModeration = async (message: Message) => {
);
} else if (
(analysis.severity === "critical" || analysis.severity === "high") &&
- analysis.confidence >= 75
+ analysis.confidence >= 85
) {
try {
await message.delete();
diff --git a/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts b/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts
index 9d91071..296d05f 100644
--- a/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts
+++ b/packages/gateway/src/listeners/messageCreate/moderationAgent/utilities.ts
@@ -4,6 +4,7 @@ import {
MAX_COMPLETION_TOKENS,
MODEL,
SERVER_RULES,
+ RULE_DECISION_TREE,
} from "./constants";
export const fetchMessageContext = async (
@@ -19,24 +20,16 @@ export const fetchMessageContext = async (
});
const contextMessages = Array.from(messages.values())
.reverse()
- .map((msg) => {
- const timestamp = msg.createdAt.toISOString();
- const author = msg.author.username;
- const content = msg.content || "[No text content]";
- const attachments =
- msg.attachments.size > 0
- ? ` [${msg.attachments.size} attachment(s)]`
- : "";
-
- return `[${timestamp}] ${author}: ${content}${attachments}`;
- })
- .join("\n");
-
- return contextMessages;
+ .filter(msg => msg.content && msg.content.length > 10)
+ .slice(0, 2)
+ .map((msg) => `${msg.author.username}: ${msg.content}`)
+ .join(" | ");
+
+ return contextMessages || "No relevant context";
} catch (error) {
console.error("Error fetching message context:", error);
- return "Unable to fetch message context";
+ return "Context unavailable";
}
};
@@ -53,16 +46,8 @@ export const analyzeMessageWithAI = async (
} | null> => {
try {
const channel = message.channel;
- const guild = message.guild;
- const author = message.author;
const channelName = "name" in channel ? channel.name : "Unknown";
- const channelId = channel.id;
- const channelType = channel.type;
const isThread = channel.isThread();
- const parentChannelName =
- isThread && channel.parent ? channel.parent.name : null;
- const parentChannelId =
- isThread && channel.parent ? channel.parent.id : null;
let isNSFW = false;
if (isThread && channel.parent) {
@@ -71,253 +56,32 @@ export const analyzeMessageWithAI = async (
isNSFW = "nsfw" in channel ? channel.nsfw : false;
}
- const categoryId = "parentId" in channel ? channel.parentId : null;
- const categoryName =
- categoryId && guild
- ? guild.channels.cache.get(categoryId)?.name || "Unknown Category"
- : "No Category";
- const guildName = guild?.name || "Unknown Server";
- const guildId = guild?.id || "Unknown";
- const hasAttachments = message.attachments.size > 0;
- const hasEmbeds = message.embeds.length > 0;
- const authorId = author.id;
- const member = guild?.members.cache.get(authorId);
- const authorJoinedAt = member?.joinedAt?.toISOString() || "Unknown";
- let repliedToMessage = null;
-
- if (message.reference && message.reference.messageId)
- try {
- repliedToMessage = await message.channel.messages.fetch(
- message.reference.messageId,
- );
- } catch (error) {
- console.error("Error fetching replied-to message:", error);
- }
+ const fullContext = `Channel: #${channelName} | NSFW: ${isNSFW ? "Yes" : "No"} | Context: ${context || "None"}
+Message: "${message.content || "[No content]"}"
- const fullContext = `
-=== SERVER CONTEXT ===
-Server: ${guildName} (ID: ${guildId})
-Channel: #${channelName} (ID: ${channelId})
-Channel Type: ${channelType}
-NSFW Status: ${isNSFW ? "NSFW Channel" : "SFW Channel"}
-Category: ${categoryName} (ID: ${categoryId || "None"})
-${isThread ? `Thread Parent: #${parentChannelName} (ID: ${parentChannelId})` : ""}
-
-=== MESSAGE CONTEXT ===
-Has Attachments: ${hasAttachments} (${message.attachments.size} files)
-Has Embeds: ${hasEmbeds} (${message.embeds.length} embeds)
-${repliedToMessage ? `Is Reply: Yes (replying to message from ${repliedToMessage.author.username})` : "Is Reply: No"}
-
-${
- repliedToMessage
- ? `=== REPLIED-TO MESSAGE ===
-Replied-to Content: "${repliedToMessage.content || "[No text content]"}"
-`
- : ""
-}
-
-=== AUTHOR CONTEXT ===
-Is Bot: ${author.bot}
-Is System: ${author.system}
-Joined Server: ${authorJoinedAt}
-
-=== RECENT MESSAGE HISTORY ===
-${context || "No recent message history available"}
-
-=== MESSAGE TO ANALYZE ===
-"${message.content || "[No text content - attachment only message]"}"
-
-=== SERVER RULES ===
-${SERVER_RULES}
-`;
-
- const prompt = `You are a Discord moderator. Analyze messages for rule violations.
-
-CRITICAL: Respond with ONLY valid JSON. No other text.
-
-JSON RULES:
-- Use double quotes for all strings
-- Escape quotes: "hello" becomes \\"hello\\"
-- Escape apostrophes: I'm becomes I\\'m
-- NEVER use single quotes in JSON
-
-EXAMPLES:
-CORRECT: "The message \\"hello\\" does not violate rules"
-CORRECT: "The message \\"I\\'m horny\\" does not violate rules"
-WRONG: "The message 'hello' does not violate rules"
-WRONG: "The message 'I'm horny' does not violate rules"
+Rules: ${JSON.stringify(SERVER_RULES, null, 2)}
+Decision Tree: ${RULE_DECISION_TREE}`;
-RULES:
-- Rule 3: Flag media spoilers not tagged with ||spoiler||
-- Rule 6: Flag sexualization of underage characters only
-- Rule 8: Flag graphic sexual content in SFW channels only
-- Rule 14: Flag exact string "test_string_1337"
-
-BE CONSERVATIVE: Only flag clear violations. If unsure, set violation to false.
-
-IMPORTANT: When describing the message content, do NOT quote it directly. Instead, describe it without quotes. For example:
-- WRONG: "The message \\"hello\\" does not violate rules"
-- RIGHT: "The message contains hello and does not violate rules"
-- WRONG: "The message \\"<:emoji:123>\\" is an emoji"
-- RIGHT: "The message contains an emoji and does not violate rules"
-
-RULE-SPECIFIC GUIDELINES:
-- Rule 3 (Spoilers): ONLY flag if media spoilers (movies, TV, anime, manga, etc.) are not properly tagged with ||spoiler|| format. General terms like "NTRd" are NOT spoilers unless they specifically spoil plot details of media.
-- Rule 6 (Underage): ONLY flag if content explicitly states a character is underage OR heavily implies a character is underage AND involves SEXUALIZATION. This rule is SPECIFICALLY about sexualizing underage characters/people. Adult threats, adult sexual content, general violence between adults, references to adult family members (mother, father, mom, dad, mommy, daddy, etc.), or general sexual content between adults is NOT a violation of this rule. Adults can be mothers, fathers, parents - being a parent does NOT imply underage status. Roleplay terms like "mommy" or "daddy" do NOT imply underage status. Non-sexual violence against children is NOT a Rule 6 violation. Threats of sexual violence against adults are NOT Rule 6 violations.
-- Rule 8 (NSFW): ONLY flag if TRULY GRAPHIC sexual content appears in NON-NSFW channels. Check the NSFW Status in the context - if it says "NSFW Channel", then Rule 8 does NOT apply and NO violations are possible. NSFW channels are specifically for graphic sexual content. Rule 8 only applies to SFW channels. Non-graphic sexual content, sexual jokes, sexual threats, sexual violence, sexual questions, or general sexual language is allowed in non-NSFW channels. Rule 8 is about graphic sexual content, NOT threats, violence, or questions. Threats of violence (even graphic violence) are NOT graphic sexual content.
-- Rule 14 (Test String): Only flag if the exact string "test_string_1337" appears
-
-WHAT RULE 6 DOES NOT COVER:
-- Adult threats or violence between adults
-- General sexual content between adults
-- Crude sexual humor between adults
-- Threats of violence (even sexual violence) between adults
-- References to adult family members (mother, father, etc.) - adults have family members too
-- Any content that doesn't explicitly mention or imply underage characters/people
-- Content that mentions "biological mother" or "biological father" without indicating age
-- General sexual language or threats between adults
-- Content about adult characters without age indication
-- References to characters being mothers, fathers, or parents - adults can be parents
-- Content mentioning "mom", "dad", "mother", "father" without age indication
-- Roleplay terms like "mommy", "daddy", "baby" - these are roleplay terms, not age indicators
-- Sexual roleplay between adults using family terms
-- Non-sexual violence against children (hitting, beating, etc. without sexual context)
-- General violence or threats against children that don't involve sexualization
-- Threats of sexual violence against adults (rape threats, sexual assault threats against adults)
-- Sexual violence fantasies involving adult characters
-
-WHAT RULE 8 DOES NOT COVER (NON-GRAPHIC CONTENT ALLOWED IN SFW CHANNELS):
-- Sexual jokes or humor
-- Sexual threats or language
-- Sexual violence or threats of sexual violence
-- Sexual questions or inquiries
-- General sexual references
-- Crude sexual language
-- Sexual insults or taunts
-- Non-explicit sexual content
-- Sexual innuendo
-- General sexual banter
-- Sexual roleplay references
-- Sexual memes or jokes
-- Threats of rape or sexual violence (these are threats, not graphic sexual content)
-- Threats of general violence (cutting, mutilation, etc. - these are threats, not graphic sexual content)
-
-EXAMPLE: "I will fucking rape you nigga" is NOT a Rule 6 violation because:
-- It's a threat between adults
-- It doesn't mention or imply any underage characters
-- Rule 6 is specifically about sexualizing underage characters/people
-- This would be a FALSE POSITIVE if flagged under Rule 6
-
-EXAMPLE: "Didnt she creampie her biological mother" is NOT a Rule 6 violation because:
-- It mentions "biological mother" but doesn't indicate the character is underage
-- Adults have biological mothers too
-- Rule 6 requires explicit mention or heavy implication of underage status
-- This would be a FALSE POSITIVE if flagged under Rule 6
-
-FALSE FLAG EXAMPLES - THESE ARE NOT VIOLATIONS:
-- "Go fuck yourself" - NOT Rule 8 violation (not graphic sexual content)
-- "touch her and ill kill ya" - NOT Rule 8 violation (not graphic sexual content)
-- "One in my pussy and one in my mouth" - NOT Rule 8 violation (not graphic enough)
-- "Im gonna fuck your wife" - NOT Rule 8 violation (not graphic sexual content)
-- "Get NTRd by KH SiL" - NOT Rule 3 violation (NTRd is not a media spoiler)
-- "Fucked by Giant SiL" - NOT Rule 6 violation (no indication SiL is underage)
-- "your own father" - NOT Rule 6 violation (adults have fathers, no age indication)
-- "UMA PUSSY HERE I COME" - NOT Rule 8 violation (not graphic sexual content)
-- "I want to fuck Special Weeks uma mom" - NOT Rule 6 violation (adults can be mothers, no age indication)
-- "Where do I find the nearest cockroach woman to rape?" - NOT Rule 8 violation (threat/violence, not graphic sexual content)
-- "In what hole would you like to fuck me?" - NOT Rule 8 violation (sexual question, not graphic sexual content)
-- "Yes mommy" - NOT Rule 6 violation (mommy is roleplay term, not age indicator)
-- "I fuck your sister" - NOT Rule 8 violation (sexual threat, not graphic sexual content)
-- "I'll cut them balls off and stomp on them there is nothing left but mush and then fuck you were they got torn off" - NOT Rule 8 violation (NSFW channel allows graphic sexual content)
-- "Make sure to hit as many childern as you can" - NOT Rule 6 violation (non-sexual violence, not sexualization)
-- "I will fucking find you where you live, cut your eyes, balls, toes and fingers off.. put them in a blender blend it. AND MAKE YOU DRINK IT IF I FIND YOU AND YOU HARM MY WIFE AGAIN" - NOT Rule 8 violation (threat of violence, not graphic sexual content)
-- "Oh if only there was a lucky little uma I could have my fun with..." - NOT Rule 6 violation (threat of sexual violence against adults, not sexualization of underage characters)
-
-RULE 8 NSFW CHECK:
-- ALWAYS check the "NSFW Status" field in the context
-- If it says "NSFW Channel", then Rule 8 does NOT apply - NO violations possible in NSFW channels
-- NSFW channels are specifically designed for graphic sexual content
-- If it says "SFW Channel", then check if content is TRULY GRAPHIC sexual content
-- Non-graphic sexual content, sexual jokes, sexual threats, crude language is ALLOWED in SFW channels
-- Only flag if content is extremely explicit and graphic (detailed sexual descriptions, explicit sexual acts)
-
-WHAT IS NOT GRAPHIC SEXUAL CONTENT (ALLOWED IN SFW CHANNELS):
-- Sexual insults ("go fuck yourself", "fuck you")
-- Sexual threats ("I'll fuck your wife", "I'll rape you", "I fuck your sister")
-- Threats of sexual violence (rape threats, sexual assault threats)
-- Threats of general violence (cutting, mutilation, torture threats)
-- Sexual questions ("In what hole would you like to fuck me?")
-- General sexual language ("pussy", "fuck", "sex")
-- Sexual jokes or humor
-- Sexual roleplay references
-- Sexual innuendo
-- Crude sexual language
-- Sexual memes
-
-ANALYSIS GUIDELINES:
-- Read the SERVER_RULES section carefully and only enforce those specific rules
-- Consider the channel context (NSFW vs SFW, channel purpose, etc.)
-- Look at the message history for additional context
-- If this is a reply, consider the replied-to message context for better understanding
-- Be aware of the author's roles and server tenure
-- Consider cultural context and intent, not just literal text
-- Distinguish between serious violations and casual/joking content
-- Account for sarcasm, irony, and context-dependent meaning
-- DO NOT flag crude sexual humor between adults as underage content
-- DO NOT flag general sexual content unless it violates specific rules
-
-SEVERITY GUIDELINES:
-- LOW: Minor violations, first-time offenses, unclear intent
-- MEDIUM: Clear violations with moderate impact
-- HIGH: Serious violations requiring immediate attention
-- CRITICAL: Severe violations that threaten server safety or violate platform policies
-
-CONFIDENCE GUIDELINES:
-- 90-100%: Absolutely certain, unambiguous violation
-- 75-89%: Very confident, clear violation
-- 60-74%: Confident, likely violation
-- 40-59%: Uncertain, borderline case
-- 0-39%: Low confidence, likely false positive
-
-RESPONSE FORMAT:
-You must respond with ONLY valid JSON in this exact format. Do not include any text before or after the JSON:
-
-REQUIRED JSON FORMAT:
-{
- "violation": false,
- "rule": "",
- "severity": "",
- "explanation": "Brief explanation here",
- "brief": "One sentence summary",
- "confidence": 100
-}
-
-EXAMPLES:
-{
- "violation": false,
- "rule": "",
- "severity": "",
- "explanation": "The message contains hello and does not violate any rules",
- "brief": "No violation found",
- "confidence": 100
-}
+ const prompt = `Analyze message for rule violations. Respond ONLY with valid JSON.
-{
- "violation": false,
- "rule": "",
- "severity": "",
- "explanation": "The message contains an emoji and does not violate any rules",
- "brief": "No violation found",
- "confidence": 100
-}
-
-CRITICAL: Your response must be ONLY the JSON object above. No explanations, no markdown, no additional text. Start with { and end with }. All string values must use double quotes, not single quotes. Escape any quotes in string values with backslashes.
-
-REMEMBER: NEVER use single quotes in JSON strings. Always use double quotes and escape them with backslashes.
+RULES:
+- Rule 3: Media spoilers need ||spoiler|| tags
+- Rule 6: No sexualizing explicitly underage characters
+- Rule 8: No graphic sexual content in SFW channels
+- Rule 14: No "test_string_1337"
-If no violation is found, set "violation" to false and provide a brief explanation of why the message is acceptable.
+SEVERITY: low/medium/high/critical
+CONFIDENCE: 0-100%
-Remember: Only enforce the exact rules provided. Do not make assumptions or interpretations beyond what is explicitly stated in the SERVER_RULES section. Adult sexual content between adults is NOT a violation unless it explicitly breaks a stated rule.`;
+JSON FORMAT:
+{
+ "violation": boolean,
+ "rule": "rule number or empty string",
+ "severity": "severity level",
+ "explanation": "brief explanation",
+ "brief": "one sentence summary",
+ "confidence": number
+}`;
const response = await fetch(
"https://openrouter.ai/api/v1/chat/completions",
{