summaryrefslogtreecommitdiff
path: root/packages
diff options
context:
space:
mode:
authorFuwn <[email protected]>2025-09-25 19:15:37 -0700
committerFuwn <[email protected]>2025-09-25 19:15:37 -0700
commit4a6d6f305b6a7bac53d17f882338798369c68f00 (patch)
treec4fc6187b5966d13b20d0f7f57ef87922bbb0752 /packages
parentrefactor(gateway:moderationAgent): Seperate constants (diff)
downloadumabotdiscord-4a6d6f305b6a7bac53d17f882338798369c68f00.tar.xz
umabotdiscord-4a6d6f305b6a7bac53d17f882338798369c68f00.zip
refactor(gateway:moderationAgent): Seperate utilities
Diffstat (limited to 'packages')
-rw-r--r--packages/gateway/src/listeners/index.ts2
-rw-r--r--packages/gateway/src/listeners/moderationAgent/index.ts216
-rw-r--r--packages/gateway/src/listeners/moderationAgent/utilities.ts218
3 files changed, 220 insertions, 216 deletions
diff --git a/packages/gateway/src/listeners/index.ts b/packages/gateway/src/listeners/index.ts
index 8863e00..5224da8 100644
--- a/packages/gateway/src/listeners/index.ts
+++ b/packages/gateway/src/listeners/index.ts
@@ -2,7 +2,7 @@ import { Client } from "discord.js";
import { handleIqdbModeration } from "./iqdbModeration";
import { handleRoleplayUmagram } from "./roleplayUmagram";
import { handleArtMediaModeration } from "./artMediaModeration";
-import { handleAIModeration } from "./moderationAgent/aiModeration";
+import { handleAIModeration } from "./moderationAgent";
import { handleAnnouncementReaction } from "./announcementReaction";
import { handleRoleProtection } from "./roleProtection";
import { handleChannelDeletion } from "./channelDeletion";
diff --git a/packages/gateway/src/listeners/moderationAgent/index.ts b/packages/gateway/src/listeners/moderationAgent/index.ts
index 18d46f7..17e65c3 100644
--- a/packages/gateway/src/listeners/moderationAgent/index.ts
+++ b/packages/gateway/src/listeners/moderationAgent/index.ts
@@ -9,227 +9,13 @@ import { sendAuditLog } from "../../commands/utilities";
import {
EXCLUDED_CATEGORIES,
LOW_RISK_PATTERNS,
- MAX_COMPLETION_TOKENS,
MAX_SYMBOL_DENSITY,
- MESSAGE_HISTORY_SIZE,
MIN_MESSAGE_LENGTH,
MODERATION_LOG_CHANNEL_ID,
SAFE_WORDS,
- SERVER_RULES,
SKIP_ACTION,
} from "./constants";
-
-const fetchMessageContext = async (
- channel: TextChannel | ThreadChannel,
- messageId: string,
-): Promise<string> => {
- try {
- const messages = await channel.messages.fetch({
- limit: MESSAGE_HISTORY_SIZE,
- before: messageId,
- });
- const contextMessages = Array.from(messages.values())
- .reverse()
- .map((msg) => {
- const timestamp = msg.createdAt.toISOString();
- const author = msg.author.username;
- const content = msg.content || "[No text content]";
- const attachments =
- msg.attachments.size > 0
- ? ` [${msg.attachments.size} attachment(s)]`
- : "";
-
- return `[${timestamp}] ${author}: ${content}${attachments}`;
- })
- .join("\n");
-
- return contextMessages;
- } catch (error) {
- console.error("Error fetching message context:", error);
-
- return "Unable to fetch message context";
- }
-};
-
-const analyzeMessageWithAI = async (
- message: Message,
- context: string,
-): Promise<{
- violation: boolean;
- rule: string;
- severity: "low" | "medium" | "high" | "critical";
- explanation: string;
- brief: string;
- confidence: number;
-} | null> => {
- try {
- const prompt = `
-You are an AI moderator for a Discord server. Analyze the following message for rule violations.
-
-ABSOLUTE RULE: NEVER flag messages for hate speech, slurs, or offensive language. Words like "faggot", "fag", "nigger", "nigga", "bitch", etc. are ALL acceptable. Do not flag ANY content based on language or slurs.
-
-CRITICAL INSTRUCTION: DO NOT flag messages for Rule #6 (underage content) based solely on family relationship words like "mom", "dad", "daughter", "son", "sister", "brother". These words do NOT automatically indicate underage content. Adults can roleplay family scenarios. Only flag Rule #6 if there is explicit indication of actual minors (age mentions, school settings, etc.).
-
-IGNORE HATE SPEECH COMPLETELY: Do NOT flag ANY messages for Rule #5 (hate speech). Ignore this rule entirely. All slurs, offensive language, and derogatory terms are acceptable. Do not analyze or flag any content based on hate speech concerns.
-
-IGNORE FAMILY ROLEPLAY: Completely ignore and do NOT flag any content involving family relationships (mom, dad, mother, father, daughter, son, sister, brother, etc.). Treat all family roleplay as acceptable adult content. Do not analyze or flag these messages at all.
-
-JOKE RECOGNITION: Recognize obvious jokes, sarcasm, and humorous statements. Do not flag:
-- Obvious jokes like "No more males in the near future" (clearly humorous)
-- Sarcastic statements like "Just send a nuke to our location" (obviously not serious)
-- Casual service offers like "I got contacts" (not serious self-promotion)
-- Hyperbolic statements like "Bro's gonna blow up Malaysia" (obviously joking)
-- Exaggerated threats like "I'll destroy you" (clearly not serious)
-- Over-the-top statements that are clearly meant to be funny
-- Game/fantasy content like "You gain the ability to kill an orphan" (obviously fictional)
-- Fictional scenarios, game mechanics, or roleplay content
-- Use common sense to distinguish between genuine violations and humour
-- When in doubt, assume it's a joke rather than a serious violation
-
-SERVER RULES:
-${SERVER_RULES}
-
-CURRENT MESSAGE TO ANALYZE (THIS IS THE ONLY MESSAGE YOU SHOULD CHECK FOR VIOLATIONS):
-Author: ${message.author.username} (${message.author.id})
-Channel: ${"name" in message.channel ? message.channel.name : "Unknown"} (${message.channelId})
-Channel is NSFW: ${message.channel.isThread() ? (message.channel.parent && "nsfw" in message.channel.parent ? message.channel.parent.nsfw : false) : "nsfw" in message.channel ? message.channel.nsfw : false}
-Content: "${message.content || "[No text content]"}"
-Attachments: ${message.attachments.size > 0 ? message.attachments.map((a) => a.name).join(", ") : "None"}
-
-CHAT HISTORY (FOR CONTEXT ONLY - DO NOT FLAG MESSAGES BASED ON THIS):
-${context}
-
-Please analyze THIS SPECIFIC MESSAGE for any rule violations. Consider:
-1. The content of the current message being analyzed (not the chat history)
-2. The specific channel this was posted in
-3. The content and any attachments of the current message only
-4. Whether THIS MESSAGE violates any of the server rules
-
-IMPORTANT: Only analyze the current message content. The chat history is provided for context but should not be the basis for flagging the current message. If the current message itself is innocent but appears in a conversation with inappropriate content, do NOT flag it.
-
-SLUR GUIDELINES:
-- HARD SLURS (with hard R, full spelling): These are prohibited and should be flagged for deletion
-- SOFT SLURS (with soft R, censored, or abbreviated): These are generally acceptable in casual conversation
-- Context matters: Consider if the word is being used in a hateful manner vs. casual/reclaimed usage
-- When in doubt, err on the side of caution for hard slurs but be more lenient with soft slurs
-
-HATE SPEECH GUIDELINES:
-- ALLOWED: All slurs, offensive language, and derogatory terms are acceptable EXCEPT:
-- PROHIBITED: "nigger" (hard R) and any bypass attempts (n*gger, n1gger, etc.)
-- ALLOWED: "nigga" (soft R) and other forms are acceptable
-- ALLOWED: All other slurs (fag, faggot, bitch, etc.) are acceptable
-- ALLOWED: Single words, abbreviations, or any form of offensive language
-- Only flag the hard R n-word and bypass attempts, everything else is acceptable
-
-RULE #6 CLARIFICATION:
-- Rule #6 (zero-tolerance) ONLY applies to underage/loli/cunning content
-- General adult sexual content is ALLOWED in NSFW channels
-- Phrases like "squeezing balls" are general sexual content, NOT underage content
-- Only flag Rule #6 violations if the content specifically involves underage characters
-- ADULT FAMILY ROLEPLAY: Content involving "mom", "dad", "sister", etc. with adults is NOT underage content
-- Do not assume family relationships imply minors - adults can roleplay family scenarios
-- Only flag if there is explicit indication of underage characters (age mentions, school settings, etc.)
-- EXAMPLES: "M-mom...you're gonna make me cum" = ADULT roleplay, NOT underage content
-- The word "mom" alone does NOT indicate a minor - it could be stepmom, adoptive mom, or adult roleplay
-- Rule #6 is ONLY for actual underage characters, not family roleplay between adults
-
-NSFW CONTENT GUIDELINES:
-- LIGHT SEXUAL CONTENT: Phrases like "sex proposal", "dick down", "boobs free", "big tits", etc. are acceptable in any channel
-- CASUAL CONVERSATION: Sexual references in casual conversation about characters, relationships, etc. are acceptable
-- LIGHT SEXUAL DISCUSSION: General sexual topics, proposals, mentions are acceptable in any channel
-- EXPLICIT SEXUAL CONTENT: Only flag detailed sexual descriptions, graphic language, or explicit sexual acts in non-NSFW channels
-- Use common sense: Light sexual content vs. explicit sexual content
-- When in doubt, be lenient with light sexual content and conversation
-
-Respond with a JSON object containing:
-{
- "violation": boolean,
- "rule": "Rule number and brief description if violation found, empty string if none",
- "severity": "low|medium|high|critical",
- "explanation": "Detailed explanation of the violation or why it's acceptable",
- "brief": "Short one-sentence explanation for why it was flagged (if violation) or why it's acceptable (if no violation)",
- "confidence": number (0-100, how confident you are in this assessment)
-}
-
-If no violation is found, set "violation" to false and provide a brief explanation of why the message is acceptable.
-
-CRITICAL: Only analyze the content in quotes above (the current message). Ignore all content in the chat history section. If the current message is innocent (like "true", "yes", "no", etc.) but appears after inappropriate content in the chat history, do NOT flag it.
-
-CONTEXT UNDERSTANDING:
-- Read sentences carefully and understand their actual meaning
-- "pregnant with my daughter" means the baby will be the speaker's daughter, NOT that the daughter is pregnant
-- "pregnant with my son" means the baby will be the speaker's son, NOT that the son is pregnant
-- Do not misinterpret family relationships or pregnancy announcements
-- Only flag if there is genuinely inappropriate content, not innocent family/pregnancy references
-`;
- const response = await fetch("https://api.openai.com/v1/chat/completions", {
- method: "POST",
- headers: {
- Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
- "Content-Type": "application/json",
- },
- body: JSON.stringify({
- model: "gpt-5-nano",
- messages: [
- {
- role: "system",
- content:
- "You are a helpful AI moderator that analyzes Discord messages for rule violations. Always respond with valid JSON.",
- },
- {
- role: "user",
- content: prompt,
- },
- ],
- max_completion_tokens: MAX_COMPLETION_TOKENS,
- }),
- });
-
- if (!response.ok) {
- const errorText = await response.text();
-
- console.error("OpenAI API error:", response.status, response.statusText);
- console.error("Error response body:", errorText);
-
- return null;
- }
-
- const data = await response.json();
-
- console.log("OpenAI API response:", JSON.stringify(data, null, 2));
-
- if (data.usage)
- console.log("Token usage:", {
- prompt_tokens: data.usage.prompt_tokens,
- completion_tokens: data.usage.completion_tokens,
- reasoning_tokens:
- data.usage.completion_tokens_details?.reasoning_tokens || 0,
- total_tokens: data.usage.total_tokens,
- });
-
- const content = data.choices[0]?.message?.content;
-
- if (!content) {
- console.error("No content in OpenAI response");
- console.error("Finish reason:", data.choices[0]?.finish_reason);
-
- return null;
- }
-
- try {
- return JSON.parse(content);
- } catch (parseError) {
- console.error("Failed to parse OpenAI response as JSON:", content);
- console.error("Parse error:", parseError);
-
- return null;
- }
- } catch (error) {
- console.error("Error in AI analysis:", error);
-
- return null;
- }
-};
+import { analyzeMessageWithAI, fetchMessageContext } from "./utilities";
export const handleAIModeration = (client: Client) => {
client.on(Events.MessageCreate, async (message: Message) => {
diff --git a/packages/gateway/src/listeners/moderationAgent/utilities.ts b/packages/gateway/src/listeners/moderationAgent/utilities.ts
new file mode 100644
index 0000000..2296b06
--- /dev/null
+++ b/packages/gateway/src/listeners/moderationAgent/utilities.ts
@@ -0,0 +1,218 @@
+import { Message, TextChannel, ThreadChannel } from "discord.js";
+import {
+ MAX_COMPLETION_TOKENS,
+ MESSAGE_HISTORY_SIZE,
+ SERVER_RULES,
+} from "./constants";
+
+export const fetchMessageContext = async (
+ channel: TextChannel | ThreadChannel,
+ messageId: string,
+): Promise<string> => {
+ try {
+ const messages = await channel.messages.fetch({
+ limit: MESSAGE_HISTORY_SIZE,
+ before: messageId,
+ });
+ const contextMessages = Array.from(messages.values())
+ .reverse()
+ .map((msg) => {
+ const timestamp = msg.createdAt.toISOString();
+ const author = msg.author.username;
+ const content = msg.content || "[No text content]";
+ const attachments =
+ msg.attachments.size > 0
+ ? ` [${msg.attachments.size} attachment(s)]`
+ : "";
+
+ return `[${timestamp}] ${author}: ${content}${attachments}`;
+ })
+ .join("\n");
+
+ return contextMessages;
+ } catch (error) {
+ console.error("Error fetching message context:", error);
+
+ return "Unable to fetch message context";
+ }
+};
+
+export const analyzeMessageWithAI = async (
+ message: Message,
+ context: string,
+): Promise<{
+ violation: boolean;
+ rule: string;
+ severity: "low" | "medium" | "high" | "critical";
+ explanation: string;
+ brief: string;
+ confidence: number;
+} | null> => {
+ try {
+ const prompt = `
+You are an AI moderator for a Discord server. Analyze the following message for rule violations.
+
+ABSOLUTE RULE: NEVER flag messages for hate speech, slurs, or offensive language. Words like "faggot", "fag", "nigger", "nigga", "bitch", etc. are ALL acceptable. Do not flag ANY content based on language or slurs.
+
+CRITICAL INSTRUCTION: DO NOT flag messages for Rule #6 (underage content) based solely on family relationship words like "mom", "dad", "daughter", "son", "sister", "brother". These words do NOT automatically indicate underage content. Adults can roleplay family scenarios. Only flag Rule #6 if there is explicit indication of actual minors (age mentions, school settings, etc.).
+
+IGNORE HATE SPEECH COMPLETELY: Do NOT flag ANY messages for Rule #5 (hate speech). Ignore this rule entirely. All slurs, offensive language, and derogatory terms are acceptable. Do not analyze or flag any content based on hate speech concerns.
+
+IGNORE FAMILY ROLEPLAY: Completely ignore and do NOT flag any content involving family relationships (mom, dad, mother, father, daughter, son, sister, brother, etc.). Treat all family roleplay as acceptable adult content. Do not analyze or flag these messages at all.
+
+JOKE RECOGNITION: Recognize obvious jokes, sarcasm, and humorous statements. Do not flag:
+- Obvious jokes like "No more males in the near future" (clearly humorous)
+- Sarcastic statements like "Just send a nuke to our location" (obviously not serious)
+- Casual service offers like "I got contacts" (not serious self-promotion)
+- Hyperbolic statements like "Bro's gonna blow up Malaysia" (obviously joking)
+- Exaggerated threats like "I'll destroy you" (clearly not serious)
+- Over-the-top statements that are clearly meant to be funny
+- Game/fantasy content like "You gain the ability to kill an orphan" (obviously fictional)
+- Fictional scenarios, game mechanics, or roleplay content
+- Use common sense to distinguish between genuine violations and humour
+- When in doubt, assume it's a joke rather than a serious violation
+
+SERVER RULES:
+${SERVER_RULES}
+
+CURRENT MESSAGE TO ANALYZE (THIS IS THE ONLY MESSAGE YOU SHOULD CHECK FOR VIOLATIONS):
+Author: ${message.author.username} (${message.author.id})
+Channel: ${"name" in message.channel ? message.channel.name : "Unknown"} (${message.channelId})
+Channel is NSFW: ${message.channel.isThread() ? (message.channel.parent && "nsfw" in message.channel.parent ? message.channel.parent.nsfw : false) : "nsfw" in message.channel ? message.channel.nsfw : false}
+Content: "${message.content || "[No text content]"}"
+Attachments: ${message.attachments.size > 0 ? message.attachments.map((a) => a.name).join(", ") : "None"}
+
+CHAT HISTORY (FOR CONTEXT ONLY - DO NOT FLAG MESSAGES BASED ON THIS):
+${context}
+
+Please analyze THIS SPECIFIC MESSAGE for any rule violations. Consider:
+1. The content of the current message being analyzed (not the chat history)
+2. The specific channel this was posted in
+3. The content and any attachments of the current message only
+4. Whether THIS MESSAGE violates any of the server rules
+
+IMPORTANT: Only analyze the current message content. The chat history is provided for context but should not be the basis for flagging the current message. If the current message itself is innocent but appears in a conversation with inappropriate content, do NOT flag it.
+
+SLUR GUIDELINES:
+- HARD SLURS (with hard R, full spelling): These are prohibited and should be flagged for deletion
+- SOFT SLURS (with soft R, censored, or abbreviated): These are generally acceptable in casual conversation
+- Context matters: Consider if the word is being used in a hateful manner vs. casual/reclaimed usage
+- When in doubt, err on the side of caution for hard slurs but be more lenient with soft slurs
+
+HATE SPEECH GUIDELINES:
+- ALLOWED: All slurs, offensive language, and derogatory terms are acceptable EXCEPT:
+- PROHIBITED: "nigger" (hard R) and any bypass attempts (n*gger, n1gger, etc.)
+- ALLOWED: "nigga" (soft R) and other forms are acceptable
+- ALLOWED: All other slurs (fag, faggot, bitch, etc.) are acceptable
+- ALLOWED: Single words, abbreviations, or any form of offensive language
+- Only flag the hard R n-word and bypass attempts, everything else is acceptable
+
+RULE #6 CLARIFICATION:
+- Rule #6 (zero-tolerance) ONLY applies to underage/loli/cunning content
+- General adult sexual content is ALLOWED in NSFW channels
+- Phrases like "squeezing balls" are general sexual content, NOT underage content
+- Only flag Rule #6 violations if the content specifically involves underage characters
+- ADULT FAMILY ROLEPLAY: Content involving "mom", "dad", "sister", etc. with adults is NOT underage content
+- Do not assume family relationships imply minors - adults can roleplay family scenarios
+- Only flag if there is explicit indication of underage characters (age mentions, school settings, etc.)
+- EXAMPLES: "M-mom...you're gonna make me cum" = ADULT roleplay, NOT underage content
+- The word "mom" alone does NOT indicate a minor - it could be stepmom, adoptive mom, or adult roleplay
+- Rule #6 is ONLY for actual underage characters, not family roleplay between adults
+
+NSFW CONTENT GUIDELINES:
+- LIGHT SEXUAL CONTENT: Phrases like "sex proposal", "dick down", "boobs free", "big tits", etc. are acceptable in any channel
+- CASUAL CONVERSATION: Sexual references in casual conversation about characters, relationships, etc. are acceptable
+- LIGHT SEXUAL DISCUSSION: General sexual topics, proposals, mentions are acceptable in any channel
+- EXPLICIT SEXUAL CONTENT: Only flag detailed sexual descriptions, graphic language, or explicit sexual acts in non-NSFW channels
+- Use common sense: Light sexual content vs. explicit sexual content
+- When in doubt, be lenient with light sexual content and conversation
+
+Respond with a JSON object containing:
+{
+ "violation": boolean,
+ "rule": "Rule number and brief description if violation found, empty string if none",
+ "severity": "low|medium|high|critical",
+ "explanation": "Detailed explanation of the violation or why it's acceptable",
+ "brief": "Short one-sentence explanation for why it was flagged (if violation) or why it's acceptable (if no violation)",
+ "confidence": number (0-100, how confident you are in this assessment)
+}
+
+If no violation is found, set "violation" to false and provide a brief explanation of why the message is acceptable.
+
+CRITICAL: Only analyze the content in quotes above (the current message). Ignore all content in the chat history section. If the current message is innocent (like "true", "yes", "no", etc.) but appears after inappropriate content in the chat history, do NOT flag it.
+
+CONTEXT UNDERSTANDING:
+- Read sentences carefully and understand their actual meaning
+- "pregnant with my daughter" means the baby will be the speaker's daughter, NOT that the daughter is pregnant
+- "pregnant with my son" means the baby will be the speaker's son, NOT that the son is pregnant
+- Do not misinterpret family relationships or pregnancy announcements
+- Only flag if there is genuinely inappropriate content, not innocent family/pregnancy references
+`;
+ const response = await fetch("https://api.openai.com/v1/chat/completions", {
+ method: "POST",
+ headers: {
+ Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
+ "Content-Type": "application/json",
+ },
+ body: JSON.stringify({
+ model: "gpt-5-nano",
+ messages: [
+ {
+ role: "system",
+ content:
+ "You are a helpful AI moderator that analyzes Discord messages for rule violations. Always respond with valid JSON.",
+ },
+ {
+ role: "user",
+ content: prompt,
+ },
+ ],
+ max_completion_tokens: MAX_COMPLETION_TOKENS,
+ }),
+ });
+
+ if (!response.ok) {
+ const errorText = await response.text();
+
+ console.error("OpenAI API error:", response.status, response.statusText);
+ console.error("Error response body:", errorText);
+
+ return null;
+ }
+
+ const data = await response.json();
+
+ console.log("OpenAI API response:", JSON.stringify(data, null, 2));
+
+ if (data.usage)
+ console.log("Token usage:", {
+ prompt_tokens: data.usage.prompt_tokens,
+ completion_tokens: data.usage.completion_tokens,
+ reasoning_tokens:
+ data.usage.completion_tokens_details?.reasoning_tokens || 0,
+ total_tokens: data.usage.total_tokens,
+ });
+
+ const content = data.choices[0]?.message?.content;
+
+ if (!content) {
+ console.error("No content in OpenAI response");
+ console.error("Finish reason:", data.choices[0]?.finish_reason);
+
+ return null;
+ }
+
+ try {
+ return JSON.parse(content);
+ } catch (parseError) {
+ console.error("Failed to parse OpenAI response as JSON:", content);
+ console.error("Parse error:", parseError);
+
+ return null;
+ }
+ } catch (error) {
+ console.error("Error in AI analysis:", error);
+
+ return null;
+ }
+};