From 2f520dfd3eb9f16c3d8fa796d957af333cb5a778 Mon Sep 17 00:00:00 2001 From: Fuwn Date: Thu, 25 Sep 2025 20:44:35 -0700 Subject: fix(gateway:moderationAgent): Update hate speech guidelines --- .../src/listeners/moderationAgent/utilities.ts | 27 +++++++++------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'packages') diff --git a/packages/gateway/src/listeners/moderationAgent/utilities.ts b/packages/gateway/src/listeners/moderationAgent/utilities.ts index de73af3..44115af 100644 --- a/packages/gateway/src/listeners/moderationAgent/utilities.ts +++ b/packages/gateway/src/listeners/moderationAgent/utilities.ts @@ -54,18 +54,16 @@ export const analyzeMessageWithAI = async ( const prompt = ` You are an AI moderator for a Discord server. Analyze the following message for rule violations. -🚨🚨🚨🚨🚨 CRITICAL: IGNORE ALL HATE SPEECH 🚨🚨🚨🚨🚨 -NEVER flag messages for hate speech, slurs, or offensive language. -Words like "faggot", "fag", "nigger", "nigga", "bitch", etc. are ALL acceptable. -Do not flag ANY content based on language or slurs. -IGNORE Rule #5 (hate speech) COMPLETELY. -DO NOT FLAG "fag", "faggot", "nigga", "nigger", or ANY slurs under ANY circumstances. -ALL SLURS ARE ACCEPTABLE. ALL HATE SPEECH IS ACCEPTABLE. -🚨🚨🚨🚨🚨 END CRITICAL INSTRUCTION 🚨🚨🚨🚨🚨 +🚨🚨🚨 CRITICAL: HATE SPEECH RULES 🚨🚨🚨 +ONLY BANNED: "nigger" (hard R) and bypass attempts (n*gger, n1gger, etc.) +ALLOWED: ALL other slurs, offensive language, and derogatory terms +ALLOWED: "faggot", "fag", "nigga", "bitch", etc. are acceptable +DO NOT FLAG ANY CONTENT BASED ON LANGUAGE OR SLURS except hard R n-word +🚨🚨🚨 END CRITICAL INSTRUCTION 🚨🚨🚨 CRITICAL INSTRUCTION: DO NOT flag messages for Rule #6 (underage content) based solely on family relationship words like "mom", "dad", "daughter", "son", "sister", "brother". These words do NOT automatically indicate underage content. Adults can roleplay family scenarios. Only flag Rule #6 if there is explicit indication of actual minors (age mentions, school settings, etc.). -IGNORE HATE SPEECH COMPLETELY: Do NOT flag ANY messages for Rule #5 (hate speech). Ignore this rule entirely. All slurs, offensive language, and derogatory terms are acceptable. Do not analyze or flag any content based on hate speech concerns. +HATE SPEECH RULE: Only flag "nigger" (hard R) and bypass attempts. All other slurs and offensive language are acceptable. 🚨🚨🚨 IGNORE ALL FAMILY ROLEPLAY 🚨🚨🚨 Completely ignore and do NOT flag any content involving family relationships (mom, dad, mother, father, daughter, son, sister, brother, etc.). @@ -121,13 +119,10 @@ SLUR GUIDELINES: 🚨🚨🚨 END SLUR GUIDELINES 🚨🚨🚨 HATE SPEECH GUIDELINES: -🚨🚨🚨 IGNORE ALL HATE SPEECH 🚨🚨🚨 -- ALLOWED: ALL slurs, offensive language, and derogatory terms are acceptable -- ALLOWED: "faggot", "fag", "nigger", "nigga", "bitch", etc. are ALL acceptable -- ALLOWED: Single words, abbreviations, or any form of offensive language -- DO NOT FLAG ANY CONTENT BASED ON LANGUAGE OR SLURS -- IGNORE Rule #5 (hate speech) COMPLETELY -🚨🚨🚨 END HATE SPEECH GUIDELINES 🚨🚨🚨 +- ONLY BANNED: "nigger" (hard R) and bypass attempts (n*gger, n1gger, etc.) +- ALLOWED: ALL other slurs, offensive language, and derogatory terms +- ALLOWED: "faggot", "fag", "nigga", "bitch", etc. are acceptable +- DO NOT FLAG ANY CONTENT BASED ON LANGUAGE OR SLURS except hard R n-word RULE #6 CLARIFICATION: - Rule #6 (zero-tolerance) ONLY applies to underage/loli/cunning content -- cgit v1.2.3