AI generated first iteration
This commit is contained in:
85
app/agent/modes/warden.py
Normal file
85
app/agent/modes/warden.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Warden Mode - Detects and flags suspicious content."""
|
||||
|
||||
import logging
|
||||
from app.llm.client import LLMClient
|
||||
from app.llm.prompts import PromptTemplates
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WardenMode:
|
||||
"""
|
||||
Warden - The guardian against unwanted influences.
|
||||
|
||||
Purpose:
|
||||
- Detects suspicious patterns (spam, scams, bot activity)
|
||||
- Flags Discord growth schemes and link spam
|
||||
- Monitors for manipulation or harmful content
|
||||
- Provides data for moderation decisions
|
||||
|
||||
Policy:
|
||||
- Runs on every message (always active)
|
||||
- Never takes action directly (only flags)
|
||||
- Patterns to detect:
|
||||
* "Join our Discord"
|
||||
* "Grow your channel"
|
||||
* Multiple links
|
||||
* Repeated messages (spam)
|
||||
* Known scam keywords
|
||||
- Flags are recorded for human review
|
||||
"""
|
||||
|
||||
def __init__(self, llm_client: LLMClient):
|
||||
"""Initialize Warden mode."""
|
||||
self.llm_client = llm_client
|
||||
self.suspicious_patterns = [
|
||||
"join our discord",
|
||||
"discord.gg",
|
||||
"grow your channel",
|
||||
"easy money",
|
||||
"click here",
|
||||
"limited offer",
|
||||
"act now",
|
||||
]
|
||||
self.flagged_count = 0
|
||||
|
||||
async def analyze_message(self, message: str) -> dict:
|
||||
"""Analyze a message for suspicious content."""
|
||||
result = {
|
||||
"is_suspicious": False,
|
||||
"patterns_detected": [],
|
||||
"severity": "safe",
|
||||
}
|
||||
|
||||
# Simple pattern matching
|
||||
message_lower = message.lower()
|
||||
for pattern in self.suspicious_patterns:
|
||||
if pattern in message_lower:
|
||||
result["patterns_detected"].append(pattern)
|
||||
result["is_suspicious"] = True
|
||||
|
||||
# Check for multiple links
|
||||
link_count = message.count("http") + message.count("www")
|
||||
if link_count > 1:
|
||||
result["patterns_detected"].append("multiple_links")
|
||||
result["is_suspicious"] = True
|
||||
|
||||
# Determine severity
|
||||
if result["is_suspicious"]:
|
||||
if len(result["patterns_detected"]) >= 2:
|
||||
result["severity"] = "high"
|
||||
else:
|
||||
result["severity"] = "medium"
|
||||
self.flagged_count += 1
|
||||
logger.warning(
|
||||
f"Warden flagged suspicious message: {result['patterns_detected']}"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def get_report(self) -> dict:
|
||||
"""Get Warden's activity report."""
|
||||
return {
|
||||
"mode": "warden",
|
||||
"total_flagged": self.flagged_count,
|
||||
}
|
||||
Reference in New Issue
Block a user