ws-sanctum-chronicler/app/agent/modes/warden.py

"""Warden Mode - Detects and flags suspicious content."""

import logging
from app.llm.client import LLMClient
from app.llm.prompts import PromptTemplates

logger = logging.getLogger(__name__)


class WardenMode:
    """
    Warden - The guardian against unwanted influences.

    Purpose:
    - Detects suspicious patterns (spam, scams, bot activity)
    - Flags Discord growth schemes and link spam
    - Monitors for manipulation or harmful content
    - Provides data for moderation decisions

    Policy:
    - Runs on every message (always active)
    - Never takes action directly (only flags)
    - Patterns to detect:
      * "Join our Discord"
      * "Grow your channel"
      * Multiple links
      * Repeated messages (spam)
      * Known scam keywords
    - Flags are recorded for human review
    """

    def __init__(self, llm_client: LLMClient):
        """Initialize Warden mode."""
        self.llm_client = llm_client
        self.suspicious_patterns = [
            "join our discord",
            "discord.gg",
            "grow your channel",
            "easy money",
            "click here",
            "limited offer",
            "act now",
        ]
        self.flagged_count = 0

    async def analyze_message(self, message: str) -> dict:
        """Analyze a message for suspicious content."""
        result = {
            "is_suspicious": False,
            "patterns_detected": [],
            "severity": "safe",
        }

        # Simple pattern matching
        message_lower = message.lower()
        for pattern in self.suspicious_patterns:
            if pattern in message_lower:
                result["patterns_detected"].append(pattern)
                result["is_suspicious"] = True

        # Check for multiple links
        link_count = message.count("http") + message.count("www")
        if link_count > 1:
            result["patterns_detected"].append("multiple_links")
            result["is_suspicious"] = True

        # Determine severity
        if result["is_suspicious"]:
            if len(result["patterns_detected"]) >= 2:
                result["severity"] = "high"
            else:
                result["severity"] = "medium"
            self.flagged_count += 1
            logger.warning(
                f"Warden flagged suspicious message: {result['patterns_detected']}"
            )

        return result

    async def get_report(self) -> dict:
        """Get Warden's activity report."""
        return {
            "mode": "warden",
            "total_flagged": self.flagged_count,
        }