DeFi-Simulation-Lab/audit_generator.py at main · Arpit-R-Doshi/DeFi-Simulation-Lab · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""
audit_generator.py
==================
Generates a comprehensive Markdown audit report using Groq.
It takes the simulation definition, event logs, and flagged breaking points
to summarize agent behavior, vulnerabilities, and remediation steps.
"""
import json
import logging
from typing import Dict, Any, List
import config

logger = logging.getLogger(__name__)

# ── Token budget helpers ──────────────────────────────────────────────
# Conservative estimate: 1 token ≈ 3.5 characters (Groq counts BPE tokens)
_CHARS_PER_TOKEN = 3.5
# Leave headroom: max_tokens (2048) + prompt template (~600 chars) already eat ~1200 tokens.
# Budget the DATA sections to stay within 4600 tokens → ~16100 chars.
_DATA_CHAR_BUDGET = int(4600 * _CHARS_PER_TOKEN)   # ≈ 16,100 chars
_TOTAL_TOKEN_LIMIT = 5800                           # hard cap before sending


def _estimate_tokens(text: str) -> int:
    return int(len(text) / _CHARS_PER_TOKEN)


def _trim_json(obj: Any, max_chars: int) -> str:
    """Serialize obj to JSON; truncate to max_chars with a trailing note."""
    s = json.dumps(obj, indent=2)
    if len(s) <= max_chars:
        return s
    truncated = s[:max_chars]
    # Close at a clean line boundary if possible
    last_nl = truncated.rfind('\n')
    if last_nl > max_chars // 2:
        truncated = truncated[:last_nl]
    return truncated + '\n  ... [TRUNCATED to fit token limit]'


def _trim_events(events: List[Dict], budget_chars: int) -> str:
    """Return as many events as fit within budget_chars, most-volatile first."""
    if not events:
        return '[]'
    # Already sorted by volatility by the caller; take from start
    result = []
    used = 4  # for "[\n]"
    for ev in events:
        s = json.dumps(ev, indent=2)
        if used + len(s) + 4 > budget_chars:
            result.append({"note": f"...{len(events) - len(result)} more events truncated to fit token limit"})
            break
        result.append(ev)
        used += len(s) + 4
    return json.dumps(result, indent=2)


def _build_prompt(config_content: str, agent_configs: dict,
                  flagged_agents: list, simulation_summary: dict,
                  simulation_events: list,
                  lending_analytics: dict = None) -> str:
    """Build the prompt, shrinking the largest sections first if over budget."""

    # Allocate per-section char budgets (largest/most-verbose sections shrink first)
    section_budgets = {
        "events":  6000,   # most verbose — shrink first
        "agents":  3000,
        "config":  2000,
        "flagged": 2000,
        "summary": 1500,
    }

    def _render(ev_budget=6000, ag_budget=3000, cfg_budget=2000):
        events_str  = _trim_events(simulation_events, ev_budget)
        agents_str  = _trim_json(agent_configs, ag_budget)
        config_str  = config_content[:cfg_budget] + (' ... [truncated]' if len(config_content) > cfg_budget else '')
        flagged_str = _trim_json(flagged_agents, 2000)
        summary_str = _trim_json(simulation_summary, 1500)

        # Compact single-line lending digest (never truncated — always fits)
        if lending_analytics:
            la = lending_analytics
            lending_str = (
                f"80% MaxLTV | 85% LiqThreshold | 5% LiqBonus | "
                f"PeakDebt=${la.get('peak_debt_usdc',0):,.0f} | "
                f"MinHF={la.get('min_health_factor_seen',9999):.3f} | "
                f"BadDebtSteps={la.get('bad_debt_steps',0)} | "
                f"PeakBadDebt=${la.get('peak_bad_debt_usdc',0):,.0f} | "
                f"Liquidations={la.get('total_liquidations',0)} | "
                f"BankRuns={la.get('bank_run_count',0)} | "
                f"WhaleLeverageLoops={la.get('total_whale_leverage_loops',0)} | "
                f"PeakUtil={la.get('peak_utilization_rate',0)*100:.1f}% | "
                f"PeakCollateralETH={la.get('peak_collateral_eth',0):.2f}"
            )
        else:
            lending_str = "No lending data collected."

        return f"""
You are a top-tier DeFi Security Auditor. A stress test simulation driven by AI agents was just completed.
Review the following simulation data and provide a comprehensive audit report.

1. System Configuration (from config.py, defining the baseline rules and environment limits):
```python
{config_str}
```

2. Agent Configurations (How each agent was instructed to behave):
{agents_str}

3. Simulation Process Events (Timeline of what happened during the simulation: steps, prices, spread and agent actions):
{events_str}

4. Flagged Agents (Agents that drove the system to unsafe extremes, broken invariants, caused massive price impacts, etc.):
{flagged_str}

5. Simulation Final Summary (Aggregated actions per agent):
{summary_str}

6. Lending Module Analytics (HF = CollateralValue × 0.85 / Debt; HF < 1.0 → liquidatable):
{lending_str}

Format your response in Markdown with the following structured sections:
# 🛡️ Comprehensive Agentic Stress Test Audit

## 1. Market & Environment Review
Briefly summarize the starting conditions based on the config.

## 2. Lending Module Health
Using **section 6 numbers**, analyze:
- Did any position fall below HF 1.0? Were they liquidated on time or did bad debt accumulate?
- Did the Whale loop-leverage strategy work or did it create underwater positions?
- Did the MEV Searcher successfully liquidate & dump? What was the cascade effect on AMM price?
- Did the Institutional lender trigger a Bank Run (bad debt detected)? Quote the bad_debt_steps count.
- Was utilization rate sustainable? Quote the peak_utilization_rate.

## 3. Agent Behavior Analysis
Detail how the intelligent agents (e.g. Retail, Whales, MEV) behaved according to their configurations. Refer to specific events in the Simulation Process Events to highlight interesting scenarios.

## 4. Breaking Points & Vulnerabilities
Identify EXACTLY what broke (e.g. AMM pricing logic, smart contract invariant, liquidity pool depletion, frontend metrics mismatch).
Explicitly name the accounts/agents (from the `Flagged Agents` list) that breached the system.
Identify EXACTLY which specific component (e.g. AMM, contract, liquidity pool) was broken or manipulated and HOW.
Provide the SPECIFIC FUNCTION NAMES that were exploited or called during the vulnerability (e.g., `execute_liquidation`, `execute_borrow`, `execute_swap`, `execute_deposit_collateral`, `execute_withdraw_collateral`).
If nothing broke severely, state that the protocol withstood the stress test but highlight any near-misses.

## 5. Suggested Remediations
Provide HIGHLY SPECIFIC, actionable advice! Give exact code-level suggestions on how to fix the smart contract, adjust the AMM curves (e.g., fee tiers, slippage limits), or tune liquidity limits to prevent these exact breaches.

Stay professional, concise, and highly analytical.
"""

    prompt = _render()

    # If still over limit, progressively shrink events → agents → config
    if _estimate_tokens(prompt) > _TOTAL_TOKEN_LIMIT:
        prompt = _render(ev_budget=3000, ag_budget=2000, cfg_budget=1200)

    if _estimate_tokens(prompt) > _TOTAL_TOKEN_LIMIT:
        prompt = _render(ev_budget=1500, ag_budget=1000, cfg_budget=800)

    if _estimate_tokens(prompt) > _TOTAL_TOKEN_LIMIT:
        # Last resort: hard-truncate the whole prompt
        max_chars = int(_TOTAL_TOKEN_LIMIT * _CHARS_PER_TOKEN)
        prompt = prompt[:max_chars] + "\n\n[Prompt truncated to fit token limit]"
        logger.warning("Audit prompt hard-truncated to %d chars", max_chars)

    est = _estimate_tokens(prompt)
    logger.info("Audit prompt estimated tokens: ~%d", est)
    return prompt


def generate_audit_report(
    config_content: str,
    agent_configs: Dict[str, Any],
    flagged_agents: List[Dict[str, Any]],
    simulation_summary: Dict[str, Any],
    simulation_events: List[Dict[str, Any]],
    lending_analytics: Dict[str, Any] = None,
) -> str:
    """Generate the LLM audit report summarizing agent behavior and breaks."""
    from nlp_engine import _groq_client
    if not _groq_client:
        return "⚠️ **Audit Generation Unavailable**: Groq API key is missing or invalid. Please check your `.env` file."

    prompt = _build_prompt(config_content, agent_configs, flagged_agents,
                           simulation_summary, simulation_events,
                           lending_analytics or {})

    try:
        response = _groq_client.chat.completions.create(
            model=config.GROQ_MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
            max_tokens=2048,
            timeout=config.GROQ_TIMEOUT,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        logger.error(f"Audit generation failed: {e}")
        return f"⚠️ **Error generating audit**: {e}"