started log ingestion and analysis

This commit is contained in:
Jake Kasper
2026-04-24 14:15:58 -04:00
parent c2537dd955
commit 9ac96cee9a
27 changed files with 1368 additions and 179 deletions

View File

@@ -1,14 +1,31 @@
"""Alertmanager client."""
"""Alert sources: Alertmanager plus log-derived alerts."""
import asyncio
import json
import httpx
from app.config import ALERTMANAGER_URL
from app.services import cluster_inventory
from app.services import cluster_inventory, log_ingest
_BASE = ALERTMANAGER_URL.rstrip("/")
async def get_alerts() -> list:
"""Return normalised list of active alerts from Alertmanager."""
"""Return normalised list of active alerts from Alertmanager and log analysis."""
cluster = await cluster_inventory.get_cluster_inventory()
alertmanager_task = asyncio.create_task(_get_alertmanager_alerts(cluster))
log_task = asyncio.to_thread(_get_log_alerts, cluster)
am_alerts, log_alerts = await asyncio.gather(alertmanager_task, log_task, return_exceptions=True)
if isinstance(am_alerts, Exception):
am_alerts = []
if isinstance(log_alerts, Exception):
log_alerts = []
return sorted(
[*am_alerts, *log_alerts],
key=lambda alert: (_severity_rank(alert.get("severity")), alert.get("timestamp", "")),
)
async def _get_alertmanager_alerts(cluster: dict) -> list:
try:
async with httpx.AsyncClient(timeout=5) as client:
r = await client.get(f"{_BASE}/api/v2/alerts", params={"active": "true", "silenced": "false"})
@@ -17,7 +34,6 @@ async def get_alerts() -> list:
except Exception:
return []
cluster = await cluster_inventory.get_cluster_inventory()
alerts = []
for a in raw:
labels = a.get("labels", {})
@@ -33,10 +49,62 @@ async def get_alerts() -> list:
"summary": summary,
"nf": nf_name,
"nodes": nodes,
"source": "alertmanager",
"timestamp": a.get("startsAt", ""),
})
return alerts
def _get_log_alerts(cluster: dict) -> list:
node_map = {}
for node in cluster.get("nodes", []):
if node.get("hostname"):
node_map[node["hostname"]] = node
if node.get("address"):
node_map[node["address"]] = node
alerts = []
for ctx in log_ingest.recent_alert_context(limit=50):
before = _decode_context(ctx.get("before_context"))
after = _decode_context(ctx.get("after_context"))
node_name = ctx.get("node", "")
nodes = []
if node_name and node_name in node_map:
nodes = [node_map[node_name]]
alerts.append({
"name": f"{ctx.get('nf') or 'System'} log anomaly",
"severity": ctx.get("severity", "warning"),
"instance": ctx.get("source", ""),
"summary": ctx.get("description", "Log-derived alert"),
"nf": ctx.get("nf", ""),
"nodes": nodes,
"source": "logs",
"timestamp": ctx.get("event_ts", ""),
"context_id": ctx.get("id"),
"node": node_name,
"match_message": ctx.get("match_message", ""),
"context_preview": {
"before": before[-3:],
"after": after[:3],
},
})
return alerts
def _decode_context(value: str | None) -> list[dict]:
if not value:
return []
try:
data = json.loads(value)
return data if isinstance(data, list) else []
except Exception:
return []
def _severity_rank(severity: str | None) -> int:
return {"critical": 0, "warning": 1, "info": 2}.get((severity or "warning").lower(), 3)
def _infer_nf(name: str, summary: str, instance: str) -> str:
text = f"{name} {summary} {instance}".upper()
for nf_name in ["AMF", "SMF", "UPF", "UDM", "UDR", "NRF", "AUSF", "PCF", "MME", "SGWC", "DRA", "DSM"]: