357 lines
15 KiB
Python
357 lines
15 KiB
Python
"""
|
|
log_analyzer.py — Reads P5G NF container logs and active Prometheus/Alertmanager
|
|
data to produce a structured list of recommended remediation actions, grouped
|
|
by category. This is the data backend powering the /api/actions endpoint.
|
|
"""
|
|
|
|
import asyncio
|
|
import re
|
|
import time
|
|
from collections import deque
|
|
from datetime import datetime
|
|
|
|
from app.config import CONTAINER_HOST, CONTAINER_RUNTIME
|
|
|
|
# ── In-memory history (up to 96 snapshots ≈ 48 min at 30 s refresh) ────────
|
|
_history: deque = deque(maxlen=96)
|
|
|
|
# ── Category colour palette ──────────────────────────────────────────────────
|
|
CATEGORY_COLORS: dict[str, str] = {
|
|
"Registration": "#3b82f6",
|
|
"Sessions": "#7c3aed",
|
|
"Authentication": "#f59e0b",
|
|
"Connectivity": "#06b6d4",
|
|
"Policy": "#10b981",
|
|
"Security": "#ef4444",
|
|
}
|
|
|
|
# All categories in canonical display order (left side, right side)
|
|
ALL_CATEGORIES = ["Registration", "Authentication", "Security",
|
|
"Sessions", "Connectivity", "Policy"]
|
|
|
|
# ── Log-pattern definitions ──────────────────────────────────────────────────
|
|
# Each entry: (regex, affected_nf, severity, short_description, remediation)
|
|
CATEGORY_PATTERNS: dict[str, list[tuple]] = {
|
|
"Registration": [
|
|
(r"RegistrationFailure|UeRegistrationFailed|N1.*[Rr]egistration.*[Ff]ail",
|
|
"AMF", "critical",
|
|
"UE registration failure",
|
|
"Check AMF logs for NGAP errors; verify UE credentials and NRF registration."),
|
|
(r"N2SetupFail|NgapSetupFail|N2.*[Tt]imeout|NgapProcedure.*failed",
|
|
"AMF", "critical",
|
|
"N2 interface setup failure",
|
|
"Verify gNB connectivity to AMF; check SCTP transport and NGAP PLMN config."),
|
|
(r"InitialContextSetupFail|UeContextRelease.*[Aa]bnormal",
|
|
"AMF", "warning",
|
|
"UE context setup failure",
|
|
"Review AMF-SMF N11 interface; check subscriber profile in UDM/UDR."),
|
|
(r"PagingFail|UeUnreachable|UeNotFound",
|
|
"AMF", "warning",
|
|
"UE paging failure",
|
|
"Verify UE is registered; check AMF tracking area configuration."),
|
|
],
|
|
"Sessions": [
|
|
(r"PduSessionEstablishmentReject|PduSession.*[Ff]ail|CreateSessionResponse.*[Ff]ail",
|
|
"SMF", "critical",
|
|
"PDU session establishment failure",
|
|
"Check SMF-UPF N4 path; verify DNN/APN config and UPF N3/N9 interfaces."),
|
|
(r"N4Session.*[Ff]ail|PfcpSession.*[Ee]rror|N4.*[Tt]imeout|PfcpAssociation.*[Ff]ail",
|
|
"UPF", "critical",
|
|
"N4/PFCP session error",
|
|
"Restart PFCP association between SMF and UPF; check N4 IP reachability."),
|
|
(r"IpAllocationFail|AddressPoolExhausted|NoIpAvailable",
|
|
"SMF", "critical",
|
|
"IP address pool exhausted",
|
|
"Expand UE IP address pool in SMF config; review active session count."),
|
|
(r"SessionModification.*[Ff]ail|BearerModification.*[Ee]rror",
|
|
"SMF", "warning",
|
|
"Session modification failure",
|
|
"Check PCF policy consistency; verify QoS parameters match UPF capabilities."),
|
|
],
|
|
"Authentication": [
|
|
(r"AuthenticationFailure|AuthReject|EapFailure|5g-aka.*[Ff]ail|EapAkaFailure",
|
|
"AUSF", "critical",
|
|
"UE authentication failure",
|
|
"Verify USIM credentials match UDM subscriber data; check AUSF-UDM N12 link."),
|
|
(r"UdmAuthReq.*[Ee]rror|SuciDeconceal.*[Ff]ail|UdmUeAuth.*[Ee]rror",
|
|
"UDM", "critical",
|
|
"UDM authentication error",
|
|
"Check UDM-UDR N35 connectivity; verify Home Network Public Key configuration."),
|
|
(r"AuthVectorFetch.*[Ff]ail|AusfUeAuth.*[Rr]eject|HssAuth.*[Ff]ail",
|
|
"AUSF", "warning",
|
|
"Auth vector fetch failure",
|
|
"Review UDR data integrity for affected SUPI; check AUSF-UDM TLS certificates."),
|
|
],
|
|
"Connectivity": [
|
|
(r"NfDiscovery.*[Ff]ail|NrfRegistration.*[Ff]ail|NfDeregistration.*unexpect",
|
|
"NRF", "warning",
|
|
"NF service discovery failure",
|
|
"Verify NRF is reachable from all NFs; check NRF registration TTL and heartbeat."),
|
|
(r"ServiceUnavailable.*NF|HTTP.*503.*NF|NfProfile.*expired",
|
|
"NRF", "warning",
|
|
"NF service unavailable",
|
|
"Check NF pod health and SBI listen port; review NRF subscription notifications."),
|
|
(r"SbiRequest.*[Tt]imeout|SbiConn.*[Ff]ail|Http2.*[Ee]rror",
|
|
"NRF", "warning",
|
|
"SBI interface timeout",
|
|
"Inspect inter-NF network MTU and TLS handshake; check load balancer config."),
|
|
],
|
|
"Policy": [
|
|
(r"PcfSmPolicy.*[Ee]rror|PolicyDecision.*[Ff]ail|SmPolicy.*[Rr]eject",
|
|
"PCF", "warning",
|
|
"Policy decision failure",
|
|
"Review PCF policy rules and subscriber group config; check PCF-UDR N36 link."),
|
|
(r"QosEnforce.*[Ff]ail|ChargingRule.*[Ee]rror|PccRule.*[Rr]eject",
|
|
"PCF", "warning",
|
|
"QoS policy enforcement failure",
|
|
"Verify QoS profiles match UPF capabilities; check PCF-CHF N40 charging path."),
|
|
],
|
|
"Security": [
|
|
(r"SecurityMode.*[Ff]ail|IntegrityCheck.*[Ff]ail|NasIntegrity.*[Ee]rror",
|
|
"AMF", "critical",
|
|
"NAS security mode failure",
|
|
"Check AMF cipher/integrity algorithm priority list matches UE capabilities."),
|
|
(r"TlsHandshake.*[Ff]ail|Certificate.*[Ee]xpir|x509.*[Ee]rror|CertVerify.*[Ff]ail",
|
|
"AMF", "critical",
|
|
"TLS/certificate error",
|
|
"Renew expired certificates; verify trust chain between NFs; check SBI TLS config."),
|
|
(r"SuciProtection.*[Ff]ail|PrivacyProtection.*[Ee]rror|HomeNetworkKey.*[Ee]rror",
|
|
"UDM", "warning",
|
|
"SUCI privacy protection error",
|
|
"Verify Home Network Public Key provisioning on UDM; check SUPI revealing config."),
|
|
],
|
|
}
|
|
|
|
# ── NF → possible container name fragments (tried in order) ─────────────────
|
|
NF_CONTAINER_HINTS: dict[str, list[str]] = {
|
|
"AMF": ["amf"],
|
|
"SMF": ["smf"],
|
|
"UPF": ["upf"],
|
|
"NRF": ["nrf"],
|
|
"UDM": ["udm"],
|
|
"AUSF": ["ausf"],
|
|
"PCF": ["pcf"],
|
|
}
|
|
|
|
# ── Container discovery cache ────────────────────────────────────────────────
|
|
_container_cache: dict[str, str] = {}
|
|
_container_cache_ts: float = 0.0
|
|
|
|
|
|
async def _discover_containers() -> dict[str, str]:
|
|
"""Run the configured container runtime and map NF names to actual container names."""
|
|
global _container_cache, _container_cache_ts
|
|
now = time.monotonic()
|
|
if _container_cache and now - _container_cache_ts < 60:
|
|
return _container_cache
|
|
try:
|
|
cmd = [CONTAINER_RUNTIME]
|
|
if CONTAINER_HOST:
|
|
cmd.extend(["--host", CONTAINER_HOST])
|
|
cmd.extend(["ps", "--format", "{{.Names}}"])
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
|
|
names = [n.strip() for n in stdout.decode().splitlines() if n.strip()]
|
|
except Exception:
|
|
names = []
|
|
|
|
mapping: dict[str, str] = {}
|
|
for nf, hints in NF_CONTAINER_HINTS.items():
|
|
for hint in hints:
|
|
match = next((n for n in names if hint in n.lower()), None)
|
|
if match:
|
|
mapping[nf] = match
|
|
break
|
|
|
|
_container_cache = mapping
|
|
_container_cache_ts = now
|
|
return mapping
|
|
|
|
|
|
async def _read_logs(container: str, tail: int = 400) -> str:
|
|
"""Read recent logs from a container (stdout + stderr)."""
|
|
try:
|
|
cmd = [CONTAINER_RUNTIME]
|
|
if CONTAINER_HOST:
|
|
cmd.extend(["--host", CONTAINER_HOST])
|
|
cmd.extend(["logs", "--tail", str(tail), container])
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=8)
|
|
return (stdout.decode("utf-8", errors="replace") +
|
|
stderr.decode("utf-8", errors="replace"))
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
def _match_count(text: str, pattern: str) -> int:
|
|
if not text:
|
|
return 0
|
|
try:
|
|
return len(re.findall(pattern, text, re.IGNORECASE | re.MULTILINE))
|
|
except re.error:
|
|
return 0
|
|
|
|
|
|
# ── Category/NF mapping for Alertmanager alerts ──────────────────────────────
|
|
|
|
def _alert_category(alert: dict) -> str:
|
|
name = (alert.get("name", "") + " " + alert.get("summary", "")).lower()
|
|
if any(k in name for k in ["register", "attach", "ngap", "n2"]):
|
|
return "Registration"
|
|
if any(k in name for k in ["session", "pdu", "bearer", "smf_", "upf_", "n4", "pfcp"]):
|
|
return "Sessions"
|
|
if any(k in name for k in ["auth", "ausf", "udm_", "supi", "aka", "eap"]):
|
|
return "Authentication"
|
|
if any(k in name for k in ["nrf", "discovery", "unavailable", "sbi", "connect"]):
|
|
return "Connectivity"
|
|
if any(k in name for k in ["pcf", "policy", "qos", "pcc", "charge"]):
|
|
return "Policy"
|
|
if any(k in name for k in ["tls", "cert", "security", "cipher", "integ", "suci"]):
|
|
return "Security"
|
|
return "Connectivity"
|
|
|
|
|
|
def _alert_nf(alert: dict) -> str:
|
|
from app.config import ALL_NFS
|
|
text = (alert.get("name", "") + alert.get("instance", "")).lower()
|
|
for nf in ALL_NFS:
|
|
if nf.lower() in text:
|
|
return nf
|
|
return "System"
|
|
|
|
|
|
# ── Main analysis entry point ────────────────────────────────────────────────
|
|
|
|
async def analyze_logs() -> dict:
|
|
"""
|
|
Gather log-pattern issues + Prometheus NF status + Alertmanager alerts.
|
|
Returns a fully structured dict ready for JSON serialisation.
|
|
"""
|
|
from app.services import alertmanager, prometheus, cluster_inventory
|
|
|
|
# Kick off all I/O in parallel
|
|
containers_f = asyncio.create_task(_discover_containers())
|
|
alerts_f = asyncio.create_task(alertmanager.get_alerts())
|
|
nf_status_f = asyncio.create_task(prometheus.get_nf_status())
|
|
cluster_f = asyncio.create_task(cluster_inventory.get_cluster_inventory())
|
|
|
|
containers = await containers_f
|
|
alerts, nf_statuses, cluster = await asyncio.gather(alerts_f, nf_status_f, cluster_f,
|
|
return_exceptions=True)
|
|
if isinstance(alerts, Exception):
|
|
alerts = []
|
|
if isinstance(nf_statuses, Exception):
|
|
nf_statuses = []
|
|
if isinstance(cluster, Exception):
|
|
cluster = {"enabled": False, "nodes": []}
|
|
|
|
# Read all container logs concurrently
|
|
log_tasks = {nf: asyncio.create_task(_read_logs(cname))
|
|
for nf, cname in containers.items()}
|
|
log_texts: dict[str, str] = {}
|
|
if log_tasks:
|
|
log_results = await asyncio.gather(*log_tasks.values(), return_exceptions=True)
|
|
for nf, result in zip(log_tasks.keys(), log_results):
|
|
log_texts[nf] = result if isinstance(result, str) else ""
|
|
|
|
issues: list[dict] = []
|
|
|
|
# 1. Log-pattern analysis
|
|
for category, patterns in CATEGORY_PATTERNS.items():
|
|
for (pat_re, nf, severity, description, remediation) in patterns:
|
|
count = _match_count(log_texts.get(nf, ""), pat_re)
|
|
if count:
|
|
issues.append({
|
|
"id": f"log-{nf}-{len(issues)}",
|
|
"category": category,
|
|
"nf": nf,
|
|
"severity": severity,
|
|
"count": count,
|
|
"description": description,
|
|
"remediation": remediation,
|
|
"source": "log",
|
|
})
|
|
|
|
# 2. NF-down events from Prometheus
|
|
for nf_st in nf_statuses:
|
|
if isinstance(nf_st, dict) and nf_st.get("state") == "down":
|
|
node_text = ", ".join(node["hostname"] for node in nf_st.get("nodes", []))
|
|
issues.append({
|
|
"id": f"nf-down-{nf_st['name']}",
|
|
"category": "Connectivity",
|
|
"nf": nf_st["name"],
|
|
"node": node_text,
|
|
"severity": "critical",
|
|
"count": 1,
|
|
"description": f"{nf_st['name']} is unreachable",
|
|
"remediation": (f"Check {node_text or 'the hosting node'} first, then run "
|
|
f"`{CONTAINER_RUNTIME} ps` and inspect `{nf_st['name'].lower()}` logs."),
|
|
"source": "prometheus",
|
|
})
|
|
|
|
# 3. Active Alertmanager alerts
|
|
for alert in alerts:
|
|
if isinstance(alert, dict):
|
|
node_text = ", ".join(node["hostname"] for node in alert.get("nodes", []))
|
|
issues.append({
|
|
"id": f"alert-{alert.get('name', '')}-{len(issues)}",
|
|
"category": _alert_category(alert),
|
|
"nf": _alert_nf(alert),
|
|
"node": node_text,
|
|
"severity": alert.get("severity", "warning"),
|
|
"count": 1,
|
|
"description": alert.get("summary") or alert.get("name", "Unknown alert"),
|
|
"remediation": "Investigate the active Alertmanager alert and follow runbook.",
|
|
"source": "alertmanager",
|
|
})
|
|
|
|
# Group by category, preserving canonical order
|
|
cats: dict[str, dict] = {}
|
|
for cat_name in ALL_CATEGORIES:
|
|
cats[cat_name] = {
|
|
"name": cat_name,
|
|
"color": CATEGORY_COLORS[cat_name],
|
|
"count": 0,
|
|
"issues": [],
|
|
}
|
|
|
|
for issue in issues:
|
|
cat = issue["category"]
|
|
if cat not in cats:
|
|
cats[cat] = {"name": cat, "color": "#7a8499", "count": 0, "issues": []}
|
|
cats[cat]["count"] += issue["count"]
|
|
cats[cat]["issues"].append(issue)
|
|
|
|
total = sum(c["count"] for c in cats.values())
|
|
categories = [c for c in cats.values()]
|
|
|
|
result = {
|
|
"total": total,
|
|
"categories": categories,
|
|
"timestamp": datetime.now().isoformat(),
|
|
"log_sources": list(containers.keys()),
|
|
"cluster": cluster,
|
|
}
|
|
|
|
# Persist to history ring-buffer
|
|
_history.append({
|
|
"time": datetime.now().isoformat(),
|
|
"total": total,
|
|
"by_category": {name: cats[name]["count"] for name in ALL_CATEGORIES},
|
|
})
|
|
|
|
return result
|
|
|
|
|
|
def get_history() -> list:
|
|
"""Return the accumulated history snapshots as a plain list."""
|
|
return list(_history)
|