started log ingestion and analysis

This commit is contained in:
Jake Kasper
2026-04-24 14:15:58 -04:00
parent c2537dd955
commit 9ac96cee9a
27 changed files with 1368 additions and 179 deletions

View File

@@ -1,8 +1,4 @@
"""
log_analyzer.py — Reads P5G NF container logs and active Prometheus/Alertmanager
data to produce a structured list of recommended remediation actions, grouped
by category. This is the data backend powering the /api/actions endpoint.
"""
"""Structured issue generation from ingested cross-node log events and state."""
import asyncio
import re
@@ -10,7 +6,13 @@ import time
from collections import deque
from datetime import datetime
from app.config import CONTAINER_HOST, CONTAINER_RUNTIME
from app.config import (
CONTAINER_HOST,
CONTAINER_RUNTIME,
LOG_ALERT_CONTEXT_AFTER,
LOG_ALERT_CONTEXT_BEFORE,
)
from app.services.log_rules import load_category_patterns
# ── In-memory history (up to 96 snapshots ≈ 48 min at 30 s refresh) ────────
_history: deque = deque(maxlen=96)
@@ -29,99 +31,6 @@ CATEGORY_COLORS: dict[str, str] = {
ALL_CATEGORIES = ["Registration", "Authentication", "Security",
"Sessions", "Connectivity", "Policy"]
# ── Log-pattern definitions ──────────────────────────────────────────────────
# Each entry: (regex, affected_nf, severity, short_description, remediation)
CATEGORY_PATTERNS: dict[str, list[tuple]] = {
"Registration": [
(r"RegistrationFailure|UeRegistrationFailed|N1.*[Rr]egistration.*[Ff]ail",
"AMF", "critical",
"UE registration failure",
"Check AMF logs for NGAP errors; verify UE credentials and NRF registration."),
(r"N2SetupFail|NgapSetupFail|N2.*[Tt]imeout|NgapProcedure.*failed",
"AMF", "critical",
"N2 interface setup failure",
"Verify gNB connectivity to AMF; check SCTP transport and NGAP PLMN config."),
(r"InitialContextSetupFail|UeContextRelease.*[Aa]bnormal",
"AMF", "warning",
"UE context setup failure",
"Review AMF-SMF N11 interface; check subscriber profile in UDM/UDR."),
(r"PagingFail|UeUnreachable|UeNotFound",
"AMF", "warning",
"UE paging failure",
"Verify UE is registered; check AMF tracking area configuration."),
],
"Sessions": [
(r"PduSessionEstablishmentReject|PduSession.*[Ff]ail|CreateSessionResponse.*[Ff]ail",
"SMF", "critical",
"PDU session establishment failure",
"Check SMF-UPF N4 path; verify DNN/APN config and UPF N3/N9 interfaces."),
(r"N4Session.*[Ff]ail|PfcpSession.*[Ee]rror|N4.*[Tt]imeout|PfcpAssociation.*[Ff]ail",
"UPF", "critical",
"N4/PFCP session error",
"Restart PFCP association between SMF and UPF; check N4 IP reachability."),
(r"IpAllocationFail|AddressPoolExhausted|NoIpAvailable",
"SMF", "critical",
"IP address pool exhausted",
"Expand UE IP address pool in SMF config; review active session count."),
(r"SessionModification.*[Ff]ail|BearerModification.*[Ee]rror",
"SMF", "warning",
"Session modification failure",
"Check PCF policy consistency; verify QoS parameters match UPF capabilities."),
],
"Authentication": [
(r"AuthenticationFailure|AuthReject|EapFailure|5g-aka.*[Ff]ail|EapAkaFailure",
"AUSF", "critical",
"UE authentication failure",
"Verify USIM credentials match UDM subscriber data; check AUSF-UDM N12 link."),
(r"UdmAuthReq.*[Ee]rror|SuciDeconceal.*[Ff]ail|UdmUeAuth.*[Ee]rror",
"UDM", "critical",
"UDM authentication error",
"Check UDM-UDR N35 connectivity; verify Home Network Public Key configuration."),
(r"AuthVectorFetch.*[Ff]ail|AusfUeAuth.*[Rr]eject|HssAuth.*[Ff]ail",
"AUSF", "warning",
"Auth vector fetch failure",
"Review UDR data integrity for affected SUPI; check AUSF-UDM TLS certificates."),
],
"Connectivity": [
(r"NfDiscovery.*[Ff]ail|NrfRegistration.*[Ff]ail|NfDeregistration.*unexpect",
"NRF", "warning",
"NF service discovery failure",
"Verify NRF is reachable from all NFs; check NRF registration TTL and heartbeat."),
(r"ServiceUnavailable.*NF|HTTP.*503.*NF|NfProfile.*expired",
"NRF", "warning",
"NF service unavailable",
"Check NF pod health and SBI listen port; review NRF subscription notifications."),
(r"SbiRequest.*[Tt]imeout|SbiConn.*[Ff]ail|Http2.*[Ee]rror",
"NRF", "warning",
"SBI interface timeout",
"Inspect inter-NF network MTU and TLS handshake; check load balancer config."),
],
"Policy": [
(r"PcfSmPolicy.*[Ee]rror|PolicyDecision.*[Ff]ail|SmPolicy.*[Rr]eject",
"PCF", "warning",
"Policy decision failure",
"Review PCF policy rules and subscriber group config; check PCF-UDR N36 link."),
(r"QosEnforce.*[Ff]ail|ChargingRule.*[Ee]rror|PccRule.*[Rr]eject",
"PCF", "warning",
"QoS policy enforcement failure",
"Verify QoS profiles match UPF capabilities; check PCF-CHF N40 charging path."),
],
"Security": [
(r"SecurityMode.*[Ff]ail|IntegrityCheck.*[Ff]ail|NasIntegrity.*[Ee]rror",
"AMF", "critical",
"NAS security mode failure",
"Check AMF cipher/integrity algorithm priority list matches UE capabilities."),
(r"TlsHandshake.*[Ff]ail|Certificate.*[Ee]xpir|x509.*[Ee]rror|CertVerify.*[Ff]ail",
"AMF", "critical",
"TLS/certificate error",
"Renew expired certificates; verify trust chain between NFs; check SBI TLS config."),
(r"SuciProtection.*[Ff]ail|PrivacyProtection.*[Ee]rror|HomeNetworkKey.*[Ee]rror",
"UDM", "warning",
"SUCI privacy protection error",
"Verify Home Network Public Key provisioning on UDM; check SUPI revealing config."),
],
}
# ── NF → possible container name fragments (tried in order) ─────────────────
NF_CONTAINER_HINTS: dict[str, list[str]] = {
"AMF": ["amf"],
@@ -191,13 +100,13 @@ async def _read_logs(container: str, tail: int = 400) -> str:
return ""
def _match_count(text: str, pattern: str) -> int:
if not text:
return 0
def _rule_matches(message: str, pattern: str) -> bool:
if not message:
return False
try:
return len(re.findall(pattern, text, re.IGNORECASE | re.MULTILINE))
return bool(re.search(pattern, message, re.IGNORECASE | re.MULTILINE))
except re.error:
return 0
return False
# ── Category/NF mapping for Alertmanager alerts ──────────────────────────────
@@ -235,50 +144,104 @@ async def analyze_logs() -> dict:
Gather log-pattern issues + Prometheus NF status + Alertmanager alerts.
Returns a fully structured dict ready for JSON serialisation.
"""
from app.services import alertmanager, prometheus, cluster_inventory
from app.services import alertmanager, cluster_inventory, log_ingest, prometheus
# Kick off all I/O in parallel
containers_f = asyncio.create_task(_discover_containers())
alerts_f = asyncio.create_task(alertmanager.get_alerts())
nf_status_f = asyncio.create_task(prometheus.get_nf_status())
cluster_f = asyncio.create_task(cluster_inventory.get_cluster_inventory())
events_f = asyncio.to_thread(log_ingest.get_events)
containers = await containers_f
alerts, nf_statuses, cluster = await asyncio.gather(alerts_f, nf_status_f, cluster_f,
return_exceptions=True)
alerts, nf_statuses, cluster, events = await asyncio.gather(
alerts_f, nf_status_f, cluster_f, events_f, return_exceptions=True
)
if isinstance(alerts, Exception):
alerts = []
if isinstance(nf_statuses, Exception):
nf_statuses = []
if isinstance(cluster, Exception):
cluster = {"enabled": False, "nodes": []}
if isinstance(events, Exception):
events = []
# Read all container logs concurrently
log_tasks = {nf: asyncio.create_task(_read_logs(cname))
for nf, cname in containers.items()}
log_texts: dict[str, str] = {}
if log_tasks:
log_results = await asyncio.gather(*log_tasks.values(), return_exceptions=True)
for nf, result in zip(log_tasks.keys(), log_results):
log_texts[nf] = result if isinstance(result, str) else ""
events = sorted(
[event for event in events if isinstance(event, dict)],
key=lambda event: event.get("epoch", 0.0),
)
issues: list[dict] = []
grouped_log_issues: dict[tuple[str, str, str, str], dict] = {}
# 1. Log-pattern analysis
for category, patterns in CATEGORY_PATTERNS.items():
for (pat_re, nf, severity, description, remediation) in patterns:
count = _match_count(log_texts.get(nf, ""), pat_re)
if count:
issues.append({
"id": f"log-{nf}-{len(issues)}",
"category": category,
"nf": nf,
"severity": severity,
"count": count,
"description": description,
"remediation": remediation,
"source": "log",
})
# 1. Time-ordered log-pattern analysis across all nodes.
for idx, event in enumerate(events):
message = event.get("message", "")
event_nf = str(event.get("nf", "")).upper()
event_node = event.get("node", "")
for category, patterns in load_category_patterns().items():
for rule in patterns:
rule_nf = str(rule["nf"]).upper()
if event_nf != rule_nf:
continue
if not _rule_matches(message, rule["pattern"]):
continue
before_context = events[max(0, idx - LOG_ALERT_CONTEXT_BEFORE):idx]
after_context = events[idx + 1:idx + 1 + LOG_ALERT_CONTEXT_AFTER]
context_id = log_ingest.record_alert_context(
category=category,
nf=rule_nf,
node=event_node,
severity=rule["severity"],
description=rule["description"],
remediation=rule["remediation"],
source="fluentbit",
event=event,
before_context=before_context,
after_context=after_context,
)
issue_key = (category, rule_nf, event_node, rule["description"])
if issue_key not in grouped_log_issues:
grouped_log_issues[issue_key] = {
"id": f"log-{rule_nf}-{len(grouped_log_issues)}",
"category": category,
"nf": rule_nf,
"node": event_node,
"severity": rule["severity"],
"count": 0,
"description": rule["description"],
"remediation": rule["remediation"],
"source": "fluentbit",
"context_id": context_id,
}
grouped_log_issues[issue_key]["count"] += 1
issues.extend(grouped_log_issues.values())
# Fallback to local container logs until Fluent Bit has populated the buffer.
if not issues and not events:
log_tasks = {nf: asyncio.create_task(_read_logs(cname)) for nf, cname in containers.items()}
if log_tasks:
log_results = await asyncio.gather(*log_tasks.values(), return_exceptions=True)
log_texts = {
nf: result if isinstance(result, str) else ""
for nf, result in zip(log_tasks.keys(), log_results)
}
for category, patterns in load_category_patterns().items():
for rule in patterns:
nf = rule["nf"]
if _rule_matches(log_texts.get(nf, ""), rule["pattern"]):
issues.append({
"id": f"log-{nf}-{len(issues)}",
"category": category,
"nf": nf,
"severity": rule["severity"],
"count": 1,
"description": rule["description"],
"remediation": rule["remediation"],
"source": "local-log-fallback",
})
# 2. NF-down events from Prometheus
for nf_st in nf_statuses:
@@ -337,7 +300,8 @@ async def analyze_logs() -> dict:
"total": total,
"categories": categories,
"timestamp": datetime.now().isoformat(),
"log_sources": list(containers.keys()),
"log_sources": sorted({f"{event.get('node', 'unknown')}:{event.get('nf', 'SYSTEM')}" for event in events}) or list(containers.keys()),
"log_ingest": log_ingest.receiver_status(),
"cluster": cluster,
}