started log ingestion and analysis

2026-04-24 14:15:58 -04:00
parent c2537dd955
commit 9ac96cee9a
27 changed files with 1368 additions and 179 deletions
--- a/app/services/log_analyzer.py
+++ b/app/services/log_analyzer.py
@@ -1,8 +1,4 @@
-"""
-log_analyzer.py — Reads P5G NF container logs and active Prometheus/Alertmanager
-data to produce a structured list of recommended remediation actions, grouped
-by category. This is the data backend powering the /api/actions endpoint.
-"""
+"""Structured issue generation from ingested cross-node log events and state."""

 import asyncio
 import re
@@ -10,7 +6,13 @@ import time
 from collections import deque
 from datetime import datetime

-from app.config import CONTAINER_HOST, CONTAINER_RUNTIME
+from app.config import (
+    CONTAINER_HOST,
+    CONTAINER_RUNTIME,
+    LOG_ALERT_CONTEXT_AFTER,
+    LOG_ALERT_CONTEXT_BEFORE,
+)
+from app.services.log_rules import load_category_patterns

 # ── In-memory history (up to 96 snapshots ≈ 48 min at 30 s refresh) ────────
 _history: deque = deque(maxlen=96)
@@ -29,99 +31,6 @@ CATEGORY_COLORS: dict[str, str] = {
 ALL_CATEGORIES = ["Registration", "Authentication", "Security",
                  "Sessions", "Connectivity", "Policy"]

-# ── Log-pattern definitions ──────────────────────────────────────────────────
-# Each entry: (regex, affected_nf, severity, short_description, remediation)
-CATEGORY_PATTERNS: dict[str, list[tuple]] = {
-    "Registration": [
-        (r"RegistrationFailure|UeRegistrationFailed|N1.*[Rr]egistration.*[Ff]ail",
-         "AMF", "critical",
-         "UE registration failure",
-         "Check AMF logs for NGAP errors; verify UE credentials and NRF registration."),
-        (r"N2SetupFail|NgapSetupFail|N2.*[Tt]imeout|NgapProcedure.*failed",
-         "AMF", "critical",
-         "N2 interface setup failure",
-         "Verify gNB connectivity to AMF; check SCTP transport and NGAP PLMN config."),
-        (r"InitialContextSetupFail|UeContextRelease.*[Aa]bnormal",
-         "AMF", "warning",
-         "UE context setup failure",
-         "Review AMF-SMF N11 interface; check subscriber profile in UDM/UDR."),
-        (r"PagingFail|UeUnreachable|UeNotFound",
-         "AMF", "warning",
-         "UE paging failure",
-         "Verify UE is registered; check AMF tracking area configuration."),
-    ],
-    "Sessions": [
-        (r"PduSessionEstablishmentReject|PduSession.*[Ff]ail|CreateSessionResponse.*[Ff]ail",
-         "SMF", "critical",
-         "PDU session establishment failure",
-         "Check SMF-UPF N4 path; verify DNN/APN config and UPF N3/N9 interfaces."),
-        (r"N4Session.*[Ff]ail|PfcpSession.*[Ee]rror|N4.*[Tt]imeout|PfcpAssociation.*[Ff]ail",
-         "UPF", "critical",
-         "N4/PFCP session error",
-         "Restart PFCP association between SMF and UPF; check N4 IP reachability."),
-        (r"IpAllocationFail|AddressPoolExhausted|NoIpAvailable",
-         "SMF", "critical",
-         "IP address pool exhausted",
-         "Expand UE IP address pool in SMF config; review active session count."),
-        (r"SessionModification.*[Ff]ail|BearerModification.*[Ee]rror",
-         "SMF", "warning",
-         "Session modification failure",
-         "Check PCF policy consistency; verify QoS parameters match UPF capabilities."),
-    ],
-    "Authentication": [
-        (r"AuthenticationFailure|AuthReject|EapFailure|5g-aka.*[Ff]ail|EapAkaFailure",
-         "AUSF", "critical",
-         "UE authentication failure",
-         "Verify USIM credentials match UDM subscriber data; check AUSF-UDM N12 link."),
-        (r"UdmAuthReq.*[Ee]rror|SuciDeconceal.*[Ff]ail|UdmUeAuth.*[Ee]rror",
-         "UDM", "critical",
-         "UDM authentication error",
-         "Check UDM-UDR N35 connectivity; verify Home Network Public Key configuration."),
-        (r"AuthVectorFetch.*[Ff]ail|AusfUeAuth.*[Rr]eject|HssAuth.*[Ff]ail",
-         "AUSF", "warning",
-         "Auth vector fetch failure",
-         "Review UDR data integrity for affected SUPI; check AUSF-UDM TLS certificates."),
-    ],
-    "Connectivity": [
-        (r"NfDiscovery.*[Ff]ail|NrfRegistration.*[Ff]ail|NfDeregistration.*unexpect",
-         "NRF", "warning",
-         "NF service discovery failure",
-         "Verify NRF is reachable from all NFs; check NRF registration TTL and heartbeat."),
-        (r"ServiceUnavailable.*NF|HTTP.*503.*NF|NfProfile.*expired",
-         "NRF", "warning",
-         "NF service unavailable",
-         "Check NF pod health and SBI listen port; review NRF subscription notifications."),
-        (r"SbiRequest.*[Tt]imeout|SbiConn.*[Ff]ail|Http2.*[Ee]rror",
-         "NRF", "warning",
-         "SBI interface timeout",
-         "Inspect inter-NF network MTU and TLS handshake; check load balancer config."),
-    ],
-    "Policy": [
-        (r"PcfSmPolicy.*[Ee]rror|PolicyDecision.*[Ff]ail|SmPolicy.*[Rr]eject",
-         "PCF", "warning",
-         "Policy decision failure",
-         "Review PCF policy rules and subscriber group config; check PCF-UDR N36 link."),
-        (r"QosEnforce.*[Ff]ail|ChargingRule.*[Ee]rror|PccRule.*[Rr]eject",
-         "PCF", "warning",
-         "QoS policy enforcement failure",
-         "Verify QoS profiles match UPF capabilities; check PCF-CHF N40 charging path."),
-    ],
-    "Security": [
-        (r"SecurityMode.*[Ff]ail|IntegrityCheck.*[Ff]ail|NasIntegrity.*[Ee]rror",
-         "AMF", "critical",
-         "NAS security mode failure",
-         "Check AMF cipher/integrity algorithm priority list matches UE capabilities."),
-        (r"TlsHandshake.*[Ff]ail|Certificate.*[Ee]xpir|x509.*[Ee]rror|CertVerify.*[Ff]ail",
-         "AMF", "critical",
-         "TLS/certificate error",
-         "Renew expired certificates; verify trust chain between NFs; check SBI TLS config."),
-        (r"SuciProtection.*[Ff]ail|PrivacyProtection.*[Ee]rror|HomeNetworkKey.*[Ee]rror",
-         "UDM", "warning",
-         "SUCI privacy protection error",
-         "Verify Home Network Public Key provisioning on UDM; check SUPI revealing config."),
-    ],
-}
-
 # ── NF → possible container name fragments (tried in order) ─────────────────
 NF_CONTAINER_HINTS: dict[str, list[str]] = {
    "AMF":  ["amf"],
@@ -191,13 +100,13 @@ async def _read_logs(container: str, tail: int = 400) -> str:
        return ""


-def _match_count(text: str, pattern: str) -> int:
-    if not text:
-        return 0
+def _rule_matches(message: str, pattern: str) -> bool:
+    if not message:
+        return False
    try:
-        return len(re.findall(pattern, text, re.IGNORECASE | re.MULTILINE))
+        return bool(re.search(pattern, message, re.IGNORECASE | re.MULTILINE))
    except re.error:
-        return 0
+        return False


 # ── Category/NF mapping for Alertmanager alerts ──────────────────────────────
@@ -235,50 +144,104 @@ async def analyze_logs() -> dict:
    Gather log-pattern issues + Prometheus NF status + Alertmanager alerts.
    Returns a fully structured dict ready for JSON serialisation.
    """
-    from app.services import alertmanager, prometheus, cluster_inventory
+    from app.services import alertmanager, cluster_inventory, log_ingest, prometheus

    # Kick off all I/O in parallel
    containers_f = asyncio.create_task(_discover_containers())
    alerts_f     = asyncio.create_task(alertmanager.get_alerts())
    nf_status_f  = asyncio.create_task(prometheus.get_nf_status())
    cluster_f    = asyncio.create_task(cluster_inventory.get_cluster_inventory())
+    events_f     = asyncio.to_thread(log_ingest.get_events)

    containers = await containers_f
-    alerts, nf_statuses, cluster = await asyncio.gather(alerts_f, nf_status_f, cluster_f,
-                                                return_exceptions=True)
+    alerts, nf_statuses, cluster, events = await asyncio.gather(
+        alerts_f, nf_status_f, cluster_f, events_f, return_exceptions=True
+    )
    if isinstance(alerts, Exception):
        alerts = []
    if isinstance(nf_statuses, Exception):
        nf_statuses = []
    if isinstance(cluster, Exception):
        cluster = {"enabled": False, "nodes": []}
+    if isinstance(events, Exception):
+        events = []

-    # Read all container logs concurrently
-    log_tasks = {nf: asyncio.create_task(_read_logs(cname))
-                 for nf, cname in containers.items()}
-    log_texts: dict[str, str] = {}
-    if log_tasks:
-        log_results = await asyncio.gather(*log_tasks.values(), return_exceptions=True)
-        for nf, result in zip(log_tasks.keys(), log_results):
-            log_texts[nf] = result if isinstance(result, str) else ""
-
+    events = sorted(
+        [event for event in events if isinstance(event, dict)],
+        key=lambda event: event.get("epoch", 0.0),
+    )
    issues: list[dict] = []
+    grouped_log_issues: dict[tuple[str, str, str, str], dict] = {}

-    # 1. Log-pattern analysis
-    for category, patterns in CATEGORY_PATTERNS.items():
-        for (pat_re, nf, severity, description, remediation) in patterns:
-            count = _match_count(log_texts.get(nf, ""), pat_re)
-            if count:
-                issues.append({
-                    "id":          f"log-{nf}-{len(issues)}",
-                    "category":    category,
-                    "nf":          nf,
-                    "severity":    severity,
-                    "count":       count,
-                    "description": description,
-                    "remediation": remediation,
-                    "source":      "log",
-                })
+    # 1. Time-ordered log-pattern analysis across all nodes.
+    for idx, event in enumerate(events):
+        message = event.get("message", "")
+        event_nf = str(event.get("nf", "")).upper()
+        event_node = event.get("node", "")
+        for category, patterns in load_category_patterns().items():
+            for rule in patterns:
+                rule_nf = str(rule["nf"]).upper()
+                if event_nf != rule_nf:
+                    continue
+                if not _rule_matches(message, rule["pattern"]):
+                    continue
+
+                before_context = events[max(0, idx - LOG_ALERT_CONTEXT_BEFORE):idx]
+                after_context = events[idx + 1:idx + 1 + LOG_ALERT_CONTEXT_AFTER]
+                context_id = log_ingest.record_alert_context(
+                    category=category,
+                    nf=rule_nf,
+                    node=event_node,
+                    severity=rule["severity"],
+                    description=rule["description"],
+                    remediation=rule["remediation"],
+                    source="fluentbit",
+                    event=event,
+                    before_context=before_context,
+                    after_context=after_context,
+                )
+
+                issue_key = (category, rule_nf, event_node, rule["description"])
+                if issue_key not in grouped_log_issues:
+                    grouped_log_issues[issue_key] = {
+                        "id":          f"log-{rule_nf}-{len(grouped_log_issues)}",
+                        "category":    category,
+                        "nf":          rule_nf,
+                        "node":        event_node,
+                        "severity":    rule["severity"],
+                        "count":       0,
+                        "description": rule["description"],
+                        "remediation": rule["remediation"],
+                        "source":      "fluentbit",
+                        "context_id":  context_id,
+                    }
+                grouped_log_issues[issue_key]["count"] += 1
+
+    issues.extend(grouped_log_issues.values())
+
+    # Fallback to local container logs until Fluent Bit has populated the buffer.
+    if not issues and not events:
+        log_tasks = {nf: asyncio.create_task(_read_logs(cname)) for nf, cname in containers.items()}
+        if log_tasks:
+            log_results = await asyncio.gather(*log_tasks.values(), return_exceptions=True)
+            log_texts = {
+                nf: result if isinstance(result, str) else ""
+                for nf, result in zip(log_tasks.keys(), log_results)
+            }
+            for category, patterns in load_category_patterns().items():
+                for rule in patterns:
+                    nf = rule["nf"]
+                    if _rule_matches(log_texts.get(nf, ""), rule["pattern"]):
+                        issues.append({
+                            "id":          f"log-{nf}-{len(issues)}",
+                            "category":    category,
+                            "nf":          nf,
+                            "severity":    rule["severity"],
+                            "count":       1,
+                            "description": rule["description"],
+                            "remediation": rule["remediation"],
+                            "source":      "local-log-fallback",
+                        })

    # 2. NF-down events from Prometheus
    for nf_st in nf_statuses:
@@ -337,7 +300,8 @@ async def analyze_logs() -> dict:
        "total":       total,
        "categories":  categories,
        "timestamp":   datetime.now().isoformat(),
-        "log_sources": list(containers.keys()),
+        "log_sources": sorted({f"{event.get('node', 'unknown')}:{event.get('nf', 'SYSTEM')}" for event in events}) or list(containers.keys()),
+        "log_ingest":  log_ingest.receiver_status(),
        "cluster":     cluster,
    }