started log ingestion and analysis
This commit is contained in:
@@ -1,8 +1,4 @@
|
||||
"""
|
||||
log_analyzer.py — Reads P5G NF container logs and active Prometheus/Alertmanager
|
||||
data to produce a structured list of recommended remediation actions, grouped
|
||||
by category. This is the data backend powering the /api/actions endpoint.
|
||||
"""
|
||||
"""Structured issue generation from ingested cross-node log events and state."""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
@@ -10,7 +6,13 @@ import time
|
||||
from collections import deque
|
||||
from datetime import datetime
|
||||
|
||||
from app.config import CONTAINER_HOST, CONTAINER_RUNTIME
|
||||
from app.config import (
|
||||
CONTAINER_HOST,
|
||||
CONTAINER_RUNTIME,
|
||||
LOG_ALERT_CONTEXT_AFTER,
|
||||
LOG_ALERT_CONTEXT_BEFORE,
|
||||
)
|
||||
from app.services.log_rules import load_category_patterns
|
||||
|
||||
# ── In-memory history (up to 96 snapshots ≈ 48 min at 30 s refresh) ────────
|
||||
_history: deque = deque(maxlen=96)
|
||||
@@ -29,99 +31,6 @@ CATEGORY_COLORS: dict[str, str] = {
|
||||
ALL_CATEGORIES = ["Registration", "Authentication", "Security",
|
||||
"Sessions", "Connectivity", "Policy"]
|
||||
|
||||
# ── Log-pattern definitions ──────────────────────────────────────────────────
|
||||
# Each entry: (regex, affected_nf, severity, short_description, remediation)
|
||||
CATEGORY_PATTERNS: dict[str, list[tuple]] = {
|
||||
"Registration": [
|
||||
(r"RegistrationFailure|UeRegistrationFailed|N1.*[Rr]egistration.*[Ff]ail",
|
||||
"AMF", "critical",
|
||||
"UE registration failure",
|
||||
"Check AMF logs for NGAP errors; verify UE credentials and NRF registration."),
|
||||
(r"N2SetupFail|NgapSetupFail|N2.*[Tt]imeout|NgapProcedure.*failed",
|
||||
"AMF", "critical",
|
||||
"N2 interface setup failure",
|
||||
"Verify gNB connectivity to AMF; check SCTP transport and NGAP PLMN config."),
|
||||
(r"InitialContextSetupFail|UeContextRelease.*[Aa]bnormal",
|
||||
"AMF", "warning",
|
||||
"UE context setup failure",
|
||||
"Review AMF-SMF N11 interface; check subscriber profile in UDM/UDR."),
|
||||
(r"PagingFail|UeUnreachable|UeNotFound",
|
||||
"AMF", "warning",
|
||||
"UE paging failure",
|
||||
"Verify UE is registered; check AMF tracking area configuration."),
|
||||
],
|
||||
"Sessions": [
|
||||
(r"PduSessionEstablishmentReject|PduSession.*[Ff]ail|CreateSessionResponse.*[Ff]ail",
|
||||
"SMF", "critical",
|
||||
"PDU session establishment failure",
|
||||
"Check SMF-UPF N4 path; verify DNN/APN config and UPF N3/N9 interfaces."),
|
||||
(r"N4Session.*[Ff]ail|PfcpSession.*[Ee]rror|N4.*[Tt]imeout|PfcpAssociation.*[Ff]ail",
|
||||
"UPF", "critical",
|
||||
"N4/PFCP session error",
|
||||
"Restart PFCP association between SMF and UPF; check N4 IP reachability."),
|
||||
(r"IpAllocationFail|AddressPoolExhausted|NoIpAvailable",
|
||||
"SMF", "critical",
|
||||
"IP address pool exhausted",
|
||||
"Expand UE IP address pool in SMF config; review active session count."),
|
||||
(r"SessionModification.*[Ff]ail|BearerModification.*[Ee]rror",
|
||||
"SMF", "warning",
|
||||
"Session modification failure",
|
||||
"Check PCF policy consistency; verify QoS parameters match UPF capabilities."),
|
||||
],
|
||||
"Authentication": [
|
||||
(r"AuthenticationFailure|AuthReject|EapFailure|5g-aka.*[Ff]ail|EapAkaFailure",
|
||||
"AUSF", "critical",
|
||||
"UE authentication failure",
|
||||
"Verify USIM credentials match UDM subscriber data; check AUSF-UDM N12 link."),
|
||||
(r"UdmAuthReq.*[Ee]rror|SuciDeconceal.*[Ff]ail|UdmUeAuth.*[Ee]rror",
|
||||
"UDM", "critical",
|
||||
"UDM authentication error",
|
||||
"Check UDM-UDR N35 connectivity; verify Home Network Public Key configuration."),
|
||||
(r"AuthVectorFetch.*[Ff]ail|AusfUeAuth.*[Rr]eject|HssAuth.*[Ff]ail",
|
||||
"AUSF", "warning",
|
||||
"Auth vector fetch failure",
|
||||
"Review UDR data integrity for affected SUPI; check AUSF-UDM TLS certificates."),
|
||||
],
|
||||
"Connectivity": [
|
||||
(r"NfDiscovery.*[Ff]ail|NrfRegistration.*[Ff]ail|NfDeregistration.*unexpect",
|
||||
"NRF", "warning",
|
||||
"NF service discovery failure",
|
||||
"Verify NRF is reachable from all NFs; check NRF registration TTL and heartbeat."),
|
||||
(r"ServiceUnavailable.*NF|HTTP.*503.*NF|NfProfile.*expired",
|
||||
"NRF", "warning",
|
||||
"NF service unavailable",
|
||||
"Check NF pod health and SBI listen port; review NRF subscription notifications."),
|
||||
(r"SbiRequest.*[Tt]imeout|SbiConn.*[Ff]ail|Http2.*[Ee]rror",
|
||||
"NRF", "warning",
|
||||
"SBI interface timeout",
|
||||
"Inspect inter-NF network MTU and TLS handshake; check load balancer config."),
|
||||
],
|
||||
"Policy": [
|
||||
(r"PcfSmPolicy.*[Ee]rror|PolicyDecision.*[Ff]ail|SmPolicy.*[Rr]eject",
|
||||
"PCF", "warning",
|
||||
"Policy decision failure",
|
||||
"Review PCF policy rules and subscriber group config; check PCF-UDR N36 link."),
|
||||
(r"QosEnforce.*[Ff]ail|ChargingRule.*[Ee]rror|PccRule.*[Rr]eject",
|
||||
"PCF", "warning",
|
||||
"QoS policy enforcement failure",
|
||||
"Verify QoS profiles match UPF capabilities; check PCF-CHF N40 charging path."),
|
||||
],
|
||||
"Security": [
|
||||
(r"SecurityMode.*[Ff]ail|IntegrityCheck.*[Ff]ail|NasIntegrity.*[Ee]rror",
|
||||
"AMF", "critical",
|
||||
"NAS security mode failure",
|
||||
"Check AMF cipher/integrity algorithm priority list matches UE capabilities."),
|
||||
(r"TlsHandshake.*[Ff]ail|Certificate.*[Ee]xpir|x509.*[Ee]rror|CertVerify.*[Ff]ail",
|
||||
"AMF", "critical",
|
||||
"TLS/certificate error",
|
||||
"Renew expired certificates; verify trust chain between NFs; check SBI TLS config."),
|
||||
(r"SuciProtection.*[Ff]ail|PrivacyProtection.*[Ee]rror|HomeNetworkKey.*[Ee]rror",
|
||||
"UDM", "warning",
|
||||
"SUCI privacy protection error",
|
||||
"Verify Home Network Public Key provisioning on UDM; check SUPI revealing config."),
|
||||
],
|
||||
}
|
||||
|
||||
# ── NF → possible container name fragments (tried in order) ─────────────────
|
||||
NF_CONTAINER_HINTS: dict[str, list[str]] = {
|
||||
"AMF": ["amf"],
|
||||
@@ -191,13 +100,13 @@ async def _read_logs(container: str, tail: int = 400) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _match_count(text: str, pattern: str) -> int:
|
||||
if not text:
|
||||
return 0
|
||||
def _rule_matches(message: str, pattern: str) -> bool:
|
||||
if not message:
|
||||
return False
|
||||
try:
|
||||
return len(re.findall(pattern, text, re.IGNORECASE | re.MULTILINE))
|
||||
return bool(re.search(pattern, message, re.IGNORECASE | re.MULTILINE))
|
||||
except re.error:
|
||||
return 0
|
||||
return False
|
||||
|
||||
|
||||
# ── Category/NF mapping for Alertmanager alerts ──────────────────────────────
|
||||
@@ -235,50 +144,104 @@ async def analyze_logs() -> dict:
|
||||
Gather log-pattern issues + Prometheus NF status + Alertmanager alerts.
|
||||
Returns a fully structured dict ready for JSON serialisation.
|
||||
"""
|
||||
from app.services import alertmanager, prometheus, cluster_inventory
|
||||
from app.services import alertmanager, cluster_inventory, log_ingest, prometheus
|
||||
|
||||
# Kick off all I/O in parallel
|
||||
containers_f = asyncio.create_task(_discover_containers())
|
||||
alerts_f = asyncio.create_task(alertmanager.get_alerts())
|
||||
nf_status_f = asyncio.create_task(prometheus.get_nf_status())
|
||||
cluster_f = asyncio.create_task(cluster_inventory.get_cluster_inventory())
|
||||
events_f = asyncio.to_thread(log_ingest.get_events)
|
||||
|
||||
containers = await containers_f
|
||||
alerts, nf_statuses, cluster = await asyncio.gather(alerts_f, nf_status_f, cluster_f,
|
||||
return_exceptions=True)
|
||||
alerts, nf_statuses, cluster, events = await asyncio.gather(
|
||||
alerts_f, nf_status_f, cluster_f, events_f, return_exceptions=True
|
||||
)
|
||||
if isinstance(alerts, Exception):
|
||||
alerts = []
|
||||
if isinstance(nf_statuses, Exception):
|
||||
nf_statuses = []
|
||||
if isinstance(cluster, Exception):
|
||||
cluster = {"enabled": False, "nodes": []}
|
||||
if isinstance(events, Exception):
|
||||
events = []
|
||||
|
||||
# Read all container logs concurrently
|
||||
log_tasks = {nf: asyncio.create_task(_read_logs(cname))
|
||||
for nf, cname in containers.items()}
|
||||
log_texts: dict[str, str] = {}
|
||||
if log_tasks:
|
||||
log_results = await asyncio.gather(*log_tasks.values(), return_exceptions=True)
|
||||
for nf, result in zip(log_tasks.keys(), log_results):
|
||||
log_texts[nf] = result if isinstance(result, str) else ""
|
||||
|
||||
events = sorted(
|
||||
[event for event in events if isinstance(event, dict)],
|
||||
key=lambda event: event.get("epoch", 0.0),
|
||||
)
|
||||
issues: list[dict] = []
|
||||
grouped_log_issues: dict[tuple[str, str, str, str], dict] = {}
|
||||
|
||||
# 1. Log-pattern analysis
|
||||
for category, patterns in CATEGORY_PATTERNS.items():
|
||||
for (pat_re, nf, severity, description, remediation) in patterns:
|
||||
count = _match_count(log_texts.get(nf, ""), pat_re)
|
||||
if count:
|
||||
issues.append({
|
||||
"id": f"log-{nf}-{len(issues)}",
|
||||
"category": category,
|
||||
"nf": nf,
|
||||
"severity": severity,
|
||||
"count": count,
|
||||
"description": description,
|
||||
"remediation": remediation,
|
||||
"source": "log",
|
||||
})
|
||||
# 1. Time-ordered log-pattern analysis across all nodes.
|
||||
for idx, event in enumerate(events):
|
||||
message = event.get("message", "")
|
||||
event_nf = str(event.get("nf", "")).upper()
|
||||
event_node = event.get("node", "")
|
||||
for category, patterns in load_category_patterns().items():
|
||||
for rule in patterns:
|
||||
rule_nf = str(rule["nf"]).upper()
|
||||
if event_nf != rule_nf:
|
||||
continue
|
||||
if not _rule_matches(message, rule["pattern"]):
|
||||
continue
|
||||
|
||||
before_context = events[max(0, idx - LOG_ALERT_CONTEXT_BEFORE):idx]
|
||||
after_context = events[idx + 1:idx + 1 + LOG_ALERT_CONTEXT_AFTER]
|
||||
context_id = log_ingest.record_alert_context(
|
||||
category=category,
|
||||
nf=rule_nf,
|
||||
node=event_node,
|
||||
severity=rule["severity"],
|
||||
description=rule["description"],
|
||||
remediation=rule["remediation"],
|
||||
source="fluentbit",
|
||||
event=event,
|
||||
before_context=before_context,
|
||||
after_context=after_context,
|
||||
)
|
||||
|
||||
issue_key = (category, rule_nf, event_node, rule["description"])
|
||||
if issue_key not in grouped_log_issues:
|
||||
grouped_log_issues[issue_key] = {
|
||||
"id": f"log-{rule_nf}-{len(grouped_log_issues)}",
|
||||
"category": category,
|
||||
"nf": rule_nf,
|
||||
"node": event_node,
|
||||
"severity": rule["severity"],
|
||||
"count": 0,
|
||||
"description": rule["description"],
|
||||
"remediation": rule["remediation"],
|
||||
"source": "fluentbit",
|
||||
"context_id": context_id,
|
||||
}
|
||||
grouped_log_issues[issue_key]["count"] += 1
|
||||
|
||||
issues.extend(grouped_log_issues.values())
|
||||
|
||||
# Fallback to local container logs until Fluent Bit has populated the buffer.
|
||||
if not issues and not events:
|
||||
log_tasks = {nf: asyncio.create_task(_read_logs(cname)) for nf, cname in containers.items()}
|
||||
if log_tasks:
|
||||
log_results = await asyncio.gather(*log_tasks.values(), return_exceptions=True)
|
||||
log_texts = {
|
||||
nf: result if isinstance(result, str) else ""
|
||||
for nf, result in zip(log_tasks.keys(), log_results)
|
||||
}
|
||||
for category, patterns in load_category_patterns().items():
|
||||
for rule in patterns:
|
||||
nf = rule["nf"]
|
||||
if _rule_matches(log_texts.get(nf, ""), rule["pattern"]):
|
||||
issues.append({
|
||||
"id": f"log-{nf}-{len(issues)}",
|
||||
"category": category,
|
||||
"nf": nf,
|
||||
"severity": rule["severity"],
|
||||
"count": 1,
|
||||
"description": rule["description"],
|
||||
"remediation": rule["remediation"],
|
||||
"source": "local-log-fallback",
|
||||
})
|
||||
|
||||
# 2. NF-down events from Prometheus
|
||||
for nf_st in nf_statuses:
|
||||
@@ -337,7 +300,8 @@ async def analyze_logs() -> dict:
|
||||
"total": total,
|
||||
"categories": categories,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"log_sources": list(containers.keys()),
|
||||
"log_sources": sorted({f"{event.get('node', 'unknown')}:{event.get('nf', 'SYSTEM')}" for event in events}) or list(containers.keys()),
|
||||
"log_ingest": log_ingest.receiver_status(),
|
||||
"cluster": cluster,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user