started log ingestion and analysis
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -8,6 +8,24 @@ def _env_bool(name: str, default: bool) -> bool:
|
||||
return value.lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _env_int(name: str, default: int) -> int:
|
||||
value = os.getenv(name)
|
||||
if value is None:
|
||||
return default
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def _env_list(name: str, default: list[str]) -> list[str]:
|
||||
value = os.getenv(name)
|
||||
if value is None:
|
||||
return default
|
||||
parsed = [item.strip() for item in value.split(",") if item.strip()]
|
||||
return parsed or default
|
||||
|
||||
|
||||
# Defaults assume the appliance-style deployment model where Marvis runs with
|
||||
# host networking and talks to sibling services over host loopback.
|
||||
PROMETHEUS_URL = os.getenv("MARVIS_PROMETHEUS_URL", "http://127.0.0.1:9090")
|
||||
@@ -21,6 +39,25 @@ PLS_PASSWORD = os.getenv("MARVIS_PLS_PASSWORD", "")
|
||||
PLS_AUTH_BACKEND = os.getenv("MARVIS_PLS_AUTH_BACKEND", "local")
|
||||
PLS_VERIFY_TLS = _env_bool("MARVIS_PLS_VERIFY_TLS", False)
|
||||
|
||||
# Fluent Bit ingestion and retention.
|
||||
LOG_INGEST_ENABLED = _env_bool("MARVIS_LOG_INGEST_ENABLED", True)
|
||||
LOG_AUTO_CONFIGURE = _env_bool("MARVIS_LOG_AUTO_CONFIGURE", True)
|
||||
LOG_RECEIVER_BIND_HOST = os.getenv("MARVIS_LOG_RECEIVER_BIND_HOST", "0.0.0.0")
|
||||
LOG_RECEIVER_HOST = os.getenv("MARVIS_LOG_RECEIVER_HOST", "")
|
||||
LOG_RECEIVER_PORT = _env_int("MARVIS_LOG_RECEIVER_PORT", 5514)
|
||||
LOG_RECEIVER_FORMAT = os.getenv("MARVIS_LOG_RECEIVER_FORMAT", "json_lines")
|
||||
LOG_BUFFER_LINES = _env_int("MARVIS_LOG_BUFFER_LINES", 1000)
|
||||
LOG_ALERT_CONTEXT_BEFORE = _env_int("MARVIS_LOG_ALERT_CONTEXT_BEFORE", 5)
|
||||
LOG_ALERT_CONTEXT_AFTER = _env_int("MARVIS_LOG_ALERT_CONTEXT_AFTER", 5)
|
||||
LOG_ALERT_CONTEXT_DB_PATH = os.getenv("MARVIS_LOG_ALERT_CONTEXT_DB_PATH", "/app/data/marvis-alert-context.db")
|
||||
LOG_ALERT_CONTEXT_DB_MAX_ROWS = _env_int("MARVIS_LOG_ALERT_CONTEXT_DB_MAX_ROWS", 500)
|
||||
LOG_TRACE_BUFFER_LINES = _env_int("MARVIS_LOG_TRACE_BUFFER_LINES", 5000)
|
||||
LOG_FLUENTBIT_MATCH = os.getenv("MARVIS_LOG_FLUENTBIT_MATCH", "*")
|
||||
LOG_ALLOWED_NFS = [item.upper() for item in _env_list(
|
||||
"MARVIS_LOG_ALLOWED_NFS",
|
||||
["AMF", "SMF", "UPF", "UDM", "UDR", "NRF", "AUSF", "PCF", "MME", "SGWC", "DRA", "DSM", "AAA", "BMSC", "CHF", "SMSF", "EIR"],
|
||||
)]
|
||||
|
||||
# AI backend: "rule" | "openai" | "ollama"
|
||||
AI_MODE = os.getenv("MARVIS_AI_MODE", "rule")
|
||||
OPENAI_API_KEY = os.getenv("MARVIS_OPENAI_API_KEY", "")
|
||||
|
||||
21
app/main.py
21
app/main.py
@@ -3,7 +3,15 @@ from fastapi.responses import FileResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pathlib import Path
|
||||
|
||||
from app.routers import network, alerts, query as query_router, actions as actions_router, emulated_session as emulated_session_router
|
||||
from app.routers import (
|
||||
actions as actions_router,
|
||||
alerts,
|
||||
emulated_session as emulated_session_router,
|
||||
logs as logs_router,
|
||||
network,
|
||||
query as query_router,
|
||||
)
|
||||
from app.services import log_ingest
|
||||
|
||||
app = FastAPI(title="P5G Marvis", version="1.0.0", docs_url="/api/docs")
|
||||
|
||||
@@ -18,6 +26,7 @@ app.include_router(network.router, prefix="/api")
|
||||
app.include_router(alerts.router, prefix="/api")
|
||||
app.include_router(query_router.router, prefix="/api")
|
||||
app.include_router(actions_router.router, prefix="/api")
|
||||
app.include_router(logs_router.router, prefix="/api")
|
||||
app.include_router(emulated_session_router.router, prefix="/api")
|
||||
|
||||
UI = Path(__file__).parent / "ui" / "index.html"
|
||||
@@ -51,6 +60,16 @@ async def actions_page():
|
||||
return FileResponse(str(ACTIONS_UI))
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def _startup() -> None:
|
||||
await log_ingest.startup()
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def _shutdown() -> None:
|
||||
await log_ingest.shutdown()
|
||||
|
||||
|
||||
# Catch-all: serve the SPA for any unmatched path (supports deep-linking)
|
||||
@app.get("/{full_path:path}")
|
||||
async def spa(full_path: str):
|
||||
|
||||
BIN
app/routers/__pycache__/logs.cpython-314.pyc
Normal file
BIN
app/routers/__pycache__/logs.cpython-314.pyc
Normal file
Binary file not shown.
Binary file not shown.
36
app/routers/logs.py
Normal file
36
app/routers/logs.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
|
||||
from app.services import log_ingest
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/logs/status")
|
||||
async def get_log_status():
|
||||
return log_ingest.receiver_status()
|
||||
|
||||
|
||||
@router.get("/logs/events")
|
||||
async def get_log_events(
|
||||
limit: int = Query(default=200, ge=1, le=5000),
|
||||
node: str | None = None,
|
||||
nf: str | None = None,
|
||||
imsi: str | None = None,
|
||||
):
|
||||
return {
|
||||
"events": log_ingest.get_events(limit=limit, node=node, nf=nf, imsi=imsi),
|
||||
"status": log_ingest.receiver_status(),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/logs/contexts")
|
||||
async def get_log_contexts(limit: int = Query(default=20, ge=1, le=200)):
|
||||
return {"contexts": log_ingest.recent_alert_context(limit=limit)}
|
||||
|
||||
|
||||
@router.post("/logs/configure")
|
||||
async def configure_log_shipping():
|
||||
try:
|
||||
return await log_ingest.configure_site_output()
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
@@ -1,6 +1,6 @@
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel
|
||||
from app.services import cluster_inventory, alertmanager, ai
|
||||
from app.services import cluster_inventory, alertmanager, ai, log_ingest
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@@ -11,14 +11,15 @@ class QueryRequest(BaseModel):
|
||||
|
||||
@router.post("/query")
|
||||
async def query(req: QueryRequest):
|
||||
network_state, alerts = await _gather(req.query)
|
||||
response = await ai.answer(req.query, network_state, alerts)
|
||||
return {"response": response, "network_state": network_state, "alerts": alerts}
|
||||
network_state, alerts, logs = await _gather(req.query)
|
||||
response = await ai.answer(req.query, network_state, alerts, logs)
|
||||
return {"response": response, "network_state": network_state, "alerts": alerts, "logs": logs}
|
||||
|
||||
|
||||
async def _gather(query_text: str):
|
||||
import asyncio
|
||||
nfs_task = asyncio.create_task(cluster_inventory.get_network_status())
|
||||
alerts_task = asyncio.create_task(alertmanager.get_alerts())
|
||||
network_state, alerts = await asyncio.gather(nfs_task, alerts_task)
|
||||
return network_state, alerts
|
||||
logs_task = asyncio.to_thread(log_ingest.get_events, 200)
|
||||
network_state, alerts, logs = await asyncio.gather(nfs_task, alerts_task, logs_task)
|
||||
return network_state, alerts, logs
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
app/services/__pycache__/log_ingest.cpython-314.pyc
Normal file
BIN
app/services/__pycache__/log_ingest.cpython-314.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/log_rules.cpython-314.pyc
Normal file
BIN
app/services/__pycache__/log_rules.cpython-314.pyc
Normal file
Binary file not shown.
Binary file not shown.
@@ -6,6 +6,7 @@ Phase 2: swap MARVIS_AI_MODE=openai or MARVIS_AI_MODE=ollama to route through LL
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
import re
|
||||
from app.config import (
|
||||
AI_MODE,
|
||||
CONTAINER_RUNTIME,
|
||||
@@ -17,23 +18,25 @@ from app.config import (
|
||||
)
|
||||
|
||||
|
||||
async def answer(query: str, network_state: dict, alerts: list) -> str:
|
||||
async def answer(query: str, network_state: dict, alerts: list, logs: list[dict] | None = None) -> str:
|
||||
if AI_MODE == "openai":
|
||||
return await _call_openai(query, network_state, alerts)
|
||||
return await _call_openai(query, network_state, alerts, logs or [])
|
||||
if AI_MODE == "ollama":
|
||||
return await _call_ollama(query, network_state, alerts)
|
||||
return _rule_based(query, network_state, alerts)
|
||||
return await _call_ollama(query, network_state, alerts, logs or [])
|
||||
return _rule_based(query, network_state, alerts, logs or [])
|
||||
|
||||
|
||||
# ── Rule-based engine ──────────────────────────────────────────────────────
|
||||
|
||||
def _rule_based(query: str, network_state: dict, alerts: list) -> str:
|
||||
def _rule_based(query: str, network_state: dict, alerts: list, logs: list[dict]) -> str:
|
||||
q = query.lower()
|
||||
nfs = network_state.get("nfs", [])
|
||||
cluster = network_state.get("cluster", {})
|
||||
up = [n for n in nfs if n["state"] == "up"]
|
||||
down = [n for n in nfs if n["state"] == "down"]
|
||||
|
||||
log_hits = _find_log_hits(q, logs)
|
||||
|
||||
if any(w in q for w in ["hello", "hi ", "hey", "howdy"]):
|
||||
return ("Hello! I'm **P5G Marvis**, your AI network assistant for HPE Private 5G.\n"
|
||||
"Ask me about network health, specific functions, alerts, or performance.")
|
||||
@@ -53,22 +56,25 @@ def _rule_based(query: str, network_state: dict, alerts: list) -> str:
|
||||
from app.config import ALL_NFS
|
||||
for nf_name in ALL_NFS:
|
||||
if nf_name.lower() in q:
|
||||
return _nf_detail(nf_name, nfs, alerts)
|
||||
return _nf_detail(nf_name, nfs, alerts, log_hits)
|
||||
|
||||
if any(w in q for w in ["alert", "alarm", "warning", "critical", "incident", "problem", "issue"]):
|
||||
return _alerts_summary(alerts)
|
||||
|
||||
if any(w in q for w in ["log", "trace", "journal", "message", "error"]):
|
||||
return _log_summary(log_hits, logs)
|
||||
|
||||
if any(w in q for w in ["subscriber", "ue ", "device", "phone", "handset", "registration", "attach"]):
|
||||
return _subscriber_analysis(nfs, alerts, cluster)
|
||||
return _subscriber_analysis(nfs, alerts, cluster, log_hits)
|
||||
|
||||
if any(w in q for w in ["session", "pdu", "bearer", "user plane", "traffic", "throughput"]):
|
||||
return _session_analysis(nfs, alerts, cluster)
|
||||
return _session_analysis(nfs, alerts, cluster, log_hits)
|
||||
|
||||
# Default → health summary
|
||||
return _health_summary(up, down, alerts, cluster)
|
||||
return _health_summary(up, down, alerts, cluster, log_hits)
|
||||
|
||||
|
||||
def _health_summary(up: list, down: list, alerts: list, cluster: dict) -> str:
|
||||
def _health_summary(up: list, down: list, alerts: list, cluster: dict, log_hits: list[dict]) -> str:
|
||||
ts = datetime.now().strftime("%H:%M:%S")
|
||||
crit = [a for a in alerts if a.get("severity") == "critical"]
|
||||
warn = [a for a in alerts if a.get("severity") != "critical"]
|
||||
@@ -104,13 +110,21 @@ def _health_summary(up: list, down: list, alerts: list, cluster: dict) -> str:
|
||||
|
||||
if not down and not alerts:
|
||||
lines.append("\n🟢 All systems nominal.")
|
||||
if log_hits:
|
||||
lines.append(f"\n🧾 **Relevant log hits ({len(log_hits)})**")
|
||||
for hit in log_hits[:4]:
|
||||
lines.append(
|
||||
f" • {hit.get('timestamp','')} — {hit.get('node','unknown')} {hit.get('nf','SYSTEM')}: "
|
||||
f"{_trim_message(hit.get('message',''))}"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _nf_detail(nf_name: str, nfs: list, alerts: list) -> str:
|
||||
def _nf_detail(nf_name: str, nfs: list, alerts: list, log_hits: list[dict]) -> str:
|
||||
nf = next((n for n in nfs if n["name"] == nf_name), None)
|
||||
nf_alerts = [a for a in alerts
|
||||
if nf_name in a.get("name", "") or nf_name.lower() in a.get("instance", "").lower()]
|
||||
nf_logs = [hit for hit in log_hits if hit.get("nf") == nf_name]
|
||||
|
||||
if not nf or nf["state"] == "unknown":
|
||||
return (f"ℹ️ No Prometheus data found for **{nf_name}**.\n"
|
||||
@@ -132,6 +146,13 @@ def _nf_detail(nf_name: str, nfs: list, alerts: list) -> str:
|
||||
lines.append(f" → {a['name']}: {a.get('summary', '')}")
|
||||
else:
|
||||
lines.append("No active alerts for this function.")
|
||||
if nf_logs:
|
||||
lines.append(f"\n🧾 Recent {nf_name} log evidence:")
|
||||
for hit in nf_logs[:4]:
|
||||
lines.append(
|
||||
f" • {hit.get('timestamp','')} on {hit.get('node','unknown')}: "
|
||||
f"{_trim_message(hit.get('message',''))}"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@@ -151,7 +172,7 @@ def _alerts_summary(alerts: list) -> str:
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _subscriber_analysis(nfs: list, alerts: list, cluster: dict) -> str:
|
||||
def _subscriber_analysis(nfs: list, alerts: list, cluster: dict, log_hits: list[dict]) -> str:
|
||||
amf = next((n for n in nfs if n["name"] == "AMF"), None)
|
||||
smf = next((n for n in nfs if n["name"] == "SMF"), None)
|
||||
lines = ["**Subscriber & Registration Analysis**\n"]
|
||||
@@ -163,11 +184,18 @@ def _subscriber_analysis(nfs: list, alerts: list, cluster: dict) -> str:
|
||||
lines.append(f"\n⚠️ {len(sub_alerts)} subscriber-related alert(s) active.")
|
||||
else:
|
||||
lines.append("\nNo subscriber-related alerts detected.")
|
||||
sub_logs = [hit for hit in log_hits if any(key in hit.get("message", "").lower() for key in ["imsi", "supi", "registration", "attach", "subscriber"])]
|
||||
if sub_logs:
|
||||
lines.append("\nRecent subscriber-related log evidence:")
|
||||
for hit in sub_logs[:4]:
|
||||
lines.append(
|
||||
f"• {hit.get('nf','SYSTEM')} on {hit.get('node','unknown')}: {_trim_message(hit.get('message',''))}"
|
||||
)
|
||||
lines.append(_cluster_scope(cluster))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _session_analysis(nfs: list, alerts: list, cluster: dict) -> str:
|
||||
def _session_analysis(nfs: list, alerts: list, cluster: dict, log_hits: list[dict]) -> str:
|
||||
smf = next((n for n in nfs if n["name"] == "SMF"), None)
|
||||
upf = next((n for n in nfs if n["name"] == "UPF"), None)
|
||||
lines = ["**PDU Session & Data Plane Analysis**\n"]
|
||||
@@ -177,10 +205,38 @@ def _session_analysis(nfs: list, alerts: list, cluster: dict) -> str:
|
||||
lines.append("\n⚡ **Impact**: PDU sessions will fail until both SMF and UPF are operational.")
|
||||
else:
|
||||
lines.append("\nBoth SMF and UPF operational — sessions should be establishing normally.")
|
||||
session_logs = [hit for hit in log_hits if hit.get("nf") in {"SMF", "UPF"}]
|
||||
if session_logs:
|
||||
lines.append("\nRecent session/data-plane log evidence:")
|
||||
for hit in session_logs[:4]:
|
||||
lines.append(
|
||||
f"• {hit.get('nf','SYSTEM')} on {hit.get('node','unknown')}: {_trim_message(hit.get('message',''))}"
|
||||
)
|
||||
lines.append(_cluster_scope(cluster))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _log_summary(log_hits: list[dict], logs: list[dict]) -> str:
|
||||
if not logs:
|
||||
return "ℹ️ No ingested logs are currently available."
|
||||
if not log_hits:
|
||||
latest = max(logs, key=lambda event: event.get("epoch", 0.0), default=None)
|
||||
if latest:
|
||||
return (
|
||||
"ℹ️ I do not see direct log matches for that question.\n\n"
|
||||
f"Latest ingested log: {latest.get('timestamp','')} on {latest.get('node','unknown')} "
|
||||
f"{latest.get('nf','SYSTEM')} — {_trim_message(latest.get('message',''))}"
|
||||
)
|
||||
return "ℹ️ No relevant log matches were found."
|
||||
lines = [f"🧾 **Relevant log matches ({len(log_hits)})**\n"]
|
||||
for hit in log_hits[:8]:
|
||||
lines.append(
|
||||
f"• {hit.get('timestamp','')} — {hit.get('node','unknown')} {hit.get('nf','SYSTEM')}: "
|
||||
f"{_trim_message(hit.get('message',''))}"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _nf_label(nf: dict) -> str:
|
||||
placements = nf.get("nodes", [])
|
||||
if not placements:
|
||||
@@ -207,24 +263,30 @@ def _cluster_scope(cluster: dict) -> str:
|
||||
|
||||
# ── LLM backends ──────────────────────────────────────────────────────────
|
||||
|
||||
def _build_context(network_state: dict, alerts: list) -> str:
|
||||
def _build_context(network_state: dict, alerts: list, logs: list[dict]) -> str:
|
||||
nfs = network_state.get("nfs", [])
|
||||
up = [n["name"] for n in nfs if n["state"] == "up"]
|
||||
down = [n["name"] for n in nfs if n["state"] == "down"]
|
||||
nodes = network_state.get("cluster", {}).get("nodes", [])
|
||||
node_summary = ", ".join(f"{node['hostname']} ({node.get('role', 'AP')})" for node in nodes) or "none"
|
||||
recent_logs = logs[-10:] if logs else []
|
||||
log_summary = "; ".join(
|
||||
f"{entry.get('timestamp','')} {entry.get('node','unknown')} {entry.get('nf','SYSTEM')}: {_trim_message(entry.get('message',''), 120)}"
|
||||
for entry in recent_logs
|
||||
) or "none"
|
||||
return (
|
||||
f"NFs UP: {', '.join(up) or 'none'}\n"
|
||||
f"NFs DOWN: {', '.join(down) or 'none'}\n"
|
||||
f"Cluster nodes: {node_summary}\n"
|
||||
f"Active alerts: {', '.join(a.get('name','') for a in alerts[:5]) or 'none'}"
|
||||
f"Active alerts: {', '.join(a.get('name','') for a in alerts[:5]) or 'none'}\n"
|
||||
f"Recent logs: {log_summary}"
|
||||
)
|
||||
|
||||
|
||||
async def _call_openai(query: str, network_state: dict, alerts: list) -> str:
|
||||
async def _call_openai(query: str, network_state: dict, alerts: list, logs: list[dict]) -> str:
|
||||
try:
|
||||
import httpx
|
||||
ctx = _build_context(network_state, alerts)
|
||||
ctx = _build_context(network_state, alerts, logs)
|
||||
messages = [
|
||||
{"role": "system", "content":
|
||||
f"You are P5G Marvis, an AI network assistant for HPE Private 5G.\n"
|
||||
@@ -247,13 +309,13 @@ async def _call_openai(query: str, network_state: dict, alerts: list) -> str:
|
||||
# some reasoning models put the answer in content, others in reasoning_content
|
||||
return msg.get("content") or msg.get("reasoning_content") or "(empty response)"
|
||||
except Exception as e:
|
||||
return f"LLM error: {e}\n\n" + _rule_based(query, network_state, alerts)
|
||||
return f"LLM error: {e}\n\n" + _rule_based(query, network_state, alerts, logs)
|
||||
|
||||
|
||||
async def _call_ollama(query: str, network_state: dict, alerts: list) -> str:
|
||||
async def _call_ollama(query: str, network_state: dict, alerts: list, logs: list[dict]) -> str:
|
||||
try:
|
||||
import httpx
|
||||
ctx = _build_context(network_state, alerts)
|
||||
ctx = _build_context(network_state, alerts, logs)
|
||||
prompt = (f"You are P5G Marvis, an AI network assistant.\n"
|
||||
f"Network state:\n{ctx}\n\nUser: {query}\nAssistant:")
|
||||
async with httpx.AsyncClient(timeout=60) as client:
|
||||
@@ -263,4 +325,34 @@ async def _call_ollama(query: str, network_state: dict, alerts: list) -> str:
|
||||
)
|
||||
return resp.json().get("response", "No response.")
|
||||
except Exception as e:
|
||||
return f"Ollama error: {e}\n\n" + _rule_based(query, network_state, alerts)
|
||||
return f"Ollama error: {e}\n\n" + _rule_based(query, network_state, alerts, logs)
|
||||
|
||||
|
||||
def _find_log_hits(query: str, logs: list[dict]) -> list[dict]:
|
||||
terms = [term for term in re.findall(r"[a-z0-9_-]+", query.lower()) if len(term) >= 3]
|
||||
if not logs or not terms:
|
||||
return []
|
||||
hits = []
|
||||
for event in logs:
|
||||
haystack = " ".join(
|
||||
[
|
||||
str(event.get("nf", "")).lower(),
|
||||
str(event.get("node", "")).lower(),
|
||||
str(event.get("source", "")).lower(),
|
||||
str(event.get("message", "")).lower(),
|
||||
]
|
||||
)
|
||||
score = sum(1 for term in terms if term in haystack)
|
||||
if score:
|
||||
event_copy = dict(event)
|
||||
event_copy["_score"] = score
|
||||
hits.append(event_copy)
|
||||
hits.sort(key=lambda event: (event.get("_score", 0), event.get("epoch", 0.0)), reverse=True)
|
||||
return hits
|
||||
|
||||
|
||||
def _trim_message(message: str, limit: int = 160) -> str:
|
||||
message = " ".join(str(message).split())
|
||||
if len(message) <= limit:
|
||||
return message
|
||||
return message[: limit - 3] + "..."
|
||||
|
||||
@@ -1,14 +1,31 @@
|
||||
"""Alertmanager client."""
|
||||
"""Alert sources: Alertmanager plus log-derived alerts."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import httpx
|
||||
from app.config import ALERTMANAGER_URL
|
||||
from app.services import cluster_inventory
|
||||
from app.services import cluster_inventory, log_ingest
|
||||
|
||||
_BASE = ALERTMANAGER_URL.rstrip("/")
|
||||
|
||||
|
||||
async def get_alerts() -> list:
|
||||
"""Return normalised list of active alerts from Alertmanager."""
|
||||
"""Return normalised list of active alerts from Alertmanager and log analysis."""
|
||||
cluster = await cluster_inventory.get_cluster_inventory()
|
||||
alertmanager_task = asyncio.create_task(_get_alertmanager_alerts(cluster))
|
||||
log_task = asyncio.to_thread(_get_log_alerts, cluster)
|
||||
am_alerts, log_alerts = await asyncio.gather(alertmanager_task, log_task, return_exceptions=True)
|
||||
if isinstance(am_alerts, Exception):
|
||||
am_alerts = []
|
||||
if isinstance(log_alerts, Exception):
|
||||
log_alerts = []
|
||||
return sorted(
|
||||
[*am_alerts, *log_alerts],
|
||||
key=lambda alert: (_severity_rank(alert.get("severity")), alert.get("timestamp", "")),
|
||||
)
|
||||
|
||||
|
||||
async def _get_alertmanager_alerts(cluster: dict) -> list:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5) as client:
|
||||
r = await client.get(f"{_BASE}/api/v2/alerts", params={"active": "true", "silenced": "false"})
|
||||
@@ -17,7 +34,6 @@ async def get_alerts() -> list:
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
cluster = await cluster_inventory.get_cluster_inventory()
|
||||
alerts = []
|
||||
for a in raw:
|
||||
labels = a.get("labels", {})
|
||||
@@ -33,10 +49,62 @@ async def get_alerts() -> list:
|
||||
"summary": summary,
|
||||
"nf": nf_name,
|
||||
"nodes": nodes,
|
||||
"source": "alertmanager",
|
||||
"timestamp": a.get("startsAt", ""),
|
||||
})
|
||||
return alerts
|
||||
|
||||
|
||||
def _get_log_alerts(cluster: dict) -> list:
|
||||
node_map = {}
|
||||
for node in cluster.get("nodes", []):
|
||||
if node.get("hostname"):
|
||||
node_map[node["hostname"]] = node
|
||||
if node.get("address"):
|
||||
node_map[node["address"]] = node
|
||||
|
||||
alerts = []
|
||||
for ctx in log_ingest.recent_alert_context(limit=50):
|
||||
before = _decode_context(ctx.get("before_context"))
|
||||
after = _decode_context(ctx.get("after_context"))
|
||||
node_name = ctx.get("node", "")
|
||||
nodes = []
|
||||
if node_name and node_name in node_map:
|
||||
nodes = [node_map[node_name]]
|
||||
alerts.append({
|
||||
"name": f"{ctx.get('nf') or 'System'} log anomaly",
|
||||
"severity": ctx.get("severity", "warning"),
|
||||
"instance": ctx.get("source", ""),
|
||||
"summary": ctx.get("description", "Log-derived alert"),
|
||||
"nf": ctx.get("nf", ""),
|
||||
"nodes": nodes,
|
||||
"source": "logs",
|
||||
"timestamp": ctx.get("event_ts", ""),
|
||||
"context_id": ctx.get("id"),
|
||||
"node": node_name,
|
||||
"match_message": ctx.get("match_message", ""),
|
||||
"context_preview": {
|
||||
"before": before[-3:],
|
||||
"after": after[:3],
|
||||
},
|
||||
})
|
||||
return alerts
|
||||
|
||||
|
||||
def _decode_context(value: str | None) -> list[dict]:
|
||||
if not value:
|
||||
return []
|
||||
try:
|
||||
data = json.loads(value)
|
||||
return data if isinstance(data, list) else []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _severity_rank(severity: str | None) -> int:
|
||||
return {"critical": 0, "warning": 1, "info": 2}.get((severity or "warning").lower(), 3)
|
||||
|
||||
|
||||
def _infer_nf(name: str, summary: str, instance: str) -> str:
|
||||
text = f"{name} {summary} {instance}".upper()
|
||||
for nf_name in ["AMF", "SMF", "UPF", "UDM", "UDR", "NRF", "AUSF", "PCF", "MME", "SGWC", "DRA", "DSM"]:
|
||||
|
||||
@@ -8,6 +8,8 @@ import re
|
||||
from app.config import ALL_NFS
|
||||
from app.services import pls, prometheus
|
||||
|
||||
_last_inventory: dict | None = None
|
||||
|
||||
ROLE_NF_MAP = {
|
||||
"5GALL": {"amf", "smf", "pcf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "chf", "smsf", "aaa", "bmsc"},
|
||||
"CP": {"amf", "smf", "pcf", "udr", "udm", "nrf", "eir", "ausf", "dra", "chf", "smsf", "aaa", "bmsc"},
|
||||
@@ -41,9 +43,10 @@ def _infer_role(hostname: str) -> str:
|
||||
|
||||
|
||||
async def get_cluster_inventory() -> dict:
|
||||
global _last_inventory
|
||||
cluster = await pls.get_cluster_status()
|
||||
if not cluster:
|
||||
return {
|
||||
return _last_inventory or {
|
||||
"enabled": False,
|
||||
"current_node": None,
|
||||
"fully_established": False,
|
||||
@@ -78,12 +81,14 @@ async def get_cluster_inventory() -> dict:
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
inventory = {
|
||||
"enabled": True,
|
||||
"current_node": cluster.get("current_node"),
|
||||
"fully_established": bool(cluster.get("fully_established")),
|
||||
"nodes": nodes,
|
||||
}
|
||||
_last_inventory = inventory
|
||||
return inventory
|
||||
|
||||
|
||||
def _aggregate_nf_state(nf_name: str, nodes: list[dict], prom_states: dict[str, dict]) -> dict:
|
||||
@@ -137,8 +142,14 @@ def _attach_node_nf_status(nodes: list[dict]) -> list[dict]:
|
||||
enriched = []
|
||||
for node in nodes:
|
||||
node_copy = dict(node)
|
||||
expected_nfs = node_copy.get("expected_nfs", [])
|
||||
node_copy["nfs"] = [_node_nf_state(node_copy, nf_name.upper()) for nf_name in expected_nfs]
|
||||
expected_nfs = {nf.upper() for nf in node_copy.get("expected_nfs", [])}
|
||||
started_nf_services = {
|
||||
svc.get("name", "").upper()
|
||||
for svc in node_copy.get("services", [])
|
||||
if svc.get("type") == "nf" and svc.get("name")
|
||||
}
|
||||
visible_nfs = sorted(expected_nfs | started_nf_services)
|
||||
node_copy["nfs"] = [_node_nf_state(node_copy, nf_name.upper()) for nf_name in visible_nfs]
|
||||
enriched.append(node_copy)
|
||||
return enriched
|
||||
|
||||
|
||||
@@ -1,8 +1,4 @@
|
||||
"""
|
||||
log_analyzer.py — Reads P5G NF container logs and active Prometheus/Alertmanager
|
||||
data to produce a structured list of recommended remediation actions, grouped
|
||||
by category. This is the data backend powering the /api/actions endpoint.
|
||||
"""
|
||||
"""Structured issue generation from ingested cross-node log events and state."""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
@@ -10,7 +6,13 @@ import time
|
||||
from collections import deque
|
||||
from datetime import datetime
|
||||
|
||||
from app.config import CONTAINER_HOST, CONTAINER_RUNTIME
|
||||
from app.config import (
|
||||
CONTAINER_HOST,
|
||||
CONTAINER_RUNTIME,
|
||||
LOG_ALERT_CONTEXT_AFTER,
|
||||
LOG_ALERT_CONTEXT_BEFORE,
|
||||
)
|
||||
from app.services.log_rules import load_category_patterns
|
||||
|
||||
# ── In-memory history (up to 96 snapshots ≈ 48 min at 30 s refresh) ────────
|
||||
_history: deque = deque(maxlen=96)
|
||||
@@ -29,99 +31,6 @@ CATEGORY_COLORS: dict[str, str] = {
|
||||
ALL_CATEGORIES = ["Registration", "Authentication", "Security",
|
||||
"Sessions", "Connectivity", "Policy"]
|
||||
|
||||
# ── Log-pattern definitions ──────────────────────────────────────────────────
|
||||
# Each entry: (regex, affected_nf, severity, short_description, remediation)
|
||||
CATEGORY_PATTERNS: dict[str, list[tuple]] = {
|
||||
"Registration": [
|
||||
(r"RegistrationFailure|UeRegistrationFailed|N1.*[Rr]egistration.*[Ff]ail",
|
||||
"AMF", "critical",
|
||||
"UE registration failure",
|
||||
"Check AMF logs for NGAP errors; verify UE credentials and NRF registration."),
|
||||
(r"N2SetupFail|NgapSetupFail|N2.*[Tt]imeout|NgapProcedure.*failed",
|
||||
"AMF", "critical",
|
||||
"N2 interface setup failure",
|
||||
"Verify gNB connectivity to AMF; check SCTP transport and NGAP PLMN config."),
|
||||
(r"InitialContextSetupFail|UeContextRelease.*[Aa]bnormal",
|
||||
"AMF", "warning",
|
||||
"UE context setup failure",
|
||||
"Review AMF-SMF N11 interface; check subscriber profile in UDM/UDR."),
|
||||
(r"PagingFail|UeUnreachable|UeNotFound",
|
||||
"AMF", "warning",
|
||||
"UE paging failure",
|
||||
"Verify UE is registered; check AMF tracking area configuration."),
|
||||
],
|
||||
"Sessions": [
|
||||
(r"PduSessionEstablishmentReject|PduSession.*[Ff]ail|CreateSessionResponse.*[Ff]ail",
|
||||
"SMF", "critical",
|
||||
"PDU session establishment failure",
|
||||
"Check SMF-UPF N4 path; verify DNN/APN config and UPF N3/N9 interfaces."),
|
||||
(r"N4Session.*[Ff]ail|PfcpSession.*[Ee]rror|N4.*[Tt]imeout|PfcpAssociation.*[Ff]ail",
|
||||
"UPF", "critical",
|
||||
"N4/PFCP session error",
|
||||
"Restart PFCP association between SMF and UPF; check N4 IP reachability."),
|
||||
(r"IpAllocationFail|AddressPoolExhausted|NoIpAvailable",
|
||||
"SMF", "critical",
|
||||
"IP address pool exhausted",
|
||||
"Expand UE IP address pool in SMF config; review active session count."),
|
||||
(r"SessionModification.*[Ff]ail|BearerModification.*[Ee]rror",
|
||||
"SMF", "warning",
|
||||
"Session modification failure",
|
||||
"Check PCF policy consistency; verify QoS parameters match UPF capabilities."),
|
||||
],
|
||||
"Authentication": [
|
||||
(r"AuthenticationFailure|AuthReject|EapFailure|5g-aka.*[Ff]ail|EapAkaFailure",
|
||||
"AUSF", "critical",
|
||||
"UE authentication failure",
|
||||
"Verify USIM credentials match UDM subscriber data; check AUSF-UDM N12 link."),
|
||||
(r"UdmAuthReq.*[Ee]rror|SuciDeconceal.*[Ff]ail|UdmUeAuth.*[Ee]rror",
|
||||
"UDM", "critical",
|
||||
"UDM authentication error",
|
||||
"Check UDM-UDR N35 connectivity; verify Home Network Public Key configuration."),
|
||||
(r"AuthVectorFetch.*[Ff]ail|AusfUeAuth.*[Rr]eject|HssAuth.*[Ff]ail",
|
||||
"AUSF", "warning",
|
||||
"Auth vector fetch failure",
|
||||
"Review UDR data integrity for affected SUPI; check AUSF-UDM TLS certificates."),
|
||||
],
|
||||
"Connectivity": [
|
||||
(r"NfDiscovery.*[Ff]ail|NrfRegistration.*[Ff]ail|NfDeregistration.*unexpect",
|
||||
"NRF", "warning",
|
||||
"NF service discovery failure",
|
||||
"Verify NRF is reachable from all NFs; check NRF registration TTL and heartbeat."),
|
||||
(r"ServiceUnavailable.*NF|HTTP.*503.*NF|NfProfile.*expired",
|
||||
"NRF", "warning",
|
||||
"NF service unavailable",
|
||||
"Check NF pod health and SBI listen port; review NRF subscription notifications."),
|
||||
(r"SbiRequest.*[Tt]imeout|SbiConn.*[Ff]ail|Http2.*[Ee]rror",
|
||||
"NRF", "warning",
|
||||
"SBI interface timeout",
|
||||
"Inspect inter-NF network MTU and TLS handshake; check load balancer config."),
|
||||
],
|
||||
"Policy": [
|
||||
(r"PcfSmPolicy.*[Ee]rror|PolicyDecision.*[Ff]ail|SmPolicy.*[Rr]eject",
|
||||
"PCF", "warning",
|
||||
"Policy decision failure",
|
||||
"Review PCF policy rules and subscriber group config; check PCF-UDR N36 link."),
|
||||
(r"QosEnforce.*[Ff]ail|ChargingRule.*[Ee]rror|PccRule.*[Rr]eject",
|
||||
"PCF", "warning",
|
||||
"QoS policy enforcement failure",
|
||||
"Verify QoS profiles match UPF capabilities; check PCF-CHF N40 charging path."),
|
||||
],
|
||||
"Security": [
|
||||
(r"SecurityMode.*[Ff]ail|IntegrityCheck.*[Ff]ail|NasIntegrity.*[Ee]rror",
|
||||
"AMF", "critical",
|
||||
"NAS security mode failure",
|
||||
"Check AMF cipher/integrity algorithm priority list matches UE capabilities."),
|
||||
(r"TlsHandshake.*[Ff]ail|Certificate.*[Ee]xpir|x509.*[Ee]rror|CertVerify.*[Ff]ail",
|
||||
"AMF", "critical",
|
||||
"TLS/certificate error",
|
||||
"Renew expired certificates; verify trust chain between NFs; check SBI TLS config."),
|
||||
(r"SuciProtection.*[Ff]ail|PrivacyProtection.*[Ee]rror|HomeNetworkKey.*[Ee]rror",
|
||||
"UDM", "warning",
|
||||
"SUCI privacy protection error",
|
||||
"Verify Home Network Public Key provisioning on UDM; check SUPI revealing config."),
|
||||
],
|
||||
}
|
||||
|
||||
# ── NF → possible container name fragments (tried in order) ─────────────────
|
||||
NF_CONTAINER_HINTS: dict[str, list[str]] = {
|
||||
"AMF": ["amf"],
|
||||
@@ -191,13 +100,13 @@ async def _read_logs(container: str, tail: int = 400) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _match_count(text: str, pattern: str) -> int:
|
||||
if not text:
|
||||
return 0
|
||||
def _rule_matches(message: str, pattern: str) -> bool:
|
||||
if not message:
|
||||
return False
|
||||
try:
|
||||
return len(re.findall(pattern, text, re.IGNORECASE | re.MULTILINE))
|
||||
return bool(re.search(pattern, message, re.IGNORECASE | re.MULTILINE))
|
||||
except re.error:
|
||||
return 0
|
||||
return False
|
||||
|
||||
|
||||
# ── Category/NF mapping for Alertmanager alerts ──────────────────────────────
|
||||
@@ -235,50 +144,104 @@ async def analyze_logs() -> dict:
|
||||
Gather log-pattern issues + Prometheus NF status + Alertmanager alerts.
|
||||
Returns a fully structured dict ready for JSON serialisation.
|
||||
"""
|
||||
from app.services import alertmanager, prometheus, cluster_inventory
|
||||
from app.services import alertmanager, cluster_inventory, log_ingest, prometheus
|
||||
|
||||
# Kick off all I/O in parallel
|
||||
containers_f = asyncio.create_task(_discover_containers())
|
||||
alerts_f = asyncio.create_task(alertmanager.get_alerts())
|
||||
nf_status_f = asyncio.create_task(prometheus.get_nf_status())
|
||||
cluster_f = asyncio.create_task(cluster_inventory.get_cluster_inventory())
|
||||
events_f = asyncio.to_thread(log_ingest.get_events)
|
||||
|
||||
containers = await containers_f
|
||||
alerts, nf_statuses, cluster = await asyncio.gather(alerts_f, nf_status_f, cluster_f,
|
||||
return_exceptions=True)
|
||||
alerts, nf_statuses, cluster, events = await asyncio.gather(
|
||||
alerts_f, nf_status_f, cluster_f, events_f, return_exceptions=True
|
||||
)
|
||||
if isinstance(alerts, Exception):
|
||||
alerts = []
|
||||
if isinstance(nf_statuses, Exception):
|
||||
nf_statuses = []
|
||||
if isinstance(cluster, Exception):
|
||||
cluster = {"enabled": False, "nodes": []}
|
||||
if isinstance(events, Exception):
|
||||
events = []
|
||||
|
||||
# Read all container logs concurrently
|
||||
log_tasks = {nf: asyncio.create_task(_read_logs(cname))
|
||||
for nf, cname in containers.items()}
|
||||
log_texts: dict[str, str] = {}
|
||||
if log_tasks:
|
||||
log_results = await asyncio.gather(*log_tasks.values(), return_exceptions=True)
|
||||
for nf, result in zip(log_tasks.keys(), log_results):
|
||||
log_texts[nf] = result if isinstance(result, str) else ""
|
||||
|
||||
events = sorted(
|
||||
[event for event in events if isinstance(event, dict)],
|
||||
key=lambda event: event.get("epoch", 0.0),
|
||||
)
|
||||
issues: list[dict] = []
|
||||
grouped_log_issues: dict[tuple[str, str, str, str], dict] = {}
|
||||
|
||||
# 1. Log-pattern analysis
|
||||
for category, patterns in CATEGORY_PATTERNS.items():
|
||||
for (pat_re, nf, severity, description, remediation) in patterns:
|
||||
count = _match_count(log_texts.get(nf, ""), pat_re)
|
||||
if count:
|
||||
issues.append({
|
||||
"id": f"log-{nf}-{len(issues)}",
|
||||
"category": category,
|
||||
"nf": nf,
|
||||
"severity": severity,
|
||||
"count": count,
|
||||
"description": description,
|
||||
"remediation": remediation,
|
||||
"source": "log",
|
||||
})
|
||||
# 1. Time-ordered log-pattern analysis across all nodes.
|
||||
for idx, event in enumerate(events):
|
||||
message = event.get("message", "")
|
||||
event_nf = str(event.get("nf", "")).upper()
|
||||
event_node = event.get("node", "")
|
||||
for category, patterns in load_category_patterns().items():
|
||||
for rule in patterns:
|
||||
rule_nf = str(rule["nf"]).upper()
|
||||
if event_nf != rule_nf:
|
||||
continue
|
||||
if not _rule_matches(message, rule["pattern"]):
|
||||
continue
|
||||
|
||||
before_context = events[max(0, idx - LOG_ALERT_CONTEXT_BEFORE):idx]
|
||||
after_context = events[idx + 1:idx + 1 + LOG_ALERT_CONTEXT_AFTER]
|
||||
context_id = log_ingest.record_alert_context(
|
||||
category=category,
|
||||
nf=rule_nf,
|
||||
node=event_node,
|
||||
severity=rule["severity"],
|
||||
description=rule["description"],
|
||||
remediation=rule["remediation"],
|
||||
source="fluentbit",
|
||||
event=event,
|
||||
before_context=before_context,
|
||||
after_context=after_context,
|
||||
)
|
||||
|
||||
issue_key = (category, rule_nf, event_node, rule["description"])
|
||||
if issue_key not in grouped_log_issues:
|
||||
grouped_log_issues[issue_key] = {
|
||||
"id": f"log-{rule_nf}-{len(grouped_log_issues)}",
|
||||
"category": category,
|
||||
"nf": rule_nf,
|
||||
"node": event_node,
|
||||
"severity": rule["severity"],
|
||||
"count": 0,
|
||||
"description": rule["description"],
|
||||
"remediation": rule["remediation"],
|
||||
"source": "fluentbit",
|
||||
"context_id": context_id,
|
||||
}
|
||||
grouped_log_issues[issue_key]["count"] += 1
|
||||
|
||||
issues.extend(grouped_log_issues.values())
|
||||
|
||||
# Fallback to local container logs until Fluent Bit has populated the buffer.
|
||||
if not issues and not events:
|
||||
log_tasks = {nf: asyncio.create_task(_read_logs(cname)) for nf, cname in containers.items()}
|
||||
if log_tasks:
|
||||
log_results = await asyncio.gather(*log_tasks.values(), return_exceptions=True)
|
||||
log_texts = {
|
||||
nf: result if isinstance(result, str) else ""
|
||||
for nf, result in zip(log_tasks.keys(), log_results)
|
||||
}
|
||||
for category, patterns in load_category_patterns().items():
|
||||
for rule in patterns:
|
||||
nf = rule["nf"]
|
||||
if _rule_matches(log_texts.get(nf, ""), rule["pattern"]):
|
||||
issues.append({
|
||||
"id": f"log-{nf}-{len(issues)}",
|
||||
"category": category,
|
||||
"nf": nf,
|
||||
"severity": rule["severity"],
|
||||
"count": 1,
|
||||
"description": rule["description"],
|
||||
"remediation": rule["remediation"],
|
||||
"source": "local-log-fallback",
|
||||
})
|
||||
|
||||
# 2. NF-down events from Prometheus
|
||||
for nf_st in nf_statuses:
|
||||
@@ -337,7 +300,8 @@ async def analyze_logs() -> dict:
|
||||
"total": total,
|
||||
"categories": categories,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"log_sources": list(containers.keys()),
|
||||
"log_sources": sorted({f"{event.get('node', 'unknown')}:{event.get('nf', 'SYSTEM')}" for event in events}) or list(containers.keys()),
|
||||
"log_ingest": log_ingest.receiver_status(),
|
||||
"cluster": cluster,
|
||||
}
|
||||
|
||||
|
||||
499
app/services/log_ingest.py
Normal file
499
app/services/log_ingest.py
Normal file
@@ -0,0 +1,499 @@
|
||||
"""Fluent Bit log ingestion, buffering, and alert-context persistence."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sqlite3
|
||||
from collections import deque
|
||||
from datetime import UTC, datetime
|
||||
from hashlib import sha1
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.config import (
|
||||
ALL_NFS,
|
||||
LOG_ALERT_CONTEXT_AFTER,
|
||||
LOG_ALERT_CONTEXT_BEFORE,
|
||||
LOG_ALLOWED_NFS,
|
||||
LOG_ALERT_CONTEXT_DB_MAX_ROWS,
|
||||
LOG_ALERT_CONTEXT_DB_PATH,
|
||||
LOG_BUFFER_LINES,
|
||||
LOG_AUTO_CONFIGURE,
|
||||
LOG_FLUENTBIT_MATCH,
|
||||
LOG_INGEST_ENABLED,
|
||||
LOG_RECEIVER_BIND_HOST,
|
||||
LOG_RECEIVER_FORMAT,
|
||||
LOG_RECEIVER_HOST,
|
||||
LOG_RECEIVER_PORT,
|
||||
LOG_TRACE_BUFFER_LINES,
|
||||
)
|
||||
from app.services import pls
|
||||
|
||||
_server: asyncio.base_events.Server | None = None
|
||||
_events: deque[dict[str, Any]] = deque(maxlen=max(LOG_BUFFER_LINES, 1))
|
||||
_trace_events: deque[dict[str, Any]] = deque(maxlen=max(LOG_TRACE_BUFFER_LINES, LOG_BUFFER_LINES, 1))
|
||||
_ingested_total = 0
|
||||
_parse_errors = 0
|
||||
_last_event_at: str | None = None
|
||||
_db_initialized = False
|
||||
_allowed_nfs = {nf.upper() for nf in LOG_ALLOWED_NFS}
|
||||
|
||||
|
||||
def _db_path() -> Path:
|
||||
return Path(LOG_ALERT_CONTEXT_DB_PATH)
|
||||
|
||||
|
||||
def _ensure_db() -> None:
|
||||
global _db_initialized
|
||||
if _db_initialized:
|
||||
return
|
||||
path = _db_path()
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = sqlite3.connect(path)
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS alert_context (
|
||||
id TEXT PRIMARY KEY,
|
||||
fingerprint TEXT UNIQUE,
|
||||
created_at TEXT NOT NULL,
|
||||
event_ts TEXT NOT NULL,
|
||||
category TEXT NOT NULL,
|
||||
nf TEXT,
|
||||
node TEXT,
|
||||
severity TEXT,
|
||||
description TEXT,
|
||||
remediation TEXT,
|
||||
source TEXT,
|
||||
match_message TEXT,
|
||||
before_context TEXT,
|
||||
after_context TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
_db_initialized = True
|
||||
|
||||
|
||||
def _trim_db(conn: sqlite3.Connection) -> None:
|
||||
conn.execute(
|
||||
"""
|
||||
DELETE FROM alert_context
|
||||
WHERE id NOT IN (
|
||||
SELECT id
|
||||
FROM alert_context
|
||||
ORDER BY event_ts DESC, created_at DESC
|
||||
LIMIT ?
|
||||
)
|
||||
""",
|
||||
(max(LOG_ALERT_CONTEXT_DB_MAX_ROWS, 1),),
|
||||
)
|
||||
|
||||
|
||||
def _parse_timestamp(value: Any) -> tuple[float, str]:
|
||||
if value is None:
|
||||
now = datetime.now(UTC)
|
||||
return now.timestamp(), now.isoformat()
|
||||
|
||||
if isinstance(value, (int, float)):
|
||||
raw = float(value)
|
||||
if raw > 1_000_000_000_000:
|
||||
raw = raw / 1_000_000.0
|
||||
elif raw > 10_000_000_000:
|
||||
raw = raw / 1000.0
|
||||
dt = datetime.fromtimestamp(raw, UTC)
|
||||
return raw, dt.isoformat()
|
||||
|
||||
text = str(value).strip()
|
||||
if text.isdigit():
|
||||
return _parse_timestamp(int(text))
|
||||
|
||||
normalized = text.replace("Z", "+00:00")
|
||||
for candidate in (normalized, normalized.replace(" ", "T")):
|
||||
try:
|
||||
dt = datetime.fromisoformat(candidate)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=UTC)
|
||||
else:
|
||||
dt = dt.astimezone(UTC)
|
||||
return dt.timestamp(), dt.isoformat()
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
now = datetime.now(UTC)
|
||||
return now.timestamp(), now.isoformat()
|
||||
|
||||
|
||||
def _candidate_fields(payload: dict[str, Any]) -> list[str]:
|
||||
candidates = []
|
||||
for key in (
|
||||
"message",
|
||||
"MESSAGE",
|
||||
"log",
|
||||
"msg",
|
||||
"systemd_unit",
|
||||
"_SYSTEMD_UNIT",
|
||||
"syslog_identifier",
|
||||
"SYSLOG_IDENTIFIER",
|
||||
"_COMM",
|
||||
"comm",
|
||||
"_EXE",
|
||||
"container_name",
|
||||
"tag",
|
||||
):
|
||||
value = payload.get(key)
|
||||
if value not in (None, ""):
|
||||
candidates.append(str(value))
|
||||
return candidates
|
||||
|
||||
|
||||
def _infer_nf(payload: dict[str, Any], message: str) -> str:
|
||||
haystack = " ".join(_candidate_fields(payload) + [message]).lower()
|
||||
aliases = {
|
||||
"upf": "UPF",
|
||||
"amf": "AMF",
|
||||
"smf": "SMF",
|
||||
"udm": "UDM",
|
||||
"udr": "UDR",
|
||||
"nrf": "NRF",
|
||||
"ausf": "AUSF",
|
||||
"pcf": "PCF",
|
||||
"mme": "MME",
|
||||
"sgwc": "SGWC",
|
||||
"dra": "DRA",
|
||||
"dsm": "DSM",
|
||||
"aaa": "AAA",
|
||||
"bmsc": "BMSC",
|
||||
"chf": "CHF",
|
||||
"smsf": "SMSF",
|
||||
"eir": "EIR",
|
||||
"licensed": "LICENSED",
|
||||
"prometheus": "PROMETHEUS",
|
||||
"alertmanager": "ALERTMANAGER",
|
||||
"fluent-bit": "FLUENT-BIT",
|
||||
}
|
||||
for needle, label in aliases.items():
|
||||
if needle in haystack:
|
||||
return label
|
||||
return "SYSTEM"
|
||||
|
||||
|
||||
def _normalize_event(payload: dict[str, Any], remote_host: str) -> dict[str, Any]:
|
||||
ts_value = (
|
||||
payload.get("timestamp")
|
||||
or payload.get("@timestamp")
|
||||
or payload.get("time")
|
||||
or payload.get("date")
|
||||
or payload.get("_SOURCE_REALTIME_TIMESTAMP")
|
||||
)
|
||||
epoch, ts_iso = _parse_timestamp(ts_value)
|
||||
|
||||
node = (
|
||||
payload.get("hostname")
|
||||
or payload.get("host")
|
||||
or payload.get("_HOSTNAME")
|
||||
or payload.get("syslog_hostname")
|
||||
or remote_host
|
||||
)
|
||||
source = (
|
||||
payload.get("systemd_unit")
|
||||
or payload.get("_SYSTEMD_UNIT")
|
||||
or payload.get("syslog_identifier")
|
||||
or payload.get("SYSLOG_IDENTIFIER")
|
||||
or payload.get("_COMM")
|
||||
or payload.get("tag")
|
||||
or "unknown"
|
||||
)
|
||||
message = (
|
||||
payload.get("message")
|
||||
or payload.get("MESSAGE")
|
||||
or payload.get("log")
|
||||
or payload.get("msg")
|
||||
or ""
|
||||
)
|
||||
message = str(message).strip()
|
||||
tag = str(payload.get("tag", ""))
|
||||
nf = _infer_nf(payload, message)
|
||||
fingerprint = sha1(f"{ts_iso}|{node}|{nf}|{source}|{message}".encode("utf-8")).hexdigest()
|
||||
return {
|
||||
"id": fingerprint,
|
||||
"timestamp": ts_iso,
|
||||
"epoch": epoch,
|
||||
"node": str(node),
|
||||
"nf": nf,
|
||||
"source": str(source),
|
||||
"tag": tag,
|
||||
"message": message,
|
||||
"raw": payload,
|
||||
}
|
||||
|
||||
|
||||
async def _ingest_payload(payload: dict[str, Any], remote_host: str) -> None:
|
||||
global _ingested_total, _last_event_at
|
||||
event = _normalize_event(payload, remote_host)
|
||||
if event.get("nf", "").upper() not in _allowed_nfs:
|
||||
return
|
||||
_events.append(event)
|
||||
_trace_events.append(event)
|
||||
_ingested_total += 1
|
||||
_last_event_at = event["timestamp"]
|
||||
|
||||
|
||||
async def _handle_client(reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
|
||||
global _parse_errors
|
||||
peer = writer.get_extra_info("peername")
|
||||
remote_host = peer[0] if isinstance(peer, tuple) and peer else "unknown"
|
||||
try:
|
||||
while not reader.at_eof():
|
||||
line = await reader.readline()
|
||||
if not line:
|
||||
break
|
||||
text = line.decode("utf-8", errors="replace").strip()
|
||||
if not text:
|
||||
continue
|
||||
try:
|
||||
payload = json.loads(text)
|
||||
if isinstance(payload, dict):
|
||||
await _ingest_payload(payload, remote_host)
|
||||
elif isinstance(payload, list):
|
||||
for item in payload:
|
||||
if isinstance(item, dict):
|
||||
await _ingest_payload(item, remote_host)
|
||||
except Exception:
|
||||
_parse_errors += 1
|
||||
finally:
|
||||
writer.close()
|
||||
await writer.wait_closed()
|
||||
|
||||
|
||||
async def startup() -> None:
|
||||
global _server
|
||||
_ensure_db()
|
||||
if not LOG_INGEST_ENABLED or _server is not None:
|
||||
return
|
||||
_server = await asyncio.start_server(_handle_client, LOG_RECEIVER_BIND_HOST, LOG_RECEIVER_PORT)
|
||||
if LOG_AUTO_CONFIGURE:
|
||||
try:
|
||||
await configure_site_output()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def shutdown() -> None:
|
||||
global _server
|
||||
if _server is None:
|
||||
return
|
||||
_server.close()
|
||||
await _server.wait_closed()
|
||||
_server = None
|
||||
|
||||
|
||||
def receiver_status() -> dict[str, Any]:
|
||||
return {
|
||||
"enabled": LOG_INGEST_ENABLED,
|
||||
"bind_host": LOG_RECEIVER_BIND_HOST,
|
||||
"receiver_host": LOG_RECEIVER_HOST,
|
||||
"port": LOG_RECEIVER_PORT,
|
||||
"format": LOG_RECEIVER_FORMAT,
|
||||
"allowed_nfs": sorted(_allowed_nfs),
|
||||
"buffer_lines": LOG_BUFFER_LINES,
|
||||
"trace_buffer_lines": LOG_TRACE_BUFFER_LINES,
|
||||
"context_before": LOG_ALERT_CONTEXT_BEFORE,
|
||||
"context_after": LOG_ALERT_CONTEXT_AFTER,
|
||||
"db_path": str(_db_path()),
|
||||
"ingested_total": _ingested_total,
|
||||
"parse_errors": _parse_errors,
|
||||
"last_event_at": _last_event_at,
|
||||
"current_buffer_size": len(_events),
|
||||
}
|
||||
|
||||
|
||||
def current_output_config(receiver_host: str) -> dict[str, Any]:
|
||||
return {
|
||||
"name": "tcp",
|
||||
"match": LOG_FLUENTBIT_MATCH,
|
||||
"host": receiver_host,
|
||||
"port": LOG_RECEIVER_PORT,
|
||||
"format": LOG_RECEIVER_FORMAT,
|
||||
}
|
||||
|
||||
|
||||
def default_input_config() -> dict[str, Any]:
|
||||
return {
|
||||
"name": "systemd",
|
||||
"path": "/var/log/journal",
|
||||
"tag": "marvis.systemd",
|
||||
"read_from_tail": "on",
|
||||
"strip_underscores": "off",
|
||||
}
|
||||
|
||||
|
||||
async def _resolve_receiver_host() -> str:
|
||||
if LOG_RECEIVER_HOST:
|
||||
return LOG_RECEIVER_HOST
|
||||
|
||||
cluster = await pls.get_cluster_status()
|
||||
if isinstance(cluster, dict):
|
||||
current_node = cluster.get("current_node")
|
||||
if isinstance(current_node, str) and current_node:
|
||||
return pls.node_host(current_node)
|
||||
|
||||
system = await pls.get_system_info()
|
||||
if isinstance(system, dict) and system.get("hostname"):
|
||||
return str(system["hostname"])
|
||||
|
||||
return "127.0.0.1"
|
||||
|
||||
|
||||
def _merged_fluentbit_config(config: dict[str, Any], receiver_host: str) -> dict[str, Any]:
|
||||
merged = dict(config or {})
|
||||
pipeline = dict(merged.get("pipeline") or {})
|
||||
inputs = list(pipeline.get("inputs") or [])
|
||||
outputs = list(pipeline.get("outputs") or [])
|
||||
desired = current_output_config(receiver_host)
|
||||
|
||||
if not inputs:
|
||||
inputs = [default_input_config()]
|
||||
|
||||
filtered = []
|
||||
for output in outputs:
|
||||
if not isinstance(output, dict):
|
||||
continue
|
||||
is_existing_marvis = (
|
||||
output.get("name") == "tcp"
|
||||
and output.get("port") == LOG_RECEIVER_PORT
|
||||
and output.get("format") == LOG_RECEIVER_FORMAT
|
||||
)
|
||||
if not is_existing_marvis:
|
||||
filtered.append(output)
|
||||
|
||||
filtered.append(desired)
|
||||
pipeline["inputs"] = inputs
|
||||
pipeline["outputs"] = filtered
|
||||
merged["pipeline"] = pipeline
|
||||
if "parsers" not in merged:
|
||||
merged["parsers"] = list(config.get("parsers") or []) if isinstance(config, dict) else []
|
||||
return merged
|
||||
|
||||
|
||||
async def configure_site_output() -> dict[str, Any]:
|
||||
current = await pls.get_fluentbit_config()
|
||||
if not isinstance(current, dict):
|
||||
raise RuntimeError("Could not read current Fluent Bit config from PLS")
|
||||
receiver_host = await _resolve_receiver_host()
|
||||
desired = _merged_fluentbit_config(current, receiver_host)
|
||||
updated = await pls.put_fluentbit_config(desired)
|
||||
if not isinstance(updated, dict):
|
||||
raise RuntimeError("PLS rejected Fluent Bit config update")
|
||||
return {
|
||||
"receiver_host": receiver_host,
|
||||
"receiver_port": LOG_RECEIVER_PORT,
|
||||
"match": LOG_FLUENTBIT_MATCH,
|
||||
"config": updated,
|
||||
}
|
||||
|
||||
|
||||
def get_events(limit: int | None = None, node: str | None = None, nf: str | None = None, imsi: str | None = None) -> list[dict[str, Any]]:
|
||||
events = list(_trace_events if imsi else _events)
|
||||
if node:
|
||||
node_l = node.lower()
|
||||
events = [event for event in events if event.get("node", "").lower() == node_l]
|
||||
if nf:
|
||||
nf_u = nf.upper()
|
||||
events = [event for event in events if event.get("nf", "").upper() == nf_u]
|
||||
if imsi:
|
||||
needle = imsi.strip()
|
||||
events = [event for event in events if needle and needle in event.get("message", "")]
|
||||
events.sort(key=lambda event: event.get("epoch", 0.0))
|
||||
if limit is not None:
|
||||
return events[-limit:]
|
||||
return events
|
||||
|
||||
|
||||
def record_alert_context(
|
||||
*,
|
||||
category: str,
|
||||
nf: str,
|
||||
node: str,
|
||||
severity: str,
|
||||
description: str,
|
||||
remediation: str,
|
||||
source: str,
|
||||
event: dict[str, Any],
|
||||
before_context: list[dict[str, Any]],
|
||||
after_context: list[dict[str, Any]],
|
||||
) -> str:
|
||||
_ensure_db()
|
||||
fingerprint = sha1(
|
||||
"|".join(
|
||||
[
|
||||
category,
|
||||
nf,
|
||||
node,
|
||||
severity,
|
||||
description,
|
||||
remediation,
|
||||
event.get("timestamp", ""),
|
||||
event.get("message", ""),
|
||||
]
|
||||
).encode("utf-8")
|
||||
).hexdigest()
|
||||
alert_id = sha1(f"{fingerprint}|{source}".encode("utf-8")).hexdigest()
|
||||
conn = sqlite3.connect(_db_path())
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO alert_context (
|
||||
id, fingerprint, created_at, event_ts, category, nf, node, severity,
|
||||
description, remediation, source, match_message, before_context, after_context
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
alert_id,
|
||||
fingerprint,
|
||||
datetime.now(UTC).isoformat(),
|
||||
event.get("timestamp", ""),
|
||||
category,
|
||||
nf,
|
||||
node,
|
||||
severity,
|
||||
description,
|
||||
remediation,
|
||||
source,
|
||||
event.get("message", ""),
|
||||
json.dumps(before_context),
|
||||
json.dumps(after_context),
|
||||
),
|
||||
)
|
||||
_trim_db(conn)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
return alert_id
|
||||
|
||||
|
||||
def recent_alert_context(limit: int = 20) -> list[dict[str, Any]]:
|
||||
_ensure_db()
|
||||
conn = sqlite3.connect(_db_path())
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, created_at, event_ts, category, nf, node, severity, description,
|
||||
remediation, source, match_message, before_context, after_context
|
||||
FROM alert_context
|
||||
ORDER BY event_ts DESC, created_at DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
return [dict(row) for row in rows]
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def known_nfs() -> list[str]:
|
||||
return list(ALL_NFS)
|
||||
37
app/services/log_rules.py
Normal file
37
app/services/log_rules.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""JSON-backed log rule loading for runtime-editable log analysis."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
RULES_PATH = Path(__file__).resolve().parents[2] / "config" / "log_rules.json"
|
||||
_rules_cache: dict[str, list[dict]] | None = None
|
||||
_rules_cache_mtime: float | None = None
|
||||
|
||||
|
||||
def load_category_patterns() -> dict[str, list[dict]]:
|
||||
global _rules_cache, _rules_cache_mtime
|
||||
try:
|
||||
stat = RULES_PATH.stat()
|
||||
if _rules_cache is not None and _rules_cache_mtime == stat.st_mtime:
|
||||
return _rules_cache
|
||||
|
||||
data = json.loads(RULES_PATH.read_text())
|
||||
categories = data.get("categories", {})
|
||||
loaded: dict[str, list[dict]] = {}
|
||||
for category, rules in categories.items():
|
||||
loaded[category] = []
|
||||
for rule in rules:
|
||||
if not all(
|
||||
key in rule
|
||||
for key in ("pattern", "nf", "severity", "description", "remediation")
|
||||
):
|
||||
continue
|
||||
loaded[category].append(rule)
|
||||
|
||||
_rules_cache = loaded
|
||||
_rules_cache_mtime = stat.st_mtime
|
||||
return loaded
|
||||
except Exception:
|
||||
return {}
|
||||
@@ -1,4 +1,4 @@
|
||||
"""PLS API client for cluster and per-node discovery."""
|
||||
"""PLS API client for cluster, per-node discovery, and site-wide config."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -11,6 +11,10 @@ from app.config import PLS_AUTH_BACKEND, PLS_BASE_URL, PLS_PASSWORD, PLS_USERNAM
|
||||
_token: str | None = None
|
||||
|
||||
|
||||
class PlsRequestError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def _base_url_for_host(host: str | None = None) -> str:
|
||||
if not host:
|
||||
return PLS_BASE_URL.rstrip("/")
|
||||
@@ -18,9 +22,9 @@ def _base_url_for_host(host: str | None = None) -> str:
|
||||
return urlunsplit((parts.scheme, host, parts.path.rstrip("/"), "", ""))
|
||||
|
||||
|
||||
async def _login() -> str | None:
|
||||
async def _login(force: bool = False) -> str | None:
|
||||
global _token
|
||||
if _token:
|
||||
if _token and not force:
|
||||
return _token
|
||||
if not PLS_USERNAME or not PLS_PASSWORD:
|
||||
return None
|
||||
@@ -48,17 +52,45 @@ async def _get(path: str, host: str | None = None) -> dict | list | None:
|
||||
if not token:
|
||||
return None
|
||||
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
url = f"{_base_url_for_host(host)}/{path.lstrip('/')}"
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5, verify=PLS_VERIFY_TLS) as client:
|
||||
response = await client.get(url, headers=headers)
|
||||
response = await client.get(url, headers={"Authorization": f"Bearer {token}"})
|
||||
if response.status_code in {401, 403}:
|
||||
refreshed = await _login(force=True)
|
||||
if not refreshed:
|
||||
return None
|
||||
response = await client.get(url, headers={"Authorization": f"Bearer {refreshed}"})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
async def _put(path: str, payload: dict, host: str | None = None) -> dict | list | None:
|
||||
token = await _login()
|
||||
if not token:
|
||||
raise PlsRequestError("PLS authentication is not configured or login failed")
|
||||
|
||||
url = f"{_base_url_for_host(host)}/{path.lstrip('/')}"
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=8, verify=PLS_VERIFY_TLS) as client:
|
||||
response = await client.put(url, headers={"Authorization": f"Bearer {token}"}, json=payload)
|
||||
if response.status_code in {401, 403}:
|
||||
refreshed = await _login(force=True)
|
||||
if not refreshed:
|
||||
raise PlsRequestError("PLS token expired and re-login failed")
|
||||
response = await client.put(url, headers={"Authorization": f"Bearer {refreshed}"}, json=payload)
|
||||
if response.is_error:
|
||||
detail = response.text.strip()
|
||||
raise PlsRequestError(f"HTTP {response.status_code}: {detail or 'unknown PLS validation error'}")
|
||||
return response.json()
|
||||
except PlsRequestError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise PlsRequestError(str(exc)) from exc
|
||||
|
||||
|
||||
def node_host(node_name: str) -> str:
|
||||
return node_name.split("@", 1)[1] if "@" in node_name else node_name
|
||||
|
||||
@@ -76,3 +108,13 @@ async def get_system_info(host: str | None = None) -> dict | None:
|
||||
async def get_services(host: str | None = None) -> list[dict]:
|
||||
data = await _get("services", host=host)
|
||||
return data if isinstance(data, list) else []
|
||||
|
||||
|
||||
async def get_fluentbit_config() -> dict | None:
|
||||
data = await _get("fluent-bit/config")
|
||||
return data if isinstance(data, dict) else None
|
||||
|
||||
|
||||
async def put_fluentbit_config(config: dict) -> dict | None:
|
||||
data = await _put("fluent-bit/config", config)
|
||||
return data if isinstance(data, dict) else None
|
||||
|
||||
@@ -174,14 +174,28 @@ header h1 span { color: var(--muted); font-weight: 400; }
|
||||
padding: 9px 12px; margin-bottom: 7px; border-left: 3px solid var(--yellow);
|
||||
}
|
||||
.alert-row.critical { border-left-color: var(--red); }
|
||||
.alert-row.logs { border-left-color: var(--blue); }
|
||||
.alert-row-name { font-size: 12px; font-weight: 600; }
|
||||
.alert-row-desc { font-size: 11px; color: var(--muted); margin-top: 2px; }
|
||||
.alert-row-node { font-size: 10px; color: var(--blue); margin-top: 5px; }
|
||||
.alert-row-meta { display: flex; gap: 6px; align-items: center; margin-top: 6px; flex-wrap: wrap; }
|
||||
.alert-badge {
|
||||
font-size: 9px; text-transform: uppercase; letter-spacing: .08em;
|
||||
border-radius: 999px; padding: 2px 6px; border: 1px solid var(--border); color: var(--muted);
|
||||
}
|
||||
.alert-badge.logs { color: var(--blue); border-color: rgba(59,130,246,.4); background: rgba(59,130,246,.12); }
|
||||
.alert-badge.alertmanager { color: var(--yellow); border-color: rgba(245,158,11,.4); background: rgba(245,158,11,.12); }
|
||||
.alert-context {
|
||||
margin-top: 7px; font-family: ui-monospace,SFMono-Regular,Menlo,monospace;
|
||||
font-size: 10px; line-height: 1.45; color: #c8d1e3;
|
||||
background: rgba(0,0,0,.18); border: 1px solid rgba(255,255,255,.05);
|
||||
border-radius: 6px; padding: 7px 8px; white-space: pre-wrap;
|
||||
}
|
||||
|
||||
/* ── Chat panel ─────────────────────────────────────────────────── */
|
||||
.chat { display: flex; flex-direction: column; overflow: hidden; }
|
||||
.chat { display: grid; grid-template-rows: auto auto minmax(0,1fr) auto; overflow: hidden; }
|
||||
.messages {
|
||||
flex: 1; overflow-y: auto; padding: 20px; display: flex; flex-direction: column; gap: 14px;
|
||||
min-height: 0; overflow-y: auto; padding: 20px; display: flex; flex-direction: column; gap: 14px;
|
||||
}
|
||||
.messages::-webkit-scrollbar { width: 4px; }
|
||||
.messages::-webkit-scrollbar-thumb { background: var(--border); border-radius: 4px; }
|
||||
@@ -241,9 +255,62 @@ header h1 span { color: var(--muted); font-weight: 400; }
|
||||
.send:hover { opacity: .85; }
|
||||
.send:disabled { opacity: .35; cursor: default; }
|
||||
|
||||
/* Trace panel */
|
||||
.trace-panel {
|
||||
background: linear-gradient(180deg, rgba(30,37,53,.65), rgba(15,17,23,.95));
|
||||
flex-shrink: 0;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
min-height: 220px;
|
||||
max-height: 280px;
|
||||
border-bottom: 1px solid var(--border);
|
||||
}
|
||||
.trace-header {
|
||||
padding: 12px 20px 10px;
|
||||
display: flex; align-items: center; justify-content: space-between; gap: 10px;
|
||||
border-bottom: 1px solid rgba(255,255,255,.04);
|
||||
}
|
||||
.trace-title {
|
||||
font-size: 11px; font-weight: 700; text-transform: uppercase; letter-spacing: .1em; color: var(--muted);
|
||||
}
|
||||
.trace-status { font-size: 11px; color: var(--muted); }
|
||||
.trace-controls {
|
||||
padding: 10px 20px;
|
||||
display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 8px;
|
||||
}
|
||||
.trace-controls select,
|
||||
.trace-controls input,
|
||||
.trace-controls button {
|
||||
background: var(--card); color: var(--text); border: 1px solid var(--border);
|
||||
border-radius: 8px; padding: 8px 10px; font: inherit; min-width: 0;
|
||||
}
|
||||
.trace-controls button { cursor: pointer; }
|
||||
.trace-controls button:hover { border-color: var(--purple); }
|
||||
.trace-log {
|
||||
flex: 1; overflow: auto; padding: 0 20px 16px;
|
||||
}
|
||||
.trace-log::-webkit-scrollbar { width: 4px; height: 4px; }
|
||||
.trace-log::-webkit-scrollbar-thumb { background: var(--border); border-radius: 4px; }
|
||||
.trace-empty {
|
||||
color: var(--muted); font-size: 12px; padding-top: 16px;
|
||||
}
|
||||
.trace-pre {
|
||||
font-family: ui-monospace,SFMono-Regular,Menlo,monospace;
|
||||
font-size: 11px; line-height: 1.55; color: #dbe5f5; white-space: pre-wrap;
|
||||
}
|
||||
.trace-line {
|
||||
display: block; padding: 2px 0;
|
||||
}
|
||||
.trace-line .t-ts { color: var(--muted); }
|
||||
.trace-line .t-node { color: var(--blue); }
|
||||
.trace-line .t-nf { color: var(--green); }
|
||||
.trace-line .t-src { color: var(--yellow); }
|
||||
|
||||
@media (max-width: 680px) {
|
||||
.layout { grid-template-columns: 1fr; }
|
||||
.left { max-height: 260px; }
|
||||
.trace-controls { grid-template-columns: 1fr 1fr; }
|
||||
.trace-panel { max-height: 320px; }
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
@@ -282,7 +349,25 @@ header h1 span { color: var(--muted); font-weight: 400; }
|
||||
|
||||
<!-- Chat panel -->
|
||||
<div class="chat">
|
||||
<div class="messages" id="messages"></div>
|
||||
<div class="trace-panel">
|
||||
<div class="trace-header">
|
||||
<div class="trace-title">Live Log Trace</div>
|
||||
<div class="trace-status" id="traceStatus">Waiting for log stream…</div>
|
||||
</div>
|
||||
<div class="trace-controls">
|
||||
<select id="traceNode" onchange="loadTraces()">
|
||||
<option value="">All nodes</option>
|
||||
</select>
|
||||
<select id="traceNf" onchange="loadTraces()">
|
||||
<option value="">All NFs</option>
|
||||
</select>
|
||||
<input id="traceLines" type="number" min="10" max="1000" value="80" onchange="loadTraces()" />
|
||||
<button onclick="loadTraces()">Refresh Trace</button>
|
||||
</div>
|
||||
<div class="trace-log" id="traceLog">
|
||||
<div class="trace-empty">No trace data yet.</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="chips">
|
||||
<button class="chip" onclick="ask(this)">Network health overview</button>
|
||||
<button class="chip" onclick="ask(this)">Any active alerts?</button>
|
||||
@@ -291,6 +376,7 @@ header h1 span { color: var(--muted); font-weight: 400; }
|
||||
<button class="chip" onclick="ask(this)">Subscriber issues?</button>
|
||||
<button class="chip" onclick="ask(this)">What can you do?</button>
|
||||
</div>
|
||||
<div class="messages" id="messages"></div>
|
||||
<div class="input-bar">
|
||||
<input class="msg-input" id="inp" placeholder="Ask about your P5G network…"
|
||||
onkeydown="if(event.key==='Enter'&&!event.shiftKey){event.preventDefault();send()}">
|
||||
@@ -312,6 +398,9 @@ const ROLE_LABELS = {
|
||||
'COMBOCP': 'Combo CP',
|
||||
'COMBODCP': 'Combo DCP',
|
||||
};
|
||||
let latestCluster = { nodes: [] };
|
||||
let allowedTraceNfs = [];
|
||||
let tracePollHandle = null;
|
||||
|
||||
function md(text) {
|
||||
// minimal markdown: **bold**, `code`, newlines
|
||||
@@ -342,6 +431,7 @@ function addMsg(role, html, isTyping=false) {
|
||||
async function loadNFs() {
|
||||
try {
|
||||
const d = await (await fetch('./api/network/status')).json();
|
||||
latestCluster = d.cluster || { nodes: [] };
|
||||
const grid = $('nfGrid');
|
||||
grid.innerHTML = '';
|
||||
(d.nfs||[]).forEach(nf => {
|
||||
@@ -353,6 +443,7 @@ async function loadNFs() {
|
||||
grid.appendChild(c);
|
||||
});
|
||||
renderNodes(d.cluster);
|
||||
populateTraceFilters(d.cluster);
|
||||
$('dot').className = 'dot';
|
||||
$('connLabel').textContent = 'Live';
|
||||
} catch {
|
||||
@@ -363,6 +454,25 @@ async function loadNFs() {
|
||||
}
|
||||
}
|
||||
|
||||
function populateTraceFilters(cluster) {
|
||||
const nodes = cluster?.nodes || [];
|
||||
const nodeSel = $('traceNode');
|
||||
const nfSel = $('traceNf');
|
||||
const currentNode = nodeSel.value;
|
||||
const currentNf = nfSel.value;
|
||||
|
||||
const nodeOptions = ['<option value="">All nodes</option>']
|
||||
.concat(nodes.map(node => `<option value="${node.hostname}">${node.hostname}</option>`));
|
||||
nodeSel.innerHTML = nodeOptions.join('');
|
||||
nodeSel.value = nodes.some(node => node.hostname === currentNode) ? currentNode : '';
|
||||
|
||||
const nfs = new Set(allowedTraceNfs);
|
||||
nfSel.innerHTML = ['<option value="">All NFs</option>']
|
||||
.concat([...nfs].sort().map(nf => `<option value="${nf}">${nf}</option>`))
|
||||
.join('');
|
||||
nfSel.value = nfs.has(currentNf) ? currentNf : '';
|
||||
}
|
||||
|
||||
function toggleNodeCard(button) {
|
||||
button.closest('.node-card')?.classList.toggle('open');
|
||||
}
|
||||
@@ -424,10 +534,15 @@ async function loadAlerts() {
|
||||
el.innerHTML = '<div class="no-alerts"><div class="ok-icon">✓</div>No active alerts</div>';
|
||||
} else {
|
||||
el.innerHTML = d.alerts.slice(0,10).map(a =>
|
||||
`<div class="alert-row ${a.severity||'warning'}">
|
||||
`<div class="alert-row ${(a.severity||'warning')} ${a.source==='logs'?'logs':''}">
|
||||
<div class="alert-row-name">${a.name}</div>
|
||||
<div class="alert-row-desc">${a.summary||a.instance||''}</div>
|
||||
<div class="alert-row-node">${(a.nodes||[]).length ? 'Node: ' + a.nodes.map(n => n.hostname).join(', ') : 'Node: unresolved'}</div>
|
||||
<div class="alert-row-meta">
|
||||
<span class="alert-badge ${a.source==='logs'?'logs':'alertmanager'}">${a.source || 'alertmanager'}</span>
|
||||
<span class="alert-badge">${a.severity || 'warning'}</span>
|
||||
</div>
|
||||
${a.source === 'logs' && a.match_message ? `<div class="alert-context">${escapeHtml(a.match_message)}</div>` : ''}
|
||||
</div>`
|
||||
).join('');
|
||||
}
|
||||
@@ -436,7 +551,66 @@ async function loadAlerts() {
|
||||
}
|
||||
}
|
||||
|
||||
async function refresh() { await Promise.all([loadNFs(), loadAlerts()]); }
|
||||
async function loadTraces() {
|
||||
try {
|
||||
const limit = Math.max(10, Math.min(1000, parseInt($('traceLines').value || '80', 10) || 80));
|
||||
const params = new URLSearchParams({ limit: String(limit) });
|
||||
if ($('traceNode').value) params.set('node', $('traceNode').value);
|
||||
if ($('traceNf').value) params.set('nf', $('traceNf').value);
|
||||
const d = await (await fetch(`./api/logs/events?${params.toString()}`)).json();
|
||||
allowedTraceNfs = (d.status?.allowed_nfs || []).map(nf => String(nf).toUpperCase());
|
||||
populateTraceFilters(latestCluster);
|
||||
const events = d.events || [];
|
||||
$('traceStatus').textContent = d.status?.last_event_at
|
||||
? `Last event ${formatFullDateTime(d.status.last_event_at)}`
|
||||
: 'Waiting for log stream…';
|
||||
if (!events.length) {
|
||||
$('traceLog').innerHTML = '<div class="trace-empty">No log events match the selected filters.</div>';
|
||||
return;
|
||||
}
|
||||
$('traceLog').innerHTML = `<div class="trace-pre">${
|
||||
events.map(evt => `<span class="trace-line"><span class="t-ts">${escapeHtml(shortTs(evt.timestamp))}</span> <span class="t-node">${escapeHtml(evt.node || 'unknown')}</span> <span class="t-nf">${escapeHtml(evt.nf || 'SYSTEM')}</span> <span class="t-src">${escapeHtml(evt.source || 'unknown')}</span> ${escapeHtml(evt.message || '')}</span>`).join('')
|
||||
}</div>`;
|
||||
$('traceLog').scrollTop = $('traceLog').scrollHeight;
|
||||
} catch {
|
||||
$('traceStatus').textContent = 'Trace unavailable';
|
||||
$('traceLog').innerHTML = '<div class="trace-empty">Cannot reach trace API.</div>';
|
||||
}
|
||||
}
|
||||
|
||||
function shortTs(value) {
|
||||
if (!value) return '--:--:--';
|
||||
const dt = new Date(value);
|
||||
return Number.isNaN(dt.getTime())
|
||||
? value
|
||||
: dt.toLocaleTimeString([], {hour:'2-digit', minute:'2-digit', second:'2-digit'});
|
||||
}
|
||||
|
||||
function formatFullDateTime(value) {
|
||||
if (!value) return 'unknown';
|
||||
const dt = new Date(value);
|
||||
return Number.isNaN(dt.getTime())
|
||||
? value
|
||||
: dt.toLocaleString([], {
|
||||
year: 'numeric',
|
||||
month: '2-digit',
|
||||
day: '2-digit',
|
||||
hour: '2-digit',
|
||||
minute: '2-digit',
|
||||
second: '2-digit',
|
||||
});
|
||||
}
|
||||
|
||||
function escapeHtml(value) {
|
||||
return String(value ?? '')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, ''');
|
||||
}
|
||||
|
||||
async function refresh() { await Promise.all([loadNFs(), loadAlerts(), loadTraces()]); }
|
||||
|
||||
// ── Chat ──────────────────────────────────────────────────────────────────
|
||||
async function send() {
|
||||
@@ -478,6 +652,7 @@ function ask(btn) { $('inp').value = btn.textContent; send(); }
|
||||
));
|
||||
await refresh();
|
||||
setInterval(refresh, 30000);
|
||||
tracePollHandle = setInterval(loadTraces, 5000);
|
||||
})();
|
||||
</script>
|
||||
</body>
|
||||
|
||||
Reference in New Issue
Block a user