added multi node functionality

This commit is contained in:
Jake Kasper
2026-04-24 12:33:52 -04:00
parent c4c081362e
commit 16e5f2ced2
30 changed files with 673 additions and 93 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
+66 -15
View File
@@ -30,6 +30,7 @@ async def answer(query: str, network_state: dict, alerts: list) -> str:
def _rule_based(query: str, network_state: dict, alerts: list) -> str:
q = query.lower()
nfs = network_state.get("nfs", [])
cluster = network_state.get("cluster", {})
up = [n for n in nfs if n["state"] == "up"]
down = [n for n in nfs if n["state"] == "down"]
@@ -58,26 +59,40 @@ def _rule_based(query: str, network_state: dict, alerts: list) -> str:
return _alerts_summary(alerts)
if any(w in q for w in ["subscriber", "ue ", "device", "phone", "handset", "registration", "attach"]):
return _subscriber_analysis(nfs, alerts)
return _subscriber_analysis(nfs, alerts, cluster)
if any(w in q for w in ["session", "pdu", "bearer", "user plane", "traffic", "throughput"]):
return _session_analysis(nfs, alerts)
return _session_analysis(nfs, alerts, cluster)
# Default → health summary
return _health_summary(up, down, alerts)
return _health_summary(up, down, alerts, cluster)
def _health_summary(up: list, down: list, alerts: list) -> str:
def _health_summary(up: list, down: list, alerts: list, cluster: dict) -> str:
ts = datetime.now().strftime("%H:%M:%S")
crit = [a for a in alerts if a.get("severity") == "critical"]
warn = [a for a in alerts if a.get("severity") != "critical"]
lines = [f"**P5G Network Health — {ts}**\n"]
nodes = cluster.get("nodes", [])
if up:
lines.append(f"✅ **{len(up)} UP**: {', '.join(n['name'] for n in up)}")
lines.append(f"✅ **{len(up)} UP**: {', '.join(_nf_label(n) for n in up)}")
if down:
lines.append(f"🔴 **{len(down)} DOWN**: {', '.join(n['name'] for n in down)}")
lines.append(f" ⚡ Action: check `{CONTAINER_RUNTIME} logs <nf>` in the runtime host")
lines.append(f"🔴 **{len(down)} DOWN**: {', '.join(_nf_label(n) for n in down)}")
lines.append(" ⚡ Action: inspect the node shown for each affected NF before pulling logs.")
if nodes:
lines.append(f"\n**Cluster nodes ({len(nodes)})**")
for node in nodes:
running = [nf["name"] for nf in node.get("nfs", []) if nf.get("state") == "up"]
down_nfs = [nf["name"] for nf in node.get("nfs", []) if nf.get("state") == "down"]
role = node.get("role", "AP")
lines.append(
f"• **{node['hostname']}** ({role}{', local' if node.get('current') else ''})"
f" — running: {', '.join(running) or 'none'}"
)
if down_nfs:
lines.append(f" down here: {', '.join(down_nfs)}")
if alerts:
lines.append(f"\n⚠️ **{len(alerts)} alert(s)** — {len(crit)} critical, {len(warn)} warning")
@@ -102,8 +117,15 @@ def _nf_detail(nf_name: str, nfs: list, alerts: list) -> str:
f"Check: `{CONTAINER_RUNTIME} ps | grep {nf_name.lower()}`")
icon = "" if nf["state"] == "up" else "🔴"
lines = [f"{icon} **{nf_name}** is **{nf['state'].upper()}**",
f"Instance: `{nf.get('instance', 'n/a')}`"]
placements = nf.get("nodes", [])
lines = [f"{icon} **{nf_name}** is **{nf['state'].upper()}**"]
if placements:
node_text = ", ".join(
f"{node['hostname']} ({'/'.join(node.get('roles', []))})"
for node in placements
)
lines.append(f"Nodes: {node_text}")
lines.append(f"Instance: `{nf.get('instance', 'n/a')}`")
if nf_alerts:
lines.append(f"\n⚠️ {len(nf_alerts)} alert(s) for {nf_name}:")
for a in nf_alerts:
@@ -129,43 +151,72 @@ def _alerts_summary(alerts: list) -> str:
return "\n".join(lines)
def _subscriber_analysis(nfs: list, alerts: list) -> str:
def _subscriber_analysis(nfs: list, alerts: list, cluster: dict) -> str:
amf = next((n for n in nfs if n["name"] == "AMF"), None)
smf = next((n for n in nfs if n["name"] == "SMF"), None)
lines = ["**Subscriber & Registration Analysis**\n"]
lines.append(f"AMF (registration/mobility): {'✅ UP' if amf and amf['state'] == 'up' else '🔴 DOWN — subscribers cannot register'}")
lines.append(f"SMF (session management): {'✅ UP' if smf and smf['state'] == 'up' else '🔴 DOWN — no new data sessions'}")
lines.append(f"AMF (registration/mobility): {_nf_sentence(amf, 'subscribers cannot register')}")
lines.append(f"SMF (session management): {_nf_sentence(smf, 'no new data sessions')}")
sub_alerts = [a for a in alerts if any(k in a.get("name", "").lower()
for k in ["ue", "subscriber", "session", "attach", "registration"])]
if sub_alerts:
lines.append(f"\n⚠️ {len(sub_alerts)} subscriber-related alert(s) active.")
else:
lines.append("\nNo subscriber-related alerts detected.")
lines.append(_cluster_scope(cluster))
return "\n".join(lines)
def _session_analysis(nfs: list, alerts: list) -> str:
def _session_analysis(nfs: list, alerts: list, cluster: dict) -> str:
smf = next((n for n in nfs if n["name"] == "SMF"), None)
upf = next((n for n in nfs if n["name"] == "UPF"), None)
lines = ["**PDU Session & Data Plane Analysis**\n"]
lines.append(f"SMF: {'✅ UP' if smf and smf['state'] == 'up' else '🔴 DOWN'}")
lines.append(f"UPF: {'✅ UP' if upf and upf['state'] == 'up' else '🔴 DOWN'}")
lines.append(f"SMF: {_nf_sentence(smf, 'session setup is blocked')}")
lines.append(f"UPF: {_nf_sentence(upf, 'user-plane forwarding is blocked')}")
if (not smf or smf["state"] != "up") or (not upf or upf["state"] != "up"):
lines.append("\n⚡ **Impact**: PDU sessions will fail until both SMF and UPF are operational.")
else:
lines.append("\nBoth SMF and UPF operational — sessions should be establishing normally.")
lines.append(_cluster_scope(cluster))
return "\n".join(lines)
def _nf_label(nf: dict) -> str:
placements = nf.get("nodes", [])
if not placements:
return nf["name"]
return f"{nf['name']} on {', '.join(node['hostname'] for node in placements)}"
def _nf_sentence(nf: dict | None, impact: str) -> str:
if not nf:
return "○ N/A"
if nf.get("state") == "up":
nodes = ", ".join(node["hostname"] for node in nf.get("nodes", [])) or nf.get("instance", "unknown host")
return f"✅ UP on {nodes}"
return f"🔴 DOWN — {impact}"
def _cluster_scope(cluster: dict) -> str:
nodes = cluster.get("nodes", [])
if not nodes:
return "\nCluster discovery is not available."
details = ", ".join(f"{node['hostname']} ({node.get('role', 'AP')})" for node in nodes)
return f"\nCluster scope checked: {details}"
# ── LLM backends ──────────────────────────────────────────────────────────
def _build_context(network_state: dict, alerts: list) -> str:
nfs = network_state.get("nfs", [])
up = [n["name"] for n in nfs if n["state"] == "up"]
down = [n["name"] for n in nfs if n["state"] == "down"]
nodes = network_state.get("cluster", {}).get("nodes", [])
node_summary = ", ".join(f"{node['hostname']} ({node.get('role', 'AP')})" for node in nodes) or "none"
return (
f"NFs UP: {', '.join(up) or 'none'}\n"
f"NFs DOWN: {', '.join(down) or 'none'}\n"
f"Cluster nodes: {node_summary}\n"
f"Active alerts: {', '.join(a.get('name','') for a in alerts[:5]) or 'none'}"
)
+18 -2
View File
@@ -2,6 +2,7 @@
import httpx
from app.config import ALERTMANAGER_URL
from app.services import cluster_inventory
_BASE = ALERTMANAGER_URL.rstrip("/")
@@ -16,14 +17,29 @@ async def get_alerts() -> list:
except Exception:
return []
cluster = await cluster_inventory.get_cluster_inventory()
alerts = []
for a in raw:
labels = a.get("labels", {})
annotations = a.get("annotations", {})
name = labels.get("alertname", "Unknown")
summary = annotations.get("summary", annotations.get("description", ""))
nf_name = _infer_nf(name, summary, labels.get("instance", ""))
nodes = cluster_inventory.find_nf_nodes(cluster, nf_name) if nf_name else []
alerts.append({
"name": labels.get("alertname", "Unknown"),
"name": name,
"severity": labels.get("severity", "warning"),
"instance": labels.get("instance", ""),
"summary": annotations.get("summary", annotations.get("description", "")),
"summary": summary,
"nf": nf_name,
"nodes": nodes,
})
return alerts
def _infer_nf(name: str, summary: str, instance: str) -> str:
text = f"{name} {summary} {instance}".upper()
for nf_name in ["AMF", "SMF", "UPF", "UDM", "UDR", "NRF", "AUSF", "PCF", "MME", "SGWC", "DRA", "DSM"]:
if nf_name in text:
return nf_name
return ""
+180
View File
@@ -0,0 +1,180 @@
"""Cluster discovery built on top of the PLS API."""
from __future__ import annotations
import asyncio
import re
from app.config import ALL_NFS
from app.services import pls, prometheus
ROLE_NF_MAP = {
"5GALL": {"amf", "smf", "pcf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "chf", "smsf", "aaa", "bmsc"},
"CP": {"amf", "smf", "pcf", "udr", "udm", "nrf", "eir", "ausf", "dra", "chf", "smsf", "aaa", "bmsc"},
"UP": {"upf"},
"DCP": {"amf", "smf", "pcf", "chf", "smsf", "bmsc"},
"DLF": {"udr", "udm", "nrf", "eir", "ausf", "aaa"},
"SIG": {"dra"},
"4GALL": {"mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "smsf", "aaa", "bmsc"},
"4GCP": {"mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "smsf", "aaa", "bmsc"},
"4GDCP": {"mme", "sgwc", "smf", "pcf", "chf", "smsf", "bmsc"},
"COMBOALL": {"amf", "mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "smsf", "aaa", "bmsc"},
"COMBOCP": {"amf", "mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "smsf", "aaa", "bmsc"},
"COMBODCP": {"amf", "mme", "sgwc", "smf", "pcf", "chf", "aaa"},
}
ROLE_ALIASES = {
"UPF": "UP",
}
ROLE_PRIORITY = ["COMBOALL", "COMBOCP", "COMBODCP", "5GALL", "4GALL", "4GCP", "4GDCP", "DCP", "DLF", "SIG", "CP", "UP"]
def _infer_role(hostname: str) -> str:
tokens = [t for t in re.split(r"[^A-Za-z0-9]+", hostname.upper()) if t]
normalized = [ROLE_ALIASES.get(token, token) for token in tokens]
for role in ROLE_PRIORITY:
if role in normalized:
return role
for token in normalized:
if token.endswith("UPF"):
return "UP"
return "AP"
async def get_cluster_inventory() -> dict:
cluster = await pls.get_cluster_status()
if not cluster:
return {
"enabled": False,
"current_node": None,
"fully_established": False,
"nodes": [],
}
node_names = [node.get("name", "") for node in cluster.get("nodes", [])]
info_tasks = [asyncio.create_task(pls.get_system_info(pls.node_host(name))) for name in node_names]
service_tasks = [asyncio.create_task(pls.get_services(pls.node_host(name))) for name in node_names]
infos = await asyncio.gather(*info_tasks, return_exceptions=True)
services = await asyncio.gather(*service_tasks, return_exceptions=True)
nodes: list[dict] = []
for idx, node in enumerate(cluster.get("nodes", [])):
info = infos[idx] if isinstance(infos[idx], dict) else {}
node_services = services[idx] if isinstance(services[idx], list) else []
started = {svc["name"] for svc in node_services if svc.get("state") == "started"}
hostname = info.get("hostname") or pls.node_host(node.get("name", ""))
role = _infer_role(hostname)
nodes.append(
{
"name": node.get("name", ""),
"address": pls.node_host(node.get("name", "")),
"hostname": hostname,
"current": node.get("name") == cluster.get("current_node"),
"repositories": node.get("repositories", []),
"role": role,
"roles": [role],
"expected_nfs": sorted(ROLE_NF_MAP.get(role, set())),
"services": node_services,
"started_services": sorted(started),
}
)
return {
"enabled": True,
"current_node": cluster.get("current_node"),
"fully_established": bool(cluster.get("fully_established")),
"nodes": nodes,
}
def _aggregate_nf_state(nf_name: str, nodes: list[dict], prom_states: dict[str, dict]) -> dict:
service_name = nf_name.lower()
placements = []
seen_service = False
for node in nodes:
for service in node.get("services", []):
if service.get("name") != service_name:
continue
seen_service = True
if service.get("state") == "started":
placements.append(
{
"hostname": node["hostname"],
"address": node["address"],
"roles": node["roles"],
}
)
prom_state = prom_states.get(nf_name, {"state": "unknown", "instance": ""})
if placements:
state = prom_state["state"] if prom_state["state"] in {"up", "down"} else "up"
instance = ", ".join(p["hostname"] for p in placements)
elif seen_service:
state = "down"
instance = ""
else:
state = prom_state["state"]
instance = prom_state["instance"]
return {
"name": nf_name,
"state": state,
"instance": instance,
"nodes": placements,
}
def _node_nf_state(node: dict, nf_name: str) -> dict:
service_name = nf_name.lower()
service = next((svc for svc in node.get("services", []) if svc.get("name") == service_name), None)
if not service:
return {"name": nf_name, "state": "unknown"}
if service.get("state") == "started":
return {"name": nf_name, "state": "up"}
return {"name": nf_name, "state": "down"}
def _attach_node_nf_status(nodes: list[dict]) -> list[dict]:
enriched = []
for node in nodes:
node_copy = dict(node)
expected_nfs = node_copy.get("expected_nfs", [])
node_copy["nfs"] = [_node_nf_state(node_copy, nf_name.upper()) for nf_name in expected_nfs]
enriched.append(node_copy)
return enriched
async def get_network_status() -> dict:
inventory_task = asyncio.create_task(get_cluster_inventory())
prom_task = asyncio.create_task(prometheus.get_nf_status_map())
inventory, prom_states = await asyncio.gather(inventory_task, prom_task)
nodes = _attach_node_nf_status(inventory.get("nodes", []))
inventory["nodes"] = nodes
nfs = [_aggregate_nf_state(nf_name, nodes, prom_states) for nf_name in ALL_NFS]
up = sum(1 for nf in nfs if nf["state"] == "up")
down = sum(1 for nf in nfs if nf["state"] == "down")
return {
"nfs": nfs,
"summary": {"up": up, "down": down, "total": len(nfs)},
"cluster": inventory,
}
def find_nf_nodes(cluster: dict, nf_name: str) -> list[dict]:
nodes = cluster.get("nodes", [])
matches = []
for node in nodes:
for nf in node.get("nfs", []):
if nf.get("name") == nf_name:
matches.append(
{
"hostname": node["hostname"],
"address": node["address"],
"role": node.get("role", "AP"),
"current": node.get("current", False),
"state": nf.get("state", "unknown"),
}
)
break
return matches
+12 -4
View File
@@ -235,20 +235,23 @@ async def analyze_logs() -> dict:
Gather log-pattern issues + Prometheus NF status + Alertmanager alerts.
Returns a fully structured dict ready for JSON serialisation.
"""
from app.services import alertmanager, prometheus
from app.services import alertmanager, prometheus, cluster_inventory
# Kick off all I/O in parallel
containers_f = asyncio.create_task(_discover_containers())
alerts_f = asyncio.create_task(alertmanager.get_alerts())
nf_status_f = asyncio.create_task(prometheus.get_nf_status())
cluster_f = asyncio.create_task(cluster_inventory.get_cluster_inventory())
containers = await containers_f
alerts, nf_statuses = await asyncio.gather(alerts_f, nf_status_f,
alerts, nf_statuses, cluster = await asyncio.gather(alerts_f, nf_status_f, cluster_f,
return_exceptions=True)
if isinstance(alerts, Exception):
alerts = []
if isinstance(nf_statuses, Exception):
nf_statuses = []
if isinstance(cluster, Exception):
cluster = {"enabled": False, "nodes": []}
# Read all container logs concurrently
log_tasks = {nf: asyncio.create_task(_read_logs(cname))
@@ -280,25 +283,29 @@ async def analyze_logs() -> dict:
# 2. NF-down events from Prometheus
for nf_st in nf_statuses:
if isinstance(nf_st, dict) and nf_st.get("state") == "down":
node_text = ", ".join(node["hostname"] for node in nf_st.get("nodes", []))
issues.append({
"id": f"nf-down-{nf_st['name']}",
"category": "Connectivity",
"nf": nf_st["name"],
"node": node_text,
"severity": "critical",
"count": 1,
"description": f"{nf_st['name']} is unreachable",
"remediation": (f"Run `{CONTAINER_RUNTIME} ps` and check if {nf_st['name']} "
f"container is running; inspect its logs."),
"remediation": (f"Check {node_text or 'the hosting node'} first, then run "
f"`{CONTAINER_RUNTIME} ps` and inspect `{nf_st['name'].lower()}` logs."),
"source": "prometheus",
})
# 3. Active Alertmanager alerts
for alert in alerts:
if isinstance(alert, dict):
node_text = ", ".join(node["hostname"] for node in alert.get("nodes", []))
issues.append({
"id": f"alert-{alert.get('name', '')}-{len(issues)}",
"category": _alert_category(alert),
"nf": _alert_nf(alert),
"node": node_text,
"severity": alert.get("severity", "warning"),
"count": 1,
"description": alert.get("summary") or alert.get("name", "Unknown alert"),
@@ -331,6 +338,7 @@ async def analyze_logs() -> dict:
"categories": categories,
"timestamp": datetime.now().isoformat(),
"log_sources": list(containers.keys()),
"cluster": cluster,
}
# Persist to history ring-buffer
+78
View File
@@ -0,0 +1,78 @@
"""PLS API client for cluster and per-node discovery."""
from __future__ import annotations
from urllib.parse import urlsplit, urlunsplit
import httpx
from app.config import PLS_AUTH_BACKEND, PLS_BASE_URL, PLS_PASSWORD, PLS_USERNAME, PLS_VERIFY_TLS
_token: str | None = None
def _base_url_for_host(host: str | None = None) -> str:
if not host:
return PLS_BASE_URL.rstrip("/")
parts = urlsplit(PLS_BASE_URL)
return urlunsplit((parts.scheme, host, parts.path.rstrip("/"), "", ""))
async def _login() -> str | None:
global _token
if _token:
return _token
if not PLS_USERNAME or not PLS_PASSWORD:
return None
try:
async with httpx.AsyncClient(timeout=5, verify=PLS_VERIFY_TLS) as client:
response = await client.post(
f"{_base_url_for_host()}/auth/login",
json={
"username": PLS_USERNAME,
"password": PLS_PASSWORD,
"backend": PLS_AUTH_BACKEND,
},
)
response.raise_for_status()
data = response.json()
_token = data.get("access_token")
return _token
except Exception:
return None
async def _get(path: str, host: str | None = None) -> dict | list | None:
token = await _login()
if not token:
return None
headers = {"Authorization": f"Bearer {token}"}
url = f"{_base_url_for_host(host)}/{path.lstrip('/')}"
try:
async with httpx.AsyncClient(timeout=5, verify=PLS_VERIFY_TLS) as client:
response = await client.get(url, headers=headers)
response.raise_for_status()
return response.json()
except Exception:
return None
def node_host(node_name: str) -> str:
return node_name.split("@", 1)[1] if "@" in node_name else node_name
async def get_cluster_status() -> dict | None:
data = await _get("data_layer/cluster/status")
return data if isinstance(data, dict) else None
async def get_system_info(host: str | None = None) -> dict | None:
data = await _get("system/info", host=host)
return data if isinstance(data, dict) else None
async def get_services(host: str | None = None) -> list[dict]:
data = await _get("services", host=host)
return data if isinstance(data, list) else []
+9 -4
View File
@@ -14,12 +14,12 @@ async def query(promql: str) -> list:
return r.json()["data"]["result"]
async def get_nf_status() -> list:
"""Return a list of {name, state, instance} for every known NF."""
async def get_nf_status_map() -> dict[str, dict]:
"""Return Prometheus-backed NF status keyed by display name."""
try:
results = await query("up")
except Exception:
return [{"name": n, "state": "unknown", "instance": ""} for n in ALL_NFS]
return {n: {"name": n, "state": "unknown", "instance": ""} for n in ALL_NFS}
seen: dict[str, dict] = {}
for r in results:
@@ -38,4 +38,9 @@ async def get_nf_status() -> list:
if n not in seen:
seen[n] = {"name": n, "state": "unknown", "instance": ""}
return list(seen.values())
return seen
async def get_nf_status() -> list:
"""Return a list of {name, state, instance} for every known NF."""
return list((await get_nf_status_map()).values())