added multi node functionality

This commit is contained in:
Jake Kasper
2026-04-24 12:33:52 -04:00
parent c4c081362e
commit 16e5f2ced2
30 changed files with 673 additions and 93 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -30,6 +30,7 @@ async def answer(query: str, network_state: dict, alerts: list) -> str:
def _rule_based(query: str, network_state: dict, alerts: list) -> str:
q = query.lower()
nfs = network_state.get("nfs", [])
cluster = network_state.get("cluster", {})
up = [n for n in nfs if n["state"] == "up"]
down = [n for n in nfs if n["state"] == "down"]
@@ -58,26 +59,40 @@ def _rule_based(query: str, network_state: dict, alerts: list) -> str:
return _alerts_summary(alerts)
if any(w in q for w in ["subscriber", "ue ", "device", "phone", "handset", "registration", "attach"]):
return _subscriber_analysis(nfs, alerts)
return _subscriber_analysis(nfs, alerts, cluster)
if any(w in q for w in ["session", "pdu", "bearer", "user plane", "traffic", "throughput"]):
return _session_analysis(nfs, alerts)
return _session_analysis(nfs, alerts, cluster)
# Default → health summary
return _health_summary(up, down, alerts)
return _health_summary(up, down, alerts, cluster)
def _health_summary(up: list, down: list, alerts: list) -> str:
def _health_summary(up: list, down: list, alerts: list, cluster: dict) -> str:
ts = datetime.now().strftime("%H:%M:%S")
crit = [a for a in alerts if a.get("severity") == "critical"]
warn = [a for a in alerts if a.get("severity") != "critical"]
lines = [f"**P5G Network Health — {ts}**\n"]
nodes = cluster.get("nodes", [])
if up:
lines.append(f"✅ **{len(up)} UP**: {', '.join(n['name'] for n in up)}")
lines.append(f"✅ **{len(up)} UP**: {', '.join(_nf_label(n) for n in up)}")
if down:
lines.append(f"🔴 **{len(down)} DOWN**: {', '.join(n['name'] for n in down)}")
lines.append(f" ⚡ Action: check `{CONTAINER_RUNTIME} logs <nf>` in the runtime host")
lines.append(f"🔴 **{len(down)} DOWN**: {', '.join(_nf_label(n) for n in down)}")
lines.append(" ⚡ Action: inspect the node shown for each affected NF before pulling logs.")
if nodes:
lines.append(f"\n**Cluster nodes ({len(nodes)})**")
for node in nodes:
running = [nf["name"] for nf in node.get("nfs", []) if nf.get("state") == "up"]
down_nfs = [nf["name"] for nf in node.get("nfs", []) if nf.get("state") == "down"]
role = node.get("role", "AP")
lines.append(
f"• **{node['hostname']}** ({role}{', local' if node.get('current') else ''})"
f" — running: {', '.join(running) or 'none'}"
)
if down_nfs:
lines.append(f" down here: {', '.join(down_nfs)}")
if alerts:
lines.append(f"\n⚠️ **{len(alerts)} alert(s)** — {len(crit)} critical, {len(warn)} warning")
@@ -102,8 +117,15 @@ def _nf_detail(nf_name: str, nfs: list, alerts: list) -> str:
f"Check: `{CONTAINER_RUNTIME} ps | grep {nf_name.lower()}`")
icon = "" if nf["state"] == "up" else "🔴"
lines = [f"{icon} **{nf_name}** is **{nf['state'].upper()}**",
f"Instance: `{nf.get('instance', 'n/a')}`"]
placements = nf.get("nodes", [])
lines = [f"{icon} **{nf_name}** is **{nf['state'].upper()}**"]
if placements:
node_text = ", ".join(
f"{node['hostname']} ({'/'.join(node.get('roles', []))})"
for node in placements
)
lines.append(f"Nodes: {node_text}")
lines.append(f"Instance: `{nf.get('instance', 'n/a')}`")
if nf_alerts:
lines.append(f"\n⚠️ {len(nf_alerts)} alert(s) for {nf_name}:")
for a in nf_alerts:
@@ -129,43 +151,72 @@ def _alerts_summary(alerts: list) -> str:
return "\n".join(lines)
def _subscriber_analysis(nfs: list, alerts: list) -> str:
def _subscriber_analysis(nfs: list, alerts: list, cluster: dict) -> str:
amf = next((n for n in nfs if n["name"] == "AMF"), None)
smf = next((n for n in nfs if n["name"] == "SMF"), None)
lines = ["**Subscriber & Registration Analysis**\n"]
lines.append(f"AMF (registration/mobility): {'✅ UP' if amf and amf['state'] == 'up' else '🔴 DOWN — subscribers cannot register'}")
lines.append(f"SMF (session management): {'✅ UP' if smf and smf['state'] == 'up' else '🔴 DOWN — no new data sessions'}")
lines.append(f"AMF (registration/mobility): {_nf_sentence(amf, 'subscribers cannot register')}")
lines.append(f"SMF (session management): {_nf_sentence(smf, 'no new data sessions')}")
sub_alerts = [a for a in alerts if any(k in a.get("name", "").lower()
for k in ["ue", "subscriber", "session", "attach", "registration"])]
if sub_alerts:
lines.append(f"\n⚠️ {len(sub_alerts)} subscriber-related alert(s) active.")
else:
lines.append("\nNo subscriber-related alerts detected.")
lines.append(_cluster_scope(cluster))
return "\n".join(lines)
def _session_analysis(nfs: list, alerts: list) -> str:
def _session_analysis(nfs: list, alerts: list, cluster: dict) -> str:
smf = next((n for n in nfs if n["name"] == "SMF"), None)
upf = next((n for n in nfs if n["name"] == "UPF"), None)
lines = ["**PDU Session & Data Plane Analysis**\n"]
lines.append(f"SMF: {'✅ UP' if smf and smf['state'] == 'up' else '🔴 DOWN'}")
lines.append(f"UPF: {'✅ UP' if upf and upf['state'] == 'up' else '🔴 DOWN'}")
lines.append(f"SMF: {_nf_sentence(smf, 'session setup is blocked')}")
lines.append(f"UPF: {_nf_sentence(upf, 'user-plane forwarding is blocked')}")
if (not smf or smf["state"] != "up") or (not upf or upf["state"] != "up"):
lines.append("\n⚡ **Impact**: PDU sessions will fail until both SMF and UPF are operational.")
else:
lines.append("\nBoth SMF and UPF operational — sessions should be establishing normally.")
lines.append(_cluster_scope(cluster))
return "\n".join(lines)
def _nf_label(nf: dict) -> str:
placements = nf.get("nodes", [])
if not placements:
return nf["name"]
return f"{nf['name']} on {', '.join(node['hostname'] for node in placements)}"
def _nf_sentence(nf: dict | None, impact: str) -> str:
if not nf:
return "○ N/A"
if nf.get("state") == "up":
nodes = ", ".join(node["hostname"] for node in nf.get("nodes", [])) or nf.get("instance", "unknown host")
return f"✅ UP on {nodes}"
return f"🔴 DOWN — {impact}"
def _cluster_scope(cluster: dict) -> str:
nodes = cluster.get("nodes", [])
if not nodes:
return "\nCluster discovery is not available."
details = ", ".join(f"{node['hostname']} ({node.get('role', 'AP')})" for node in nodes)
return f"\nCluster scope checked: {details}"
# ── LLM backends ──────────────────────────────────────────────────────────
def _build_context(network_state: dict, alerts: list) -> str:
nfs = network_state.get("nfs", [])
up = [n["name"] for n in nfs if n["state"] == "up"]
down = [n["name"] for n in nfs if n["state"] == "down"]
nodes = network_state.get("cluster", {}).get("nodes", [])
node_summary = ", ".join(f"{node['hostname']} ({node.get('role', 'AP')})" for node in nodes) or "none"
return (
f"NFs UP: {', '.join(up) or 'none'}\n"
f"NFs DOWN: {', '.join(down) or 'none'}\n"
f"Cluster nodes: {node_summary}\n"
f"Active alerts: {', '.join(a.get('name','') for a in alerts[:5]) or 'none'}"
)

View File

@@ -2,6 +2,7 @@
import httpx
from app.config import ALERTMANAGER_URL
from app.services import cluster_inventory
_BASE = ALERTMANAGER_URL.rstrip("/")
@@ -16,14 +17,29 @@ async def get_alerts() -> list:
except Exception:
return []
cluster = await cluster_inventory.get_cluster_inventory()
alerts = []
for a in raw:
labels = a.get("labels", {})
annotations = a.get("annotations", {})
name = labels.get("alertname", "Unknown")
summary = annotations.get("summary", annotations.get("description", ""))
nf_name = _infer_nf(name, summary, labels.get("instance", ""))
nodes = cluster_inventory.find_nf_nodes(cluster, nf_name) if nf_name else []
alerts.append({
"name": labels.get("alertname", "Unknown"),
"name": name,
"severity": labels.get("severity", "warning"),
"instance": labels.get("instance", ""),
"summary": annotations.get("summary", annotations.get("description", "")),
"summary": summary,
"nf": nf_name,
"nodes": nodes,
})
return alerts
def _infer_nf(name: str, summary: str, instance: str) -> str:
text = f"{name} {summary} {instance}".upper()
for nf_name in ["AMF", "SMF", "UPF", "UDM", "UDR", "NRF", "AUSF", "PCF", "MME", "SGWC", "DRA", "DSM"]:
if nf_name in text:
return nf_name
return ""

View File

@@ -0,0 +1,180 @@
"""Cluster discovery built on top of the PLS API."""
from __future__ import annotations
import asyncio
import re
from app.config import ALL_NFS
from app.services import pls, prometheus
ROLE_NF_MAP = {
"5GALL": {"amf", "smf", "pcf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "chf", "smsf", "aaa", "bmsc"},
"CP": {"amf", "smf", "pcf", "udr", "udm", "nrf", "eir", "ausf", "dra", "chf", "smsf", "aaa", "bmsc"},
"UP": {"upf"},
"DCP": {"amf", "smf", "pcf", "chf", "smsf", "bmsc"},
"DLF": {"udr", "udm", "nrf", "eir", "ausf", "aaa"},
"SIG": {"dra"},
"4GALL": {"mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "smsf", "aaa", "bmsc"},
"4GCP": {"mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "smsf", "aaa", "bmsc"},
"4GDCP": {"mme", "sgwc", "smf", "pcf", "chf", "smsf", "bmsc"},
"COMBOALL": {"amf", "mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "smsf", "aaa", "bmsc"},
"COMBOCP": {"amf", "mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "smsf", "aaa", "bmsc"},
"COMBODCP": {"amf", "mme", "sgwc", "smf", "pcf", "chf", "aaa"},
}
ROLE_ALIASES = {
"UPF": "UP",
}
ROLE_PRIORITY = ["COMBOALL", "COMBOCP", "COMBODCP", "5GALL", "4GALL", "4GCP", "4GDCP", "DCP", "DLF", "SIG", "CP", "UP"]
def _infer_role(hostname: str) -> str:
tokens = [t for t in re.split(r"[^A-Za-z0-9]+", hostname.upper()) if t]
normalized = [ROLE_ALIASES.get(token, token) for token in tokens]
for role in ROLE_PRIORITY:
if role in normalized:
return role
for token in normalized:
if token.endswith("UPF"):
return "UP"
return "AP"
async def get_cluster_inventory() -> dict:
cluster = await pls.get_cluster_status()
if not cluster:
return {
"enabled": False,
"current_node": None,
"fully_established": False,
"nodes": [],
}
node_names = [node.get("name", "") for node in cluster.get("nodes", [])]
info_tasks = [asyncio.create_task(pls.get_system_info(pls.node_host(name))) for name in node_names]
service_tasks = [asyncio.create_task(pls.get_services(pls.node_host(name))) for name in node_names]
infos = await asyncio.gather(*info_tasks, return_exceptions=True)
services = await asyncio.gather(*service_tasks, return_exceptions=True)
nodes: list[dict] = []
for idx, node in enumerate(cluster.get("nodes", [])):
info = infos[idx] if isinstance(infos[idx], dict) else {}
node_services = services[idx] if isinstance(services[idx], list) else []
started = {svc["name"] for svc in node_services if svc.get("state") == "started"}
hostname = info.get("hostname") or pls.node_host(node.get("name", ""))
role = _infer_role(hostname)
nodes.append(
{
"name": node.get("name", ""),
"address": pls.node_host(node.get("name", "")),
"hostname": hostname,
"current": node.get("name") == cluster.get("current_node"),
"repositories": node.get("repositories", []),
"role": role,
"roles": [role],
"expected_nfs": sorted(ROLE_NF_MAP.get(role, set())),
"services": node_services,
"started_services": sorted(started),
}
)
return {
"enabled": True,
"current_node": cluster.get("current_node"),
"fully_established": bool(cluster.get("fully_established")),
"nodes": nodes,
}
def _aggregate_nf_state(nf_name: str, nodes: list[dict], prom_states: dict[str, dict]) -> dict:
service_name = nf_name.lower()
placements = []
seen_service = False
for node in nodes:
for service in node.get("services", []):
if service.get("name") != service_name:
continue
seen_service = True
if service.get("state") == "started":
placements.append(
{
"hostname": node["hostname"],
"address": node["address"],
"roles": node["roles"],
}
)
prom_state = prom_states.get(nf_name, {"state": "unknown", "instance": ""})
if placements:
state = prom_state["state"] if prom_state["state"] in {"up", "down"} else "up"
instance = ", ".join(p["hostname"] for p in placements)
elif seen_service:
state = "down"
instance = ""
else:
state = prom_state["state"]
instance = prom_state["instance"]
return {
"name": nf_name,
"state": state,
"instance": instance,
"nodes": placements,
}
def _node_nf_state(node: dict, nf_name: str) -> dict:
service_name = nf_name.lower()
service = next((svc for svc in node.get("services", []) if svc.get("name") == service_name), None)
if not service:
return {"name": nf_name, "state": "unknown"}
if service.get("state") == "started":
return {"name": nf_name, "state": "up"}
return {"name": nf_name, "state": "down"}
def _attach_node_nf_status(nodes: list[dict]) -> list[dict]:
enriched = []
for node in nodes:
node_copy = dict(node)
expected_nfs = node_copy.get("expected_nfs", [])
node_copy["nfs"] = [_node_nf_state(node_copy, nf_name.upper()) for nf_name in expected_nfs]
enriched.append(node_copy)
return enriched
async def get_network_status() -> dict:
inventory_task = asyncio.create_task(get_cluster_inventory())
prom_task = asyncio.create_task(prometheus.get_nf_status_map())
inventory, prom_states = await asyncio.gather(inventory_task, prom_task)
nodes = _attach_node_nf_status(inventory.get("nodes", []))
inventory["nodes"] = nodes
nfs = [_aggregate_nf_state(nf_name, nodes, prom_states) for nf_name in ALL_NFS]
up = sum(1 for nf in nfs if nf["state"] == "up")
down = sum(1 for nf in nfs if nf["state"] == "down")
return {
"nfs": nfs,
"summary": {"up": up, "down": down, "total": len(nfs)},
"cluster": inventory,
}
def find_nf_nodes(cluster: dict, nf_name: str) -> list[dict]:
nodes = cluster.get("nodes", [])
matches = []
for node in nodes:
for nf in node.get("nfs", []):
if nf.get("name") == nf_name:
matches.append(
{
"hostname": node["hostname"],
"address": node["address"],
"role": node.get("role", "AP"),
"current": node.get("current", False),
"state": nf.get("state", "unknown"),
}
)
break
return matches

View File

@@ -235,20 +235,23 @@ async def analyze_logs() -> dict:
Gather log-pattern issues + Prometheus NF status + Alertmanager alerts.
Returns a fully structured dict ready for JSON serialisation.
"""
from app.services import alertmanager, prometheus
from app.services import alertmanager, prometheus, cluster_inventory
# Kick off all I/O in parallel
containers_f = asyncio.create_task(_discover_containers())
alerts_f = asyncio.create_task(alertmanager.get_alerts())
nf_status_f = asyncio.create_task(prometheus.get_nf_status())
cluster_f = asyncio.create_task(cluster_inventory.get_cluster_inventory())
containers = await containers_f
alerts, nf_statuses = await asyncio.gather(alerts_f, nf_status_f,
alerts, nf_statuses, cluster = await asyncio.gather(alerts_f, nf_status_f, cluster_f,
return_exceptions=True)
if isinstance(alerts, Exception):
alerts = []
if isinstance(nf_statuses, Exception):
nf_statuses = []
if isinstance(cluster, Exception):
cluster = {"enabled": False, "nodes": []}
# Read all container logs concurrently
log_tasks = {nf: asyncio.create_task(_read_logs(cname))
@@ -280,25 +283,29 @@ async def analyze_logs() -> dict:
# 2. NF-down events from Prometheus
for nf_st in nf_statuses:
if isinstance(nf_st, dict) and nf_st.get("state") == "down":
node_text = ", ".join(node["hostname"] for node in nf_st.get("nodes", []))
issues.append({
"id": f"nf-down-{nf_st['name']}",
"category": "Connectivity",
"nf": nf_st["name"],
"node": node_text,
"severity": "critical",
"count": 1,
"description": f"{nf_st['name']} is unreachable",
"remediation": (f"Run `{CONTAINER_RUNTIME} ps` and check if {nf_st['name']} "
f"container is running; inspect its logs."),
"remediation": (f"Check {node_text or 'the hosting node'} first, then run "
f"`{CONTAINER_RUNTIME} ps` and inspect `{nf_st['name'].lower()}` logs."),
"source": "prometheus",
})
# 3. Active Alertmanager alerts
for alert in alerts:
if isinstance(alert, dict):
node_text = ", ".join(node["hostname"] for node in alert.get("nodes", []))
issues.append({
"id": f"alert-{alert.get('name', '')}-{len(issues)}",
"category": _alert_category(alert),
"nf": _alert_nf(alert),
"node": node_text,
"severity": alert.get("severity", "warning"),
"count": 1,
"description": alert.get("summary") or alert.get("name", "Unknown alert"),
@@ -331,6 +338,7 @@ async def analyze_logs() -> dict:
"categories": categories,
"timestamp": datetime.now().isoformat(),
"log_sources": list(containers.keys()),
"cluster": cluster,
}
# Persist to history ring-buffer

78
app/services/pls.py Normal file
View File

@@ -0,0 +1,78 @@
"""PLS API client for cluster and per-node discovery."""
from __future__ import annotations
from urllib.parse import urlsplit, urlunsplit
import httpx
from app.config import PLS_AUTH_BACKEND, PLS_BASE_URL, PLS_PASSWORD, PLS_USERNAME, PLS_VERIFY_TLS
_token: str | None = None
def _base_url_for_host(host: str | None = None) -> str:
if not host:
return PLS_BASE_URL.rstrip("/")
parts = urlsplit(PLS_BASE_URL)
return urlunsplit((parts.scheme, host, parts.path.rstrip("/"), "", ""))
async def _login() -> str | None:
global _token
if _token:
return _token
if not PLS_USERNAME or not PLS_PASSWORD:
return None
try:
async with httpx.AsyncClient(timeout=5, verify=PLS_VERIFY_TLS) as client:
response = await client.post(
f"{_base_url_for_host()}/auth/login",
json={
"username": PLS_USERNAME,
"password": PLS_PASSWORD,
"backend": PLS_AUTH_BACKEND,
},
)
response.raise_for_status()
data = response.json()
_token = data.get("access_token")
return _token
except Exception:
return None
async def _get(path: str, host: str | None = None) -> dict | list | None:
token = await _login()
if not token:
return None
headers = {"Authorization": f"Bearer {token}"}
url = f"{_base_url_for_host(host)}/{path.lstrip('/')}"
try:
async with httpx.AsyncClient(timeout=5, verify=PLS_VERIFY_TLS) as client:
response = await client.get(url, headers=headers)
response.raise_for_status()
return response.json()
except Exception:
return None
def node_host(node_name: str) -> str:
return node_name.split("@", 1)[1] if "@" in node_name else node_name
async def get_cluster_status() -> dict | None:
data = await _get("data_layer/cluster/status")
return data if isinstance(data, dict) else None
async def get_system_info(host: str | None = None) -> dict | None:
data = await _get("system/info", host=host)
return data if isinstance(data, dict) else None
async def get_services(host: str | None = None) -> list[dict]:
data = await _get("services", host=host)
return data if isinstance(data, list) else []

View File

@@ -14,12 +14,12 @@ async def query(promql: str) -> list:
return r.json()["data"]["result"]
async def get_nf_status() -> list:
"""Return a list of {name, state, instance} for every known NF."""
async def get_nf_status_map() -> dict[str, dict]:
"""Return Prometheus-backed NF status keyed by display name."""
try:
results = await query("up")
except Exception:
return [{"name": n, "state": "unknown", "instance": ""} for n in ALL_NFS]
return {n: {"name": n, "state": "unknown", "instance": ""} for n in ALL_NFS}
seen: dict[str, dict] = {}
for r in results:
@@ -38,4 +38,9 @@ async def get_nf_status() -> list:
if n not in seen:
seen[n] = {"name": n, "state": "unknown", "instance": ""}
return list(seen.values())
return seen
async def get_nf_status() -> list:
"""Return a list of {name, state, instance} for every known NF."""
return list((await get_nf_status_map()).values())