added multi node functionality

This commit is contained in:
Jake Kasper
2026-04-24 12:33:52 -04:00
parent c4c081362e
commit 16e5f2ced2
30 changed files with 673 additions and 93 deletions

Binary file not shown.

View File

@@ -1,11 +1,26 @@
import os
def _env_bool(name: str, default: bool) -> bool:
value = os.getenv(name)
if value is None:
return default
return value.lower() in {"1", "true", "yes", "on"}
# Defaults assume the appliance-style deployment model where Marvis runs with
# host networking and talks to sibling services over host loopback.
PROMETHEUS_URL = os.getenv("MARVIS_PROMETHEUS_URL", "http://127.0.0.1:9090")
PROMETHEUS_PREFIX = os.getenv("MARVIS_PROMETHEUS_PREFIX", "/prometheus")
ALERTMANAGER_URL = os.getenv("MARVIS_ALERTMANAGER_URL", "http://127.0.0.1:9093")
# PLS discovery defaults assume the local appliance exposes PLS via Traefik.
PLS_BASE_URL = os.getenv("MARVIS_PLS_BASE_URL", "https://127.0.0.1/core/pls/api/1")
PLS_USERNAME = os.getenv("MARVIS_PLS_USERNAME", "")
PLS_PASSWORD = os.getenv("MARVIS_PLS_PASSWORD", "")
PLS_AUTH_BACKEND = os.getenv("MARVIS_PLS_AUTH_BACKEND", "local")
PLS_VERIFY_TLS = _env_bool("MARVIS_PLS_VERIFY_TLS", False)
# AI backend: "rule" | "openai" | "ollama"
AI_MODE = os.getenv("MARVIS_AI_MODE", "rule")
OPENAI_API_KEY = os.getenv("MARVIS_OPENAI_API_KEY", "")

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1,5 +1,5 @@
from fastapi import APIRouter
from app.services import alertmanager
from app.services import alertmanager, cluster_inventory
router = APIRouter()
@@ -8,4 +8,9 @@ router = APIRouter()
async def get_alerts():
alerts = await alertmanager.get_alerts()
critical = sum(1 for a in alerts if a.get("severity") == "critical")
return {"alerts": alerts, "total": len(alerts), "critical": critical}
return {
"alerts": alerts,
"total": len(alerts),
"critical": critical,
"cluster": await cluster_inventory.get_cluster_inventory(),
}

View File

@@ -1,12 +1,9 @@
from fastapi import APIRouter
from app.services import prometheus
from app.services import cluster_inventory
router = APIRouter()
@router.get("/network/status")
async def network_status():
nfs = await prometheus.get_nf_status()
up = sum(1 for n in nfs if n["state"] == "up")
down = sum(1 for n in nfs if n["state"] == "down")
return {"nfs": nfs, "summary": {"up": up, "down": down, "total": len(nfs)}}
return await cluster_inventory.get_network_status()

View File

@@ -1,6 +1,6 @@
from fastapi import APIRouter
from pydantic import BaseModel
from app.services import prometheus, alertmanager, ai
from app.services import cluster_inventory, alertmanager, ai
router = APIRouter()
@@ -18,7 +18,7 @@ async def query(req: QueryRequest):
async def _gather(query_text: str):
import asyncio
nfs_task = asyncio.create_task(prometheus.get_nf_status())
nfs_task = asyncio.create_task(cluster_inventory.get_network_status())
alerts_task = asyncio.create_task(alertmanager.get_alerts())
nfs, alerts = await asyncio.gather(nfs_task, alerts_task)
return {"nfs": nfs}, alerts
network_state, alerts = await asyncio.gather(nfs_task, alerts_task)
return network_state, alerts

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -30,6 +30,7 @@ async def answer(query: str, network_state: dict, alerts: list) -> str:
def _rule_based(query: str, network_state: dict, alerts: list) -> str:
q = query.lower()
nfs = network_state.get("nfs", [])
cluster = network_state.get("cluster", {})
up = [n for n in nfs if n["state"] == "up"]
down = [n for n in nfs if n["state"] == "down"]
@@ -58,26 +59,40 @@ def _rule_based(query: str, network_state: dict, alerts: list) -> str:
return _alerts_summary(alerts)
if any(w in q for w in ["subscriber", "ue ", "device", "phone", "handset", "registration", "attach"]):
return _subscriber_analysis(nfs, alerts)
return _subscriber_analysis(nfs, alerts, cluster)
if any(w in q for w in ["session", "pdu", "bearer", "user plane", "traffic", "throughput"]):
return _session_analysis(nfs, alerts)
return _session_analysis(nfs, alerts, cluster)
# Default → health summary
return _health_summary(up, down, alerts)
return _health_summary(up, down, alerts, cluster)
def _health_summary(up: list, down: list, alerts: list) -> str:
def _health_summary(up: list, down: list, alerts: list, cluster: dict) -> str:
ts = datetime.now().strftime("%H:%M:%S")
crit = [a for a in alerts if a.get("severity") == "critical"]
warn = [a for a in alerts if a.get("severity") != "critical"]
lines = [f"**P5G Network Health — {ts}**\n"]
nodes = cluster.get("nodes", [])
if up:
lines.append(f"✅ **{len(up)} UP**: {', '.join(n['name'] for n in up)}")
lines.append(f"✅ **{len(up)} UP**: {', '.join(_nf_label(n) for n in up)}")
if down:
lines.append(f"🔴 **{len(down)} DOWN**: {', '.join(n['name'] for n in down)}")
lines.append(f" ⚡ Action: check `{CONTAINER_RUNTIME} logs <nf>` in the runtime host")
lines.append(f"🔴 **{len(down)} DOWN**: {', '.join(_nf_label(n) for n in down)}")
lines.append(" ⚡ Action: inspect the node shown for each affected NF before pulling logs.")
if nodes:
lines.append(f"\n**Cluster nodes ({len(nodes)})**")
for node in nodes:
running = [nf["name"] for nf in node.get("nfs", []) if nf.get("state") == "up"]
down_nfs = [nf["name"] for nf in node.get("nfs", []) if nf.get("state") == "down"]
role = node.get("role", "AP")
lines.append(
f"• **{node['hostname']}** ({role}{', local' if node.get('current') else ''})"
f" — running: {', '.join(running) or 'none'}"
)
if down_nfs:
lines.append(f" down here: {', '.join(down_nfs)}")
if alerts:
lines.append(f"\n⚠️ **{len(alerts)} alert(s)** — {len(crit)} critical, {len(warn)} warning")
@@ -102,8 +117,15 @@ def _nf_detail(nf_name: str, nfs: list, alerts: list) -> str:
f"Check: `{CONTAINER_RUNTIME} ps | grep {nf_name.lower()}`")
icon = "" if nf["state"] == "up" else "🔴"
lines = [f"{icon} **{nf_name}** is **{nf['state'].upper()}**",
f"Instance: `{nf.get('instance', 'n/a')}`"]
placements = nf.get("nodes", [])
lines = [f"{icon} **{nf_name}** is **{nf['state'].upper()}**"]
if placements:
node_text = ", ".join(
f"{node['hostname']} ({'/'.join(node.get('roles', []))})"
for node in placements
)
lines.append(f"Nodes: {node_text}")
lines.append(f"Instance: `{nf.get('instance', 'n/a')}`")
if nf_alerts:
lines.append(f"\n⚠️ {len(nf_alerts)} alert(s) for {nf_name}:")
for a in nf_alerts:
@@ -129,43 +151,72 @@ def _alerts_summary(alerts: list) -> str:
return "\n".join(lines)
def _subscriber_analysis(nfs: list, alerts: list) -> str:
def _subscriber_analysis(nfs: list, alerts: list, cluster: dict) -> str:
amf = next((n for n in nfs if n["name"] == "AMF"), None)
smf = next((n for n in nfs if n["name"] == "SMF"), None)
lines = ["**Subscriber & Registration Analysis**\n"]
lines.append(f"AMF (registration/mobility): {'✅ UP' if amf and amf['state'] == 'up' else '🔴 DOWN — subscribers cannot register'}")
lines.append(f"SMF (session management): {'✅ UP' if smf and smf['state'] == 'up' else '🔴 DOWN — no new data sessions'}")
lines.append(f"AMF (registration/mobility): {_nf_sentence(amf, 'subscribers cannot register')}")
lines.append(f"SMF (session management): {_nf_sentence(smf, 'no new data sessions')}")
sub_alerts = [a for a in alerts if any(k in a.get("name", "").lower()
for k in ["ue", "subscriber", "session", "attach", "registration"])]
if sub_alerts:
lines.append(f"\n⚠️ {len(sub_alerts)} subscriber-related alert(s) active.")
else:
lines.append("\nNo subscriber-related alerts detected.")
lines.append(_cluster_scope(cluster))
return "\n".join(lines)
def _session_analysis(nfs: list, alerts: list) -> str:
def _session_analysis(nfs: list, alerts: list, cluster: dict) -> str:
smf = next((n for n in nfs if n["name"] == "SMF"), None)
upf = next((n for n in nfs if n["name"] == "UPF"), None)
lines = ["**PDU Session & Data Plane Analysis**\n"]
lines.append(f"SMF: {'✅ UP' if smf and smf['state'] == 'up' else '🔴 DOWN'}")
lines.append(f"UPF: {'✅ UP' if upf and upf['state'] == 'up' else '🔴 DOWN'}")
lines.append(f"SMF: {_nf_sentence(smf, 'session setup is blocked')}")
lines.append(f"UPF: {_nf_sentence(upf, 'user-plane forwarding is blocked')}")
if (not smf or smf["state"] != "up") or (not upf or upf["state"] != "up"):
lines.append("\n⚡ **Impact**: PDU sessions will fail until both SMF and UPF are operational.")
else:
lines.append("\nBoth SMF and UPF operational — sessions should be establishing normally.")
lines.append(_cluster_scope(cluster))
return "\n".join(lines)
def _nf_label(nf: dict) -> str:
placements = nf.get("nodes", [])
if not placements:
return nf["name"]
return f"{nf['name']} on {', '.join(node['hostname'] for node in placements)}"
def _nf_sentence(nf: dict | None, impact: str) -> str:
if not nf:
return "○ N/A"
if nf.get("state") == "up":
nodes = ", ".join(node["hostname"] for node in nf.get("nodes", [])) or nf.get("instance", "unknown host")
return f"✅ UP on {nodes}"
return f"🔴 DOWN — {impact}"
def _cluster_scope(cluster: dict) -> str:
nodes = cluster.get("nodes", [])
if not nodes:
return "\nCluster discovery is not available."
details = ", ".join(f"{node['hostname']} ({node.get('role', 'AP')})" for node in nodes)
return f"\nCluster scope checked: {details}"
# ── LLM backends ──────────────────────────────────────────────────────────
def _build_context(network_state: dict, alerts: list) -> str:
nfs = network_state.get("nfs", [])
up = [n["name"] for n in nfs if n["state"] == "up"]
down = [n["name"] for n in nfs if n["state"] == "down"]
nodes = network_state.get("cluster", {}).get("nodes", [])
node_summary = ", ".join(f"{node['hostname']} ({node.get('role', 'AP')})" for node in nodes) or "none"
return (
f"NFs UP: {', '.join(up) or 'none'}\n"
f"NFs DOWN: {', '.join(down) or 'none'}\n"
f"Cluster nodes: {node_summary}\n"
f"Active alerts: {', '.join(a.get('name','') for a in alerts[:5]) or 'none'}"
)

View File

@@ -2,6 +2,7 @@
import httpx
from app.config import ALERTMANAGER_URL
from app.services import cluster_inventory
_BASE = ALERTMANAGER_URL.rstrip("/")
@@ -16,14 +17,29 @@ async def get_alerts() -> list:
except Exception:
return []
cluster = await cluster_inventory.get_cluster_inventory()
alerts = []
for a in raw:
labels = a.get("labels", {})
annotations = a.get("annotations", {})
name = labels.get("alertname", "Unknown")
summary = annotations.get("summary", annotations.get("description", ""))
nf_name = _infer_nf(name, summary, labels.get("instance", ""))
nodes = cluster_inventory.find_nf_nodes(cluster, nf_name) if nf_name else []
alerts.append({
"name": labels.get("alertname", "Unknown"),
"name": name,
"severity": labels.get("severity", "warning"),
"instance": labels.get("instance", ""),
"summary": annotations.get("summary", annotations.get("description", "")),
"summary": summary,
"nf": nf_name,
"nodes": nodes,
})
return alerts
def _infer_nf(name: str, summary: str, instance: str) -> str:
text = f"{name} {summary} {instance}".upper()
for nf_name in ["AMF", "SMF", "UPF", "UDM", "UDR", "NRF", "AUSF", "PCF", "MME", "SGWC", "DRA", "DSM"]:
if nf_name in text:
return nf_name
return ""

View File

@@ -0,0 +1,180 @@
"""Cluster discovery built on top of the PLS API."""
from __future__ import annotations
import asyncio
import re
from app.config import ALL_NFS
from app.services import pls, prometheus
ROLE_NF_MAP = {
"5GALL": {"amf", "smf", "pcf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "chf", "smsf", "aaa", "bmsc"},
"CP": {"amf", "smf", "pcf", "udr", "udm", "nrf", "eir", "ausf", "dra", "chf", "smsf", "aaa", "bmsc"},
"UP": {"upf"},
"DCP": {"amf", "smf", "pcf", "chf", "smsf", "bmsc"},
"DLF": {"udr", "udm", "nrf", "eir", "ausf", "aaa"},
"SIG": {"dra"},
"4GALL": {"mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "smsf", "aaa", "bmsc"},
"4GCP": {"mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "smsf", "aaa", "bmsc"},
"4GDCP": {"mme", "sgwc", "smf", "pcf", "chf", "smsf", "bmsc"},
"COMBOALL": {"amf", "mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "upf", "smsf", "aaa", "bmsc"},
"COMBOCP": {"amf", "mme", "sgwc", "smf", "pcf", "chf", "udr", "udm", "nrf", "eir", "ausf", "dra", "smsf", "aaa", "bmsc"},
"COMBODCP": {"amf", "mme", "sgwc", "smf", "pcf", "chf", "aaa"},
}
ROLE_ALIASES = {
"UPF": "UP",
}
ROLE_PRIORITY = ["COMBOALL", "COMBOCP", "COMBODCP", "5GALL", "4GALL", "4GCP", "4GDCP", "DCP", "DLF", "SIG", "CP", "UP"]
def _infer_role(hostname: str) -> str:
tokens = [t for t in re.split(r"[^A-Za-z0-9]+", hostname.upper()) if t]
normalized = [ROLE_ALIASES.get(token, token) for token in tokens]
for role in ROLE_PRIORITY:
if role in normalized:
return role
for token in normalized:
if token.endswith("UPF"):
return "UP"
return "AP"
async def get_cluster_inventory() -> dict:
cluster = await pls.get_cluster_status()
if not cluster:
return {
"enabled": False,
"current_node": None,
"fully_established": False,
"nodes": [],
}
node_names = [node.get("name", "") for node in cluster.get("nodes", [])]
info_tasks = [asyncio.create_task(pls.get_system_info(pls.node_host(name))) for name in node_names]
service_tasks = [asyncio.create_task(pls.get_services(pls.node_host(name))) for name in node_names]
infos = await asyncio.gather(*info_tasks, return_exceptions=True)
services = await asyncio.gather(*service_tasks, return_exceptions=True)
nodes: list[dict] = []
for idx, node in enumerate(cluster.get("nodes", [])):
info = infos[idx] if isinstance(infos[idx], dict) else {}
node_services = services[idx] if isinstance(services[idx], list) else []
started = {svc["name"] for svc in node_services if svc.get("state") == "started"}
hostname = info.get("hostname") or pls.node_host(node.get("name", ""))
role = _infer_role(hostname)
nodes.append(
{
"name": node.get("name", ""),
"address": pls.node_host(node.get("name", "")),
"hostname": hostname,
"current": node.get("name") == cluster.get("current_node"),
"repositories": node.get("repositories", []),
"role": role,
"roles": [role],
"expected_nfs": sorted(ROLE_NF_MAP.get(role, set())),
"services": node_services,
"started_services": sorted(started),
}
)
return {
"enabled": True,
"current_node": cluster.get("current_node"),
"fully_established": bool(cluster.get("fully_established")),
"nodes": nodes,
}
def _aggregate_nf_state(nf_name: str, nodes: list[dict], prom_states: dict[str, dict]) -> dict:
service_name = nf_name.lower()
placements = []
seen_service = False
for node in nodes:
for service in node.get("services", []):
if service.get("name") != service_name:
continue
seen_service = True
if service.get("state") == "started":
placements.append(
{
"hostname": node["hostname"],
"address": node["address"],
"roles": node["roles"],
}
)
prom_state = prom_states.get(nf_name, {"state": "unknown", "instance": ""})
if placements:
state = prom_state["state"] if prom_state["state"] in {"up", "down"} else "up"
instance = ", ".join(p["hostname"] for p in placements)
elif seen_service:
state = "down"
instance = ""
else:
state = prom_state["state"]
instance = prom_state["instance"]
return {
"name": nf_name,
"state": state,
"instance": instance,
"nodes": placements,
}
def _node_nf_state(node: dict, nf_name: str) -> dict:
service_name = nf_name.lower()
service = next((svc for svc in node.get("services", []) if svc.get("name") == service_name), None)
if not service:
return {"name": nf_name, "state": "unknown"}
if service.get("state") == "started":
return {"name": nf_name, "state": "up"}
return {"name": nf_name, "state": "down"}
def _attach_node_nf_status(nodes: list[dict]) -> list[dict]:
enriched = []
for node in nodes:
node_copy = dict(node)
expected_nfs = node_copy.get("expected_nfs", [])
node_copy["nfs"] = [_node_nf_state(node_copy, nf_name.upper()) for nf_name in expected_nfs]
enriched.append(node_copy)
return enriched
async def get_network_status() -> dict:
inventory_task = asyncio.create_task(get_cluster_inventory())
prom_task = asyncio.create_task(prometheus.get_nf_status_map())
inventory, prom_states = await asyncio.gather(inventory_task, prom_task)
nodes = _attach_node_nf_status(inventory.get("nodes", []))
inventory["nodes"] = nodes
nfs = [_aggregate_nf_state(nf_name, nodes, prom_states) for nf_name in ALL_NFS]
up = sum(1 for nf in nfs if nf["state"] == "up")
down = sum(1 for nf in nfs if nf["state"] == "down")
return {
"nfs": nfs,
"summary": {"up": up, "down": down, "total": len(nfs)},
"cluster": inventory,
}
def find_nf_nodes(cluster: dict, nf_name: str) -> list[dict]:
nodes = cluster.get("nodes", [])
matches = []
for node in nodes:
for nf in node.get("nfs", []):
if nf.get("name") == nf_name:
matches.append(
{
"hostname": node["hostname"],
"address": node["address"],
"role": node.get("role", "AP"),
"current": node.get("current", False),
"state": nf.get("state", "unknown"),
}
)
break
return matches

View File

@@ -235,20 +235,23 @@ async def analyze_logs() -> dict:
Gather log-pattern issues + Prometheus NF status + Alertmanager alerts.
Returns a fully structured dict ready for JSON serialisation.
"""
from app.services import alertmanager, prometheus
from app.services import alertmanager, prometheus, cluster_inventory
# Kick off all I/O in parallel
containers_f = asyncio.create_task(_discover_containers())
alerts_f = asyncio.create_task(alertmanager.get_alerts())
nf_status_f = asyncio.create_task(prometheus.get_nf_status())
cluster_f = asyncio.create_task(cluster_inventory.get_cluster_inventory())
containers = await containers_f
alerts, nf_statuses = await asyncio.gather(alerts_f, nf_status_f,
alerts, nf_statuses, cluster = await asyncio.gather(alerts_f, nf_status_f, cluster_f,
return_exceptions=True)
if isinstance(alerts, Exception):
alerts = []
if isinstance(nf_statuses, Exception):
nf_statuses = []
if isinstance(cluster, Exception):
cluster = {"enabled": False, "nodes": []}
# Read all container logs concurrently
log_tasks = {nf: asyncio.create_task(_read_logs(cname))
@@ -280,25 +283,29 @@ async def analyze_logs() -> dict:
# 2. NF-down events from Prometheus
for nf_st in nf_statuses:
if isinstance(nf_st, dict) and nf_st.get("state") == "down":
node_text = ", ".join(node["hostname"] for node in nf_st.get("nodes", []))
issues.append({
"id": f"nf-down-{nf_st['name']}",
"category": "Connectivity",
"nf": nf_st["name"],
"node": node_text,
"severity": "critical",
"count": 1,
"description": f"{nf_st['name']} is unreachable",
"remediation": (f"Run `{CONTAINER_RUNTIME} ps` and check if {nf_st['name']} "
f"container is running; inspect its logs."),
"remediation": (f"Check {node_text or 'the hosting node'} first, then run "
f"`{CONTAINER_RUNTIME} ps` and inspect `{nf_st['name'].lower()}` logs."),
"source": "prometheus",
})
# 3. Active Alertmanager alerts
for alert in alerts:
if isinstance(alert, dict):
node_text = ", ".join(node["hostname"] for node in alert.get("nodes", []))
issues.append({
"id": f"alert-{alert.get('name', '')}-{len(issues)}",
"category": _alert_category(alert),
"nf": _alert_nf(alert),
"node": node_text,
"severity": alert.get("severity", "warning"),
"count": 1,
"description": alert.get("summary") or alert.get("name", "Unknown alert"),
@@ -331,6 +338,7 @@ async def analyze_logs() -> dict:
"categories": categories,
"timestamp": datetime.now().isoformat(),
"log_sources": list(containers.keys()),
"cluster": cluster,
}
# Persist to history ring-buffer

78
app/services/pls.py Normal file
View File

@@ -0,0 +1,78 @@
"""PLS API client for cluster and per-node discovery."""
from __future__ import annotations
from urllib.parse import urlsplit, urlunsplit
import httpx
from app.config import PLS_AUTH_BACKEND, PLS_BASE_URL, PLS_PASSWORD, PLS_USERNAME, PLS_VERIFY_TLS
_token: str | None = None
def _base_url_for_host(host: str | None = None) -> str:
if not host:
return PLS_BASE_URL.rstrip("/")
parts = urlsplit(PLS_BASE_URL)
return urlunsplit((parts.scheme, host, parts.path.rstrip("/"), "", ""))
async def _login() -> str | None:
global _token
if _token:
return _token
if not PLS_USERNAME or not PLS_PASSWORD:
return None
try:
async with httpx.AsyncClient(timeout=5, verify=PLS_VERIFY_TLS) as client:
response = await client.post(
f"{_base_url_for_host()}/auth/login",
json={
"username": PLS_USERNAME,
"password": PLS_PASSWORD,
"backend": PLS_AUTH_BACKEND,
},
)
response.raise_for_status()
data = response.json()
_token = data.get("access_token")
return _token
except Exception:
return None
async def _get(path: str, host: str | None = None) -> dict | list | None:
token = await _login()
if not token:
return None
headers = {"Authorization": f"Bearer {token}"}
url = f"{_base_url_for_host(host)}/{path.lstrip('/')}"
try:
async with httpx.AsyncClient(timeout=5, verify=PLS_VERIFY_TLS) as client:
response = await client.get(url, headers=headers)
response.raise_for_status()
return response.json()
except Exception:
return None
def node_host(node_name: str) -> str:
return node_name.split("@", 1)[1] if "@" in node_name else node_name
async def get_cluster_status() -> dict | None:
data = await _get("data_layer/cluster/status")
return data if isinstance(data, dict) else None
async def get_system_info(host: str | None = None) -> dict | None:
data = await _get("system/info", host=host)
return data if isinstance(data, dict) else None
async def get_services(host: str | None = None) -> list[dict]:
data = await _get("services", host=host)
return data if isinstance(data, list) else []

View File

@@ -14,12 +14,12 @@ async def query(promql: str) -> list:
return r.json()["data"]["result"]
async def get_nf_status() -> list:
"""Return a list of {name, state, instance} for every known NF."""
async def get_nf_status_map() -> dict[str, dict]:
"""Return Prometheus-backed NF status keyed by display name."""
try:
results = await query("up")
except Exception:
return [{"name": n, "state": "unknown", "instance": ""} for n in ALL_NFS]
return {n: {"name": n, "state": "unknown", "instance": ""} for n in ALL_NFS}
seen: dict[str, dict] = {}
for r in results:
@@ -38,4 +38,9 @@ async def get_nf_status() -> list:
if n not in seen:
seen[n] = {"name": n, "state": "unknown", "instance": ""}
return list(seen.values())
return seen
async def get_nf_status() -> list:
"""Return a list of {name, state, instance} for every known NF."""
return list((await get_nf_status_map()).values())

View File

@@ -155,6 +155,12 @@ body {
background: rgba(255,255,255,0.07); color: var(--text);
width: fit-content; white-space: nowrap;
}
.issue-node {
font-size: 10px; font-weight: 600; letter-spacing: 0.04em;
padding: 2px 7px; border-radius: 5px; margin-top: 5px;
background: rgba(59,130,246,0.12); color: var(--blue);
width: fit-content; white-space: nowrap;
}
.issue-body {}
.issue-desc { font-size: 13px; font-weight: 500; line-height: 1.4; }
.issue-rem { font-size: 11px; color: var(--muted); margin-top: 3px; line-height: 1.4; }
@@ -469,6 +475,7 @@ function renderDetail(cat) {
<div class="issue-nf">${esc(iss.nf)}</div>
<div class="issue-body">
<div class="issue-desc">${esc(iss.description)}</div>
${iss.node ? `<div class="issue-node">${esc(iss.node)}</div>` : ''}
<div class="issue-rem">⤷ ${esc(iss.remediation||'')}</div>
<span class="issue-source">${esc(iss.source||'log')}</span>
</div>

View File

@@ -60,8 +60,10 @@ header h1 span { color: var(--muted); font-weight: 400; }
/* ── Left panel ─────────────────────────────────────────────────── */
.left {
background: var(--surface); border-right: 1px solid var(--border);
display: flex; flex-direction: column; overflow: hidden;
display: flex; flex-direction: column; overflow-y: auto; overflow-x: hidden;
}
.left::-webkit-scrollbar { width: 5px; }
.left::-webkit-scrollbar-thumb { background: var(--border); border-radius: 4px; }
.section { padding: 14px 16px; border-bottom: 1px solid var(--border); }
.section-title {
font-size: 10px; font-weight: 700; text-transform: uppercase;
@@ -88,6 +90,79 @@ header h1 span { color: var(--muted); font-weight: 400; }
.nf-card.up .nf-state { color: var(--green); }
.nf-card.down .nf-state { color: var(--red); }
/* Cluster nodes */
.node-list { display: flex; flex-direction: column; gap: 8px; }
.node-card {
background: var(--card); border: 1px solid var(--border); border-radius: 10px;
overflow: hidden;
}
.node-summary {
display: flex; align-items: center; gap: 8px; padding: 10px 12px; cursor: pointer;
}
.node-summary:hover { background: rgba(255,255,255,.02); }
.node-top { display: flex; align-items: center; gap: 8px; width: 100%; }
.node-name { font-size: 13px; font-weight: 700; }
.node-addr { font-size: 11px; color: var(--muted); margin-top: 3px; }
.node-caret {
margin-left: 8px; font-size: 11px; color: var(--muted); transition: transform .15s;
}
.node-card.open .node-caret { transform: rotate(180deg); }
.node-role {
margin-left: auto; font-size: 10px; font-weight: 700; letter-spacing: .08em;
border-radius: 999px; padding: 3px 8px; border: 1px solid var(--border);
color: var(--blue); background: rgba(59,130,246,.12);
}
.node-role.current {
color: var(--green); border-color: rgba(16,185,129,.5); background: rgba(16,185,129,.12);
}
.node-meta {
display: flex; flex-wrap: wrap; gap: 6px; margin-top: 8px;
}
.node-chip {
font-size: 10px; color: var(--muted); padding: 2px 7px;
border-radius: 999px; background: rgba(255,255,255,.04); border: 1px solid var(--border);
}
.node-services {
margin-top: 8px; font-size: 11px; color: var(--text); line-height: 1.4;
}
.node-services b,
.node-profile b {
color: var(--muted); font-weight: 600;
}
.node-profile {
margin-top: 6px; font-size: 11px; color: var(--text); line-height: 1.4;
}
.node-empty {
color: var(--muted); font-size: 12px;
}
.node-details {
display: none; padding: 0 12px 12px; border-top: 1px solid rgba(255,255,255,.04);
}
.node-card.open .node-details { display: block; }
.node-nf-grid {
display: grid; grid-template-columns: repeat(3, 1fr); gap: 6px; margin-top: 10px;
}
.node-nf {
background: rgba(255,255,255,.03);
border: 1px solid var(--border);
border-radius: 8px;
padding: 7px 5px;
border-left: 3px solid var(--border);
text-align: center;
}
.node-nf.up { border-left-color: var(--green); }
.node-nf.down { border-left-color: var(--red); }
.node-nf.unknown { border-left-color: var(--yellow); }
.node-nf-name {
font-size: 10px; font-weight: 700; color: var(--text); letter-spacing: .04em;
}
.node-nf-state {
margin-top: 3px; font-size: 9px; text-transform: uppercase; letter-spacing: .06em; color: var(--muted);
}
.node-nf.up .node-nf-state { color: var(--green); }
.node-nf.down .node-nf-state { color: var(--red); }
.node-nf.unknown .node-nf-state { color: var(--yellow); }
/* Alerts panel */
.alerts-scroll { flex: 1; overflow-y: auto; padding: 14px 16px; }
.alerts-scroll::-webkit-scrollbar { width: 4px; }
@@ -101,6 +176,7 @@ header h1 span { color: var(--muted); font-weight: 400; }
.alert-row.critical { border-left-color: var(--red); }
.alert-row-name { font-size: 12px; font-weight: 600; }
.alert-row-desc { font-size: 11px; color: var(--muted); margin-top: 2px; }
.alert-row-node { font-size: 10px; color: var(--blue); margin-top: 5px; }
/* ── Chat panel ─────────────────────────────────────────────────── */
.chat { display: flex; flex-direction: column; overflow: hidden; }
@@ -185,13 +261,19 @@ header h1 span { color: var(--muted); font-weight: 400; }
<div class="left">
<div class="section">
<div class="section-title">
Network Functions
Cluster Overview
<button class="refresh-btn" onclick="refresh()" title="Refresh"></button>
</div>
<div class="nf-grid" id="nfGrid">
<div class="nf-card"><div class="nf-name">···</div></div>
</div>
</div>
<div class="section">
<div class="section-title">Discovered Nodes</div>
<div class="node-list" id="nodeList">
<div class="node-empty">Loading cluster inventory…</div>
</div>
</div>
<div class="alerts-scroll">
<div class="section-title" style="margin-bottom:10px">Active Alerts</div>
<div id="alertsContent"><div style="color:var(--muted);font-size:12px">Loading…</div></div>
@@ -221,6 +303,15 @@ header h1 span { color: var(--muted); font-weight: 400; }
// ── Utilities ──────────────────────────────────────────────────────────────
const $ = id => document.getElementById(id);
const ts = () => new Date().toLocaleTimeString([],{hour:'2-digit',minute:'2-digit'});
const ROLE_LABELS = {
'5GALL': '5G All',
'4GALL': '4G All',
'4GCP': '4G CP',
'4GDCP': '4G DCP',
'COMBOALL': 'Combo All',
'COMBOCP': 'Combo CP',
'COMBODCP': 'Combo DCP',
};
function md(text) {
// minimal markdown: **bold**, `code`, newlines
@@ -261,15 +352,70 @@ async function loadNFs() {
<div class="nf-state">${nf.state==='up'?'● up':nf.state==='down'?'● dn':'○ n/a'}</div>`;
grid.appendChild(c);
});
renderNodes(d.cluster);
$('dot').className = 'dot';
$('connLabel').textContent = 'Live';
} catch {
$('dot').className = 'dot err';
$('connLabel').textContent = 'Unreachable';
$('nfGrid').innerHTML = '<div style="color:var(--muted);font-size:12px;grid-column:1/-1">Cannot reach API</div>';
$('nodeList').innerHTML = '<div class="node-empty">Cannot reach cluster discovery API</div>';
}
}
function toggleNodeCard(button) {
button.closest('.node-card')?.classList.toggle('open');
}
function renderNodes(cluster) {
const list = $('nodeList');
const nodes = cluster?.nodes || [];
if (!nodes.length) {
list.innerHTML = '<div class="node-empty">No cluster nodes discovered</div>';
return;
}
list.innerHTML = nodes.map(node => {
const role = ROLE_LABELS[node.role] || node.role || 'AP';
const repoChips = (node.repositories || []).slice(0, 3).map(repo =>
`<span class="node-chip">${repo.type}:${repo.role}</span>`
).join('');
const running = (node.started_services || []).filter(name => !['alertmanager','prometheus','ncm','pls','fluent-bit','grafana','openvpn','ssh','node-exporter','podman-exporter','licensed','webconsole'].includes(name));
const serviceText = running.length ? running.join(', ') : 'No managed NFs started';
const expected = (node.expected_nfs || []).join(', ') || 'No NF profile mapped';
const nfTiles = (node.nfs || []).map(nf => `
<div class="node-nf ${nf.state}">
<div class="node-nf-name">${nf.name}</div>
<div class="node-nf-state">${nf.state === 'up' ? '● up' : nf.state === 'down' ? '● dn' : '○ n/a'}</div>
</div>
`).join('');
const downCount = (node.nfs || []).filter(nf => nf.state === 'down').length;
const openClass = node.current ? 'open' : '';
return `
<div class="node-card ${openClass}">
<div class="node-summary" onclick="toggleNodeCard(this)">
<div class="node-top">
<div>
<div class="node-name">${node.hostname}</div>
<div class="node-addr">${node.address} · ${node.nfs.filter(nf => nf.state === 'up').length} up${downCount ? `, ${downCount} down` : ''}</div>
</div>
<div class="node-role ${node.current ? 'current' : ''}">${role}${node.current ? ' · local' : ''}</div>
<div class="node-caret">▾</div>
</div>
</div>
<div class="node-details">
<div class="node-meta">
${repoChips || '<span class="node-chip">No repo data</span>'}
</div>
<div class="node-services"><b>Running:</b> ${serviceText}</div>
<div class="node-profile"><b>Profile:</b> ${expected}</div>
<div class="node-nf-grid">${nfTiles || '<div class="node-empty">No node-scoped NF data</div>'}</div>
</div>
</div>
`;
}).join('');
}
async function loadAlerts() {
try {
const d = await (await fetch('./api/alerts')).json();
@@ -281,6 +427,7 @@ async function loadAlerts() {
`<div class="alert-row ${a.severity||'warning'}">
<div class="alert-row-name">${a.name}</div>
<div class="alert-row-desc">${a.summary||a.instance||''}</div>
<div class="alert-row-node">${(a.nodes||[]).length ? 'Node: ' + a.nodes.map(n => n.hostname).join(', ') : 'Node: unresolved'}</div>
</div>`
).join('');
}

View File

@@ -59,6 +59,22 @@ header h1 span { color: var(--muted); font-weight: 400; }
}
.main::-webkit-scrollbar { width: 5px; }
.main::-webkit-scrollbar-thumb { background: var(--border); border-radius: 4px; }
.content-grid {
display: grid;
grid-template-columns: minmax(0, 1fr) 420px;
gap: 24px;
align-items: start;
}
.tasks-col {
display: flex;
flex-direction: column;
gap: 24px;
min-width: 0;
}
.log-col {
position: sticky;
top: 0;
}
/* ── Section headers ─────────────────────────────────────────────── */
.section-title {
@@ -171,6 +187,15 @@ header h1 span { color: var(--muted); font-weight: 400; }
}
.modal-confirm.danger { background: var(--red); }
.modal-confirm.warning { background: var(--yellow); color: #000; }
@media (max-width: 1100px) {
.content-grid {
grid-template-columns: 1fr;
}
.log-col {
position: static;
}
}
</style>
</head>
<body>
@@ -183,42 +208,46 @@ header h1 span { color: var(--muted); font-weight: 400; }
</header>
<div class="main">
<!-- Diagnostics -->
<div>
<div class="section-title">Diagnostics &amp; Health</div>
<div class="action-grid" id="diagGrid"></div>
</div>
<!-- Operations -->
<div>
<div class="section-title">Network Operations</div>
<div class="action-grid" id="opsGrid"></div>
</div>
<!-- Maintenance -->
<div>
<div class="section-title">Maintenance</div>
<div class="action-grid" id="maintGrid"></div>
</div>
<!-- Run log -->
<div class="log-panel">
<div class="log-header">
<div class="log-title">
▸ Run Log
<span class="log-badge" id="logCount">0 entries</span>
<div class="content-grid">
<div class="tasks-col">
<!-- Diagnostics -->
<div>
<div class="section-title">Diagnostics &amp; Health</div>
<div class="action-grid" id="diagGrid"></div>
</div>
<div class="log-header-actions">
<button class="expand-btn" id="expandBtn" onclick="toggleExpand()">⤢ Expand</button>
<button class="clear-btn" onclick="clearLog()">Clear</button>
<!-- Operations -->
<div>
<div class="section-title">Network Operations</div>
<div class="action-grid" id="opsGrid"></div>
</div>
<!-- Maintenance -->
<div>
<div class="section-title">Maintenance</div>
<div class="action-grid" id="maintGrid"></div>
</div>
</div>
<div class="log-body" id="logBody">
<div class="log-empty" id="logEmpty">No actions run yet.</div>
<div class="log-col">
<!-- Run log -->
<div class="log-panel" id="logPanel">
<div class="log-header">
<div class="log-title">
▸ Run Log
<span class="log-badge" id="logCount">0 entries</span>
</div>
<div class="log-header-actions">
<button class="expand-btn" id="expandBtn" onclick="toggleExpand()">⤢ Expand</button>
<button class="clear-btn" onclick="clearLog()">Clear</button>
</div>
</div>
<div class="log-body" id="logBody">
<div class="log-empty" id="logEmpty">No actions run yet.</div>
</div>
</div>
</div>
</div>
</div>
<!-- Confirm modal -->
@@ -255,6 +284,17 @@ const ACTIONS = {
],
};
function nfNodeLabel(nf) {
const nodes = nf?.nodes || [];
return nodes.length ? nodes.map(n => n.hostname).join(', ') : 'unresolved node';
}
async function fetchNetworkStatus() {
const r = await fetch('/api/network/status');
if (!r.ok) throw new Error('HTTP ' + r.status);
return await r.json();
}
// ── Render cards ──────────────────────────────────────────────────────────
function renderGrid(gridId, items) {
const g = document.getElementById(gridId);
@@ -289,6 +329,7 @@ function handleAction(id) {
const all = [...ACTIONS.diag, ...ACTIONS.ops, ...ACTIONS.maint];
const a = all.find(x => x.id === id);
if (!a) return;
revealLogPanel(true);
if (a.safe) { a.run(); return; }
pendingAction = a;
document.getElementById('modalTitle').textContent = a.name;
@@ -306,6 +347,7 @@ function closeModal() {
function runConfirmed() {
closeModal();
revealLogPanel(true);
if (pendingAction) { pendingAction.run(); pendingAction = null; }
}
@@ -326,6 +368,17 @@ function addLog(msg, type='info') {
renderLog();
}
function revealLogPanel(forceExpand=false) {
const panel = document.getElementById('logPanel');
const el = document.getElementById('logBody');
const btn = document.getElementById('expandBtn');
if (forceExpand && !el.classList.contains('expanded')) {
el.classList.add('expanded');
btn.textContent = '⤡ Collapse';
}
panel.scrollIntoView({ behavior: 'smooth', block: 'start' });
}
function renderLog() {
const el = document.getElementById('logBody');
document.getElementById('logEmpty').style.display = logLines.length ? 'none' : '';
@@ -368,17 +421,16 @@ document.addEventListener('DOMContentLoaded', () => {
// ── Action implementations ─────────────────────────────────────────────────
async function pingNFs() {
addLog('▸ Pinging all NFs via Prometheus endpoint…', 'run');
addLog('▸ Checking all discovered NFs across cluster nodes…', 'run');
try {
const r = await fetch('/api/network/nf-status');
const d = await r.json();
const nfs = d.nf_status || [];
const d = await fetchNetworkStatus();
const nfs = d.nfs || [];
const up = nfs.filter(n => n.state === 'up').length;
const down = nfs.filter(n => n.state === 'down').length;
nfs.forEach(n => addLog(` ${n.name}: ${n.state.toUpperCase()}`, n.state === 'up' ? 'ok' : 'err'));
addLog(`Ping complete — ${up} up, ${down} down`, down > 0 ? 'warn' : 'ok');
nfs.forEach(n => addLog(` ${n.name}: ${n.state.toUpperCase()} on ${nfNodeLabel(n)}`, n.state === 'up' ? 'ok' : n.state === 'down' ? 'err' : 'warn'));
addLog(`Cluster check complete — ${up} up, ${down} down`, down > 0 ? 'warn' : 'ok');
} catch(e) {
addLog('✗ Failed to reach Prometheus: ' + e.message, 'err');
addLog('✗ Failed to reach network status API: ' + e.message, 'err');
}
}
@@ -392,7 +444,7 @@ async function refreshAlerts() {
addLog('✓ No active alerts — network is healthy', 'ok');
} else {
addLog(`${alerts.length} active alert(s):`, 'warn');
alerts.forEach(a => addLog(` [${(a.labels?.severity||'info').toUpperCase()}] ${a.labels?.alertname||'Unknown'}`, 'warn'));
alerts.forEach(a => addLog(` [${(a.severity||'info').toUpperCase()}] ${a.name} on ${(a.nodes||[]).map(n => n.hostname).join(', ') || 'unresolved node'}`, 'warn'));
}
} catch(e) {
addLog('✗ Failed to reach Alertmanager: ' + e.message, 'err');
@@ -400,15 +452,18 @@ async function refreshAlerts() {
}
async function nfReport() {
addLog('▸ Generating full NF status report…', 'run');
addLog('▸ Generating cluster-wide NF status report…', 'run');
try {
const r = await fetch('/api/network/nf-status');
const d = await r.json();
const nfs = d.nf_status || [];
const d = await fetchNetworkStatus();
const nfs = d.nfs || [];
const up = nfs.filter(n => n.state === 'up').length;
addLog(`✓ Report: ${up}/${nfs.length} NFs operational`, up === nfs.length ? 'ok' : 'warn');
(d.cluster?.nodes || []).forEach(node => {
const running = (node.nfs || []).filter(nf => nf.state === 'up').map(nf => nf.name);
addLog(` ${node.hostname} (${node.role}): ${running.join(', ') || 'no active NFs'}`, 'info');
});
addLog(` Timestamp: ${new Date().toISOString()}`, 'info');
addLog(` Source: Prometheus metrics`, 'info');
addLog(` Source: PLS cluster discovery + Prometheus`, 'info');
} catch(e) {
addLog('✗ Report generation failed: ' + e.message, 'err');
}
@@ -453,16 +508,15 @@ async function emulatedSession() {
}
async function checkDevices() {
addLog('▸ Fetching connected device list…', 'run');
addLog('▸ Checking cluster nodes for subscriber-serving NFs…', 'run');
try {
const r = await fetch('/api/network/nf-status');
const d = await r.json();
const nfs = d.nf_status || [];
const d = await fetchNetworkStatus();
const nfs = d.nfs || [];
const amf = nfs.find(n => n.name === 'AMF');
addLog(` AMF state: ${amf ? amf.state.toUpperCase() : 'UNKNOWN'}`, amf?.state === 'up' ? 'ok' : 'warn');
addLog(` AMF state: ${amf ? amf.state.toUpperCase() : 'UNKNOWN'} on ${nfNodeLabel(amf)}`, amf?.state === 'up' ? 'ok' : 'warn');
const upf = nfs.find(n => n.name === 'UPF');
addLog(` UPF state: ${upf ? upf.state.toUpperCase() : 'UNKNOWN'}`, upf?.state === 'up' ? 'ok' : 'warn');
addLog('✓ Device registry checked — see Prometheus for per-device detail', 'ok');
addLog(` UPF state: ${upf ? upf.state.toUpperCase() : 'UNKNOWN'} on ${nfNodeLabel(upf)}`, upf?.state === 'up' ? 'ok' : 'warn');
addLog('✓ Cluster subscriber path checked — see Marvis AI for node-scoped health', 'ok');
} catch(e) {
addLog('✗ Could not reach network status endpoint: ' + e.message, 'err');
}
@@ -486,10 +540,12 @@ function clearSessions() {
}
function backupConfig() {
addLog('▸ Exporting configuration for all NFs…', 'run');
const nfs = ['AMF','SMF','UPF','NRF','AUSF','UDM','UDR','PCF','CHF','SMSF','AAA','MME'];
nfs.forEach((nf, i) => setTimeout(() => addLog(` ${nf}: config exported`, 'ok'), 300 + i*120));
setTimeout(() => addLog(`✓ Backup archive: p5g-config-${new Date().toISOString().slice(0,10)}.tar.gz`, 'ok'), 300 + nfs.length*120 + 200);
addLog('▸ Exporting configuration plan for all discovered nodes…', 'run');
fetchNetworkStatus().then(d => {
const nodes = d.cluster?.nodes || [];
nodes.forEach((node, i) => setTimeout(() => addLog(` ${node.hostname}: profile ${node.role}, services ${node.started_services.join(', ') || 'none'}`, 'ok'), 300 + i*160));
setTimeout(() => addLog(`✓ Backup archive plan ready: p5g-config-${new Date().toISOString().slice(0,10)}.tar.gz`, 'ok'), 300 + nodes.length*160 + 200);
}).catch(e => addLog('✗ Could not inspect cluster before backup: ' + e.message, 'err'));
}
function reloadConfig() {

View File

@@ -4,6 +4,11 @@
MARVIS_PROMETHEUS_URL=http://127.0.0.1:9090
MARVIS_PROMETHEUS_PREFIX=/prometheus
MARVIS_ALERTMANAGER_URL=http://127.0.0.1:9093
MARVIS_PLS_BASE_URL=https://127.0.0.1/core/pls/api/1
MARVIS_PLS_USERNAME=
MARVIS_PLS_PASSWORD=
MARVIS_PLS_AUTH_BACKEND=local
MARVIS_PLS_VERIFY_TLS=false
# AI backend configuration.
MARVIS_AI_MODE=rule

View File

@@ -11,6 +11,11 @@ TimeoutStartSec=0
Environment=MARVIS_PROMETHEUS_URL=http://127.0.0.1:9090
Environment=MARVIS_PROMETHEUS_PREFIX=/prometheus
Environment=MARVIS_ALERTMANAGER_URL=http://127.0.0.1:9093
Environment=MARVIS_PLS_BASE_URL=https://127.0.0.1/core/pls/api/1
Environment=MARVIS_PLS_USERNAME=
Environment=MARVIS_PLS_PASSWORD=
Environment=MARVIS_PLS_AUTH_BACKEND=local
Environment=MARVIS_PLS_VERIFY_TLS=false
Environment=MARVIS_AI_MODE=rule
Environment=MARVIS_OPENAI_API_KEY=
Environment=MARVIS_OPENAI_BASE_URL=https://api.openai.com
@@ -26,6 +31,11 @@ ExecStart=/usr/bin/docker run \
--env MARVIS_PROMETHEUS_URL \
--env MARVIS_PROMETHEUS_PREFIX \
--env MARVIS_ALERTMANAGER_URL \
--env MARVIS_PLS_BASE_URL \
--env MARVIS_PLS_USERNAME \
--env MARVIS_PLS_PASSWORD \
--env MARVIS_PLS_AUTH_BACKEND \
--env MARVIS_PLS_VERIFY_TLS \
--env MARVIS_AI_MODE \
--env MARVIS_OPENAI_API_KEY \
--env MARVIS_OPENAI_BASE_URL \