added multi node functionality
This commit is contained in:
@@ -235,20 +235,23 @@ async def analyze_logs() -> dict:
|
||||
Gather log-pattern issues + Prometheus NF status + Alertmanager alerts.
|
||||
Returns a fully structured dict ready for JSON serialisation.
|
||||
"""
|
||||
from app.services import alertmanager, prometheus
|
||||
from app.services import alertmanager, prometheus, cluster_inventory
|
||||
|
||||
# Kick off all I/O in parallel
|
||||
containers_f = asyncio.create_task(_discover_containers())
|
||||
alerts_f = asyncio.create_task(alertmanager.get_alerts())
|
||||
nf_status_f = asyncio.create_task(prometheus.get_nf_status())
|
||||
cluster_f = asyncio.create_task(cluster_inventory.get_cluster_inventory())
|
||||
|
||||
containers = await containers_f
|
||||
alerts, nf_statuses = await asyncio.gather(alerts_f, nf_status_f,
|
||||
alerts, nf_statuses, cluster = await asyncio.gather(alerts_f, nf_status_f, cluster_f,
|
||||
return_exceptions=True)
|
||||
if isinstance(alerts, Exception):
|
||||
alerts = []
|
||||
if isinstance(nf_statuses, Exception):
|
||||
nf_statuses = []
|
||||
if isinstance(cluster, Exception):
|
||||
cluster = {"enabled": False, "nodes": []}
|
||||
|
||||
# Read all container logs concurrently
|
||||
log_tasks = {nf: asyncio.create_task(_read_logs(cname))
|
||||
@@ -280,25 +283,29 @@ async def analyze_logs() -> dict:
|
||||
# 2. NF-down events from Prometheus
|
||||
for nf_st in nf_statuses:
|
||||
if isinstance(nf_st, dict) and nf_st.get("state") == "down":
|
||||
node_text = ", ".join(node["hostname"] for node in nf_st.get("nodes", []))
|
||||
issues.append({
|
||||
"id": f"nf-down-{nf_st['name']}",
|
||||
"category": "Connectivity",
|
||||
"nf": nf_st["name"],
|
||||
"node": node_text,
|
||||
"severity": "critical",
|
||||
"count": 1,
|
||||
"description": f"{nf_st['name']} is unreachable",
|
||||
"remediation": (f"Run `{CONTAINER_RUNTIME} ps` and check if {nf_st['name']} "
|
||||
f"container is running; inspect its logs."),
|
||||
"remediation": (f"Check {node_text or 'the hosting node'} first, then run "
|
||||
f"`{CONTAINER_RUNTIME} ps` and inspect `{nf_st['name'].lower()}` logs."),
|
||||
"source": "prometheus",
|
||||
})
|
||||
|
||||
# 3. Active Alertmanager alerts
|
||||
for alert in alerts:
|
||||
if isinstance(alert, dict):
|
||||
node_text = ", ".join(node["hostname"] for node in alert.get("nodes", []))
|
||||
issues.append({
|
||||
"id": f"alert-{alert.get('name', '')}-{len(issues)}",
|
||||
"category": _alert_category(alert),
|
||||
"nf": _alert_nf(alert),
|
||||
"node": node_text,
|
||||
"severity": alert.get("severity", "warning"),
|
||||
"count": 1,
|
||||
"description": alert.get("summary") or alert.get("name", "Unknown alert"),
|
||||
@@ -331,6 +338,7 @@ async def analyze_logs() -> dict:
|
||||
"categories": categories,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"log_sources": list(containers.keys()),
|
||||
"cluster": cluster,
|
||||
}
|
||||
|
||||
# Persist to history ring-buffer
|
||||
|
||||
Reference in New Issue
Block a user