Files
p5g-marvis/app/services/log_ingest.py
2026-04-24 14:15:58 -04:00

500 lines
14 KiB
Python

"""Fluent Bit log ingestion, buffering, and alert-context persistence."""
from __future__ import annotations
import asyncio
import json
import sqlite3
from collections import deque
from datetime import UTC, datetime
from hashlib import sha1
from pathlib import Path
from typing import Any
from app.config import (
ALL_NFS,
LOG_ALERT_CONTEXT_AFTER,
LOG_ALERT_CONTEXT_BEFORE,
LOG_ALLOWED_NFS,
LOG_ALERT_CONTEXT_DB_MAX_ROWS,
LOG_ALERT_CONTEXT_DB_PATH,
LOG_BUFFER_LINES,
LOG_AUTO_CONFIGURE,
LOG_FLUENTBIT_MATCH,
LOG_INGEST_ENABLED,
LOG_RECEIVER_BIND_HOST,
LOG_RECEIVER_FORMAT,
LOG_RECEIVER_HOST,
LOG_RECEIVER_PORT,
LOG_TRACE_BUFFER_LINES,
)
from app.services import pls
_server: asyncio.base_events.Server | None = None
_events: deque[dict[str, Any]] = deque(maxlen=max(LOG_BUFFER_LINES, 1))
_trace_events: deque[dict[str, Any]] = deque(maxlen=max(LOG_TRACE_BUFFER_LINES, LOG_BUFFER_LINES, 1))
_ingested_total = 0
_parse_errors = 0
_last_event_at: str | None = None
_db_initialized = False
_allowed_nfs = {nf.upper() for nf in LOG_ALLOWED_NFS}
def _db_path() -> Path:
return Path(LOG_ALERT_CONTEXT_DB_PATH)
def _ensure_db() -> None:
global _db_initialized
if _db_initialized:
return
path = _db_path()
path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(path)
try:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS alert_context (
id TEXT PRIMARY KEY,
fingerprint TEXT UNIQUE,
created_at TEXT NOT NULL,
event_ts TEXT NOT NULL,
category TEXT NOT NULL,
nf TEXT,
node TEXT,
severity TEXT,
description TEXT,
remediation TEXT,
source TEXT,
match_message TEXT,
before_context TEXT,
after_context TEXT
)
"""
)
conn.commit()
finally:
conn.close()
_db_initialized = True
def _trim_db(conn: sqlite3.Connection) -> None:
conn.execute(
"""
DELETE FROM alert_context
WHERE id NOT IN (
SELECT id
FROM alert_context
ORDER BY event_ts DESC, created_at DESC
LIMIT ?
)
""",
(max(LOG_ALERT_CONTEXT_DB_MAX_ROWS, 1),),
)
def _parse_timestamp(value: Any) -> tuple[float, str]:
if value is None:
now = datetime.now(UTC)
return now.timestamp(), now.isoformat()
if isinstance(value, (int, float)):
raw = float(value)
if raw > 1_000_000_000_000:
raw = raw / 1_000_000.0
elif raw > 10_000_000_000:
raw = raw / 1000.0
dt = datetime.fromtimestamp(raw, UTC)
return raw, dt.isoformat()
text = str(value).strip()
if text.isdigit():
return _parse_timestamp(int(text))
normalized = text.replace("Z", "+00:00")
for candidate in (normalized, normalized.replace(" ", "T")):
try:
dt = datetime.fromisoformat(candidate)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=UTC)
else:
dt = dt.astimezone(UTC)
return dt.timestamp(), dt.isoformat()
except ValueError:
continue
now = datetime.now(UTC)
return now.timestamp(), now.isoformat()
def _candidate_fields(payload: dict[str, Any]) -> list[str]:
candidates = []
for key in (
"message",
"MESSAGE",
"log",
"msg",
"systemd_unit",
"_SYSTEMD_UNIT",
"syslog_identifier",
"SYSLOG_IDENTIFIER",
"_COMM",
"comm",
"_EXE",
"container_name",
"tag",
):
value = payload.get(key)
if value not in (None, ""):
candidates.append(str(value))
return candidates
def _infer_nf(payload: dict[str, Any], message: str) -> str:
haystack = " ".join(_candidate_fields(payload) + [message]).lower()
aliases = {
"upf": "UPF",
"amf": "AMF",
"smf": "SMF",
"udm": "UDM",
"udr": "UDR",
"nrf": "NRF",
"ausf": "AUSF",
"pcf": "PCF",
"mme": "MME",
"sgwc": "SGWC",
"dra": "DRA",
"dsm": "DSM",
"aaa": "AAA",
"bmsc": "BMSC",
"chf": "CHF",
"smsf": "SMSF",
"eir": "EIR",
"licensed": "LICENSED",
"prometheus": "PROMETHEUS",
"alertmanager": "ALERTMANAGER",
"fluent-bit": "FLUENT-BIT",
}
for needle, label in aliases.items():
if needle in haystack:
return label
return "SYSTEM"
def _normalize_event(payload: dict[str, Any], remote_host: str) -> dict[str, Any]:
ts_value = (
payload.get("timestamp")
or payload.get("@timestamp")
or payload.get("time")
or payload.get("date")
or payload.get("_SOURCE_REALTIME_TIMESTAMP")
)
epoch, ts_iso = _parse_timestamp(ts_value)
node = (
payload.get("hostname")
or payload.get("host")
or payload.get("_HOSTNAME")
or payload.get("syslog_hostname")
or remote_host
)
source = (
payload.get("systemd_unit")
or payload.get("_SYSTEMD_UNIT")
or payload.get("syslog_identifier")
or payload.get("SYSLOG_IDENTIFIER")
or payload.get("_COMM")
or payload.get("tag")
or "unknown"
)
message = (
payload.get("message")
or payload.get("MESSAGE")
or payload.get("log")
or payload.get("msg")
or ""
)
message = str(message).strip()
tag = str(payload.get("tag", ""))
nf = _infer_nf(payload, message)
fingerprint = sha1(f"{ts_iso}|{node}|{nf}|{source}|{message}".encode("utf-8")).hexdigest()
return {
"id": fingerprint,
"timestamp": ts_iso,
"epoch": epoch,
"node": str(node),
"nf": nf,
"source": str(source),
"tag": tag,
"message": message,
"raw": payload,
}
async def _ingest_payload(payload: dict[str, Any], remote_host: str) -> None:
global _ingested_total, _last_event_at
event = _normalize_event(payload, remote_host)
if event.get("nf", "").upper() not in _allowed_nfs:
return
_events.append(event)
_trace_events.append(event)
_ingested_total += 1
_last_event_at = event["timestamp"]
async def _handle_client(reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
global _parse_errors
peer = writer.get_extra_info("peername")
remote_host = peer[0] if isinstance(peer, tuple) and peer else "unknown"
try:
while not reader.at_eof():
line = await reader.readline()
if not line:
break
text = line.decode("utf-8", errors="replace").strip()
if not text:
continue
try:
payload = json.loads(text)
if isinstance(payload, dict):
await _ingest_payload(payload, remote_host)
elif isinstance(payload, list):
for item in payload:
if isinstance(item, dict):
await _ingest_payload(item, remote_host)
except Exception:
_parse_errors += 1
finally:
writer.close()
await writer.wait_closed()
async def startup() -> None:
global _server
_ensure_db()
if not LOG_INGEST_ENABLED or _server is not None:
return
_server = await asyncio.start_server(_handle_client, LOG_RECEIVER_BIND_HOST, LOG_RECEIVER_PORT)
if LOG_AUTO_CONFIGURE:
try:
await configure_site_output()
except Exception:
pass
async def shutdown() -> None:
global _server
if _server is None:
return
_server.close()
await _server.wait_closed()
_server = None
def receiver_status() -> dict[str, Any]:
return {
"enabled": LOG_INGEST_ENABLED,
"bind_host": LOG_RECEIVER_BIND_HOST,
"receiver_host": LOG_RECEIVER_HOST,
"port": LOG_RECEIVER_PORT,
"format": LOG_RECEIVER_FORMAT,
"allowed_nfs": sorted(_allowed_nfs),
"buffer_lines": LOG_BUFFER_LINES,
"trace_buffer_lines": LOG_TRACE_BUFFER_LINES,
"context_before": LOG_ALERT_CONTEXT_BEFORE,
"context_after": LOG_ALERT_CONTEXT_AFTER,
"db_path": str(_db_path()),
"ingested_total": _ingested_total,
"parse_errors": _parse_errors,
"last_event_at": _last_event_at,
"current_buffer_size": len(_events),
}
def current_output_config(receiver_host: str) -> dict[str, Any]:
return {
"name": "tcp",
"match": LOG_FLUENTBIT_MATCH,
"host": receiver_host,
"port": LOG_RECEIVER_PORT,
"format": LOG_RECEIVER_FORMAT,
}
def default_input_config() -> dict[str, Any]:
return {
"name": "systemd",
"path": "/var/log/journal",
"tag": "marvis.systemd",
"read_from_tail": "on",
"strip_underscores": "off",
}
async def _resolve_receiver_host() -> str:
if LOG_RECEIVER_HOST:
return LOG_RECEIVER_HOST
cluster = await pls.get_cluster_status()
if isinstance(cluster, dict):
current_node = cluster.get("current_node")
if isinstance(current_node, str) and current_node:
return pls.node_host(current_node)
system = await pls.get_system_info()
if isinstance(system, dict) and system.get("hostname"):
return str(system["hostname"])
return "127.0.0.1"
def _merged_fluentbit_config(config: dict[str, Any], receiver_host: str) -> dict[str, Any]:
merged = dict(config or {})
pipeline = dict(merged.get("pipeline") or {})
inputs = list(pipeline.get("inputs") or [])
outputs = list(pipeline.get("outputs") or [])
desired = current_output_config(receiver_host)
if not inputs:
inputs = [default_input_config()]
filtered = []
for output in outputs:
if not isinstance(output, dict):
continue
is_existing_marvis = (
output.get("name") == "tcp"
and output.get("port") == LOG_RECEIVER_PORT
and output.get("format") == LOG_RECEIVER_FORMAT
)
if not is_existing_marvis:
filtered.append(output)
filtered.append(desired)
pipeline["inputs"] = inputs
pipeline["outputs"] = filtered
merged["pipeline"] = pipeline
if "parsers" not in merged:
merged["parsers"] = list(config.get("parsers") or []) if isinstance(config, dict) else []
return merged
async def configure_site_output() -> dict[str, Any]:
current = await pls.get_fluentbit_config()
if not isinstance(current, dict):
raise RuntimeError("Could not read current Fluent Bit config from PLS")
receiver_host = await _resolve_receiver_host()
desired = _merged_fluentbit_config(current, receiver_host)
updated = await pls.put_fluentbit_config(desired)
if not isinstance(updated, dict):
raise RuntimeError("PLS rejected Fluent Bit config update")
return {
"receiver_host": receiver_host,
"receiver_port": LOG_RECEIVER_PORT,
"match": LOG_FLUENTBIT_MATCH,
"config": updated,
}
def get_events(limit: int | None = None, node: str | None = None, nf: str | None = None, imsi: str | None = None) -> list[dict[str, Any]]:
events = list(_trace_events if imsi else _events)
if node:
node_l = node.lower()
events = [event for event in events if event.get("node", "").lower() == node_l]
if nf:
nf_u = nf.upper()
events = [event for event in events if event.get("nf", "").upper() == nf_u]
if imsi:
needle = imsi.strip()
events = [event for event in events if needle and needle in event.get("message", "")]
events.sort(key=lambda event: event.get("epoch", 0.0))
if limit is not None:
return events[-limit:]
return events
def record_alert_context(
*,
category: str,
nf: str,
node: str,
severity: str,
description: str,
remediation: str,
source: str,
event: dict[str, Any],
before_context: list[dict[str, Any]],
after_context: list[dict[str, Any]],
) -> str:
_ensure_db()
fingerprint = sha1(
"|".join(
[
category,
nf,
node,
severity,
description,
remediation,
event.get("timestamp", ""),
event.get("message", ""),
]
).encode("utf-8")
).hexdigest()
alert_id = sha1(f"{fingerprint}|{source}".encode("utf-8")).hexdigest()
conn = sqlite3.connect(_db_path())
try:
conn.execute(
"""
INSERT OR REPLACE INTO alert_context (
id, fingerprint, created_at, event_ts, category, nf, node, severity,
description, remediation, source, match_message, before_context, after_context
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
alert_id,
fingerprint,
datetime.now(UTC).isoformat(),
event.get("timestamp", ""),
category,
nf,
node,
severity,
description,
remediation,
source,
event.get("message", ""),
json.dumps(before_context),
json.dumps(after_context),
),
)
_trim_db(conn)
conn.commit()
finally:
conn.close()
return alert_id
def recent_alert_context(limit: int = 20) -> list[dict[str, Any]]:
_ensure_db()
conn = sqlite3.connect(_db_path())
conn.row_factory = sqlite3.Row
try:
rows = conn.execute(
"""
SELECT id, created_at, event_ts, category, nf, node, severity, description,
remediation, source, match_message, before_context, after_context
FROM alert_context
ORDER BY event_ts DESC, created_at DESC
LIMIT ?
""",
(limit,),
).fetchall()
return [dict(row) for row in rows]
finally:
conn.close()
def known_nfs() -> list[str]:
return list(ALL_NFS)