From 30f53eb66880d7a0c404b37391ac11ca25a1bd1a Mon Sep 17 00:00:00 2001 From: beatz174-bit Date: Mon, 13 Apr 2026 06:19:57 +1000 Subject: [PATCH] Harden unknown-project Node-RED functions for missing labels --- monitoring/node-red/UPDATE_LOGGING_GRAFANA.md | 160 ++++++++++++++++ monitoring/node-red/data/flows.json | 181 +++++++++++++++++- monitoring/prometheus/docker-compose.yml | 1 + monitoring/telegraf/telegraf.conf | 36 ++++ 4 files changed, 369 insertions(+), 9 deletions(-) create mode 100644 monitoring/node-red/UPDATE_LOGGING_GRAFANA.md diff --git a/monitoring/node-red/UPDATE_LOGGING_GRAFANA.md b/monitoring/node-red/UPDATE_LOGGING_GRAFANA.md new file mode 100644 index 0000000..51c8fc5 --- /dev/null +++ b/monitoring/node-red/UPDATE_LOGGING_GRAFANA.md @@ -0,0 +1,160 @@ +# Node-RED update logging for Grafana + +This guide adds structured update-event logging to your existing Node-RED + Telegraf + Prometheus + Grafana stack without introducing Loki. + +## Goal + +Track and surface (in Grafana) the latest update attempts from Node-RED, including: + +- when an update attempt started, +- target container/project, +- success/failure, +- optional failure reason, +- elapsed duration. + +## 1) Add a reusable logger function in Node-RED + +Create a **Function** node named `Build update log event` and use: + +```javascript +const nowIso = new Date().toISOString(); +const startedAt = msg.update_started_at || Date.now(); +const durationMs = Math.max(0, Date.now() - startedAt); + +const payload = msg.payload || {}; +const labels = payload.labels || {}; + +const status = (msg.update_status || payload.status || "unknown").toString().toLowerCase(); +const success = status === "success" ? 1 : 0; +const failed = status === "failed" ? 1 : 0; + +msg.payload = { + ts: nowIso, + flow: "docker-updates", + event: msg.update_event || "attempt", + container: msg.container || labels.container || "unknown", + project: labels.com_docker_compose_project || msg.project || "unknown", + host: msg.host || "unknown", + status, + success, + failed, + duration_ms: durationMs, + code: Number.isFinite(Number(payload.code)) ? Number(payload.code) : 0, + error: (msg.update_error || payload.error || "").toString().slice(0, 300) +}; + +// one JSON line per event for file output +msg.payload = JSON.stringify(msg.payload); +return msg; +``` + +### Wiring recommendation + +Use the same logger function in these branches: + +- before a pull/update command (`update_status=started`, `update_event=attempt`), +- success path (`update_status=success`, `update_event=completed`), +- failure path (`update_status=failed`, `update_event=completed`, and include `msg.update_error`). + +Then route each branch into a **File** node configured as: + +- Filename: `/data/update-events.ndjson` +- Action: append to file +- Add newline: enabled + +## 2) Make update state explicit in existing update flow + +In your current update flow (already present in `flows.json`), add/change **Change** nodes around your shell/docker nodes: + +- At update start: + - `msg.update_started_at = $millis()` + - `msg.update_status = "started"` + - `msg.update_event = "attempt"` +- At success: + - `msg.update_status = "success"` + - `msg.update_event = "completed"` +- At failure: + - `msg.update_status = "failed"` + - `msg.update_event = "completed"` + - `msg.update_error = msg.payload.stderr` (or equivalent error field) + +## 3) Let Telegraf ingest Node-RED event logs + +Append this to `monitoring/telegraf/telegraf.conf`: + +```toml +[[inputs.tail]] + files = ["/var/log/node-red/update-events.ndjson"] + from_beginning = false + name_override = "node_red_update_event" + data_format = "json_v2" + + [[inputs.tail.json_v2]] + measurement_name = "node_red_update_event" + + [[inputs.tail.json_v2.tag]] + path = "flow" + [[inputs.tail.json_v2.tag]] + path = "event" + [[inputs.tail.json_v2.tag]] + path = "container" + [[inputs.tail.json_v2.tag]] + path = "project" + [[inputs.tail.json_v2.tag]] + path = "host" + [[inputs.tail.json_v2.tag]] + path = "status" + + [[inputs.tail.json_v2.field]] + path = "success" + type = "int" + [[inputs.tail.json_v2.field]] + path = "failed" + type = "int" + [[inputs.tail.json_v2.field]] + path = "duration_ms" + type = "int" + [[inputs.tail.json_v2.field]] + path = "code" + type = "int" +``` + +And mount the Node-RED data directory into Telegraf (read-only) in `monitoring/prometheus/docker-compose.yml` under `telegraf.volumes`: + +```yaml + - ${PROJECT_ROOT}/monitoring/node-red/data:/var/log/node-red:ro +``` + +## 4) Prometheus scrape (already in place) + +No Prometheus scrape change is required as long as it already scrapes Telegraf (`telegraf:9273`). + +## 5) Grafana queries to start with + +Use your Prometheus data source and try: + +- Latest success/failure by container: + - `last_over_time(node_red_update_event_success[24h])` + - `last_over_time(node_red_update_event_failed[24h])` +- Failed updates in the last 24h: + - `sum by (container, project) (increase(node_red_update_event_failed[24h]))` +- Average update duration in last 24h: + - `avg by (container, project) (avg_over_time(node_red_update_event_duration_ms[24h]))` + +Recommended panels: + +- **Table**: container, project, status (last value), duration_ms (last value) +- **Time series**: failed count over time +- **Stat**: total failed updates in last 24h + +## 6) Validation checklist + +1. Trigger a known update path (including one failure if possible). +2. Check Node-RED log file: + - `tail -n 20 monitoring/node-red/data/update-events.ndjson` +3. Check Telegraf metrics endpoint for `node_red_update_event_` metrics. +4. Confirm Grafana panel values match the latest Node-RED run. + +## Optional next step + +If you want searchable raw log text and richer log UX, add Loki + Promtail later. Keep this structured metrics path for high-signal alerting even after adding logs. diff --git a/monitoring/node-red/data/flows.json b/monitoring/node-red/data/flows.json index 36399b7..b709991 100644 --- a/monitoring/node-red/data/flows.json +++ b/monitoring/node-red/data/flows.json @@ -460,7 +460,7 @@ "type": "function", "z": "00b02bbd01c91485", "name": "Unknown Project", - "func": "const labels = msg.payload.labels || {};\nconst container = labels.container;\nconst image = labels.compose_image || labels.running_image || labels.image;\nconst project = labels.com_docker_compose_project;\n\nnode.warn(`Unable to map project name ${project} to host.\n\n Updates for ${container} (${image}) failed`);\nreturn msg;", + "func": "const payload = (msg.payload && typeof msg.payload === \"object\") ? msg.payload : {};\nconst labels = (payload.labels && typeof payload.labels === \"object\") ? payload.labels : {};\n\nconst container = labels.container || \"unknown container\";\nconst image = labels.compose_image || labels.running_image || labels.image || \"unknown image\";\nconst project = labels.com_docker_compose_project || \"unknown project\";\n\nnode.warn(`Unable to map project name ${project} to host.\n\nUpdates for ${container} (${image}) failed`);\nreturn msg;", "outputs": 1, "timeout": "", "noerr": 0, @@ -936,7 +936,7 @@ "y": 200, "wires": [ [ - "0135d283b9edfb01" + "c1aa11bb22cc33dd" ] ] }, @@ -972,7 +972,7 @@ "y": 300, "wires": [ [ - "4eafade32c867e40" + "d2aa11bb22cc33dd" ] ] }, @@ -1008,7 +1008,7 @@ "y": 380, "wires": [ [ - "7d8200040f9b1e83" + "e3aa11bb22cc33dd" ] ] }, @@ -1017,7 +1017,7 @@ "type": "function", "z": "c5240b64a962ea54", "name": "Docker updates Unknown Project", - "func": "const container = msg.container || \"unknown container\";\nconst code = msg.payload.code;\nconst stderr = flow.get(\"pull_stderr\") || \"Unknown error\";\nconst project = msg.payload.labels.com_docker_compose_project\nmsg.payload = {\n title: \"Container Updates Failed\",\n message: `The ${container} container has failed.\\n\n Unknown project ${project}`,\n priority: 8\n};\n\nreturn msg;", + "func": "const payload = (msg.payload && typeof msg.payload === \"object\") ? msg.payload : {};\nconst labels = (payload.labels && typeof payload.labels === \"object\") ? payload.labels : {};\n\nconst container = msg.container || labels.container || \"unknown container\";\nconst project = labels.com_docker_compose_project || msg.project || \"unknown project\";\n\nmsg.payload = {\n title: \"Container Updates Failed\",\n message: `The ${container} container has failed.\n\nUnknown project ${project}`,\n priority: 8\n};\n\nreturn msg;", "outputs": 1, "timeout": 0, "noerr": 0, @@ -1044,7 +1044,7 @@ "y": 460, "wires": [ [ - "c3d07241f4a570af" + "f4aa11bb22cc33dd" ] ] }, @@ -1078,7 +1078,7 @@ "y": 520, "wires": [ [ - "d1346f7151103832" + "a5aa11bb22cc33dd" ] ] }, @@ -1095,7 +1095,7 @@ "y": 580, "wires": [ [ - "1a9798d5c081240a" + "b6aa11bb22cc33dd" ] ] }, @@ -1118,5 +1118,168 @@ "8630c7dfcdbcce50" ] ] + }, + { + "id": "a1f8e9b2c3d4e5f6", + "type": "function", + "z": "c5240b64a962ea54", + "name": "Build update log event", + "func": "const nowIso = new Date().toISOString();\nconst startedAt = msg.update_started_at || Date.now();\nconst durationMs = Math.max(0, Date.now() - startedAt);\n\nconst payload = (msg.payload && typeof msg.payload === \"object\") ? msg.payload : {};\nconst labels = payload.labels || {};\n\nconst status = (msg.update_status || payload.status || \"unknown\").toString().toLowerCase();\n\nmsg.payload = JSON.stringify({\n ts: nowIso,\n flow: \"docker-updates\",\n event: msg.update_event || \"attempt\",\n container: msg.container || labels.container || \"unknown\",\n project: labels.com_docker_compose_project || msg.project || \"unknown\",\n host: msg.host || \"unknown\",\n status,\n success: status === \"success\" ? 1 : 0,\n failed: [\"failed\", \"locked\"].includes(status) ? 1 : 0,\n duration_ms: durationMs,\n code: Number.isFinite(Number(payload.code)) ? Number(payload.code) : 0,\n error: (msg.update_error || payload.error || \"\").toString().slice(0, 300)\n});\n\nreturn msg;", + "outputs": 1, + "timeout": "", + "noerr": 0, + "initialize": "", + "finalize": "", + "libs": [], + "x": 690, + "y": 660, + "wires": [ + [ + "b1c2d3e4f5a69788" + ] + ] + }, + { + "id": "b1c2d3e4f5a69788", + "type": "file", + "z": "c5240b64a962ea54", + "name": "Write update event log", + "filename": "/data/update-events.ndjson", + "filenameType": "str", + "appendNewline": true, + "createDir": false, + "overwriteFile": "false", + "encoding": "none", + "x": 930, + "y": 660, + "wires": [ + [] + ] + }, + { + "id": "c1aa11bb22cc33dd", + "type": "function", + "z": "c5240b64a962ea54", + "name": "Mark Docker Pull Failed", + "func": "msg.update_status = \"failed\";\nmsg.update_event = \"completed\";\nreturn msg;", + "outputs": 1, + "timeout": "", + "noerr": 0, + "initialize": "", + "finalize": "", + "libs": [], + "x": 470, + "y": 200, + "wires": [ + [ + "0135d283b9edfb01", + "a1f8e9b2c3d4e5f6" + ] + ] + }, + { + "id": "d2aa11bb22cc33dd", + "type": "function", + "z": "c5240b64a962ea54", + "name": "Mark Docker Test Failed", + "func": "msg.update_status = \"failed\";\nmsg.update_event = \"completed\";\nreturn msg;", + "outputs": 1, + "timeout": "", + "noerr": 0, + "initialize": "", + "finalize": "", + "libs": [], + "x": 470, + "y": 300, + "wires": [ + [ + "4eafade32c867e40", + "a1f8e9b2c3d4e5f6" + ] + ] + }, + { + "id": "e3aa11bb22cc33dd", + "type": "function", + "z": "c5240b64a962ea54", + "name": "Mark Docker Update Success", + "func": "msg.update_status = \"success\";\nmsg.update_event = \"completed\";\nreturn msg;", + "outputs": 1, + "timeout": "", + "noerr": 0, + "initialize": "", + "finalize": "", + "libs": [], + "x": 470, + "y": 380, + "wires": [ + [ + "7d8200040f9b1e83", + "a1f8e9b2c3d4e5f6" + ] + ] + }, + { + "id": "f4aa11bb22cc33dd", + "type": "function", + "z": "c5240b64a962ea54", + "name": "Mark Docker Unknown Project", + "func": "msg.update_status = \"failed\";\nmsg.update_event = \"completed\";\nreturn msg;", + "outputs": 1, + "timeout": "", + "noerr": 0, + "initialize": "", + "finalize": "", + "libs": [], + "x": 470, + "y": 460, + "wires": [ + [ + "c3d07241f4a570af", + "a1f8e9b2c3d4e5f6" + ] + ] + }, + { + "id": "a5aa11bb22cc33dd", + "type": "function", + "z": "c5240b64a962ea54", + "name": "Mark Docker Update Attempt", + "func": "msg.update_status = \"attempt\";\nmsg.update_event = \"attempt\";\nreturn msg;", + "outputs": 1, + "timeout": "", + "noerr": 0, + "initialize": "", + "finalize": "", + "libs": [], + "x": 470, + "y": 520, + "wires": [ + [ + "d1346f7151103832", + "a1f8e9b2c3d4e5f6" + ] + ] + }, + { + "id": "b6aa11bb22cc33dd", + "type": "function", + "z": "c5240b64a962ea54", + "name": "Mark Docker Update Locked", + "func": "msg.update_status = \"locked\";\nmsg.update_event = \"completed\";\nreturn msg;", + "outputs": 1, + "timeout": "", + "noerr": 0, + "initialize": "", + "finalize": "", + "libs": [], + "x": 470, + "y": 580, + "wires": [ + [ + "1a9798d5c081240a", + "a1f8e9b2c3d4e5f6" + ] + ] } -] \ No newline at end of file +] diff --git a/monitoring/prometheus/docker-compose.yml b/monitoring/prometheus/docker-compose.yml index 3f74647..f41789a 100644 --- a/monitoring/prometheus/docker-compose.yml +++ b/monitoring/prometheus/docker-compose.yml @@ -197,6 +197,7 @@ services: - no-new-privileges:true volumes: - ${PROJECT_ROOT}/monitoring/telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro + - ${PROJECT_ROOT}/monitoring/node-red/data:/var/log/node-red:ro networks: # - edge - monitor diff --git a/monitoring/telegraf/telegraf.conf b/monitoring/telegraf/telegraf.conf index 9063a7e..2e3956e 100644 --- a/monitoring/telegraf/telegraf.conf +++ b/monitoring/telegraf/telegraf.conf @@ -7,3 +7,39 @@ [[outputs.prometheus_client]] listen = ":9273" + +# Node-RED update-event logs (structured NDJSON) -> Prometheus metrics for Grafana +[[inputs.tail]] + files = ["/var/log/node-red/update-events.ndjson"] + from_beginning = false + name_override = "node_red_update_event" + data_format = "json_v2" + + [[inputs.tail.json_v2]] + measurement_name = "node_red_update_event" + + [[inputs.tail.json_v2.tag]] + path = "flow" + [[inputs.tail.json_v2.tag]] + path = "event" + [[inputs.tail.json_v2.tag]] + path = "container" + [[inputs.tail.json_v2.tag]] + path = "project" + [[inputs.tail.json_v2.tag]] + path = "host" + [[inputs.tail.json_v2.tag]] + path = "status" + + [[inputs.tail.json_v2.field]] + path = "success" + type = "int" + [[inputs.tail.json_v2.field]] + path = "failed" + type = "int" + [[inputs.tail.json_v2.field]] + path = "duration_ms" + type = "int" + [[inputs.tail.json_v2.field]] + path = "code" + type = "int"