فهرست منبع

Add open file descriptor monitoring (#5655)

Michael Vines 6 سال پیش
والد
کامیت
81bb208a62

+ 167 - 15
metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json

@@ -15,8 +15,8 @@
   "editable": true,
   "gnetId": null,
   "graphTooltip": 0,
-  "id": 851,
-  "iteration": 1565991401072,
+  "id": 883,
+  "iteration": 1566852798488,
   "links": [
     {
       "asDropdown": true,
@@ -2516,7 +2516,7 @@
         "x": 12,
         "y": 24
       },
-      "id": 23,
+      "id": 61,
       "interval": null,
       "links": [],
       "mappingType": 1,
@@ -2569,7 +2569,7 @@
           ],
           "orderByTime": "ASC",
           "policy": "default",
-          "query": "SELECT sum(\"one\") FROM \"$testnet\".\"autogen\".\"panic\" WHERE $timeFilter",
+          "query": "SELECT SUM(\"points_lost\") FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter\n",
           "rawQuery": true,
           "refId": "A",
           "resultFormat": "table",
@@ -2591,7 +2591,7 @@
         }
       ],
       "thresholds": "",
-      "title": "Total Panics",
+      "title": "Lost Datapoints",
       "type": "singlestat",
       "valueFontSize": "80%",
       "valueMaps": [
@@ -2840,7 +2840,7 @@
       "datasource": "$datasource",
       "fill": 1,
       "gridPos": {
-        "h": 6,
+        "h": 3,
         "w": 8,
         "x": 0,
         "y": 26
@@ -2852,7 +2852,7 @@
         "current": false,
         "max": false,
         "min": false,
-        "rightSide": false,
+        "rightSide": true,
         "show": true,
         "total": false,
         "values": false
@@ -2888,7 +2888,7 @@
           "hide": false,
           "orderByTime": "ASC",
           "policy": "default",
-          "query": "SELECT MEAN(\"points_written\") as \"Mean points written\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
+          "query": "SELECT MEAN(\"points_written\") as \"mean\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
           "rawQuery": true,
           "refId": "B",
           "resultFormat": "time_series",
@@ -2925,7 +2925,7 @@
           ],
           "orderByTime": "ASC",
           "policy": "default",
-          "query": "SELECT MAX(\"points_written\") as \"Max points written\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
+          "query": "SELECT MAX(\"points_written\") as \"max\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
           "rawQuery": true,
           "refId": "A",
           "resultFormat": "time_series",
@@ -3263,6 +3263,162 @@
         "alignLevel": null
       }
     },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 3,
+        "w": 8,
+        "x": 0,
+        "y": 29
+      },
+      "id": 62,
+      "legend": {
+        "alignAsTable": false,
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "connected",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "groupBy": [
+            {
+              "params": [
+                "$__interval"
+              ],
+              "type": "time"
+            },
+            {
+              "params": [
+                "null"
+              ],
+              "type": "fill"
+            }
+          ],
+          "hide": false,
+          "orderByTime": "ASC",
+          "policy": "default",
+          "query": "SELECT MEAN(\"count\") as \"mean\" FROM \"$testnet\".\"autogen\".\"open-files\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
+          "rawQuery": true,
+          "refId": "B",
+          "resultFormat": "time_series",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "field"
+              },
+              {
+                "params": [],
+                "type": "mean"
+              }
+            ]
+          ],
+          "tags": []
+        },
+        {
+          "groupBy": [
+            {
+              "params": [
+                "$__interval"
+              ],
+              "type": "time"
+            },
+            {
+              "params": [
+                "null"
+              ],
+              "type": "fill"
+            }
+          ],
+          "orderByTime": "ASC",
+          "policy": "default",
+          "query": "SELECT MAX(\"count\") as \"max\" FROM \"$testnet\".\"autogen\".\"open-files\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
+          "rawQuery": true,
+          "refId": "A",
+          "resultFormat": "time_series",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "field"
+              },
+              {
+                "params": [],
+                "type": "mean"
+              }
+            ]
+          ],
+          "tags": []
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Open Files per node",
+      "tooltip": {
+        "shared": true,
+        "sort": 1,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": 0,
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": "0.2",
+          "show": true
+        },
+        {
+          "decimals": null,
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
     {
       "columns": [],
       "datasource": "$datasource",
@@ -8173,10 +8329,6 @@
       },
       {
         "allValue": ".*",
-        "current": {
-          "text": "All",
-          "value": "$__all"
-        },
         "datasource": "$datasource",
         "hide": 0,
         "includeAll": true,
@@ -8228,5 +8380,5 @@
   "timezone": "",
   "title": "Testnet Monitor (edge)",
   "uid": "testnet-edge",
-  "version": 3
-}
+  "version": 1
+}

+ 1 - 1
net/net.sh

@@ -752,7 +752,7 @@ stopNode() {
       PS4=\"$PS4\"
       set -x
       ! tmux list-sessions || tmux kill-session
-      for pid in solana/{net-stats,oom-monitor}.pid; do
+      for pid in solana/{net-stats,fd-monitor,oom-monitor}.pid; do
         pgid=\$(ps opgid= \$(cat \$pid) | tr -d '[:space:]')
         if [[ -n \$pgid ]]; then
           sudo kill -- -\$pgid

+ 5 - 1
net/remote/remote-client.sh

@@ -50,9 +50,13 @@ skip)
 esac
 
 (
-  sudo scripts/oom-monitor.sh
+  sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
 ) > oom-monitor.log 2>&1 &
+echo $! > oom-monitor.pid
+scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
+echo $! > fd-monitor.pid
 scripts/net-stats.sh  > net-stats.log 2>&1 &
+echo $! > net-stats.pid
 
 ! tmux list-sessions || tmux kill-session
 

+ 2 - 0
net/remote/remote-node.sh

@@ -93,6 +93,8 @@ local|tar|skip)
     sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
   ) > oom-monitor.log 2>&1 &
   echo $! > oom-monitor.pid
+  scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
+  echo $! > fd-monitor.pid
   scripts/net-stats.sh  > net-stats.log 2>&1 &
   echo $! > net-stats.pid
 

+ 20 - 0
scripts/fd-monitor.sh

@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+#
+# Reports open file descriptors for the current user
+#
+set -e
+
+[[ $(uname) == Linux ]] || exit 0
+
+cd "$(dirname "$0")"
+
+# shellcheck source=scripts/configure-metrics.sh
+source configure-metrics.sh
+
+while true; do
+  count=$(lsof -u $UID | wc -l)
+  ./metrics-write-datapoint.sh "open-files,hostname=$HOSTNAME count=$count"
+  sleep 10
+done
+
+exit 1

+ 1 - 1
scripts/metrics-write-datapoint.sh

@@ -22,5 +22,5 @@ if [[ -n $INFLUX_HOST ]]; then
 fi
 
 echo "${host}/write?db=${INFLUX_DATABASE}&u=${INFLUX_USERNAME}&p=${INFLUX_PASSWORD}" \
-  | xargs curl --max-time 5 -XPOST --data-binary "$point"
+  | xargs curl --max-time 5 --silent --show-error -XPOST --data-binary "$point"
 exit 0