1.首先安装 NVIDIA Data Center GPU Manager (DCGM),从 https://developer.nvidia.com/dcgm 下载安装
nv-hostengine -t
yum erase -y datacenter-gpu-manager
rpm -ivh datacenter-gpu-manager*
systemctl enable --now dcgm.service
2. 安装 NVIDIA DCGM exporter for Prometheus,从 https://github.com/NVIDIA/gpu-monitoring-tools/tree/master/exporters/prometheus-dcgm 下载手工安装
wget -q -O /usr/local/bin/dcgm-exporter https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/master/exporters/prometheus-dcgm/dcgm-exporter/dcgm-exporter
chmod +x /usr/local/bin/dcgm-exporter
mkdir /run/prometheus
wget -q -O /etc/systemd/system/prometheus-dcgm.service https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/master/exporters/prometheus-dcgm/bare-metal/prometheus-dcgm.service
systemctl daemon-reload
systemctl enable --now prometheus-dcgm.service
3. 从 https://prometheus.io/download/#node_exporter 下载 node_exporter,手工安装为服务并添加 dcgm-exporter 资料
tar xf node_exporter*.tar.gz
mv node_exporter-*/node_exporter /usr/local/bin/
chown root:root /usr/local/bin/node_exporter
chmod +x /usr/local/bin/node_exporter
cat > /etc/systemd/system/node_exporter.service <<EOF
[Unit]
Description=Prometheus Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
ExecStart=/usr/local/bin/node_exporter
[Install]
WantedBy=multi-user.target
EOF
sed -i '/ExecStart=\/usr\/local\/bin\/node_exporter/c\ExecStart=\/usr\/local\/bin\/node_exporter --collector.textfile.directory=\/run\/prometheus' /etc/systemd/system/node_exporter.service
systemctl daemon-reload
systemctl enable --now node_exporter.service
4. Grafana 添加这个Dashboard
https://grafana.com/grafana/dashboards/11752