監控 Docker 狀態
# 本機狀態
docker stats
# 多容器狀態
docker compose stats
docker host 狀態
services:
node_exporter:
image: quay.io/prometheus/node-exporter:latest
container_name: node_exporter
# 確保容器在退出或 Docker 服務重啟時自動重啟
restart: always
# 1. 網路模式:使用主機的網路堆棧
# 這是為了讓 Node Exporter 能正確收集主機的網路指標,並直接使用主機的 9100 Port
network_mode: host
# 2. PID 模式:存取主機的程序資訊
# 這是為了讓 Node Exporter 能正確收集程序數量和系統負載等指標
pid: host
# 3. 檔案系統掛載:存取系統資訊
# 這是為了讓 Node Exporter 能讀取 /proc, /sys 等關鍵系統檔案
volumes:
# 將主機根目錄掛載到容器內的 /host,只讀模式
- "/:/host:ro,rslave"
# 4. 啟動指令:告知 Exporter 根目錄的位置
command:
- '--path.rootfs=/host'
container 狀態
services:
cadvisor:
image: ghcr.io/google/cadvisor:0.56.2
container_name: cadvisor
volumes:
# 必須配置的 Volumes
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
network_mode: host
#pid: host
#ports:
# 將 8080 端口暴露出來,這是 Prometheus 採集數據的接口
# - '8080:8080'
restart: always
privileged: true
devices:
- /dev/kmsg
收集數據 指標收集與儲存
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
volumes:
# 載入配置文件
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
# 數據持久化
- prometheus_data:/prometheus
command:
# 設定數據保留時間為 90 天
- '--storage.tsdb.retention.time=90d'
- '--config.file=/etc/prometheus/prometheus.yml'
#ports:
# - "9090:9090" # 暴露 Prometheus UI 埠
#networks:
# - monitor-net
network_mode: host
restart: unless-stopped
volumes:
prometheus_data: {}
# prometheus/prometheus.yml
global:
scrape_interval: 15s # 每 15 秒拉取一次數據
external_labels:
monitor: 'docker-cluster-monitor'
scrape_configs:
# 監控 Prometheus 自身的指標 (用於自我健康檢查)
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# 監控 Docker 主機的指標 (Node Exporter)
- job_name: 'docker_nodes'
static_configs:
- targets:
- '192.168.50.123:9100'
# ... 其他主機
# 新增一個 Job 來採集 cAdvisor 的指標
- job_name: 'cadvisor'
metrics_path: /metrics
static_configs:
- targets:
- '192.168.50.123:8080'
# ... 其他主機
!!! 修改 prometheus.yml 後, 需要 reload
docker kill -s HUP <container_id>
!!! 檢查 prometheus.yml 語法
docker compose exec prometheus promtool check config /etc/prometheus/prometheus.yml
數據檢視
services:
grafana:
image: grafana/grafana:latest
container_name: grafana
volumes:
# 數據持久化,用於儲存用戶、設置、和修改後的儀表板
- grafana_data:/var/lib/grafana
environment:
# 預設管理員帳號密碼 (請務必修改密碼)
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=icYwJBq6rRFwz0u9Nft9
# 讓 Grafana 在啟動時能自動連接 Prometheus (可選,通常建議手動配置)
- GF_PATHS_PROVISIONING=/etc/grafana/provisioning
depends_on:
- prometheus # 確保 Prometheus 先啟動
#ports:
# - "3000:3000" # 暴露 Grafana UI 埠
#networks:
# - monitor-net
network_mode: host
restart: unless-stopped
volumes:
grafana_data: {}
https://grafana.com/grafana/dashboards/
!!! node-exporter
https://grafana.com/grafana/dashboards/1860-node-exporter-full/
https://grafana.com/grafana/dashboards/16098-node-exporter-dashboard-20240520-job/
!!! cadvisor
https://grafana.com/grafana/dashboards/19724-y0nei-s-cadvisor-exporter/