Feat: lab1 done

2026-03-28 02:59:29 +08:00
commit a4566f179c
9 changed files with 156 additions and 0 deletions
@@ -0,0 +1,15 @@
 # HPC Lab 1
 - https://github.com/ncabatoff/process-exporter
 - https://github.com/utkuozdemir/nvidia_gpu_exporter/blob/master/INSTALL.md
 - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/
 You know what this means.
 ```
 Mar 28 01:42:44 vm1774609080735-5833800-iaas kernel: NVRM: The NVIDIA Tesla V100-SXM2-32GB GPU installed in this system is
                                                     NVRM:  supported through the NVIDIA 580.xx Legacy drivers. Please
                                                     NVRM:  visit http://www.nvidia.com/object/unix.html for more
                                                     NVRM:  information.  The 595.58.03 NVIDIA driver will ignore
                                                     NVRM:  this GPU.  Continuing probe...
 ```
@@ -0,0 +1,28 @@
 ---
 services:
  prometheus:
    image: docker.io/prom/prometheus:v3.9.1
    ports:
      - 9090:9090
    volumes:
      - './docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro'
      - './docker/prometheus/clients.yml:/etc/prometheus/clients.yml:ro'
  node_exporter:
    image: quay.io/prometheus/node-exporter:v1.10.2
    command: ['--path.rootfs=/host']
    network_mode: host
    pid: host
    restart: unless-stopped
    volumes:
      - '/:/host:ro,rslave'
  process_exporter:
    image: docker.io/ncabatoff/process-exporter:v0.8.7
    privileged: true
    command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
    ports:
      - 9256:9256
    volumes:
      - '/proc:/host/proc:ro,rslave'
      - './docker/process_exporter:/config:ro'
@@ -0,0 +1,5 @@
 process_names:
  - comm:
    - bash
    - prometheus
    - vim
@@ -0,0 +1,9 @@
 - targets:
    - controller:9100
    - cpu01:9100
    - gpu01:9100
    - controller:9256
    - cpu01:9256
    - gpu01:9256
    - gpu01:9835
  labels: {}
@@ -0,0 +1,40 @@
 global:
  scrape_interval: 1s
  evaluation_interval: 30s
  body_size_limit: 15MB
  sample_limit: 5000
  target_limit: 30
  label_limit: 30
  label_name_length_limit: 200
  label_value_length_limit: 200
  query_log_file: query.log
  scrape_failure_log_file: fail.log
  # scrape_timeout is set to the global default (10s).
 runtime:
  gogc: 42
 scrape_configs:
  - job_name: node
    file_sd_configs:
      - files:
          - clients.yml
        refresh_interval: 10m
  - job_name: prometheus
    static_configs:
      - targets: ["localhost:9090"]
        labels: {}
    honor_labels: true
    fallback_scrape_protocol: PrometheusText0.0.4
    scrape_failure_log_file: fail_prom.log
 storage:
  tsdb:
    out_of_order_time_window: 30m
    retention:
      time: 1d
      size: 1GB
@@ -0,0 +1,19 @@
 ---
 services:
  node_exporter:
    image: quay.io/prometheus/node-exporter:v1.10.2
    command: ['--path.rootfs=/host']
    network_mode: host
    pid: host
    restart: unless-stopped
    volumes:
      - '/:/host:ro,rslave'
  process_exporter:
    image: docker.io/ncabatoff/process-exporter:v0.8.7
    privileged: true
    command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
    ports:
      - 9256:9256
    volumes:
      - '/proc:/host/proc:ro,rslave'
      - './docker/process_exporter:/config:ro'
@@ -0,0 +1,5 @@
 process_names:
  - comm:
    - bash
    - prometheus
    - vim
@@ -0,0 +1,30 @@
 ---
 services:
  node_exporter:
    image: quay.io/prometheus/node-exporter:v1.10.2
    command: ['--path.rootfs=/host']
    network_mode: host
    pid: host
    restart: unless-stopped
    volumes:
      - '/:/host:ro,rslave'
  process_exporter:
    image: docker.io/ncabatoff/process-exporter:v0.8.7
    privileged: true
    command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
    ports:
      - 9256:9256
    volumes:
      - '/proc:/host/proc:ro,rslave'
      - './docker/process_exporter:/config:ro'
  nvidia_gpu_exporter:
    image: ghcr.io/utkuozdemir/nvidia_gpu_exporter:1.4.1-amd64
    ports:
      - 9835:9835
    devices:
      - '/dev/nvidiactl:/dev/nvidiactl'
      - '/dev/nvidia0:/dev/nvidia0'
    volumes:
      - '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so'
      - '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1'
      - '/usr/bin/nvidia-smi:/usr/bin/nvidia-smi'
@@ -0,0 +1,5 @@
 process_names:
  - comm:
    - bash
    - prometheus
    - vim