Feat: lab1 done

This commit is contained in:
2026-03-28 02:59:29 +08:00
commit a4566f179c
9 changed files with 156 additions and 0 deletions

15
README.md Normal file
View File

@@ -0,0 +1,15 @@
# HPC Lab 1
- https://github.com/ncabatoff/process-exporter
- https://github.com/utkuozdemir/nvidia_gpu_exporter/blob/master/INSTALL.md
- https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/
You know what this means.
```
Mar 28 01:42:44 vm1774609080735-5833800-iaas kernel: NVRM: The NVIDIA Tesla V100-SXM2-32GB GPU installed in this system is
NVRM: supported through the NVIDIA 580.xx Legacy drivers. Please
NVRM: visit http://www.nvidia.com/object/unix.html for more
NVRM: information. The 595.58.03 NVIDIA driver will ignore
NVRM: this GPU. Continuing probe...
```

View File

@@ -0,0 +1,28 @@
---
services:
prometheus:
image: docker.io/prom/prometheus:v3.9.1
ports:
- 9090:9090
volumes:
- './docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro'
- './docker/prometheus/clients.yml:/etc/prometheus/clients.yml:ro'
node_exporter:
image: quay.io/prometheus/node-exporter:v1.10.2
command: ['--path.rootfs=/host']
network_mode: host
pid: host
restart: unless-stopped
volumes:
- '/:/host:ro,rslave'
process_exporter:
image: docker.io/ncabatoff/process-exporter:v0.8.7
privileged: true
command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
ports:
- 9256:9256
volumes:
- '/proc:/host/proc:ro,rslave'
- './docker/process_exporter:/config:ro'

View File

@@ -0,0 +1,5 @@
process_names:
- comm:
- bash
- prometheus
- vim

View File

@@ -0,0 +1,9 @@
- targets:
- controller:9100
- cpu01:9100
- gpu01:9100
- controller:9256
- cpu01:9256
- gpu01:9256
- gpu01:9835
labels: {}

View File

@@ -0,0 +1,40 @@
global:
scrape_interval: 1s
evaluation_interval: 30s
body_size_limit: 15MB
sample_limit: 5000
target_limit: 30
label_limit: 30
label_name_length_limit: 200
label_value_length_limit: 200
query_log_file: query.log
scrape_failure_log_file: fail.log
# scrape_timeout is set to the global default (10s).
runtime:
gogc: 42
scrape_configs:
- job_name: node
file_sd_configs:
- files:
- clients.yml
refresh_interval: 10m
- job_name: prometheus
static_configs:
- targets: ["localhost:9090"]
labels: {}
honor_labels: true
fallback_scrape_protocol: PrometheusText0.0.4
scrape_failure_log_file: fail_prom.log
storage:
tsdb:
out_of_order_time_window: 30m
retention:
time: 1d
size: 1GB

View File

@@ -0,0 +1,19 @@
---
services:
node_exporter:
image: quay.io/prometheus/node-exporter:v1.10.2
command: ['--path.rootfs=/host']
network_mode: host
pid: host
restart: unless-stopped
volumes:
- '/:/host:ro,rslave'
process_exporter:
image: docker.io/ncabatoff/process-exporter:v0.8.7
privileged: true
command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
ports:
- 9256:9256
volumes:
- '/proc:/host/proc:ro,rslave'
- './docker/process_exporter:/config:ro'

View File

@@ -0,0 +1,5 @@
process_names:
- comm:
- bash
- prometheus
- vim

View File

@@ -0,0 +1,30 @@
---
services:
node_exporter:
image: quay.io/prometheus/node-exporter:v1.10.2
command: ['--path.rootfs=/host']
network_mode: host
pid: host
restart: unless-stopped
volumes:
- '/:/host:ro,rslave'
process_exporter:
image: docker.io/ncabatoff/process-exporter:v0.8.7
privileged: true
command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
ports:
- 9256:9256
volumes:
- '/proc:/host/proc:ro,rslave'
- './docker/process_exporter:/config:ro'
nvidia_gpu_exporter:
image: ghcr.io/utkuozdemir/nvidia_gpu_exporter:1.4.1-amd64
ports:
- 9835:9835
devices:
- '/dev/nvidiactl:/dev/nvidiactl'
- '/dev/nvidia0:/dev/nvidia0'
volumes:
- '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so'
- '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1'
- '/usr/bin/nvidia-smi:/usr/bin/nvidia-smi'

View File

@@ -0,0 +1,5 @@
process_names:
- comm:
- bash
- prometheus
- vim