Feat: lab1 done
This commit is contained in:
15
README.md
Normal file
15
README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# HPC Lab 1
|
||||
|
||||
- https://github.com/ncabatoff/process-exporter
|
||||
- https://github.com/utkuozdemir/nvidia_gpu_exporter/blob/master/INSTALL.md
|
||||
- https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/
|
||||
|
||||
You know what this means.
|
||||
|
||||
```
|
||||
Mar 28 01:42:44 vm1774609080735-5833800-iaas kernel: NVRM: The NVIDIA Tesla V100-SXM2-32GB GPU installed in this system is
|
||||
NVRM: supported through the NVIDIA 580.xx Legacy drivers. Please
|
||||
NVRM: visit http://www.nvidia.com/object/unix.html for more
|
||||
NVRM: information. The 595.58.03 NVIDIA driver will ignore
|
||||
NVRM: this GPU. Continuing probe...
|
||||
```
|
||||
28
controller/monitoring/docker-compose.yaml
Normal file
28
controller/monitoring/docker-compose.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
---
|
||||
services:
|
||||
prometheus:
|
||||
image: docker.io/prom/prometheus:v3.9.1
|
||||
ports:
|
||||
- 9090:9090
|
||||
volumes:
|
||||
- './docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro'
|
||||
- './docker/prometheus/clients.yml:/etc/prometheus/clients.yml:ro'
|
||||
|
||||
node_exporter:
|
||||
image: quay.io/prometheus/node-exporter:v1.10.2
|
||||
command: ['--path.rootfs=/host']
|
||||
network_mode: host
|
||||
pid: host
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- '/:/host:ro,rslave'
|
||||
|
||||
process_exporter:
|
||||
image: docker.io/ncabatoff/process-exporter:v0.8.7
|
||||
privileged: true
|
||||
command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
|
||||
ports:
|
||||
- 9256:9256
|
||||
volumes:
|
||||
- '/proc:/host/proc:ro,rslave'
|
||||
- './docker/process_exporter:/config:ro'
|
||||
5
controller/monitoring/docker/process_exporter/config.yml
Normal file
5
controller/monitoring/docker/process_exporter/config.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
process_names:
|
||||
- comm:
|
||||
- bash
|
||||
- prometheus
|
||||
- vim
|
||||
9
controller/monitoring/docker/prometheus/clients.yml
Normal file
9
controller/monitoring/docker/prometheus/clients.yml
Normal file
@@ -0,0 +1,9 @@
|
||||
- targets:
|
||||
- controller:9100
|
||||
- cpu01:9100
|
||||
- gpu01:9100
|
||||
- controller:9256
|
||||
- cpu01:9256
|
||||
- gpu01:9256
|
||||
- gpu01:9835
|
||||
labels: {}
|
||||
40
controller/monitoring/docker/prometheus/prometheus.yml
Normal file
40
controller/monitoring/docker/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,40 @@
|
||||
global:
|
||||
scrape_interval: 1s
|
||||
evaluation_interval: 30s
|
||||
body_size_limit: 15MB
|
||||
sample_limit: 5000
|
||||
target_limit: 30
|
||||
label_limit: 30
|
||||
label_name_length_limit: 200
|
||||
label_value_length_limit: 200
|
||||
query_log_file: query.log
|
||||
scrape_failure_log_file: fail.log
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
runtime:
|
||||
gogc: 42
|
||||
|
||||
scrape_configs:
|
||||
- job_name: node
|
||||
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- clients.yml
|
||||
refresh_interval: 10m
|
||||
|
||||
- job_name: prometheus
|
||||
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
labels: {}
|
||||
|
||||
honor_labels: true
|
||||
fallback_scrape_protocol: PrometheusText0.0.4
|
||||
scrape_failure_log_file: fail_prom.log
|
||||
|
||||
storage:
|
||||
tsdb:
|
||||
out_of_order_time_window: 30m
|
||||
retention:
|
||||
time: 1d
|
||||
size: 1GB
|
||||
19
cpu01/monitoring/docker-compose.yaml
Normal file
19
cpu01/monitoring/docker-compose.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
services:
|
||||
node_exporter:
|
||||
image: quay.io/prometheus/node-exporter:v1.10.2
|
||||
command: ['--path.rootfs=/host']
|
||||
network_mode: host
|
||||
pid: host
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- '/:/host:ro,rslave'
|
||||
process_exporter:
|
||||
image: docker.io/ncabatoff/process-exporter:v0.8.7
|
||||
privileged: true
|
||||
command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
|
||||
ports:
|
||||
- 9256:9256
|
||||
volumes:
|
||||
- '/proc:/host/proc:ro,rslave'
|
||||
- './docker/process_exporter:/config:ro'
|
||||
5
cpu01/monitoring/docker/process_exporter/config.yml
Normal file
5
cpu01/monitoring/docker/process_exporter/config.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
process_names:
|
||||
- comm:
|
||||
- bash
|
||||
- prometheus
|
||||
- vim
|
||||
30
gpu01/monitoring/docker-compose.yaml
Normal file
30
gpu01/monitoring/docker-compose.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
---
|
||||
services:
|
||||
node_exporter:
|
||||
image: quay.io/prometheus/node-exporter:v1.10.2
|
||||
command: ['--path.rootfs=/host']
|
||||
network_mode: host
|
||||
pid: host
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- '/:/host:ro,rslave'
|
||||
process_exporter:
|
||||
image: docker.io/ncabatoff/process-exporter:v0.8.7
|
||||
privileged: true
|
||||
command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
|
||||
ports:
|
||||
- 9256:9256
|
||||
volumes:
|
||||
- '/proc:/host/proc:ro,rslave'
|
||||
- './docker/process_exporter:/config:ro'
|
||||
nvidia_gpu_exporter:
|
||||
image: ghcr.io/utkuozdemir/nvidia_gpu_exporter:1.4.1-amd64
|
||||
ports:
|
||||
- 9835:9835
|
||||
devices:
|
||||
- '/dev/nvidiactl:/dev/nvidiactl'
|
||||
- '/dev/nvidia0:/dev/nvidia0'
|
||||
volumes:
|
||||
- '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so'
|
||||
- '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1'
|
||||
- '/usr/bin/nvidia-smi:/usr/bin/nvidia-smi'
|
||||
5
gpu01/monitoring/docker/process_exporter/config.yml
Normal file
5
gpu01/monitoring/docker/process_exporter/config.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
process_names:
|
||||
- comm:
|
||||
- bash
|
||||
- prometheus
|
||||
- vim
|
||||
Reference in New Issue
Block a user