Feat: lab1 done
This commit is contained in:
15
README.md
Normal file
15
README.md
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# HPC Lab 1
|
||||||
|
|
||||||
|
- https://github.com/ncabatoff/process-exporter
|
||||||
|
- https://github.com/utkuozdemir/nvidia_gpu_exporter/blob/master/INSTALL.md
|
||||||
|
- https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/
|
||||||
|
|
||||||
|
You know what this means.
|
||||||
|
|
||||||
|
```
|
||||||
|
Mar 28 01:42:44 vm1774609080735-5833800-iaas kernel: NVRM: The NVIDIA Tesla V100-SXM2-32GB GPU installed in this system is
|
||||||
|
NVRM: supported through the NVIDIA 580.xx Legacy drivers. Please
|
||||||
|
NVRM: visit http://www.nvidia.com/object/unix.html for more
|
||||||
|
NVRM: information. The 595.58.03 NVIDIA driver will ignore
|
||||||
|
NVRM: this GPU. Continuing probe...
|
||||||
|
```
|
||||||
28
controller/monitoring/docker-compose.yaml
Normal file
28
controller/monitoring/docker-compose.yaml
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
---
|
||||||
|
services:
|
||||||
|
prometheus:
|
||||||
|
image: docker.io/prom/prometheus:v3.9.1
|
||||||
|
ports:
|
||||||
|
- 9090:9090
|
||||||
|
volumes:
|
||||||
|
- './docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro'
|
||||||
|
- './docker/prometheus/clients.yml:/etc/prometheus/clients.yml:ro'
|
||||||
|
|
||||||
|
node_exporter:
|
||||||
|
image: quay.io/prometheus/node-exporter:v1.10.2
|
||||||
|
command: ['--path.rootfs=/host']
|
||||||
|
network_mode: host
|
||||||
|
pid: host
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- '/:/host:ro,rslave'
|
||||||
|
|
||||||
|
process_exporter:
|
||||||
|
image: docker.io/ncabatoff/process-exporter:v0.8.7
|
||||||
|
privileged: true
|
||||||
|
command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
|
||||||
|
ports:
|
||||||
|
- 9256:9256
|
||||||
|
volumes:
|
||||||
|
- '/proc:/host/proc:ro,rslave'
|
||||||
|
- './docker/process_exporter:/config:ro'
|
||||||
5
controller/monitoring/docker/process_exporter/config.yml
Normal file
5
controller/monitoring/docker/process_exporter/config.yml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
process_names:
|
||||||
|
- comm:
|
||||||
|
- bash
|
||||||
|
- prometheus
|
||||||
|
- vim
|
||||||
9
controller/monitoring/docker/prometheus/clients.yml
Normal file
9
controller/monitoring/docker/prometheus/clients.yml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
- targets:
|
||||||
|
- controller:9100
|
||||||
|
- cpu01:9100
|
||||||
|
- gpu01:9100
|
||||||
|
- controller:9256
|
||||||
|
- cpu01:9256
|
||||||
|
- gpu01:9256
|
||||||
|
- gpu01:9835
|
||||||
|
labels: {}
|
||||||
40
controller/monitoring/docker/prometheus/prometheus.yml
Normal file
40
controller/monitoring/docker/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 1s
|
||||||
|
evaluation_interval: 30s
|
||||||
|
body_size_limit: 15MB
|
||||||
|
sample_limit: 5000
|
||||||
|
target_limit: 30
|
||||||
|
label_limit: 30
|
||||||
|
label_name_length_limit: 200
|
||||||
|
label_value_length_limit: 200
|
||||||
|
query_log_file: query.log
|
||||||
|
scrape_failure_log_file: fail.log
|
||||||
|
# scrape_timeout is set to the global default (10s).
|
||||||
|
|
||||||
|
runtime:
|
||||||
|
gogc: 42
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: node
|
||||||
|
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- clients.yml
|
||||||
|
refresh_interval: 10m
|
||||||
|
|
||||||
|
- job_name: prometheus
|
||||||
|
|
||||||
|
static_configs:
|
||||||
|
- targets: ["localhost:9090"]
|
||||||
|
labels: {}
|
||||||
|
|
||||||
|
honor_labels: true
|
||||||
|
fallback_scrape_protocol: PrometheusText0.0.4
|
||||||
|
scrape_failure_log_file: fail_prom.log
|
||||||
|
|
||||||
|
storage:
|
||||||
|
tsdb:
|
||||||
|
out_of_order_time_window: 30m
|
||||||
|
retention:
|
||||||
|
time: 1d
|
||||||
|
size: 1GB
|
||||||
19
cpu01/monitoring/docker-compose.yaml
Normal file
19
cpu01/monitoring/docker-compose.yaml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
services:
|
||||||
|
node_exporter:
|
||||||
|
image: quay.io/prometheus/node-exporter:v1.10.2
|
||||||
|
command: ['--path.rootfs=/host']
|
||||||
|
network_mode: host
|
||||||
|
pid: host
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- '/:/host:ro,rslave'
|
||||||
|
process_exporter:
|
||||||
|
image: docker.io/ncabatoff/process-exporter:v0.8.7
|
||||||
|
privileged: true
|
||||||
|
command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
|
||||||
|
ports:
|
||||||
|
- 9256:9256
|
||||||
|
volumes:
|
||||||
|
- '/proc:/host/proc:ro,rslave'
|
||||||
|
- './docker/process_exporter:/config:ro'
|
||||||
5
cpu01/monitoring/docker/process_exporter/config.yml
Normal file
5
cpu01/monitoring/docker/process_exporter/config.yml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
process_names:
|
||||||
|
- comm:
|
||||||
|
- bash
|
||||||
|
- prometheus
|
||||||
|
- vim
|
||||||
30
gpu01/monitoring/docker-compose.yaml
Normal file
30
gpu01/monitoring/docker-compose.yaml
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
---
|
||||||
|
services:
|
||||||
|
node_exporter:
|
||||||
|
image: quay.io/prometheus/node-exporter:v1.10.2
|
||||||
|
command: ['--path.rootfs=/host']
|
||||||
|
network_mode: host
|
||||||
|
pid: host
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- '/:/host:ro,rslave'
|
||||||
|
process_exporter:
|
||||||
|
image: docker.io/ncabatoff/process-exporter:v0.8.7
|
||||||
|
privileged: true
|
||||||
|
command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml']
|
||||||
|
ports:
|
||||||
|
- 9256:9256
|
||||||
|
volumes:
|
||||||
|
- '/proc:/host/proc:ro,rslave'
|
||||||
|
- './docker/process_exporter:/config:ro'
|
||||||
|
nvidia_gpu_exporter:
|
||||||
|
image: ghcr.io/utkuozdemir/nvidia_gpu_exporter:1.4.1-amd64
|
||||||
|
ports:
|
||||||
|
- 9835:9835
|
||||||
|
devices:
|
||||||
|
- '/dev/nvidiactl:/dev/nvidiactl'
|
||||||
|
- '/dev/nvidia0:/dev/nvidia0'
|
||||||
|
volumes:
|
||||||
|
- '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so'
|
||||||
|
- '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1'
|
||||||
|
- '/usr/bin/nvidia-smi:/usr/bin/nvidia-smi'
|
||||||
5
gpu01/monitoring/docker/process_exporter/config.yml
Normal file
5
gpu01/monitoring/docker/process_exporter/config.yml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
process_names:
|
||||||
|
- comm:
|
||||||
|
- bash
|
||||||
|
- prometheus
|
||||||
|
- vim
|
||||||
Reference in New Issue
Block a user