commit a4566f179ca2e585e9af3044efe7b599232a58cd Author: Yi-Ting Shih Date: Sat Mar 28 02:59:29 2026 +0800 Feat: lab1 done diff --git a/README.md b/README.md new file mode 100644 index 0000000..bb85d70 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# HPC Lab 1 + +- https://github.com/ncabatoff/process-exporter +- https://github.com/utkuozdemir/nvidia_gpu_exporter/blob/master/INSTALL.md +- https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/ + +You know what this means. + +``` +Mar 28 01:42:44 vm1774609080735-5833800-iaas kernel: NVRM: The NVIDIA Tesla V100-SXM2-32GB GPU installed in this system is + NVRM: supported through the NVIDIA 580.xx Legacy drivers. Please + NVRM: visit http://www.nvidia.com/object/unix.html for more + NVRM: information. The 595.58.03 NVIDIA driver will ignore + NVRM: this GPU. Continuing probe... +``` diff --git a/controller/monitoring/docker-compose.yaml b/controller/monitoring/docker-compose.yaml new file mode 100644 index 0000000..8029ed9 --- /dev/null +++ b/controller/monitoring/docker-compose.yaml @@ -0,0 +1,28 @@ +--- +services: + prometheus: + image: docker.io/prom/prometheus:v3.9.1 + ports: + - 9090:9090 + volumes: + - './docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro' + - './docker/prometheus/clients.yml:/etc/prometheus/clients.yml:ro' + + node_exporter: + image: quay.io/prometheus/node-exporter:v1.10.2 + command: ['--path.rootfs=/host'] + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '/:/host:ro,rslave' + + process_exporter: + image: docker.io/ncabatoff/process-exporter:v0.8.7 + privileged: true + command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml'] + ports: + - 9256:9256 + volumes: + - '/proc:/host/proc:ro,rslave' + - './docker/process_exporter:/config:ro' diff --git a/controller/monitoring/docker/process_exporter/config.yml b/controller/monitoring/docker/process_exporter/config.yml new file mode 100644 index 0000000..b7837fe --- /dev/null +++ b/controller/monitoring/docker/process_exporter/config.yml @@ -0,0 +1,5 @@ +process_names: + - comm: + - bash + - prometheus + - vim diff --git a/controller/monitoring/docker/prometheus/clients.yml b/controller/monitoring/docker/prometheus/clients.yml new file mode 100644 index 0000000..3638a27 --- /dev/null +++ b/controller/monitoring/docker/prometheus/clients.yml @@ -0,0 +1,9 @@ +- targets: + - controller:9100 + - cpu01:9100 + - gpu01:9100 + - controller:9256 + - cpu01:9256 + - gpu01:9256 + - gpu01:9835 + labels: {} diff --git a/controller/monitoring/docker/prometheus/prometheus.yml b/controller/monitoring/docker/prometheus/prometheus.yml new file mode 100644 index 0000000..fa645e3 --- /dev/null +++ b/controller/monitoring/docker/prometheus/prometheus.yml @@ -0,0 +1,40 @@ +global: + scrape_interval: 1s + evaluation_interval: 30s + body_size_limit: 15MB + sample_limit: 5000 + target_limit: 30 + label_limit: 30 + label_name_length_limit: 200 + label_value_length_limit: 200 + query_log_file: query.log + scrape_failure_log_file: fail.log + # scrape_timeout is set to the global default (10s). + +runtime: + gogc: 42 + +scrape_configs: + - job_name: node + + file_sd_configs: + - files: + - clients.yml + refresh_interval: 10m + + - job_name: prometheus + + static_configs: + - targets: ["localhost:9090"] + labels: {} + + honor_labels: true + fallback_scrape_protocol: PrometheusText0.0.4 + scrape_failure_log_file: fail_prom.log + +storage: + tsdb: + out_of_order_time_window: 30m + retention: + time: 1d + size: 1GB diff --git a/cpu01/monitoring/docker-compose.yaml b/cpu01/monitoring/docker-compose.yaml new file mode 100644 index 0000000..b95573e --- /dev/null +++ b/cpu01/monitoring/docker-compose.yaml @@ -0,0 +1,19 @@ +--- +services: + node_exporter: + image: quay.io/prometheus/node-exporter:v1.10.2 + command: ['--path.rootfs=/host'] + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '/:/host:ro,rslave' + process_exporter: + image: docker.io/ncabatoff/process-exporter:v0.8.7 + privileged: true + command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml'] + ports: + - 9256:9256 + volumes: + - '/proc:/host/proc:ro,rslave' + - './docker/process_exporter:/config:ro' diff --git a/cpu01/monitoring/docker/process_exporter/config.yml b/cpu01/monitoring/docker/process_exporter/config.yml new file mode 100644 index 0000000..b7837fe --- /dev/null +++ b/cpu01/monitoring/docker/process_exporter/config.yml @@ -0,0 +1,5 @@ +process_names: + - comm: + - bash + - prometheus + - vim diff --git a/gpu01/monitoring/docker-compose.yaml b/gpu01/monitoring/docker-compose.yaml new file mode 100644 index 0000000..ef0227d --- /dev/null +++ b/gpu01/monitoring/docker-compose.yaml @@ -0,0 +1,30 @@ +--- +services: + node_exporter: + image: quay.io/prometheus/node-exporter:v1.10.2 + command: ['--path.rootfs=/host'] + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '/:/host:ro,rslave' + process_exporter: + image: docker.io/ncabatoff/process-exporter:v0.8.7 + privileged: true + command: ['-procfs', '/host/proc', '-config.path', '/config/config.yml'] + ports: + - 9256:9256 + volumes: + - '/proc:/host/proc:ro,rslave' + - './docker/process_exporter:/config:ro' + nvidia_gpu_exporter: + image: ghcr.io/utkuozdemir/nvidia_gpu_exporter:1.4.1-amd64 + ports: + - 9835:9835 + devices: + - '/dev/nvidiactl:/dev/nvidiactl' + - '/dev/nvidia0:/dev/nvidia0' + volumes: + - '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so' + - '/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1' + - '/usr/bin/nvidia-smi:/usr/bin/nvidia-smi' diff --git a/gpu01/monitoring/docker/process_exporter/config.yml b/gpu01/monitoring/docker/process_exporter/config.yml new file mode 100644 index 0000000..b7837fe --- /dev/null +++ b/gpu01/monitoring/docker/process_exporter/config.yml @@ -0,0 +1,5 @@ +process_names: + - comm: + - bash + - prometheus + - vim