Sat Dec 06 2025
Prometheus и Grafana: мониторинг инфраструктуры
Мониторинг — критически важная часть production-инфраструктуры. Prometheus + Grafana — стандартная связка для сбора метрик и визуализации. Prometheus собирает и хранит метрики, а Grafana предоставляет мощный интерфейс для создания дашбордов и алертов. Вместе они образуют полноценную систему мониторинга.
Архитектура
Prometheus использует pull-модель — сервер сам запрашивает метрики у целевых приложений. Это отличает его от систем с push-моделью.
┌─────────────────────────────────────────────────────────────┐
│ Prometheus Server │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
│ │ ServiceDisc │ │ Storage │ │ Alert Manager │ │
│ └─────────────┘ └─────────────┘ └─────────────────────┘ │
└───────────────────────────┬─────────────────────────────────┘
│ scrape
┌─────────────────┼─────────────────┐
┌──────────┐ ┌──────────┐ ┌──────────┐
│ App 1 │ │ App 2 │ │ Node Ex │
│ (metrics)│ │ (metrics)│ │ (node) │
└──────────┘ └──────────┘ └──────────┘
│
┌──────────────────┐
│ Grafana │
│ (dashboards) │
└──────────────────┘
Prometheus
Установка
# Docker
docker run -d \
-p 9090:9090 \
-v prometheus.yml:/etc/prometheus/prometheus.yml \
prom/prometheus
# или через docker-compose
prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- "alerts.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'myservice'
metrics_path: '/metrics'
static_configs:
- targets: ['myservice:8080']
Метрики и экспортеры
Node Exporter
# Установка
wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
tar xzf node_exporter-1.7.0.linux-amd64.tar.gz
./node_exporter
# Метрики: cpu, memory, disk, network, load
MySQL Exporter
# docker-compose.yml
mysqld_exporter:
image: prom/mysqld-exporter
environment:
DATA_SOURCE_NAME: "user:password@(mysql:3306)/"
ports:
- "9104:9104"
Redis Exporter
redis_exporter:
image: oliver006/redis_exporter
environment:
REDIS_ADDR: "redis://redis:6379"
ports:
- "9121:9121"
Метрики в приложении
Go
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"net/http"
)
var (
requestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total HTTP requests",
},
[]string{"method", "path", "status"},
)
requestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration",
Buckets: []float64{.005, .01, .025, .05, .0.1, .25, .5, 1},
},
[]string{"method", "path"},
)
)
func init() {
prometheus.MustRegister(requestsTotal, requestDuration)
}
func middleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
rw := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}
next.ServeHTTP(rw, r)
duration := time.Since(start).Seconds()
requestsTotal.WithLabelValues(r.Method, r.URL.Path, fmt.Sprintf("%d", rw.statusCode)).Inc()
requestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
})
}
func main() {
http.Handle("/metrics", promhttp.Handler())
http.ListenAndServe(":8080", middleware(router))
}
Node.js
const promClient = require('prom-client');
const register = new promClient.Registry();
promClient.collectDefaultMetrics({ register });
const httpRequestsTotal = new promClient.Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'path', 'status'],
registers: [register]
});
const httpRequestDuration = new promClient.Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration',
labelNames: ['method', 'path'],
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1],
registers: [register]
});
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
httpRequestsTotal.inc({
method: req.method,
path: req.route?.path || req.path,
status: res.statusCode
});
httpRequestDuration.observe({
method: req.method,
path: req.route?.path || req.path
}, duration);
});
next();
});
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});
Python
from prometheus_client import Counter, Histogram, generate_latest
from flask import Flask, Response
app = Flask(__name__)
http_requests_total = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'path', 'status']
)
http_request_duration = Histogram(
'http_request_duration_seconds',
'HTTP request duration',
['method', 'path'],
buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0)
)
@app.before_request
def before_request():
g.start_time = time.time()
@app.after_request
def after_request(response):
duration = time.time() - g.start_time
http_requests_total.labels(
method=request.method,
path=request.path,
status=response.status_code
).inc()
http_request_duration.labels(
method=request.method,
path=request.path
).observe(duration)
return response
@app.route('/metrics')
def metrics():
return Response(generate_latest(), mimetype='text/plain')
PromQL
Базовые запросы
# Все метрики типа
http_requests_total
# С фильтром по лейблам
http_requests_total{status="200"}
# Скорость запросов в секунду
rate(http_requests_total[5m])
# Процентили
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# CPU использование
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Использование памяти
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100
# Свободное место на диске
100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100)
# Количество запущенных контейнеров
count(container_last_seen{name!=""})
Агрегации
# Сумма по лейблам
sum(rate(http_requests_total[5m]))
# Среднее
avg(rate(http_requests_total[5m]))
# Максимум
max(node_memory_MemTotal_bytes)
# По всем значениям лейбла
sum by (status) (rate(http_requests_total[5m]))
# Без определённого лейбла
sum without (instance) (rate(http_requests_total[5m]))
Alerting
alerts.yml
groups:
- name: alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | printf \"%.2f\" }}"
- alert: HighCPUUsage
expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage"
description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}"
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
- alert: DiskSpaceLow
expr: 100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100) > 90
for: 5m
labels:
severity: critical
annotations:
summary: "Disk space low"
Alertmanager
# alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'email'
routes:
- match:
severity: critical
receiver: 'slack-critical'
continue: true
receivers:
- name: 'email'
email_configs:
- to: 'alerts@example.com'
send_resolved: true
- name: 'slack-critical'
slack_configs:
- channel: '#alerts'
send_resolved: true
api_url: 'https://hooks.slack.com/services/xxx'
title: '🚨 Critical Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
Grafana
Подключение к Prometheus
#provisioning/datasources/datasources.yml
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
Dashboard JSON
{
"dashboard": {
"title": "Application Metrics",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "sum by (path) (rate(http_requests_total[5m]))",
"legendFormat": "{{path}}"
}
]
},
{
"title": "Error Rate",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m]))"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "green"},
{"value": 0.01, "color": "red"}
]
}
}
}
},
{
"title": "Request Duration (p95)",
"type": "graph",
"gridPos": {"x": 12, "y": 4, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le, path) (rate(http_request_duration_seconds_bucket[5m])))",
"legendFormat": "p95 - {{path}}"
}
]
}
]
}
}
Полезные переменные
{
"templating": {
"list": [
{
"name": "instance",
"type": "query",
"query": "label_values(node_cpu_seconds_total, instance)",
"refresh": 1
},
{
"name": "service",
"type": "query",
"query": "label_values(http_requests_total, service)",
"refresh": 1
}
]
}
}
Примеры панелей
# CPU Load
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[$__rate_interval])) * 100)
# Memory Usage
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100
# Network Traffic
sum by (instance) (rate(node_network_receive_bytes_total[5m]))
# Disk I/O
rate(node_disk_reads_completed_total[5m])
rate(node_disk_writes_completed_total[5m])
# Uptime
time() - process_start_time_seconds{service="$service"}
Docker Compose полного стека
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./alerts.yml:/etc/prometheus/alerts.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
alertmanager:
image: prom/alertmanager:latest
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
node-exporter:
image: prom/node-exporter:latest
ports:
- "9100:9100"
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
grafana:
image: grafana/grafana:latest
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning
- grafana_data:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
ports:
- "3000:3000"
volumes:
prometheus_data:
grafana_data:
Best Practices
- Именование метрик — используйте snake_case, добавляйте суффиксы (_total, _seconds)
- Лейблы — не используйте слишком много кардинальных значений
- Alerts — настраивайте for, избегайте ложных срабатываний
- Retention — настраивайте период хранения данных
- Dashboards — создавайте иерархию (overview → service → instance)
- Recording rules — для часто запрашиваемых запросов
Заключение
Prometheus + Grafana — мощный стек для мониторинга. Правильная настройка метрик, алертинга и дашбордов позволяет оперативно обнаруживать и решать проблемы в production.