feat(observability): Add Prometheus metrics for monitoring #209

Closed
opened 2026-01-24 17:15:05 +00:00 by jack · 0 comments
Owner

Problem

Keine Metriken für:

  • Task Processing Zeit pro Typ
  • Worker Utilization
  • Database Query Performance
  • Queue Depth Trends
  • Error Rates

Performance-Probleme bleiben unbemerkt bis User sich beschweren.

Lösung

1. Metrics Registry

// packages/backend/src/metrics/index.ts
import { Registry, Counter, Histogram, Gauge } from 'prom-client';

export const registry = new Registry();

// API Metrics
export const httpRequestDuration = new Histogram({
  name: 'http_request_duration_ms',
  help: 'HTTP request duration in milliseconds',
  labelNames: ['method', 'path', 'status'],
  buckets: [10, 50, 100, 200, 500, 1000, 2000, 5000],
  registers: [registry],
});

export const httpRequestTotal = new Counter({
  name: 'http_request_total',
  help: 'Total HTTP requests',
  labelNames: ['method', 'path', 'status'],
  registers: [registry],
});

// Task Metrics
export const taskDuration = new Histogram({
  name: 'task_duration_ms',
  help: 'Task processing duration in milliseconds',
  labelNames: ['type', 'status'],
  buckets: [100, 500, 1000, 2000, 5000, 10000, 30000],
  registers: [registry],
});

export const taskQueueDepth = new Gauge({
  name: 'task_queue_depth',
  help: 'Current number of tasks in queue',
  labelNames: ['status'],
  registers: [registry],
});

export const taskTotal = new Counter({
  name: 'task_total',
  help: 'Total tasks processed',
  labelNames: ['type', 'status'],
  registers: [registry],
});

// Worker Metrics
export const workerCount = new Gauge({
  name: 'worker_count',
  help: 'Number of connected workers',
  labelNames: ['status'],
  registers: [registry],
});

export const workerBusy = new Gauge({
  name: 'worker_busy_count',
  help: 'Number of workers currently processing tasks',
  registers: [registry],
});

// Database Metrics
export const dbQueryDuration = new Histogram({
  name: 'db_query_duration_ms',
  help: 'Database query duration in milliseconds',
  labelNames: ['operation', 'table'],
  buckets: [1, 5, 10, 25, 50, 100, 250, 500],
  registers: [registry],
});

// Memory Metrics
export const observationCount = new Gauge({
  name: 'observation_count',
  help: 'Total number of observations',
  labelNames: ['project'],
  registers: [registry],
});

export const sessionCount = new Gauge({
  name: 'session_count',
  help: 'Total number of sessions',
  labelNames: ['status'],
  registers: [registry],
});

2. Metrics Middleware

// packages/backend/src/middleware/metrics.ts
export function metricsMiddleware(req: Request, res: Response, next: NextFunction) {
  const start = Date.now();
  
  res.on('finish', () => {
    const duration = Date.now() - start;
    const labels = {
      method: req.method,
      path: normalizePath(req.path),  // /api/data/sessions/123 → /api/data/sessions/:id
      status: String(res.statusCode),
    };
    
    httpRequestDuration.observe(labels, duration);
    httpRequestTotal.inc(labels);
  });
  
  next();
}

3. Metrics Endpoint

// packages/backend/src/routes/metrics.ts
router.get('/metrics', async (req, res) => {
  // Update gauges before returning
  await updateGauges();
  
  res.set('Content-Type', registry.contentType);
  res.send(await registry.metrics());
});

async function updateGauges() {
  // Task queue depths
  const queueStats = await taskRepo.countByStatus();
  taskQueueDepth.set({ status: 'pending' }, queueStats.pending);
  taskQueueDepth.set({ status: 'processing' }, queueStats.processing);
  taskQueueDepth.set({ status: 'failed' }, queueStats.failed);
  
  // Worker counts
  const workers = workerHub.getWorkers();
  workerCount.set({ status: 'connected' }, workers.length);
  workerBusy.set(workers.filter(w => w.busy).length);
  
  // Session counts
  const sessionStats = await sessionRepo.countByStatus();
  sessionCount.set({ status: 'active' }, sessionStats.active);
  sessionCount.set({ status: 'completed' }, sessionStats.completed);
}

4. Task Metrics Integration

// TaskDispatcher
async handleTaskCompleted(taskId: string, result: unknown) {
  const task = await this.taskRepo.findById(taskId);
  const duration = Date.now() - task.assignedAt;
  
  taskDuration.observe({ type: task.type, status: 'completed' }, duration);
  taskTotal.inc({ type: task.type, status: 'completed' });
}

async handleTaskFailed(taskId: string, error: Error) {
  const task = await this.taskRepo.findById(taskId);
  
  taskTotal.inc({ type: task.type, status: 'failed' });
}

Beispiel Prometheus Queries

# Request Latency P99
histogram_quantile(0.99, rate(http_request_duration_ms_bucket[5m]))

# Task Processing Rate
rate(task_total[5m])

# Queue Depth Trend
task_queue_depth{status="pending"}

# Error Rate
rate(http_request_total{status=~"5.."}[5m]) / rate(http_request_total[5m])

# Worker Utilization
worker_busy_count / worker_count{status="connected"}

Grafana Dashboard (Optional)

{
  "title": "Claude-Mem Overview",
  "panels": [
    { "title": "Request Rate", "expr": "rate(http_request_total[5m])" },
    { "title": "Request Latency P95", "expr": "histogram_quantile(0.95, rate(http_request_duration_ms_bucket[5m]))" },
    { "title": "Task Queue Depth", "expr": "task_queue_depth" },
    { "title": "Worker Utilization", "expr": "worker_busy_count / worker_count" },
    { "title": "Error Rate", "expr": "rate(http_request_total{status=~\"5..\"}[5m])" }
  ]
}

Akzeptanzkriterien

  • Prometheus Client Library integriert
  • HTTP Request Metrics
  • Task Processing Metrics
  • Worker Metrics
  • /metrics Endpoint
  • Dokumentation der verfügbaren Metriken
  • Optional: Grafana Dashboard Template
## Problem Keine Metriken für: - Task Processing Zeit pro Typ - Worker Utilization - Database Query Performance - Queue Depth Trends - Error Rates Performance-Probleme bleiben unbemerkt bis User sich beschweren. ## Lösung ### 1. Metrics Registry ```typescript // packages/backend/src/metrics/index.ts import { Registry, Counter, Histogram, Gauge } from 'prom-client'; export const registry = new Registry(); // API Metrics export const httpRequestDuration = new Histogram({ name: 'http_request_duration_ms', help: 'HTTP request duration in milliseconds', labelNames: ['method', 'path', 'status'], buckets: [10, 50, 100, 200, 500, 1000, 2000, 5000], registers: [registry], }); export const httpRequestTotal = new Counter({ name: 'http_request_total', help: 'Total HTTP requests', labelNames: ['method', 'path', 'status'], registers: [registry], }); // Task Metrics export const taskDuration = new Histogram({ name: 'task_duration_ms', help: 'Task processing duration in milliseconds', labelNames: ['type', 'status'], buckets: [100, 500, 1000, 2000, 5000, 10000, 30000], registers: [registry], }); export const taskQueueDepth = new Gauge({ name: 'task_queue_depth', help: 'Current number of tasks in queue', labelNames: ['status'], registers: [registry], }); export const taskTotal = new Counter({ name: 'task_total', help: 'Total tasks processed', labelNames: ['type', 'status'], registers: [registry], }); // Worker Metrics export const workerCount = new Gauge({ name: 'worker_count', help: 'Number of connected workers', labelNames: ['status'], registers: [registry], }); export const workerBusy = new Gauge({ name: 'worker_busy_count', help: 'Number of workers currently processing tasks', registers: [registry], }); // Database Metrics export const dbQueryDuration = new Histogram({ name: 'db_query_duration_ms', help: 'Database query duration in milliseconds', labelNames: ['operation', 'table'], buckets: [1, 5, 10, 25, 50, 100, 250, 500], registers: [registry], }); // Memory Metrics export const observationCount = new Gauge({ name: 'observation_count', help: 'Total number of observations', labelNames: ['project'], registers: [registry], }); export const sessionCount = new Gauge({ name: 'session_count', help: 'Total number of sessions', labelNames: ['status'], registers: [registry], }); ``` ### 2. Metrics Middleware ```typescript // packages/backend/src/middleware/metrics.ts export function metricsMiddleware(req: Request, res: Response, next: NextFunction) { const start = Date.now(); res.on('finish', () => { const duration = Date.now() - start; const labels = { method: req.method, path: normalizePath(req.path), // /api/data/sessions/123 → /api/data/sessions/:id status: String(res.statusCode), }; httpRequestDuration.observe(labels, duration); httpRequestTotal.inc(labels); }); next(); } ``` ### 3. Metrics Endpoint ```typescript // packages/backend/src/routes/metrics.ts router.get('/metrics', async (req, res) => { // Update gauges before returning await updateGauges(); res.set('Content-Type', registry.contentType); res.send(await registry.metrics()); }); async function updateGauges() { // Task queue depths const queueStats = await taskRepo.countByStatus(); taskQueueDepth.set({ status: 'pending' }, queueStats.pending); taskQueueDepth.set({ status: 'processing' }, queueStats.processing); taskQueueDepth.set({ status: 'failed' }, queueStats.failed); // Worker counts const workers = workerHub.getWorkers(); workerCount.set({ status: 'connected' }, workers.length); workerBusy.set(workers.filter(w => w.busy).length); // Session counts const sessionStats = await sessionRepo.countByStatus(); sessionCount.set({ status: 'active' }, sessionStats.active); sessionCount.set({ status: 'completed' }, sessionStats.completed); } ``` ### 4. Task Metrics Integration ```typescript // TaskDispatcher async handleTaskCompleted(taskId: string, result: unknown) { const task = await this.taskRepo.findById(taskId); const duration = Date.now() - task.assignedAt; taskDuration.observe({ type: task.type, status: 'completed' }, duration); taskTotal.inc({ type: task.type, status: 'completed' }); } async handleTaskFailed(taskId: string, error: Error) { const task = await this.taskRepo.findById(taskId); taskTotal.inc({ type: task.type, status: 'failed' }); } ``` ## Beispiel Prometheus Queries ```promql # Request Latency P99 histogram_quantile(0.99, rate(http_request_duration_ms_bucket[5m])) # Task Processing Rate rate(task_total[5m]) # Queue Depth Trend task_queue_depth{status="pending"} # Error Rate rate(http_request_total{status=~"5.."}[5m]) / rate(http_request_total[5m]) # Worker Utilization worker_busy_count / worker_count{status="connected"} ``` ## Grafana Dashboard (Optional) ```json { "title": "Claude-Mem Overview", "panels": [ { "title": "Request Rate", "expr": "rate(http_request_total[5m])" }, { "title": "Request Latency P95", "expr": "histogram_quantile(0.95, rate(http_request_duration_ms_bucket[5m]))" }, { "title": "Task Queue Depth", "expr": "task_queue_depth" }, { "title": "Worker Utilization", "expr": "worker_busy_count / worker_count" }, { "title": "Error Rate", "expr": "rate(http_request_total{status=~\"5..\"}[5m])" } ] } ``` ## Akzeptanzkriterien - [ ] Prometheus Client Library integriert - [ ] HTTP Request Metrics - [ ] Task Processing Metrics - [ ] Worker Metrics - [ ] `/metrics` Endpoint - [ ] Dokumentation der verfügbaren Metriken - [ ] Optional: Grafana Dashboard Template
Sign in to join this conversation.
No milestone
No project
No assignees
1 participant
Notifications
Due date
The due date is invalid or out of range. Please use the format "yyyy-mm-dd".

No due date set.

Dependencies

No dependencies set.

Reference
customable/claude-mem#209
No description provided.