db-performance-watchlist

Defines database performance monitoring strategy with slow query detection, resource usage alerts, query execution thresholds, and automated alerting. Use for "database monitoring", "performance alerts", "slow queries", or "DB metrics".

$ 安裝

git clone https://github.com/patricio0312rev/skillset /tmp/skillset && cp -r /tmp/skillset/templates/db-management/db-performance-watchlist ~/.claude/skills/skillset

// tip: Run this command in your terminal to install the skill


name: db-performance-watchlist description: Defines database performance monitoring strategy with slow query detection, resource usage alerts, query execution thresholds, and automated alerting. Use for "database monitoring", "performance alerts", "slow queries", or "DB metrics".

DB Performance Watchlist

Monitor database performance and prevent regressions.

Key Performance Metrics

// performance-metrics.ts
export interface DBMetrics {
  // Query Performance
  slowQueries: {
    threshold: number; // ms
    count: number;
    queries: SlowQuery[];
  };

  // Connection Pool
  connections: {
    active: number;
    idle: number;
    total: number;
    maxConnections: number;
    utilizationPercent: number;
  };

  // Resource Usage
  resources: {
    cpuPercent: number;
    memoryPercent: number;
    diskUsagePercent: number;
    iops: number;
  };

  // Query Statistics
  queryStats: {
    selectsPerSecond: number;
    insertsPerSecond: number;
    updatesPerSecond: number;
    deletesPerSecond: number;
  };

  // Cache Performance
  cache: {
    hitRate: number; // %
    size: number; // MB
    evictions: number;
  };

  // Index Usage
  indexes: {
    unusedIndexes: string[];
    missingIndexes: string[];
  };
}

interface SlowQuery {
  query: string;
  duration: number;
  calls: number;
  avgDuration: number;
  table: string;
}

Slow Query Detection

// scripts/detect-slow-queries.ts
async function detectSlowQueries(thresholdMs: number = 100) {
  // Enable slow query logging (PostgreSQL)
  await prisma.$executeRaw`
    ALTER DATABASE mydb 
    SET log_min_duration_statement = ${thresholdMs};
  `;

  // Query pg_stat_statements for slow queries
  const slowQueries = await prisma.$queryRaw<SlowQuery[]>`
    SELECT 
      query,
      calls,
      total_exec_time / 1000 as total_time_ms,
      mean_exec_time / 1000 as avg_time_ms,
      max_exec_time / 1000 as max_time_ms,
      (total_exec_time / sum(total_exec_time) OVER()) * 100 as percent_of_total
    FROM pg_stat_statements
    WHERE mean_exec_time > ${thresholdMs}
    ORDER BY mean_exec_time DESC
    LIMIT 20
  `;

  console.log("🐌 Slow Queries Detected:\n");

  slowQueries.forEach((q, i) => {
    console.log(`${i + 1}. ${q.query.substring(0, 80)}...`);
    console.log(`   Calls: ${q.calls}`);
    console.log(`   Avg: ${q.avg_time_ms.toFixed(2)}ms`);
    console.log(`   Max: ${q.max_time_ms.toFixed(2)}ms`);
    console.log(`   % of total time: ${q.percent_of_total.toFixed(1)}%\n`);
  });

  return slowQueries;
}

Connection Pool Monitoring

async function monitorConnectionPool() {
  const stats = await prisma.$queryRaw<any[]>`
    SELECT 
      sum(numbackends) as total_connections,
      sum(CASE WHEN state = 'active' THEN 1 ELSE 0 END) as active,
      sum(CASE WHEN state = 'idle' THEN 1 ELSE 0 END) as idle,
      max_connections
    FROM pg_stat_database
    CROSS JOIN (SELECT setting::int as max_connections FROM pg_settings WHERE name = 'max_connections')
    WHERE datname = current_database()
    GROUP BY max_connections
  `;

  const { total_connections, active, idle, max_connections } = stats[0];
  const utilization = (total_connections / max_connections) * 100;

  console.log("🔌 Connection Pool Status:");
  console.log(
    `  Total: ${total_connections}/${max_connections} (${utilization.toFixed(
      1
    )}%)`
  );
  console.log(`  Active: ${active}`);
  console.log(`  Idle: ${idle}`);

  // Alert if > 80% utilization
  if (utilization > 80) {
    console.warn("⚠️  Connection pool >80% utilized!");
    await sendAlert({
      title: "High connection pool usage",
      message: `${utilization.toFixed(1)}% of connections in use`,
    });
  }
}

Resource Monitoring

async function monitorResources() {
  // CPU Usage
  const cpuStats = await prisma.$queryRaw<any[]>`
    SELECT 
      (sum(total_exec_time) / (extract(epoch from (now() - stats_reset)) * 1000 * 100)) as cpu_percent
    FROM pg_stat_statements, pg_stat_database
    WHERE datname = current_database()
  `;

  // Memory Usage
  const memStats = await prisma.$queryRaw<any[]>`
    SELECT 
      pg_size_pretty(pg_database_size(current_database())) as db_size,
      pg_size_pretty(sum(pg_relation_size(schemaname||'.'||tablename))) as tables_size
    FROM pg_tables
    WHERE schemaname = 'public'
  `;

  // Cache Hit Rate
  const cacheStats = await prisma.$queryRaw<any[]>`
    SELECT 
      sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) * 100 as cache_hit_rate
    FROM pg_statio_user_tables
  `;

  console.log("📊 Resource Usage:");
  console.log(`  CPU: ${cpuStats[0].cpu_percent.toFixed(1)}%`);
  console.log(`  Database Size: ${memStats[0].db_size}`);
  console.log(`  Cache Hit Rate: ${cacheStats[0].cache_hit_rate.toFixed(1)}%`);

  // Alert if cache hit rate < 90%
  if (cacheStats[0].cache_hit_rate < 90) {
    console.warn("⚠️  Cache hit rate below 90%!");
    await sendAlert({
      title: "Low cache hit rate",
      message: `Cache hit rate: ${cacheStats[0].cache_hit_rate.toFixed(1)}%`,
    });
  }
}

Index Usage Analysis

async function analyzeIndexUsage() {
  // Find unused indexes
  const unusedIndexes = await prisma.$queryRaw<any[]>`
    SELECT 
      schemaname,
      tablename,
      indexname,
      idx_scan
    FROM pg_stat_user_indexes
    WHERE idx_scan = 0
    AND indexname NOT LIKE '%_pkey'
    ORDER BY pg_relation_size(indexrelid) DESC
  `;

  console.log("🗂️  Unused Indexes:\n");
  unusedIndexes.forEach((idx) => {
    console.log(`  ${idx.tablename}.${idx.indexname} (0 scans)`);
  });

  // Find missing indexes (sequential scans on large tables)
  const missingIndexes = await prisma.$queryRaw<any[]>`
    SELECT 
      schemaname,
      tablename,
      seq_scan,
      seq_tup_read,
      idx_scan,
      n_live_tup
    FROM pg_stat_user_tables
    WHERE seq_scan > 1000
    AND n_live_tup > 10000
    ORDER BY seq_scan * n_live_tup DESC
    LIMIT 10
  `;

  console.log("\n📉 Tables with High Sequential Scans:\n");
  missingIndexes.forEach((table) => {
    console.log(`  ${table.tablename}:`);
    console.log(`    Sequential scans: ${table.seq_scan}`);
    console.log(`    Rows: ${table.n_live_tup}`);
    console.log(`    Index scans: ${table.idx_scan}`);
  });
}

Alert Thresholds

const ALERT_THRESHOLDS = {
  slowQuery: {
    avgDuration: 500, // ms
    maxDuration: 2000, // ms
    callsPerMinute: 100,
  },
  connections: {
    utilizationWarning: 70, // %
    utilizationCritical: 85, // %
  },
  resources: {
    cpuWarning: 70, // %
    cpuCritical: 85, // %
    memoryWarning: 80, // %
    memoryCritical: 90, // %
    diskWarning: 75, // %
    diskCritical: 85, // %
  },
  cache: {
    hitRateWarning: 90, // %
    hitRateCritical: 80, // %
  },
  queryRate: {
    maxSelectsPerSecond: 10000,
    maxWritesPerSecond: 1000,
  },
};

async function checkThresholds() {
  const metrics = await gatherMetrics();

  // Check slow queries
  if (metrics.slowQueries.count > 10) {
    await sendAlert({
      level: "warning",
      title: "Slow queries detected",
      message: `${metrics.slowQueries.count} queries exceeding ${ALERT_THRESHOLDS.slowQuery.avgDuration}ms`,
    });
  }

  // Check connection pool
  if (
    metrics.connections.utilizationPercent >
    ALERT_THRESHOLDS.connections.utilizationCritical
  ) {
    await sendAlert({
      level: "critical",
      title: "Connection pool critical",
      message: `${metrics.connections.utilizationPercent.toFixed(
        1
      )}% utilization`,
    });
  }

  // Check cache hit rate
  if (metrics.cache.hitRate < ALERT_THRESHOLDS.cache.hitRateCritical) {
    await sendAlert({
      level: "critical",
      title: "Cache hit rate critical",
      message: `${metrics.cache.hitRate.toFixed(1)}% hit rate`,
    });
  }
}

Monitoring Dashboard

// Generate monitoring report
async function generatePerformanceReport() {
  console.log("📊 Database Performance Report\n");
  console.log("=".repeat(50) + "\n");

  // Slow queries
  const slowQueries = await detectSlowQueries(100);
  console.log(`Slow Queries (>100ms): ${slowQueries.length}\n`);

  // Connection pool
  await monitorConnectionPool();
  console.log();

  // Resources
  await monitorResources();
  console.log();

  // Index usage
  await analyzeIndexUsage();
  console.log();

  // Query rates
  const queryStats = await prisma.$queryRaw<any[]>`
    SELECT 
      sum(xact_commit + xact_rollback) as transactions,
      sum(tup_returned) as rows_read,
      sum(tup_inserted) as rows_inserted,
      sum(tup_updated) as rows_updated,
      sum(tup_deleted) as rows_deleted
    FROM pg_stat_database
    WHERE datname = current_database()
  `;

  console.log("📈 Query Statistics:");
  console.log(`  Transactions: ${queryStats[0].transactions}`);
  console.log(`  Rows read: ${queryStats[0].rows_read}`);
  console.log(`  Rows inserted: ${queryStats[0].rows_inserted}`);
  console.log(`  Rows updated: ${queryStats[0].rows_updated}`);
  console.log(`  Rows deleted: ${queryStats[0].rows_deleted}`);
}

Automated Monitoring Script

// scripts/monitor-db.ts
import cron from "node-cron";

// Run every 5 minutes
cron.schedule("*/5 * * * *", async () => {
  await checkThresholds();
});

// Generate report every hour
cron.schedule("0 * * * *", async () => {
  await generatePerformanceReport();
});

// Analyze indexes weekly
cron.schedule("0 0 * * 0", async () => {
  await analyzeIndexUsage();
});

Grafana Dashboard Queries

-- Query latency over time
SELECT
  bucket,
  AVG(mean_exec_time) as avg_latency,
  MAX(max_exec_time) as max_latency,
  SUM(calls) as total_calls
FROM pg_stat_statements
WHERE query NOT LIKE '%pg_stat_statements%'
GROUP BY time_bucket('5 minutes', queryid)
ORDER BY bucket;

-- Connection count over time
SELECT
  now() as time,
  count(*) as total,
  count(*) FILTER (WHERE state = 'active') as active,
  count(*) FILTER (WHERE state = 'idle') as idle
FROM pg_stat_activity;

-- Cache hit rate
SELECT
  sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) * 100 as cache_hit_rate
FROM pg_statio_user_tables;

Best Practices

  1. Monitor continuously: Don't wait for problems
  2. Set appropriate thresholds: Based on your SLAs
  3. Alert on trends: Not just absolute values
  4. Review regularly: Weekly performance reviews
  5. Automate everything: No manual checks
  6. Document baselines: Know what's normal
  7. Test alerts: Ensure they work

Output Checklist

  • Slow query detection configured
  • Connection pool monitoring
  • Resource usage tracking
  • Cache hit rate monitoring
  • Index usage analysis
  • Alert thresholds defined
  • Monitoring dashboard setup
  • Automated checks scheduled
  • Grafana/alerting integration
  • Performance baseline documented