observability-setup
Setting up Prometheus metrics, OpenTelemetry tracing, and health endpoints for Nais applications
$ Instalar
git clone https://github.com/navikt/copilot /tmp/copilot && cp -r /tmp/copilot/.github/skills/observability-setup ~/.claude/skills/copilot// tip: Run this command in your terminal to install the skill
name: observability-setup description: Setting up Prometheus metrics, OpenTelemetry tracing, and health endpoints for Nais applications
Observability Setup Skill
This skill provides patterns for setting up observability in Nais applications.
Required Health Endpoints
import io.ktor.server.application.*
import io.ktor.server.response.*
import io.ktor.server.routing.*
import io.ktor.http.*
fun Application.configureHealthEndpoints(
dataSource: HikariDataSource,
kafkaProducer: KafkaProducer<String, String>
) {
routing {
get("/isalive") {
call.respondText("Alive", ContentType.Text.Plain)
}
get("/isready") {
val databaseHealthy = checkDatabase(dataSource)
val kafkaHealthy = checkKafka(kafkaProducer)
if (databaseHealthy && kafkaHealthy) {
call.respondText("Ready", ContentType.Text.Plain)
} else {
call.respondText(
"Not ready",
ContentType.Text.Plain,
HttpStatusCode.ServiceUnavailable
)
}
}
}
}
fun checkDatabase(dataSource: HikariDataSource): Boolean {
return try {
dataSource.connection.use { it.isValid(1) }
} catch (e: Exception) {
false
}
}
fun checkKafka(producer: KafkaProducer<String, String>): Boolean {
return try {
producer.partitionsFor("health-check-topic").isNotEmpty()
} catch (e: Exception) {
false
}
}
Prometheus Metrics Setup
import io.micrometer.core.instrument.Clock
import io.micrometer.core.instrument.binder.jvm.*
import io.micrometer.prometheus.PrometheusConfig
import io.micrometer.prometheus.PrometheusMeterRegistry
import io.prometheus.client.CollectorRegistry
import io.ktor.server.metrics.micrometer.*
import io.ktor.server.response.*
import io.ktor.http.*
val meterRegistry = PrometheusMeterRegistry(
PrometheusConfig.DEFAULT,
CollectorRegistry.defaultRegistry,
Clock.SYSTEM
)
fun Application.configureMetrics() {
install(MicrometerMetrics) {
registry = meterRegistry
// Production pattern from navikt/ao-oppfolgingskontor
meterBinders = listOf(
JvmMemoryMetrics(), // Heap, non-heap memory
JvmGcMetrics(), // Garbage collection
ProcessorMetrics(), // CPU usage
UptimeMetrics() // Application uptime
)
}
routing {
get("/metrics") {
call.respondText(
meterRegistry.scrape(),
ContentType.parse("text/plain; version=0.0.4")
)
}
}
}
Business Metrics
import io.micrometer.core.instrument.Counter
import io.micrometer.core.instrument.Timer
class UserService(private val meterRegistry: PrometheusMeterRegistry) {
private val userCreatedCounter = Counter.builder("users_created_total")
.description("Total users created")
.register(meterRegistry)
private val userCreationTimer = Timer.builder("user_creation_duration_seconds")
.description("User creation duration")
.register(meterRegistry)
fun createUser(user: User) {
userCreationTimer.record {
repository.save(user)
}
userCreatedCounter.increment()
}
}
OpenTelemetry Tracing
Nais enables OpenTelemetry auto-instrumentation by default. For manual spans:
import io.opentelemetry.api.GlobalOpenTelemetry
import io.opentelemetry.api.trace.Span
import io.opentelemetry.api.trace.StatusCode
val tracer = GlobalOpenTelemetry.getTracer("my-app")
fun processPayment(paymentId: String) {
val span = tracer.spanBuilder("processPayment")
.setAttribute("payment.id", paymentId)
.startSpan()
try {
// Business logic
val payment = repository.findPayment(paymentId)
span.setAttribute("payment.amount", payment.amount)
processPaymentInternal(payment)
span.setStatus(StatusCode.OK)
} catch (e: Exception) {
span.setStatus(StatusCode.ERROR, "Payment processing failed")
span.recordException(e)
throw e
} finally {
span.end()
}
}
Structured Logging
import mu.KotlinLogging
import net.logstash.logback.argument.StructuredArguments.kv
private val logger = KotlinLogging.logger {}
fun processOrder(orderId: String) {
logger.info(
"Processing order",
kv("order_id", orderId),
kv("timestamp", LocalDateTime.now())
)
try {
orderService.process(orderId)
logger.info(
"Order processed successfully",
kv("order_id", orderId)
)
} catch (e: Exception) {
logger.error(
"Order processing failed",
kv("order_id", orderId),
kv("error", e.message),
e
)
throw e
}
}
Nais Manifest
apiVersion: nais.io/v1alpha1
kind: Application
metadata:
name: my-app
namespace: myteam
labels:
team: myteam
spec:
image: ghcr.io/navikt/my-app:latest
port: 8080
# Health checks
liveness:
path: /isalive
initialDelay: 10
timeout: 1
periodSeconds: 10
failureThreshold: 3
readiness:
path: /isready
initialDelay: 10
timeout: 1
periodSeconds: 10
failureThreshold: 3
# Prometheus scraping
prometheus:
enabled: true
path: /metrics
# OpenTelemetry auto-instrumentation
observability:
autoInstrumentation:
enabled: true
runtime: java # Instruments Ktor, JDBC, Kafka automatically
logging:
destinations:
- id: loki # Automatic Loki shipping
- id: team-logs # Optional: private team logs
# Resources (for metrics alerting)
resources:
limits:
memory: 512Mi
requests:
cpu: 50m
memory: 256Mi
Alert Configuration
Create .nais/alert.yml:
apiVersion: nais.io/v1
kind: Alert
metadata:
name: my-app-alerts
namespace: myteam
labels:
team: myteam
spec:
receivers:
slack:
channel: "#team-alerts"
prependText: "@here "
alerts:
- alert: HighErrorRate
expr: |
(sum(rate(http_requests_total{app="my-app",status=~"5.."}[5m]))
/ sum(rate(http_requests_total{app="my-app"}[5m]))) > 0.05
for: 5m
description: "Error rate is {{ $value | humanizePercentage }}"
action: "Check logs in Grafana Loki"
documentation: https://teamdocs/runbooks/high-error-rate
sla: "Respond within 15 minutes"
severity: critical
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
rate(http_request_duration_seconds_bucket{app="my-app"}[5m])
) > 1
for: 10m
description: "95th percentile response time is {{ $value }}s"
action: "Check Tempo traces for slow requests"
severity: warning
- alert: PodCrashLooping
expr: |
rate(kube_pod_container_status_restarts_total{
pod=~"my-app-.*"
}[15m]) > 0
for: 5m
description: "Pod {{ $labels.pod }} is crash looping"
action: "Check logs: kubectl logs {{ $labels.pod }}"
severity: critical
- alert: HighMemoryUsage
expr: |
(container_memory_working_set_bytes{app="my-app"}
/ container_spec_memory_limit_bytes{app="my-app"}) > 0.9
for: 10m
description: "Memory usage is {{ $value | humanizePercentage }}"
action: "Check for memory leaks, increase limits if needed"
severity: warning
Complete Example
import io.ktor.server.application.*
import io.ktor.server.engine.*
import io.ktor.server.netty.*
import io.micrometer.core.instrument.Timer
import io.opentelemetry.api.GlobalOpenTelemetry
import io.opentelemetry.api.trace.StatusCode
fun main() {
val env = Environment.from(System.getenv())
val dataSource = createDataSource(env.databaseUrl)
// Run database migrations
runMigrations(dataSource)
// Setup metrics
val meterRegistry = setupMetrics()
embeddedServer(Netty, port = 8080) {
configureHealthEndpoints(dataSource)
configureMetrics(meterRegistry)
configureRouting(dataSource, meterRegistry)
}.start(wait = true)
}
fun Application.configureRouting(
dataSource: HikariDataSource,
meterRegistry: PrometheusMeterRegistry
) {
val tracer = GlobalOpenTelemetry.getTracer("my-app")
routing {
get("/api/users") {
val requestTimer = Timer.sample()
val requestCounter = meterRegistry.counter(
"http_requests_total",
"method", "GET",
"endpoint", "/api/users"
)
val span = tracer.spanBuilder("getUsersRequest")
.setAttribute("http.method", "GET")
.setAttribute("http.route", "/api/users")
.startSpan()
try {
val users = userRepository.findAll()
span.setAttribute("user.count", users.size.toLong())
span.setStatus(StatusCode.OK)
requestCounter.increment()
requestTimer.stop(meterRegistry.timer(
"http_request_duration_seconds",
"method", "GET",
"endpoint", "/api/users",
"status", "200"
))
call.respond(users)
} catch (e: Exception) {
span.setStatus(StatusCode.ERROR, "Failed to get users")
span.recordException(e)
meterRegistry.counter(
"http_requests_total",
"method", "GET",
"endpoint", "/api/users",
"status", "500"
).increment()
logger.error(
"Failed to get users",
kv("trace_id", span.spanContext.traceId),
kv("span_id", span.spanContext.spanId),
e
)
throw e
} finally {
span.end()
}
}
}
}
Grafana Dashboard Example
Create a dashboard in Grafana with these panels:
Panel 1: Request Rate
sum(rate(http_requests_total{app="my-app"}[5m])) by (endpoint)
Panel 2: Error Rate
sum(rate(http_requests_total{app="my-app",status=~"5.."}[5m]))
/ sum(rate(http_requests_total{app="my-app"}[5m])) * 100
Panel 3: Response Time (p50, p95, p99)
histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{app="my-app"}[5m]))
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{app="my-app"}[5m]))
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{app="my-app"}[5m]))
Panel 4: Memory Usage
container_memory_working_set_bytes{app="my-app"}
/ container_spec_memory_limit_bytes{app="my-app"} * 100
Panel 5: Database Connections
hikaricp_connections_active{app="my-app"}
hikaricp_connections_max{app="my-app"}
Panel 6: Kafka Consumer Lag
kafka_consumer_lag{app="my-app"}
Loki Query Examples
View logs in Grafana Loki Explorer:
# All logs from your app
{app="my-app", namespace="myteam"}
# Only errors
{app="my-app"} |= "ERROR"
# JSON logs with specific field
{app="my-app"} | json | event_type="payment_processed"
# Logs correlated with trace
{app="my-app"} | json | trace_id="abc123def456"
# Count errors per minute
sum(rate({app="my-app"} |= "ERROR" [1m])) by (pod)
Tempo Trace Search
View traces in Grafana Tempo:
- Open Grafana → Explore
- Select Tempo data source
- Query by:
- Service name:
my-app - Operation:
getUsersRequest - Duration:
> 1s - Status:
error
- Service name:
Or link from logs by clicking trace_id in Loki.
Monitoring Checklist
-
/isaliveendpoint implemented -
/isreadyendpoint with dependency checks (database, Kafka) -
/metricsendpoint exposing Prometheus metrics - Health checks configured in Nais manifest
- Business metrics instrumented (counters, timers, gauges)
- Structured logging with correlation IDs (trace_id, span_id)
- OpenTelemetry auto-instrumentation enabled in Nais manifest
- Alert rules created in
.nais/alert.yml - Slack channel configured for alerts
- Grafana dashboard created
- No sensitive data in logs or metrics (verify in Grafana)
- High-cardinality labels avoided (no user_ids, transaction_ids)
Production Patterns from navikt
Based on 177+ repositories using observability setup:
JVM Metrics Binders (navikt/ao-oppfolgingskontor)
import io.micrometer.core.instrument.binder.jvm.*
install(MicrometerMetrics) {
registry = meterRegistry
meterBinders = listOf(
JvmMemoryMetrics(), // Heap, non-heap, buffer pool metrics
JvmGcMetrics(), // GC pause time, count
ProcessorMetrics(), // CPU usage
UptimeMetrics() // Application uptime
)
}
Common Counter Patterns
// From dp-rapportering: Track business events
val eventsProcessed = Counter.builder("events_processed_total")
.description("Total events processed")
.tag("event_type", "rapportering_innsendt")
.tag("status", "ok")
.register(meterRegistry)
// From dp-rapportering: Track API errors
val apiErrors = Counter.builder("api_errors_total")
.description("Total API errors")
.tag("endpoint", "/api/rapporteringsperioder")
.tag("error_type", "validation_error")
.register(meterRegistry)
Timer Patterns
// From dp-rapportering: Measure HTTP call duration
suspend fun <T> timedAction(navn: String, block: suspend () -> T): T {
val (result, duration) = measureTimedValue {
block()
}
Timer.builder("http_timer")
.tag("navn", navn)
.description("HTTP call duration")
.register(meterRegistry)
.record(duration.inWholeMilliseconds, MILLISECONDS)
return result
}
DORA Metrics Examples
Track DORA metrics for your team:
// Deployment frequency
val deployments = Counter.builder("deployments_total")
.description("Total deployments")
.tag("team", "myteam")
.tag("environment", "production")
.register(meterRegistry)
// Lead time for changes (commit to deploy)
val leadTime = Timer.builder("deployment_lead_time_seconds")
.description("Time from commit to deployment")
.tag("team", "myteam")
.register(meterRegistry)
// Change failure rate
val failedDeployments = Counter.builder("deployments_failed_total")
.description("Total failed deployments")
.tag("team", "myteam")
.register(meterRegistry)
// Time to restore service
val incidentResolutionTime = Timer.builder("incident_resolution_duration_seconds")
.description("Time to resolve incidents")
.tag("team", "myteam")
.tag("severity", "critical")
.register(meterRegistry)
Alert on DORA metrics:
- alert: LowDeploymentFrequency
expr: |
sum(increase(deployments_total{team="myteam",environment="production"}[7d]))
< 5
description: "Only {{ $value }} deployments in last 7 days (target: >1/day)"
severity: info
- alert: HighChangeFailureRate
expr: |
sum(rate(deployments_failed_total{team="myteam"}[7d]))
/ sum(rate(deployments_total{team="myteam"}[7d]))
> 0.15
description: "Change failure rate is {{ $value | humanizePercentage }} (target: <15%)"
severity: warning
See https://dora.dev for benchmarks and best practices.
Repository
