From b5ec91f9d9987bb3acad55c014371a261256c29a Mon Sep 17 00:00:00 2001 From: loitragg Date: Fri, 26 Sep 2025 13:17:08 +0700 Subject: [PATCH] feat: setup otel monitoring infa --- iac/ecs.tf | 53 +++++++++ iac/grafana-ecs.tf | 234 +++++++++++++++++++++++++++++++++++++++ iac/outputs.tf | 89 +++++++++++++++ iac/service_discovery.tf | 62 +++++++++-- 4 files changed, 426 insertions(+), 12 deletions(-) create mode 100644 iac/grafana-ecs.tf diff --git a/iac/ecs.tf b/iac/ecs.tf index 22fee29c89..c939bc53f1 100644 --- a/iac/ecs.tf +++ b/iac/ecs.tf @@ -207,6 +207,59 @@ resource "aws_ecs_task_definition" "webui_scaled" { { name = "FASTEMBED_CACHE_PATH" value = "/app/backend/data/cache/fastembed" + }, + # OpenTelemetry Configuration + { + name = "ENABLE_OTEL" + value = "true" + }, + { + name = "ENABLE_OTEL_METRICS" + value = "true" + }, + { + name = "ENABLE_OTEL_TRACES" + value = "true" + }, + { + name = "ENABLE_OTEL_LOGS" + value = "true" + }, + { + name = "OTEL_EXPORTER_OTLP_ENDPOINT" + value = "http://otel-monitor.ggai:4317" + }, + { + name = "OTEL_EXPORTER_OTLP_INSECURE" + value = "true" + }, + { + name = "OTEL_OTLP_SPAN_EXPORTER" + value = "grpc" + }, + { + name = "OTEL_METRICS_OTLP_SPAN_EXPORTER" + value = "grpc" + }, + { + name = "OTEL_LOGS_OTLP_SPAN_EXPORTER" + value = "grpc" + }, + { + name = "OTEL_SERVICE_NAME" + value = "open-webui-production" + }, + { + name = "OTEL_RESOURCE_ATTRIBUTES" + value = "service.version=${var.container_image},deployment.environment=production,service.instance.id=ecs-fargate" + }, + { + name = "OTEL_TRACES_SAMPLER" + value = "traceidratio" + }, + { + name = "OTEL_TRACES_SAMPLER_ARG" + value = "0.1" } ] diff --git a/iac/grafana-ecs.tf b/iac/grafana-ecs.tf new file mode 100644 index 0000000000..f186009d4e --- /dev/null +++ b/iac/grafana-ecs.tf @@ -0,0 +1,234 @@ +# CloudWatch log group for Grafana OTEL +resource "aws_cloudwatch_log_group" "grafana_logs" { + name = "/ecs/grafana-otel-lgtm" + retention_in_days = 7 + + tags = { + Name = "Grafana OTEL Logs" + } +} + +# Security Group for direct Grafana ECS access +resource "aws_security_group" "grafana_ecs_direct_sg" { + name_prefix = "grafana-ecs-direct-" + vpc_id = var.vpc_id + description = "Security group for direct Grafana OTEL ECS access" + + # Allow Grafana UI access from VPN + ingress { + description = "Grafana UI from VPN" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = [var.gg_vpn_cidr] + } + + # Allow OTLP gRPC from OpenWebUI ECS tasks + ingress { + description = "OTLP gRPC from OpenWebUI ECS" + from_port = 4317 + to_port = 4317 + protocol = "tcp" + security_groups = [aws_security_group.ecs_scaled_sg.id] + } + + # Allow OTLP HTTP from OpenWebUI ECS tasks + ingress { + description = "OTLP HTTP from OpenWebUI ECS" + from_port = 4318 + to_port = 4318 + protocol = "tcp" + security_groups = [aws_security_group.ecs_scaled_sg.id] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { + Name = "Grafana ECS Direct Access Security Group" + } +} + +# IAM role for Grafana ECS tasks (reuse existing execution role) +# The existing openwebui_execution_role should have sufficient permissions + +# ECS Task Definition for Grafana OTEL LGTM +resource "aws_ecs_task_definition" "grafana_otel" { + family = "grafana-otel-lgtm" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = 1024 + memory = 2048 + execution_role_arn = aws_iam_role.openwebui_execution_role.arn + task_role_arn = aws_iam_role.openwebui_execution_role.arn + + container_definitions = jsonencode([ + { + name = "grafana-otel-lgtm" + image = "grafana/otel-lgtm:latest" + cpu = 0 + essential = true + + portMappings = [ + { + containerPort = 3000 + hostPort = 3000 + protocol = "tcp" + name = "grafana-ui" + appProtocol = "http" + }, + { + containerPort = 4317 + hostPort = 4317 + protocol = "tcp" + name = "otlp-grpc" + }, + { + containerPort = 4318 + hostPort = 4318 + protocol = "tcp" + name = "otlp-http" + appProtocol = "http" + } + ] + + environment = [ + { + name = "GF_SECURITY_ADMIN_PASSWORD" + value = "openwebui_monitoring_2024" + }, + { + name = "GF_SECURITY_ADMIN_USER" + value = "admin" + }, + { + name = "GF_INSTALL_PLUGINS" + value = "" + }, + { + name = "GF_FEATURE_TOGGLES_ENABLE" + value = "traceqlEditor" + } + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.grafana_logs.name + "mode" = "non-blocking" + "awslogs-create-group" = "true" + "max-buffer-size" = "25m" + "awslogs-region" = var.aws_region + "awslogs-stream-prefix" = "grafana" + } + } + + healthCheck = { + command = [ + "CMD-SHELL", + "curl --silent --fail http://localhost:3000/api/health || exit 1" + ] + interval = 30 + timeout = 10 + retries = 3 + startPeriod = 60 + } + + systemControls = [] + } + ]) + + tags = { + Name = "Grafana OTEL LGTM Task Definition" + } +} + +# ECS Service for Grafana OTEL +resource "aws_ecs_service" "grafana_otel" { + name = "grafana-otel-lgtm" + cluster = var.cluster_name + task_definition = aws_ecs_task_definition.grafana_otel.arn + desired_count = 1 + + triggers = { + redeployment = sha1(jsonencode(aws_ecs_task_definition.grafana_otel.container_definitions)) + } + + capacity_provider_strategy { + capacity_provider = "FARGATE" + weight = 1 + base = 0 + } + + platform_version = "LATEST" + + deployment_maximum_percent = 200 + deployment_minimum_healthy_percent = 100 + + deployment_circuit_breaker { + enable = true + rollback = false + } + + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.grafana_ecs_direct_sg.id] + assign_public_ip = false + } + + # No load balancer - using direct service discovery + service_registries { + registry_arn = aws_service_discovery_service.otel_monitor.arn + } + + # Enable deployment circuit breaker + deployment_controller { + type = "ECS" + } + + # Enable execute command for debugging + enable_execute_command = true + + tags = { + Name = "Grafana OTEL LGTM Service" + } + + lifecycle { + ignore_changes = [desired_count] + } +} + +# Auto Scaling Target for Grafana (optional, keep at 1 for now) +resource "aws_appautoscaling_target" "grafana_target" { + max_capacity = 2 + min_capacity = 1 + resource_id = "service/${var.cluster_name}/grafana-otel-lgtm" + scalable_dimension = "ecs:service:DesiredCount" + service_namespace = "ecs" + + depends_on = [aws_ecs_service.grafana_otel] +} + +# Auto Scaling Policy - Scale Up based on CPU (conservative for monitoring) +resource "aws_appautoscaling_policy" "grafana_scale_up" { + name = "grafana-scale-up" + policy_type = "TargetTrackingScaling" + resource_id = aws_appautoscaling_target.grafana_target.resource_id + scalable_dimension = aws_appautoscaling_target.grafana_target.scalable_dimension + service_namespace = aws_appautoscaling_target.grafana_target.service_namespace + + target_tracking_scaling_policy_configuration { + target_value = 80.0 + + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + + scale_out_cooldown = 600 # 10 minutes + scale_in_cooldown = 300 # 5 minutes + } +} \ No newline at end of file diff --git a/iac/outputs.tf b/iac/outputs.tf index bdf86e39c7..a4cc4662f5 100644 --- a/iac/outputs.tf +++ b/iac/outputs.tf @@ -93,4 +93,93 @@ output "migration_instructions" { output "service_discovery_migration" { description = "Service discovery migration instructions (since Entra App Proxy cannot be modified)" value = local.migration_commands +} + +# Grafana OTEL Monitoring Stack Outputs + +output "grafana_dashboard_url" { + description = "Grafana dashboard URL (accessible via VPN)" + value = "http://otel-monitor.ggai:3000" +} + +output "grafana_admin_credentials" { + description = "Grafana admin login credentials" + value = { + username = "admin" + password = "openwebui_monitoring_2024" + } + sensitive = true +} + +output "otlp_endpoints" { + description = "OpenTelemetry OTLP endpoints for telemetry data" + value = { + grpc = "http://otel-monitor.ggai:4317" + http = "http://otel-monitor.ggai:4318" + } +} + +output "grafana_service_discovery" { + description = "Grafana service discovery details" + value = { + namespace = "ggai" + service_name = "otel-monitor" + endpoints = { + grafana_ui = "http://otel-monitor.ggai:3000" + otlp_grpc = "http://otel-monitor.ggai:4317" + otlp_http = "http://otel-monitor.ggai:4318" + } + } +} + +output "monitoring_setup_instructions" { + description = "Instructions for accessing and configuring monitoring" + value = <<-EOT + + === GRAFANA OTEL MONITORING SETUP === + + 1. VERIFICATION COMMANDS (run from within VPC): + nslookup otel-monitor.ggai + curl http://otel-monitor.ggai:3000 + + 2. GRAFANA ACCESS (via VPN): + URL: http://otel-monitor.ggai:3000 + Username: admin + Password: openwebui_monitoring_2024 + + 3. OPENWEBUI TELEMETRY CONFIGURATION: + - OpenTelemetry is automatically enabled in the scaled service + - Traces: Enabled with 10% sampling rate for performance + - Metrics: HTTP requests, duration, active users, database queries + - Logs: Integrated with existing CloudWatch logs + - Endpoint: http://otel-monitor.ggai:4317 (gRPC) + + 4. MONITORING DATA SOURCES: + - Prometheus: Pre-configured for metrics + - Tempo: Pre-configured for distributed traces + - Loki: Pre-configured for logs aggregation + + 5. TROUBLESHOOTING: + - Check ECS service status: aws ecs describe-services --cluster ${var.cluster_name} --services grafana-otel-lgtm + - View Grafana logs: aws logs tail /ecs/grafana-otel-lgtm --follow + - Test OTLP connectivity from OpenWebUI tasks + + 6. PERFORMANCE INVESTIGATION WORKFLOW: + a) Access Grafana via VPN: http://otel-monitor.ggai:3000 + b) Navigate to "Explore" tab + c) Select "Tempo" for distributed tracing + d) Query traces with slow response times: {duration > 5s} + e) Analyze database queries, HTTP calls, and bottlenecks + f) Use "Prometheus" for metrics correlation + g) Check "Loki" for error logs during slow periods + + === INTEGRATION VERIFICATION === + + After deployment, verify telemetry flow: + 1. Make requests to OpenWebUI: http://ai-scaled.ggai:8080 + 2. Check Grafana traces appear within 30 seconds + 3. Verify metrics are updating in Grafana dashboards + 4. Confirm no OTLP errors in OpenWebUI logs + + EOT } \ No newline at end of file diff --git a/iac/service_discovery.tf b/iac/service_discovery.tf index ed35869a54..ff81beaed7 100644 --- a/iac/service_discovery.tf +++ b/iac/service_discovery.tf @@ -48,31 +48,69 @@ resource "aws_service_discovery_instance" "alb_instance" { } } +# Create service discovery service for Grafana OTEL monitoring +resource "aws_service_discovery_service" "otel_monitor" { + name = "otel-monitor" + + dns_config { + namespace_id = data.aws_service_discovery_dns_namespace.ggai.id + + dns_records { + ttl = 60 + type = "A" + } + + routing_policy = "MULTIVALUE" + } + + health_check_custom_config { + failure_threshold = 1 + } + + description = "Grafana OTEL LGTM monitoring stack - direct ECS access" + + tags = { + Name = "OTEL Monitor Service Discovery" + } +} + # Output for manual migration locals { migration_commands = <<-EOT - + === SERVICE DISCOVERY MIGRATION === - + CURRENT SETUP: - Entra App Proxy → ai.ggai:8080 → Single task - + NEW SETUP OPTIONS: - + Option 1: Create new endpoint (RECOMMENDED) - Entra App Proxy → ai-scaled.ggai:8080 → ALB → Multiple tasks - Test endpoint: ai-scaled.ggai:8080 - Safer migration with rollback capability - + Option 2: Update existing endpoint (RISKIER) - Requires manual AWS CLI commands to update existing service discovery - Direct replacement of ai.ggai:8080 records - - TESTING COMMAND (from within VPC): - nslookup ai-scaled.ggai - - MIGRATION VERIFICATION: - curl -H "Host: ai-glondon.msappproxy.net" http://ai-scaled.ggai:8080/health - + + === GRAFANA OTEL MONITORING === + + NEW MONITORING ENDPOINTS: + - Grafana Dashboard: http://otel-monitor.ggai:3000 + - OTLP gRPC Endpoint: http://otel-monitor.ggai:4317 + - OTLP HTTP Endpoint: http://otel-monitor.ggai:4318 + + TESTING COMMANDS (from within VPC): + nslookup ai-scaled.ggai + nslookup otel-monitor.ggai + + MIGRATION VERIFICATION: + curl -H "Host: ai-glondon.msappproxy.net" http://ai-scaled.ggai:8080/health + + GRAFANA ACCESS (via VPN): + curl http://otel-monitor.ggai:3000 + # Default login: admin / openwebui_monitoring_2024 + EOT }