# CloudWatch log group for Grafana OTEL resource "aws_cloudwatch_log_group" "grafana_logs" { name = "/ecs/grafana-otel-lgtm" retention_in_days = 7 tags = { Name = "Grafana OTEL Logs" } } # Security Group for direct Grafana ECS access resource "aws_security_group" "grafana_ecs_direct_sg" { name_prefix = "grafana-ecs-direct-" vpc_id = var.vpc_id description = "Security group for direct Grafana OTEL ECS access" # Allow Grafana UI access from VPN ingress { description = "Grafana UI from VPN" from_port = 3000 to_port = 3000 protocol = "tcp" cidr_blocks = [var.gg_vpn_cidr] } # Allow OTLP gRPC from OpenWebUI ECS tasks ingress { description = "OTLP gRPC from OpenWebUI ECS" from_port = 4317 to_port = 4317 protocol = "tcp" security_groups = [aws_security_group.ecs_scaled_sg.id] } # Allow OTLP HTTP from OpenWebUI ECS tasks ingress { description = "OTLP HTTP from OpenWebUI ECS" from_port = 4318 to_port = 4318 protocol = "tcp" security_groups = [aws_security_group.ecs_scaled_sg.id] } egress { from_port = 0 to_port = 0 protocol = "-1" cidr_blocks = ["0.0.0.0/0"] } tags = { Name = "Grafana ECS Direct Access Security Group" } } # IAM role for Grafana ECS tasks (reuse existing execution role) # The existing openwebui_execution_role should have sufficient permissions # ECS Task Definition for Grafana OTEL LGTM resource "aws_ecs_task_definition" "grafana_otel" { family = "grafana-otel-lgtm" network_mode = "awsvpc" requires_compatibilities = ["FARGATE"] cpu = 1024 memory = 2048 execution_role_arn = aws_iam_role.openwebui_execution_role.arn task_role_arn = aws_iam_role.openwebui_execution_role.arn container_definitions = jsonencode([ { name = "grafana-otel-lgtm" image = "grafana/otel-lgtm:latest" cpu = 0 essential = true portMappings = [ { containerPort = 3000 hostPort = 3000 protocol = "tcp" name = "grafana-ui" appProtocol = "http" }, { containerPort = 4317 hostPort = 4317 protocol = "tcp" name = "otlp-grpc" }, { containerPort = 4318 hostPort = 4318 protocol = "tcp" name = "otlp-http" appProtocol = "http" } ] environment = [ { name = "GF_SECURITY_ADMIN_PASSWORD" value = "openwebui_monitoring_2024" }, { name = "GF_SECURITY_ADMIN_USER" value = "admin" }, { name = "GF_INSTALL_PLUGINS" value = "" }, { name = "GF_FEATURE_TOGGLES_ENABLE" value = "traceqlEditor" } ] logConfiguration = { logDriver = "awslogs" options = { "awslogs-group" = aws_cloudwatch_log_group.grafana_logs.name "mode" = "non-blocking" "awslogs-create-group" = "true" "max-buffer-size" = "25m" "awslogs-region" = var.aws_region "awslogs-stream-prefix" = "grafana" } } healthCheck = { command = [ "CMD-SHELL", "curl --silent --fail http://localhost:3000/api/health || exit 1" ] interval = 30 timeout = 10 retries = 3 startPeriod = 60 } systemControls = [] } ]) tags = { Name = "Grafana OTEL LGTM Task Definition" } } # ECS Service for Grafana OTEL resource "aws_ecs_service" "grafana_otel" { name = "grafana-otel-lgtm" cluster = var.cluster_name task_definition = aws_ecs_task_definition.grafana_otel.arn desired_count = 1 triggers = { redeployment = sha1(jsonencode(aws_ecs_task_definition.grafana_otel.container_definitions)) } capacity_provider_strategy { capacity_provider = "FARGATE" weight = 1 base = 0 } platform_version = "LATEST" deployment_maximum_percent = 200 deployment_minimum_healthy_percent = 100 deployment_circuit_breaker { enable = true rollback = false } network_configuration { subnets = var.private_subnet_ids security_groups = [aws_security_group.grafana_ecs_direct_sg.id] assign_public_ip = false } # No load balancer - using direct service discovery service_registries { registry_arn = aws_service_discovery_service.otel_monitor.arn } # Enable deployment circuit breaker deployment_controller { type = "ECS" } # Enable execute command for debugging enable_execute_command = true tags = { Name = "Grafana OTEL LGTM Service" } lifecycle { ignore_changes = [desired_count] } } # Auto Scaling Target for Grafana (optional, keep at 1 for now) resource "aws_appautoscaling_target" "grafana_target" { max_capacity = 2 min_capacity = 1 resource_id = "service/${var.cluster_name}/grafana-otel-lgtm" scalable_dimension = "ecs:service:DesiredCount" service_namespace = "ecs" depends_on = [aws_ecs_service.grafana_otel] } # Auto Scaling Policy - Scale Up based on CPU (conservative for monitoring) resource "aws_appautoscaling_policy" "grafana_scale_up" { name = "grafana-scale-up" policy_type = "TargetTrackingScaling" resource_id = aws_appautoscaling_target.grafana_target.resource_id scalable_dimension = aws_appautoscaling_target.grafana_target.scalable_dimension service_namespace = aws_appautoscaling_target.grafana_target.service_namespace target_tracking_scaling_policy_configuration { target_value = 80.0 predefined_metric_specification { predefined_metric_type = "ECSServiceAverageCPUUtilization" } scale_out_cooldown = 600 # 10 minutes scale_in_cooldown = 300 # 5 minutes } }