mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-12 04:15:25 +00:00
feat: setup otel monitoring infa
This commit is contained in:
parent
9c7969466b
commit
b5ec91f9d9
4 changed files with 426 additions and 12 deletions
53
iac/ecs.tf
53
iac/ecs.tf
|
|
@ -207,6 +207,59 @@ resource "aws_ecs_task_definition" "webui_scaled" {
|
|||
{
|
||||
name = "FASTEMBED_CACHE_PATH"
|
||||
value = "/app/backend/data/cache/fastembed"
|
||||
},
|
||||
# OpenTelemetry Configuration
|
||||
{
|
||||
name = "ENABLE_OTEL"
|
||||
value = "true"
|
||||
},
|
||||
{
|
||||
name = "ENABLE_OTEL_METRICS"
|
||||
value = "true"
|
||||
},
|
||||
{
|
||||
name = "ENABLE_OTEL_TRACES"
|
||||
value = "true"
|
||||
},
|
||||
{
|
||||
name = "ENABLE_OTEL_LOGS"
|
||||
value = "true"
|
||||
},
|
||||
{
|
||||
name = "OTEL_EXPORTER_OTLP_ENDPOINT"
|
||||
value = "http://otel-monitor.ggai:4317"
|
||||
},
|
||||
{
|
||||
name = "OTEL_EXPORTER_OTLP_INSECURE"
|
||||
value = "true"
|
||||
},
|
||||
{
|
||||
name = "OTEL_OTLP_SPAN_EXPORTER"
|
||||
value = "grpc"
|
||||
},
|
||||
{
|
||||
name = "OTEL_METRICS_OTLP_SPAN_EXPORTER"
|
||||
value = "grpc"
|
||||
},
|
||||
{
|
||||
name = "OTEL_LOGS_OTLP_SPAN_EXPORTER"
|
||||
value = "grpc"
|
||||
},
|
||||
{
|
||||
name = "OTEL_SERVICE_NAME"
|
||||
value = "open-webui-production"
|
||||
},
|
||||
{
|
||||
name = "OTEL_RESOURCE_ATTRIBUTES"
|
||||
value = "service.version=${var.container_image},deployment.environment=production,service.instance.id=ecs-fargate"
|
||||
},
|
||||
{
|
||||
name = "OTEL_TRACES_SAMPLER"
|
||||
value = "traceidratio"
|
||||
},
|
||||
{
|
||||
name = "OTEL_TRACES_SAMPLER_ARG"
|
||||
value = "0.1"
|
||||
}
|
||||
]
|
||||
|
||||
|
|
|
|||
234
iac/grafana-ecs.tf
Normal file
234
iac/grafana-ecs.tf
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
# CloudWatch log group for Grafana OTEL
|
||||
resource "aws_cloudwatch_log_group" "grafana_logs" {
|
||||
name = "/ecs/grafana-otel-lgtm"
|
||||
retention_in_days = 7
|
||||
|
||||
tags = {
|
||||
Name = "Grafana OTEL Logs"
|
||||
}
|
||||
}
|
||||
|
||||
# Security Group for direct Grafana ECS access
|
||||
resource "aws_security_group" "grafana_ecs_direct_sg" {
|
||||
name_prefix = "grafana-ecs-direct-"
|
||||
vpc_id = var.vpc_id
|
||||
description = "Security group for direct Grafana OTEL ECS access"
|
||||
|
||||
# Allow Grafana UI access from VPN
|
||||
ingress {
|
||||
description = "Grafana UI from VPN"
|
||||
from_port = 3000
|
||||
to_port = 3000
|
||||
protocol = "tcp"
|
||||
cidr_blocks = [var.gg_vpn_cidr]
|
||||
}
|
||||
|
||||
# Allow OTLP gRPC from OpenWebUI ECS tasks
|
||||
ingress {
|
||||
description = "OTLP gRPC from OpenWebUI ECS"
|
||||
from_port = 4317
|
||||
to_port = 4317
|
||||
protocol = "tcp"
|
||||
security_groups = [aws_security_group.ecs_scaled_sg.id]
|
||||
}
|
||||
|
||||
# Allow OTLP HTTP from OpenWebUI ECS tasks
|
||||
ingress {
|
||||
description = "OTLP HTTP from OpenWebUI ECS"
|
||||
from_port = 4318
|
||||
to_port = 4318
|
||||
protocol = "tcp"
|
||||
security_groups = [aws_security_group.ecs_scaled_sg.id]
|
||||
}
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "Grafana ECS Direct Access Security Group"
|
||||
}
|
||||
}
|
||||
|
||||
# IAM role for Grafana ECS tasks (reuse existing execution role)
|
||||
# The existing openwebui_execution_role should have sufficient permissions
|
||||
|
||||
# ECS Task Definition for Grafana OTEL LGTM
|
||||
resource "aws_ecs_task_definition" "grafana_otel" {
|
||||
family = "grafana-otel-lgtm"
|
||||
network_mode = "awsvpc"
|
||||
requires_compatibilities = ["FARGATE"]
|
||||
cpu = 1024
|
||||
memory = 2048
|
||||
execution_role_arn = aws_iam_role.openwebui_execution_role.arn
|
||||
task_role_arn = aws_iam_role.openwebui_execution_role.arn
|
||||
|
||||
container_definitions = jsonencode([
|
||||
{
|
||||
name = "grafana-otel-lgtm"
|
||||
image = "grafana/otel-lgtm:latest"
|
||||
cpu = 0
|
||||
essential = true
|
||||
|
||||
portMappings = [
|
||||
{
|
||||
containerPort = 3000
|
||||
hostPort = 3000
|
||||
protocol = "tcp"
|
||||
name = "grafana-ui"
|
||||
appProtocol = "http"
|
||||
},
|
||||
{
|
||||
containerPort = 4317
|
||||
hostPort = 4317
|
||||
protocol = "tcp"
|
||||
name = "otlp-grpc"
|
||||
},
|
||||
{
|
||||
containerPort = 4318
|
||||
hostPort = 4318
|
||||
protocol = "tcp"
|
||||
name = "otlp-http"
|
||||
appProtocol = "http"
|
||||
}
|
||||
]
|
||||
|
||||
environment = [
|
||||
{
|
||||
name = "GF_SECURITY_ADMIN_PASSWORD"
|
||||
value = "openwebui_monitoring_2024"
|
||||
},
|
||||
{
|
||||
name = "GF_SECURITY_ADMIN_USER"
|
||||
value = "admin"
|
||||
},
|
||||
{
|
||||
name = "GF_INSTALL_PLUGINS"
|
||||
value = ""
|
||||
},
|
||||
{
|
||||
name = "GF_FEATURE_TOGGLES_ENABLE"
|
||||
value = "traceqlEditor"
|
||||
}
|
||||
]
|
||||
|
||||
logConfiguration = {
|
||||
logDriver = "awslogs"
|
||||
options = {
|
||||
"awslogs-group" = aws_cloudwatch_log_group.grafana_logs.name
|
||||
"mode" = "non-blocking"
|
||||
"awslogs-create-group" = "true"
|
||||
"max-buffer-size" = "25m"
|
||||
"awslogs-region" = var.aws_region
|
||||
"awslogs-stream-prefix" = "grafana"
|
||||
}
|
||||
}
|
||||
|
||||
healthCheck = {
|
||||
command = [
|
||||
"CMD-SHELL",
|
||||
"curl --silent --fail http://localhost:3000/api/health || exit 1"
|
||||
]
|
||||
interval = 30
|
||||
timeout = 10
|
||||
retries = 3
|
||||
startPeriod = 60
|
||||
}
|
||||
|
||||
systemControls = []
|
||||
}
|
||||
])
|
||||
|
||||
tags = {
|
||||
Name = "Grafana OTEL LGTM Task Definition"
|
||||
}
|
||||
}
|
||||
|
||||
# ECS Service for Grafana OTEL
|
||||
resource "aws_ecs_service" "grafana_otel" {
|
||||
name = "grafana-otel-lgtm"
|
||||
cluster = var.cluster_name
|
||||
task_definition = aws_ecs_task_definition.grafana_otel.arn
|
||||
desired_count = 1
|
||||
|
||||
triggers = {
|
||||
redeployment = sha1(jsonencode(aws_ecs_task_definition.grafana_otel.container_definitions))
|
||||
}
|
||||
|
||||
capacity_provider_strategy {
|
||||
capacity_provider = "FARGATE"
|
||||
weight = 1
|
||||
base = 0
|
||||
}
|
||||
|
||||
platform_version = "LATEST"
|
||||
|
||||
deployment_maximum_percent = 200
|
||||
deployment_minimum_healthy_percent = 100
|
||||
|
||||
deployment_circuit_breaker {
|
||||
enable = true
|
||||
rollback = false
|
||||
}
|
||||
|
||||
network_configuration {
|
||||
subnets = var.private_subnet_ids
|
||||
security_groups = [aws_security_group.grafana_ecs_direct_sg.id]
|
||||
assign_public_ip = false
|
||||
}
|
||||
|
||||
# No load balancer - using direct service discovery
|
||||
service_registries {
|
||||
registry_arn = aws_service_discovery_service.otel_monitor.arn
|
||||
}
|
||||
|
||||
# Enable deployment circuit breaker
|
||||
deployment_controller {
|
||||
type = "ECS"
|
||||
}
|
||||
|
||||
# Enable execute command for debugging
|
||||
enable_execute_command = true
|
||||
|
||||
tags = {
|
||||
Name = "Grafana OTEL LGTM Service"
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [desired_count]
|
||||
}
|
||||
}
|
||||
|
||||
# Auto Scaling Target for Grafana (optional, keep at 1 for now)
|
||||
resource "aws_appautoscaling_target" "grafana_target" {
|
||||
max_capacity = 2
|
||||
min_capacity = 1
|
||||
resource_id = "service/${var.cluster_name}/grafana-otel-lgtm"
|
||||
scalable_dimension = "ecs:service:DesiredCount"
|
||||
service_namespace = "ecs"
|
||||
|
||||
depends_on = [aws_ecs_service.grafana_otel]
|
||||
}
|
||||
|
||||
# Auto Scaling Policy - Scale Up based on CPU (conservative for monitoring)
|
||||
resource "aws_appautoscaling_policy" "grafana_scale_up" {
|
||||
name = "grafana-scale-up"
|
||||
policy_type = "TargetTrackingScaling"
|
||||
resource_id = aws_appautoscaling_target.grafana_target.resource_id
|
||||
scalable_dimension = aws_appautoscaling_target.grafana_target.scalable_dimension
|
||||
service_namespace = aws_appautoscaling_target.grafana_target.service_namespace
|
||||
|
||||
target_tracking_scaling_policy_configuration {
|
||||
target_value = 80.0
|
||||
|
||||
predefined_metric_specification {
|
||||
predefined_metric_type = "ECSServiceAverageCPUUtilization"
|
||||
}
|
||||
|
||||
scale_out_cooldown = 600 # 10 minutes
|
||||
scale_in_cooldown = 300 # 5 minutes
|
||||
}
|
||||
}
|
||||
|
|
@ -93,4 +93,93 @@ output "migration_instructions" {
|
|||
output "service_discovery_migration" {
|
||||
description = "Service discovery migration instructions (since Entra App Proxy cannot be modified)"
|
||||
value = local.migration_commands
|
||||
}
|
||||
|
||||
# Grafana OTEL Monitoring Stack Outputs
|
||||
|
||||
output "grafana_dashboard_url" {
|
||||
description = "Grafana dashboard URL (accessible via VPN)"
|
||||
value = "http://otel-monitor.ggai:3000"
|
||||
}
|
||||
|
||||
output "grafana_admin_credentials" {
|
||||
description = "Grafana admin login credentials"
|
||||
value = {
|
||||
username = "admin"
|
||||
password = "openwebui_monitoring_2024"
|
||||
}
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
output "otlp_endpoints" {
|
||||
description = "OpenTelemetry OTLP endpoints for telemetry data"
|
||||
value = {
|
||||
grpc = "http://otel-monitor.ggai:4317"
|
||||
http = "http://otel-monitor.ggai:4318"
|
||||
}
|
||||
}
|
||||
|
||||
output "grafana_service_discovery" {
|
||||
description = "Grafana service discovery details"
|
||||
value = {
|
||||
namespace = "ggai"
|
||||
service_name = "otel-monitor"
|
||||
endpoints = {
|
||||
grafana_ui = "http://otel-monitor.ggai:3000"
|
||||
otlp_grpc = "http://otel-monitor.ggai:4317"
|
||||
otlp_http = "http://otel-monitor.ggai:4318"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output "monitoring_setup_instructions" {
|
||||
description = "Instructions for accessing and configuring monitoring"
|
||||
value = <<-EOT
|
||||
|
||||
=== GRAFANA OTEL MONITORING SETUP ===
|
||||
|
||||
1. VERIFICATION COMMANDS (run from within VPC):
|
||||
nslookup otel-monitor.ggai
|
||||
curl http://otel-monitor.ggai:3000
|
||||
|
||||
2. GRAFANA ACCESS (via VPN):
|
||||
URL: http://otel-monitor.ggai:3000
|
||||
Username: admin
|
||||
Password: openwebui_monitoring_2024
|
||||
|
||||
3. OPENWEBUI TELEMETRY CONFIGURATION:
|
||||
- OpenTelemetry is automatically enabled in the scaled service
|
||||
- Traces: Enabled with 10% sampling rate for performance
|
||||
- Metrics: HTTP requests, duration, active users, database queries
|
||||
- Logs: Integrated with existing CloudWatch logs
|
||||
- Endpoint: http://otel-monitor.ggai:4317 (gRPC)
|
||||
|
||||
4. MONITORING DATA SOURCES:
|
||||
- Prometheus: Pre-configured for metrics
|
||||
- Tempo: Pre-configured for distributed traces
|
||||
- Loki: Pre-configured for logs aggregation
|
||||
|
||||
5. TROUBLESHOOTING:
|
||||
- Check ECS service status: aws ecs describe-services --cluster ${var.cluster_name} --services grafana-otel-lgtm
|
||||
- View Grafana logs: aws logs tail /ecs/grafana-otel-lgtm --follow
|
||||
- Test OTLP connectivity from OpenWebUI tasks
|
||||
|
||||
6. PERFORMANCE INVESTIGATION WORKFLOW:
|
||||
a) Access Grafana via VPN: http://otel-monitor.ggai:3000
|
||||
b) Navigate to "Explore" tab
|
||||
c) Select "Tempo" for distributed tracing
|
||||
d) Query traces with slow response times: {duration > 5s}
|
||||
e) Analyze database queries, HTTP calls, and bottlenecks
|
||||
f) Use "Prometheus" for metrics correlation
|
||||
g) Check "Loki" for error logs during slow periods
|
||||
|
||||
=== INTEGRATION VERIFICATION ===
|
||||
|
||||
After deployment, verify telemetry flow:
|
||||
1. Make requests to OpenWebUI: http://ai-scaled.ggai:8080
|
||||
2. Check Grafana traces appear within 30 seconds
|
||||
3. Verify metrics are updating in Grafana dashboards
|
||||
4. Confirm no OTLP errors in OpenWebUI logs
|
||||
|
||||
EOT
|
||||
}
|
||||
|
|
@ -48,31 +48,69 @@ resource "aws_service_discovery_instance" "alb_instance" {
|
|||
}
|
||||
}
|
||||
|
||||
# Create service discovery service for Grafana OTEL monitoring
|
||||
resource "aws_service_discovery_service" "otel_monitor" {
|
||||
name = "otel-monitor"
|
||||
|
||||
dns_config {
|
||||
namespace_id = data.aws_service_discovery_dns_namespace.ggai.id
|
||||
|
||||
dns_records {
|
||||
ttl = 60
|
||||
type = "A"
|
||||
}
|
||||
|
||||
routing_policy = "MULTIVALUE"
|
||||
}
|
||||
|
||||
health_check_custom_config {
|
||||
failure_threshold = 1
|
||||
}
|
||||
|
||||
description = "Grafana OTEL LGTM monitoring stack - direct ECS access"
|
||||
|
||||
tags = {
|
||||
Name = "OTEL Monitor Service Discovery"
|
||||
}
|
||||
}
|
||||
|
||||
# Output for manual migration
|
||||
locals {
|
||||
migration_commands = <<-EOT
|
||||
|
||||
|
||||
=== SERVICE DISCOVERY MIGRATION ===
|
||||
|
||||
|
||||
CURRENT SETUP:
|
||||
- Entra App Proxy → ai.ggai:8080 → Single task
|
||||
|
||||
|
||||
NEW SETUP OPTIONS:
|
||||
|
||||
|
||||
Option 1: Create new endpoint (RECOMMENDED)
|
||||
- Entra App Proxy → ai-scaled.ggai:8080 → ALB → Multiple tasks
|
||||
- Test endpoint: ai-scaled.ggai:8080
|
||||
- Safer migration with rollback capability
|
||||
|
||||
|
||||
Option 2: Update existing endpoint (RISKIER)
|
||||
- Requires manual AWS CLI commands to update existing service discovery
|
||||
- Direct replacement of ai.ggai:8080 records
|
||||
|
||||
TESTING COMMAND (from within VPC):
|
||||
nslookup ai-scaled.ggai
|
||||
|
||||
MIGRATION VERIFICATION:
|
||||
curl -H "Host: ai-glondon.msappproxy.net" http://ai-scaled.ggai:8080/health
|
||||
|
||||
|
||||
=== GRAFANA OTEL MONITORING ===
|
||||
|
||||
NEW MONITORING ENDPOINTS:
|
||||
- Grafana Dashboard: http://otel-monitor.ggai:3000
|
||||
- OTLP gRPC Endpoint: http://otel-monitor.ggai:4317
|
||||
- OTLP HTTP Endpoint: http://otel-monitor.ggai:4318
|
||||
|
||||
TESTING COMMANDS (from within VPC):
|
||||
nslookup ai-scaled.ggai
|
||||
nslookup otel-monitor.ggai
|
||||
|
||||
MIGRATION VERIFICATION:
|
||||
curl -H "Host: ai-glondon.msappproxy.net" http://ai-scaled.ggai:8080/health
|
||||
|
||||
GRAFANA ACCESS (via VPN):
|
||||
curl http://otel-monitor.ggai:3000
|
||||
# Default login: admin / openwebui_monitoring_2024
|
||||
|
||||
EOT
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue