mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-14 05:15:18 +00:00
feat(infra): add common module Grafana
This commit is contained in:
parent
b5ec91f9d9
commit
e917f7eb67
5 changed files with 858 additions and 0 deletions
231
iac/modules/grafana-otel/README.md
Normal file
231
iac/modules/grafana-otel/README.md
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
# Grafana OTEL Module
|
||||
|
||||
This Terraform module deploys a standalone Grafana OTEL LGTM (Logs, Grafana, Tempo, Mimir) stack on AWS ECS Fargate for OpenTelemetry monitoring and observability.
|
||||
|
||||
## Features
|
||||
|
||||
- **Complete OTEL Stack**: Grafana + Prometheus + Tempo + Loki in a single container
|
||||
- **ECS Fargate Deployment**: Serverless, scalable container deployment
|
||||
- **Service Discovery**: Automatic DNS registration for easy service connectivity
|
||||
- **Security**: Configurable security groups and network access controls
|
||||
- **Auto Scaling**: Optional ECS autoscaling based on CPU utilization
|
||||
- **CloudWatch Integration**: Structured logging with configurable retention
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
||||
│ Applications │───▶│ OTLP Endpoints │───▶│ Grafana UI │
|
||||
│ │ │ (4317/4318) │ │ (3000) │
|
||||
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────┐
|
||||
│ ECS Fargate │
|
||||
│ - Grafana │
|
||||
│ - Prometheus │
|
||||
│ - Tempo │
|
||||
│ - Loki │
|
||||
└──────────────────┘
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```hcl
|
||||
module "grafana_monitoring" {
|
||||
source = "./modules/grafana-otel"
|
||||
|
||||
# Core Infrastructure
|
||||
vpc_id = "vpc-12345678"
|
||||
private_subnet_ids = ["subnet-12345678", "subnet-87654321"]
|
||||
cluster_name = "my-ecs-cluster"
|
||||
|
||||
# Network Access
|
||||
allowed_cidr_blocks = ["10.0.0.0/8", "192.168.0.0/16"]
|
||||
|
||||
# Optional: OpenTelemetry Sources
|
||||
otlp_sources_security_group_ids = ["sg-app1", "sg-app2"]
|
||||
|
||||
tags = {
|
||||
Environment = "production"
|
||||
Project = "monitoring"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Advanced Usage with Existing Service Discovery
|
||||
|
||||
```hcl
|
||||
module "grafana_monitoring" {
|
||||
source = "./modules/grafana-otel"
|
||||
|
||||
# Core Infrastructure
|
||||
vpc_id = "vpc-12345678"
|
||||
private_subnet_ids = ["subnet-12345678", "subnet-87654321"]
|
||||
cluster_name = "my-ecs-cluster"
|
||||
|
||||
# Use existing service discovery namespace
|
||||
service_discovery_namespace_id = "ns-12345678"
|
||||
service_name = "monitoring"
|
||||
|
||||
# Custom configuration
|
||||
environment = "staging"
|
||||
cpu = 2048
|
||||
memory = 4096
|
||||
desired_count = 2
|
||||
enable_autoscaling = true
|
||||
max_capacity = 3
|
||||
|
||||
# Custom Grafana credentials
|
||||
grafana_admin_user = "monitoring-admin"
|
||||
grafana_admin_password = "secure-password-123"
|
||||
|
||||
tags = {
|
||||
Environment = "staging"
|
||||
Project = "monitoring"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
| Name | Version |
|
||||
|------|---------|
|
||||
| terraform | >= 1.0 |
|
||||
| aws | >= 5.0 |
|
||||
|
||||
## Providers
|
||||
|
||||
| Name | Version |
|
||||
|------|---------|
|
||||
| aws | >= 5.0 |
|
||||
|
||||
## Resources Created
|
||||
|
||||
- **ECS Service & Task Definition**: Fargate-based Grafana OTEL LGTM container
|
||||
- **Service Discovery**: DNS service registration for easy connectivity
|
||||
- **Security Groups**: Network access controls for Grafana UI and OTLP endpoints
|
||||
- **IAM Roles**: Execution role with necessary permissions
|
||||
- **CloudWatch Log Group**: Centralized logging with configurable retention
|
||||
- **Auto Scaling** (optional): CPU-based scaling for high availability
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|------|---------|:--------:|
|
||||
| vpc_id | VPC ID where Grafana will be deployed | `string` | n/a | yes |
|
||||
| private_subnet_ids | Private subnet IDs for Grafana ECS tasks | `list(string)` | n/a | yes |
|
||||
| cluster_name | ECS cluster name where Grafana will be deployed | `string` | n/a | yes |
|
||||
| aws_region | AWS region for deployment | `string` | `"us-east-1"` | no |
|
||||
| allowed_cidr_blocks | CIDR blocks allowed to access Grafana UI | `list(string)` | `["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]` | no |
|
||||
| otlp_sources_security_group_ids | Security group IDs that should be allowed to send OTLP data | `list(string)` | `[]` | no |
|
||||
| grafana_admin_user | Grafana admin username | `string` | `"admin"` | no |
|
||||
| grafana_admin_password | Grafana admin password | `string` | `"openwebui_monitoring_2024"` | no |
|
||||
| cpu | CPU units for Grafana task | `number` | `1024` | no |
|
||||
| memory | Memory (MB) for Grafana task | `number` | `2048` | no |
|
||||
| enable_autoscaling | Enable ECS autoscaling for Grafana | `bool` | `true` | no |
|
||||
|
||||
See [variables.tf](./variables.tf) for complete list of inputs.
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| grafana_dashboard_url | Grafana dashboard URL |
|
||||
| grafana_admin_credentials | Grafana admin login credentials (sensitive) |
|
||||
| otlp_endpoints | OpenTelemetry OTLP endpoints (gRPC and HTTP) |
|
||||
| security_group_id | Security group ID for Grafana tasks |
|
||||
| setup_instructions | Complete setup and integration instructions |
|
||||
|
||||
See [outputs.tf](./outputs.tf) for complete list of outputs.
|
||||
|
||||
## Integration with Applications
|
||||
|
||||
To send telemetry data from your applications to this Grafana instance:
|
||||
|
||||
### 1. Add Application Security Groups
|
||||
|
||||
```hcl
|
||||
module "grafana_monitoring" {
|
||||
source = "./modules/grafana-otel"
|
||||
# ... other configuration
|
||||
|
||||
otlp_sources_security_group_ids = [
|
||||
aws_security_group.my_app.id,
|
||||
aws_security_group.another_app.id
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Configure Application Environment Variables
|
||||
|
||||
```bash
|
||||
# In your application environment
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-monitor.my-namespace:4317
|
||||
OTEL_EXPORTER_OTLP_INSECURE=true
|
||||
OTEL_SERVICE_NAME=my-application
|
||||
```
|
||||
|
||||
### 3. Verify Integration
|
||||
|
||||
```bash
|
||||
# Check service discovery
|
||||
nslookup otel-monitor.my-namespace
|
||||
|
||||
# Test OTLP endpoint
|
||||
curl http://otel-monitor.my-namespace:4317
|
||||
|
||||
# Access Grafana UI
|
||||
curl http://otel-monitor.my-namespace:3000
|
||||
```
|
||||
|
||||
## Monitoring and Troubleshooting
|
||||
|
||||
### Access Grafana Dashboard
|
||||
|
||||
1. Connect to your VPC (via VPN or bastion host)
|
||||
2. Navigate to the Grafana URL from module outputs
|
||||
3. Login with the admin credentials
|
||||
4. Explore pre-configured data sources:
|
||||
- **Prometheus**: Metrics and monitoring
|
||||
- **Tempo**: Distributed tracing
|
||||
- **Loki**: Log aggregation
|
||||
|
||||
### Common Issues
|
||||
|
||||
- **Connection refused**: Check security group rules and CIDR blocks
|
||||
- **Service not starting**: Check CloudWatch logs and ECS service events
|
||||
- **No telemetry data**: Verify OTLP source security groups and endpoints
|
||||
|
||||
### Useful Commands
|
||||
|
||||
```bash
|
||||
# Check ECS service status
|
||||
aws ecs describe-services --cluster my-cluster --services grafana-otel
|
||||
|
||||
# View logs
|
||||
aws logs tail /ecs/grafana-otel --follow
|
||||
|
||||
# Check service discovery
|
||||
aws servicediscovery list-services --filters Name=NAMESPACE_ID,Values=ns-12345678
|
||||
```
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- Grafana admin password is configurable but stored in Terraform state
|
||||
- Consider using AWS Secrets Manager for production passwords
|
||||
- Network access is controlled via security groups and CIDR blocks
|
||||
- ECS tasks run with least privilege IAM permissions
|
||||
|
||||
## Cost Optimization
|
||||
|
||||
- Default configuration uses 1 vCPU and 2GB RAM (estimated $35-50/month)
|
||||
- Enable autoscaling to handle traffic spikes efficiently
|
||||
- Adjust log retention period to control CloudWatch costs
|
||||
- Consider using Spot instances for non-production environments
|
||||
|
||||
## License
|
||||
|
||||
This module is part of the OpenWebUI infrastructure project.
|
||||
347
iac/modules/grafana-otel/main.tf
Normal file
347
iac/modules/grafana-otel/main.tf
Normal file
|
|
@ -0,0 +1,347 @@
|
|||
# Local values for consistent naming
|
||||
locals {
|
||||
name_prefix = var.name_prefix
|
||||
common_tags = merge(
|
||||
{
|
||||
Environment = var.environment
|
||||
Module = "grafana-otel"
|
||||
ManagedBy = "terraform"
|
||||
},
|
||||
var.tags
|
||||
)
|
||||
}
|
||||
|
||||
# Service Discovery Namespace (create if not provided)
|
||||
resource "aws_service_discovery_private_dns_namespace" "grafana" {
|
||||
count = var.service_discovery_namespace_id == "" ? 1 : 0
|
||||
|
||||
name = var.service_discovery_namespace_name
|
||||
vpc = var.vpc_id
|
||||
|
||||
description = "Service discovery namespace for Grafana OTEL monitoring"
|
||||
|
||||
tags = merge(local.common_tags, {
|
||||
Name = "${local.name_prefix}-namespace"
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
# Service Discovery Service for Grafana
|
||||
resource "aws_service_discovery_service" "grafana" {
|
||||
name = var.service_name
|
||||
|
||||
dns_config {
|
||||
namespace_id = var.service_discovery_namespace_id != "" ? var.service_discovery_namespace_id : aws_service_discovery_private_dns_namespace.grafana[0].id
|
||||
|
||||
dns_records {
|
||||
ttl = 60
|
||||
type = "A"
|
||||
}
|
||||
|
||||
routing_policy = "MULTIVALUE"
|
||||
}
|
||||
|
||||
health_check_custom_config {
|
||||
failure_threshold = 1
|
||||
}
|
||||
|
||||
description = "Grafana OTEL LGTM monitoring stack service discovery"
|
||||
|
||||
tags = merge(local.common_tags, {
|
||||
Name = "${local.name_prefix}-service-discovery"
|
||||
})
|
||||
}
|
||||
|
||||
# CloudWatch Log Group for Grafana
|
||||
resource "aws_cloudwatch_log_group" "grafana" {
|
||||
name = "/ecs/${local.name_prefix}"
|
||||
retention_in_days = var.log_retention_days
|
||||
|
||||
tags = merge(local.common_tags, {
|
||||
Name = "${local.name_prefix}-logs"
|
||||
})
|
||||
}
|
||||
|
||||
# Security Group for Grafana ECS Tasks
|
||||
resource "aws_security_group" "grafana" {
|
||||
name_prefix = "${local.name_prefix}-"
|
||||
vpc_id = var.vpc_id
|
||||
description = "Security group for Grafana OTEL ECS tasks"
|
||||
|
||||
# Allow Grafana UI access from specified CIDR blocks
|
||||
ingress {
|
||||
description = "Grafana UI access"
|
||||
from_port = 3000
|
||||
to_port = 3000
|
||||
protocol = "tcp"
|
||||
cidr_blocks = var.allowed_cidr_blocks
|
||||
}
|
||||
|
||||
# Allow OTLP gRPC from specified security groups
|
||||
dynamic "ingress" {
|
||||
for_each = length(var.otlp_sources_security_group_ids) > 0 ? [1] : []
|
||||
content {
|
||||
description = "OTLP gRPC from sources"
|
||||
from_port = 4317
|
||||
to_port = 4317
|
||||
protocol = "tcp"
|
||||
security_groups = var.otlp_sources_security_group_ids
|
||||
}
|
||||
}
|
||||
|
||||
# Allow OTLP HTTP from specified security groups
|
||||
dynamic "ingress" {
|
||||
for_each = length(var.otlp_sources_security_group_ids) > 0 ? [1] : []
|
||||
content {
|
||||
description = "OTLP HTTP from sources"
|
||||
from_port = 4318
|
||||
to_port = 4318
|
||||
protocol = "tcp"
|
||||
security_groups = var.otlp_sources_security_group_ids
|
||||
}
|
||||
}
|
||||
|
||||
# Allow all outbound traffic
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
tags = merge(local.common_tags, {
|
||||
Name = "${local.name_prefix}-security-group"
|
||||
})
|
||||
}
|
||||
|
||||
# IAM Role for ECS Task Execution
|
||||
resource "aws_iam_role" "grafana_execution" {
|
||||
name = "${local.name_prefix}-execution-role"
|
||||
|
||||
assume_role_policy = jsonencode({
|
||||
Version = "2012-10-17"
|
||||
Statement = [
|
||||
{
|
||||
Action = "sts:AssumeRole"
|
||||
Effect = "Allow"
|
||||
Principal = {
|
||||
Service = "ecs-tasks.amazonaws.com"
|
||||
}
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
tags = merge(local.common_tags, {
|
||||
Name = "${local.name_prefix}-execution-role"
|
||||
})
|
||||
}
|
||||
|
||||
# Attach AWS managed ECS execution policy
|
||||
resource "aws_iam_role_policy_attachment" "grafana_execution_policy" {
|
||||
role = aws_iam_role.grafana_execution.name
|
||||
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
|
||||
}
|
||||
|
||||
# Additional policy for CloudWatch logs
|
||||
resource "aws_iam_role_policy" "grafana_logs_policy" {
|
||||
name = "${local.name_prefix}-logs-policy"
|
||||
role = aws_iam_role.grafana_execution.id
|
||||
|
||||
policy = jsonencode({
|
||||
Version = "2012-10-17"
|
||||
Statement = [
|
||||
{
|
||||
Effect = "Allow"
|
||||
Action = [
|
||||
"logs:CreateLogGroup",
|
||||
"logs:CreateLogStream",
|
||||
"logs:PutLogEvents"
|
||||
]
|
||||
Resource = "${aws_cloudwatch_log_group.grafana.arn}:*"
|
||||
}
|
||||
]
|
||||
})
|
||||
}
|
||||
|
||||
# ECS Task Definition for Grafana OTEL LGTM
|
||||
resource "aws_ecs_task_definition" "grafana" {
|
||||
family = local.name_prefix
|
||||
network_mode = "awsvpc"
|
||||
requires_compatibilities = ["FARGATE"]
|
||||
cpu = var.cpu
|
||||
memory = var.memory
|
||||
execution_role_arn = aws_iam_role.grafana_execution.arn
|
||||
task_role_arn = aws_iam_role.grafana_execution.arn
|
||||
|
||||
container_definitions = jsonencode([
|
||||
{
|
||||
name = "grafana-otel-lgtm"
|
||||
image = var.container_image
|
||||
cpu = 0
|
||||
essential = true
|
||||
|
||||
portMappings = [
|
||||
{
|
||||
containerPort = 3000
|
||||
hostPort = 3000
|
||||
protocol = "tcp"
|
||||
name = "grafana-ui"
|
||||
appProtocol = "http"
|
||||
},
|
||||
{
|
||||
containerPort = 4317
|
||||
hostPort = 4317
|
||||
protocol = "tcp"
|
||||
name = "otlp-grpc"
|
||||
},
|
||||
{
|
||||
containerPort = 4318
|
||||
hostPort = 4318
|
||||
protocol = "tcp"
|
||||
name = "otlp-http"
|
||||
appProtocol = "http"
|
||||
}
|
||||
]
|
||||
|
||||
environment = [
|
||||
{
|
||||
name = "GF_SECURITY_ADMIN_PASSWORD"
|
||||
value = var.grafana_admin_password
|
||||
},
|
||||
{
|
||||
name = "GF_SECURITY_ADMIN_USER"
|
||||
value = var.grafana_admin_user
|
||||
},
|
||||
{
|
||||
name = "GF_INSTALL_PLUGINS"
|
||||
value = ""
|
||||
},
|
||||
{
|
||||
name = "GF_FEATURE_TOGGLES_ENABLE"
|
||||
value = "traceqlEditor"
|
||||
}
|
||||
]
|
||||
|
||||
logConfiguration = {
|
||||
logDriver = "awslogs"
|
||||
options = {
|
||||
"awslogs-group" = aws_cloudwatch_log_group.grafana.name
|
||||
"mode" = "non-blocking"
|
||||
"awslogs-create-group" = "true"
|
||||
"max-buffer-size" = "25m"
|
||||
"awslogs-region" = var.aws_region
|
||||
"awslogs-stream-prefix" = "grafana"
|
||||
}
|
||||
}
|
||||
|
||||
healthCheck = {
|
||||
command = [
|
||||
"CMD-SHELL",
|
||||
"curl --silent --fail http://localhost:3000/api/health || exit 1"
|
||||
]
|
||||
interval = 30
|
||||
timeout = 10
|
||||
retries = 3
|
||||
startPeriod = 60
|
||||
}
|
||||
|
||||
systemControls = []
|
||||
}
|
||||
])
|
||||
|
||||
tags = merge(local.common_tags, {
|
||||
Name = "${local.name_prefix}-task-definition"
|
||||
})
|
||||
}
|
||||
|
||||
# ECS Service for Grafana
|
||||
resource "aws_ecs_service" "grafana" {
|
||||
name = local.name_prefix
|
||||
cluster = var.cluster_name
|
||||
task_definition = aws_ecs_task_definition.grafana.arn
|
||||
desired_count = var.desired_count
|
||||
|
||||
triggers = {
|
||||
redeployment = sha1(jsonencode(aws_ecs_task_definition.grafana.container_definitions))
|
||||
}
|
||||
|
||||
capacity_provider_strategy {
|
||||
capacity_provider = "FARGATE"
|
||||
weight = 1
|
||||
base = 0
|
||||
}
|
||||
|
||||
platform_version = "LATEST"
|
||||
|
||||
deployment_maximum_percent = 200
|
||||
deployment_minimum_healthy_percent = 100
|
||||
|
||||
deployment_circuit_breaker {
|
||||
enable = true
|
||||
rollback = false
|
||||
}
|
||||
|
||||
network_configuration {
|
||||
subnets = var.private_subnet_ids
|
||||
security_groups = concat(
|
||||
[aws_security_group.grafana.id],
|
||||
var.additional_security_group_ids
|
||||
)
|
||||
assign_public_ip = false
|
||||
}
|
||||
|
||||
service_registries {
|
||||
registry_arn = aws_service_discovery_service.grafana.arn
|
||||
}
|
||||
|
||||
deployment_controller {
|
||||
type = "ECS"
|
||||
}
|
||||
|
||||
enable_execute_command = var.enable_execute_command
|
||||
|
||||
tags = merge(local.common_tags, {
|
||||
Name = "${local.name_prefix}-service"
|
||||
})
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [desired_count]
|
||||
}
|
||||
}
|
||||
|
||||
# Auto Scaling Target (if enabled)
|
||||
resource "aws_appautoscaling_target" "grafana" {
|
||||
count = var.enable_autoscaling ? 1 : 0
|
||||
|
||||
max_capacity = var.max_capacity
|
||||
min_capacity = var.min_capacity
|
||||
resource_id = "service/${var.cluster_name}/${aws_ecs_service.grafana.name}"
|
||||
scalable_dimension = "ecs:service:DesiredCount"
|
||||
service_namespace = "ecs"
|
||||
|
||||
depends_on = [aws_ecs_service.grafana]
|
||||
|
||||
tags = local.common_tags
|
||||
}
|
||||
|
||||
# Auto Scaling Policy (if enabled)
|
||||
resource "aws_appautoscaling_policy" "grafana_scale_up" {
|
||||
count = var.enable_autoscaling ? 1 : 0
|
||||
|
||||
name = "${local.name_prefix}-scale-up"
|
||||
policy_type = "TargetTrackingScaling"
|
||||
resource_id = aws_appautoscaling_target.grafana[0].resource_id
|
||||
scalable_dimension = aws_appautoscaling_target.grafana[0].scalable_dimension
|
||||
service_namespace = aws_appautoscaling_target.grafana[0].service_namespace
|
||||
|
||||
target_tracking_scaling_policy_configuration {
|
||||
target_value = var.cpu_target_value
|
||||
|
||||
predefined_metric_specification {
|
||||
predefined_metric_type = "ECSServiceAverageCPUUtilization"
|
||||
}
|
||||
|
||||
scale_out_cooldown = 600 # 10 minutes
|
||||
scale_in_cooldown = 300 # 5 minutes
|
||||
}
|
||||
}
|
||||
114
iac/modules/grafana-otel/outputs.tf
Normal file
114
iac/modules/grafana-otel/outputs.tf
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
# Service Information
|
||||
output "service_name" {
|
||||
description = "Name of the Grafana ECS service"
|
||||
value = aws_ecs_service.grafana.name
|
||||
}
|
||||
|
||||
output "service_arn" {
|
||||
description = "ARN of the Grafana ECS service"
|
||||
value = aws_ecs_service.grafana.id
|
||||
}
|
||||
|
||||
output "task_definition_arn" {
|
||||
description = "ARN of the Grafana task definition"
|
||||
value = aws_ecs_task_definition.grafana.arn
|
||||
}
|
||||
|
||||
# Access Information
|
||||
output "grafana_dashboard_url" {
|
||||
description = "Grafana dashboard URL (accessible from allowed CIDR blocks)"
|
||||
value = var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:3000" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:3000"
|
||||
}
|
||||
|
||||
output "grafana_admin_credentials" {
|
||||
description = "Grafana admin login credentials"
|
||||
value = {
|
||||
username = var.grafana_admin_user
|
||||
password = var.grafana_admin_password
|
||||
}
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
output "otlp_endpoints" {
|
||||
description = "OpenTelemetry OTLP endpoints for telemetry data"
|
||||
value = {
|
||||
grpc = var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4317" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4317"
|
||||
http = var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4318" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4318"
|
||||
}
|
||||
}
|
||||
|
||||
# Service Discovery Information
|
||||
output "service_discovery_namespace_id" {
|
||||
description = "Service discovery namespace ID"
|
||||
value = var.service_discovery_namespace_id != "" ? var.service_discovery_namespace_id : aws_service_discovery_private_dns_namespace.grafana[0].id
|
||||
}
|
||||
|
||||
output "service_discovery_namespace_name" {
|
||||
description = "Service discovery namespace name"
|
||||
value = var.service_discovery_namespace_id != "" ? var.service_discovery_namespace_name : aws_service_discovery_private_dns_namespace.grafana[0].name
|
||||
}
|
||||
|
||||
output "service_discovery_service_arn" {
|
||||
description = "Service discovery service ARN"
|
||||
value = aws_service_discovery_service.grafana.arn
|
||||
}
|
||||
|
||||
# Security Information
|
||||
output "security_group_id" {
|
||||
description = "Security group ID for Grafana tasks"
|
||||
value = aws_security_group.grafana.id
|
||||
}
|
||||
|
||||
output "execution_role_arn" {
|
||||
description = "IAM execution role ARN for Grafana tasks"
|
||||
value = aws_iam_role.grafana_execution.arn
|
||||
}
|
||||
|
||||
# Monitoring Information
|
||||
output "cloudwatch_log_group_name" {
|
||||
description = "CloudWatch log group name for Grafana logs"
|
||||
value = aws_cloudwatch_log_group.grafana.name
|
||||
}
|
||||
|
||||
output "cloudwatch_log_group_arn" {
|
||||
description = "CloudWatch log group ARN for Grafana logs"
|
||||
value = aws_cloudwatch_log_group.grafana.arn
|
||||
}
|
||||
|
||||
# Setup Instructions
|
||||
output "setup_instructions" {
|
||||
description = "Instructions for accessing and configuring Grafana monitoring"
|
||||
value = <<-EOT
|
||||
|
||||
=== GRAFANA OTEL MONITORING SETUP ===
|
||||
|
||||
1. VERIFICATION COMMANDS (run from within VPC):
|
||||
nslookup ${var.service_name}.${var.service_discovery_namespace_id != "" ? var.service_discovery_namespace_name : aws_service_discovery_private_dns_namespace.grafana[0].name}
|
||||
curl ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:3000" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:3000"}
|
||||
|
||||
2. GRAFANA ACCESS:
|
||||
URL: ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:3000" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:3000"}
|
||||
Username: ${var.grafana_admin_user}
|
||||
Password: ${var.grafana_admin_password}
|
||||
|
||||
3. OPENTELEMETRY ENDPOINTS:
|
||||
- OTLP gRPC: ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4317" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4317"}
|
||||
- OTLP HTTP: ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4318" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4318"}
|
||||
|
||||
4. MONITORING DATA SOURCES:
|
||||
- Prometheus: Pre-configured for metrics
|
||||
- Tempo: Pre-configured for distributed traces
|
||||
- Loki: Pre-configured for logs aggregation
|
||||
|
||||
5. TROUBLESHOOTING:
|
||||
- Check ECS service status: aws ecs describe-services --cluster ${var.cluster_name} --services ${local.name_prefix}
|
||||
- View Grafana logs: aws logs tail ${aws_cloudwatch_log_group.grafana.name} --follow
|
||||
- Test connectivity from application security groups
|
||||
|
||||
6. INTEGRATION WITH APPLICATIONS:
|
||||
To send telemetry data to this Grafana instance, configure your applications with:
|
||||
- OTEL_EXPORTER_OTLP_ENDPOINT: ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4317" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4317"}
|
||||
- Ensure your application security groups are added to otlp_sources_security_group_ids
|
||||
|
||||
EOT
|
||||
}
|
||||
156
iac/modules/grafana-otel/variables.tf
Normal file
156
iac/modules/grafana-otel/variables.tf
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
# Core Infrastructure Variables
|
||||
variable "aws_region" {
|
||||
description = "AWS region for deployment"
|
||||
type = string
|
||||
default = "us-east-1"
|
||||
}
|
||||
|
||||
variable "environment" {
|
||||
description = "Environment name (e.g., production, staging, dev)"
|
||||
type = string
|
||||
default = "production"
|
||||
}
|
||||
|
||||
variable "name_prefix" {
|
||||
description = "Prefix for all resource names"
|
||||
type = string
|
||||
default = "grafana-otel"
|
||||
}
|
||||
|
||||
# Network Configuration
|
||||
variable "vpc_id" {
|
||||
description = "VPC ID where Grafana will be deployed"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "private_subnet_ids" {
|
||||
description = "Private subnet IDs for Grafana ECS tasks"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "allowed_cidr_blocks" {
|
||||
description = "CIDR blocks allowed to access Grafana UI (port 3000)"
|
||||
type = list(string)
|
||||
default = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
|
||||
}
|
||||
|
||||
# ECS Configuration
|
||||
variable "cluster_name" {
|
||||
description = "ECS cluster name where Grafana will be deployed"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "container_image" {
|
||||
description = "Grafana OTEL LGTM container image"
|
||||
type = string
|
||||
default = "grafana/otel-lgtm:latest"
|
||||
}
|
||||
|
||||
variable "cpu" {
|
||||
description = "CPU units for Grafana task"
|
||||
type = number
|
||||
default = 1024
|
||||
}
|
||||
|
||||
variable "memory" {
|
||||
description = "Memory (MB) for Grafana task"
|
||||
type = number
|
||||
default = 2048
|
||||
}
|
||||
|
||||
variable "desired_count" {
|
||||
description = "Desired number of Grafana tasks"
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
# Grafana Configuration
|
||||
variable "grafana_admin_user" {
|
||||
description = "Grafana admin username"
|
||||
type = string
|
||||
default = "admin"
|
||||
}
|
||||
|
||||
variable "grafana_admin_password" {
|
||||
description = "Grafana admin password"
|
||||
type = string
|
||||
default = "openwebui_monitoring_2024"
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
# Service Discovery Configuration
|
||||
variable "service_discovery_namespace_id" {
|
||||
description = "Service discovery namespace ID (if using existing namespace)"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "service_discovery_namespace_name" {
|
||||
description = "Service discovery namespace name (creates new if namespace_id not provided)"
|
||||
type = string
|
||||
default = "grafana-monitoring"
|
||||
}
|
||||
|
||||
variable "service_name" {
|
||||
description = "Service discovery service name"
|
||||
type = string
|
||||
default = "otel-monitor"
|
||||
}
|
||||
|
||||
# Monitoring Configuration
|
||||
variable "log_retention_days" {
|
||||
description = "CloudWatch log retention in days"
|
||||
type = number
|
||||
default = 7
|
||||
}
|
||||
|
||||
variable "enable_autoscaling" {
|
||||
description = "Enable ECS autoscaling for Grafana"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "max_capacity" {
|
||||
description = "Maximum number of tasks for autoscaling"
|
||||
type = number
|
||||
default = 2
|
||||
}
|
||||
|
||||
variable "min_capacity" {
|
||||
description = "Minimum number of tasks for autoscaling"
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
variable "cpu_target_value" {
|
||||
description = "Target CPU utilization for autoscaling"
|
||||
type = number
|
||||
default = 80.0
|
||||
}
|
||||
|
||||
# Security Configuration
|
||||
variable "additional_security_group_ids" {
|
||||
description = "Additional security group IDs to attach to Grafana tasks"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "enable_execute_command" {
|
||||
description = "Enable ECS execute command for debugging"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
# OpenTelemetry Configuration
|
||||
variable "otlp_sources_security_group_ids" {
|
||||
description = "Security group IDs that should be allowed to send OTLP data to Grafana"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
# Tags
|
||||
variable "tags" {
|
||||
description = "Additional tags for all resources"
|
||||
type = map(string)
|
||||
default = {}
|
||||
}
|
||||
10
iac/modules/grafana-otel/versions.tf
Normal file
10
iac/modules/grafana-otel/versions.tf
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
terraform {
|
||||
required_version = ">= 1.0"
|
||||
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = ">= 5.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue