mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-14 13:25:20 +00:00
feat(infra): add common module Grafana
This commit is contained in:
parent
b5ec91f9d9
commit
e917f7eb67
5 changed files with 858 additions and 0 deletions
231
iac/modules/grafana-otel/README.md
Normal file
231
iac/modules/grafana-otel/README.md
Normal file
|
|
@ -0,0 +1,231 @@
|
||||||
|
# Grafana OTEL Module
|
||||||
|
|
||||||
|
This Terraform module deploys a standalone Grafana OTEL LGTM (Logs, Grafana, Tempo, Mimir) stack on AWS ECS Fargate for OpenTelemetry monitoring and observability.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Complete OTEL Stack**: Grafana + Prometheus + Tempo + Loki in a single container
|
||||||
|
- **ECS Fargate Deployment**: Serverless, scalable container deployment
|
||||||
|
- **Service Discovery**: Automatic DNS registration for easy service connectivity
|
||||||
|
- **Security**: Configurable security groups and network access controls
|
||||||
|
- **Auto Scaling**: Optional ECS autoscaling based on CPU utilization
|
||||||
|
- **CloudWatch Integration**: Structured logging with configurable retention
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
||||||
|
│ Applications │───▶│ OTLP Endpoints │───▶│ Grafana UI │
|
||||||
|
│ │ │ (4317/4318) │ │ (3000) │
|
||||||
|
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────────┐
|
||||||
|
│ ECS Fargate │
|
||||||
|
│ - Grafana │
|
||||||
|
│ - Prometheus │
|
||||||
|
│ - Tempo │
|
||||||
|
│ - Loki │
|
||||||
|
└──────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Basic Usage
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
module "grafana_monitoring" {
|
||||||
|
source = "./modules/grafana-otel"
|
||||||
|
|
||||||
|
# Core Infrastructure
|
||||||
|
vpc_id = "vpc-12345678"
|
||||||
|
private_subnet_ids = ["subnet-12345678", "subnet-87654321"]
|
||||||
|
cluster_name = "my-ecs-cluster"
|
||||||
|
|
||||||
|
# Network Access
|
||||||
|
allowed_cidr_blocks = ["10.0.0.0/8", "192.168.0.0/16"]
|
||||||
|
|
||||||
|
# Optional: OpenTelemetry Sources
|
||||||
|
otlp_sources_security_group_ids = ["sg-app1", "sg-app2"]
|
||||||
|
|
||||||
|
tags = {
|
||||||
|
Environment = "production"
|
||||||
|
Project = "monitoring"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Advanced Usage with Existing Service Discovery
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
module "grafana_monitoring" {
|
||||||
|
source = "./modules/grafana-otel"
|
||||||
|
|
||||||
|
# Core Infrastructure
|
||||||
|
vpc_id = "vpc-12345678"
|
||||||
|
private_subnet_ids = ["subnet-12345678", "subnet-87654321"]
|
||||||
|
cluster_name = "my-ecs-cluster"
|
||||||
|
|
||||||
|
# Use existing service discovery namespace
|
||||||
|
service_discovery_namespace_id = "ns-12345678"
|
||||||
|
service_name = "monitoring"
|
||||||
|
|
||||||
|
# Custom configuration
|
||||||
|
environment = "staging"
|
||||||
|
cpu = 2048
|
||||||
|
memory = 4096
|
||||||
|
desired_count = 2
|
||||||
|
enable_autoscaling = true
|
||||||
|
max_capacity = 3
|
||||||
|
|
||||||
|
# Custom Grafana credentials
|
||||||
|
grafana_admin_user = "monitoring-admin"
|
||||||
|
grafana_admin_password = "secure-password-123"
|
||||||
|
|
||||||
|
tags = {
|
||||||
|
Environment = "staging"
|
||||||
|
Project = "monitoring"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
| Name | Version |
|
||||||
|
|------|---------|
|
||||||
|
| terraform | >= 1.0 |
|
||||||
|
| aws | >= 5.0 |
|
||||||
|
|
||||||
|
## Providers
|
||||||
|
|
||||||
|
| Name | Version |
|
||||||
|
|------|---------|
|
||||||
|
| aws | >= 5.0 |
|
||||||
|
|
||||||
|
## Resources Created
|
||||||
|
|
||||||
|
- **ECS Service & Task Definition**: Fargate-based Grafana OTEL LGTM container
|
||||||
|
- **Service Discovery**: DNS service registration for easy connectivity
|
||||||
|
- **Security Groups**: Network access controls for Grafana UI and OTLP endpoints
|
||||||
|
- **IAM Roles**: Execution role with necessary permissions
|
||||||
|
- **CloudWatch Log Group**: Centralized logging with configurable retention
|
||||||
|
- **Auto Scaling** (optional): CPU-based scaling for high availability
|
||||||
|
|
||||||
|
## Inputs
|
||||||
|
|
||||||
|
| Name | Description | Type | Default | Required |
|
||||||
|
|------|-------------|------|---------|:--------:|
|
||||||
|
| vpc_id | VPC ID where Grafana will be deployed | `string` | n/a | yes |
|
||||||
|
| private_subnet_ids | Private subnet IDs for Grafana ECS tasks | `list(string)` | n/a | yes |
|
||||||
|
| cluster_name | ECS cluster name where Grafana will be deployed | `string` | n/a | yes |
|
||||||
|
| aws_region | AWS region for deployment | `string` | `"us-east-1"` | no |
|
||||||
|
| allowed_cidr_blocks | CIDR blocks allowed to access Grafana UI | `list(string)` | `["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]` | no |
|
||||||
|
| otlp_sources_security_group_ids | Security group IDs that should be allowed to send OTLP data | `list(string)` | `[]` | no |
|
||||||
|
| grafana_admin_user | Grafana admin username | `string` | `"admin"` | no |
|
||||||
|
| grafana_admin_password | Grafana admin password | `string` | `"openwebui_monitoring_2024"` | no |
|
||||||
|
| cpu | CPU units for Grafana task | `number` | `1024` | no |
|
||||||
|
| memory | Memory (MB) for Grafana task | `number` | `2048` | no |
|
||||||
|
| enable_autoscaling | Enable ECS autoscaling for Grafana | `bool` | `true` | no |
|
||||||
|
|
||||||
|
See [variables.tf](./variables.tf) for complete list of inputs.
|
||||||
|
|
||||||
|
## Outputs
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| grafana_dashboard_url | Grafana dashboard URL |
|
||||||
|
| grafana_admin_credentials | Grafana admin login credentials (sensitive) |
|
||||||
|
| otlp_endpoints | OpenTelemetry OTLP endpoints (gRPC and HTTP) |
|
||||||
|
| security_group_id | Security group ID for Grafana tasks |
|
||||||
|
| setup_instructions | Complete setup and integration instructions |
|
||||||
|
|
||||||
|
See [outputs.tf](./outputs.tf) for complete list of outputs.
|
||||||
|
|
||||||
|
## Integration with Applications
|
||||||
|
|
||||||
|
To send telemetry data from your applications to this Grafana instance:
|
||||||
|
|
||||||
|
### 1. Add Application Security Groups
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
module "grafana_monitoring" {
|
||||||
|
source = "./modules/grafana-otel"
|
||||||
|
# ... other configuration
|
||||||
|
|
||||||
|
otlp_sources_security_group_ids = [
|
||||||
|
aws_security_group.my_app.id,
|
||||||
|
aws_security_group.another_app.id
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Configure Application Environment Variables
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# In your application environment
|
||||||
|
OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-monitor.my-namespace:4317
|
||||||
|
OTEL_EXPORTER_OTLP_INSECURE=true
|
||||||
|
OTEL_SERVICE_NAME=my-application
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Verify Integration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check service discovery
|
||||||
|
nslookup otel-monitor.my-namespace
|
||||||
|
|
||||||
|
# Test OTLP endpoint
|
||||||
|
curl http://otel-monitor.my-namespace:4317
|
||||||
|
|
||||||
|
# Access Grafana UI
|
||||||
|
curl http://otel-monitor.my-namespace:3000
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring and Troubleshooting
|
||||||
|
|
||||||
|
### Access Grafana Dashboard
|
||||||
|
|
||||||
|
1. Connect to your VPC (via VPN or bastion host)
|
||||||
|
2. Navigate to the Grafana URL from module outputs
|
||||||
|
3. Login with the admin credentials
|
||||||
|
4. Explore pre-configured data sources:
|
||||||
|
- **Prometheus**: Metrics and monitoring
|
||||||
|
- **Tempo**: Distributed tracing
|
||||||
|
- **Loki**: Log aggregation
|
||||||
|
|
||||||
|
### Common Issues
|
||||||
|
|
||||||
|
- **Connection refused**: Check security group rules and CIDR blocks
|
||||||
|
- **Service not starting**: Check CloudWatch logs and ECS service events
|
||||||
|
- **No telemetry data**: Verify OTLP source security groups and endpoints
|
||||||
|
|
||||||
|
### Useful Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check ECS service status
|
||||||
|
aws ecs describe-services --cluster my-cluster --services grafana-otel
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
aws logs tail /ecs/grafana-otel --follow
|
||||||
|
|
||||||
|
# Check service discovery
|
||||||
|
aws servicediscovery list-services --filters Name=NAMESPACE_ID,Values=ns-12345678
|
||||||
|
```
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
- Grafana admin password is configurable but stored in Terraform state
|
||||||
|
- Consider using AWS Secrets Manager for production passwords
|
||||||
|
- Network access is controlled via security groups and CIDR blocks
|
||||||
|
- ECS tasks run with least privilege IAM permissions
|
||||||
|
|
||||||
|
## Cost Optimization
|
||||||
|
|
||||||
|
- Default configuration uses 1 vCPU and 2GB RAM (estimated $35-50/month)
|
||||||
|
- Enable autoscaling to handle traffic spikes efficiently
|
||||||
|
- Adjust log retention period to control CloudWatch costs
|
||||||
|
- Consider using Spot instances for non-production environments
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
This module is part of the OpenWebUI infrastructure project.
|
||||||
347
iac/modules/grafana-otel/main.tf
Normal file
347
iac/modules/grafana-otel/main.tf
Normal file
|
|
@ -0,0 +1,347 @@
|
||||||
|
# Local values for consistent naming
|
||||||
|
locals {
|
||||||
|
name_prefix = var.name_prefix
|
||||||
|
common_tags = merge(
|
||||||
|
{
|
||||||
|
Environment = var.environment
|
||||||
|
Module = "grafana-otel"
|
||||||
|
ManagedBy = "terraform"
|
||||||
|
},
|
||||||
|
var.tags
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Service Discovery Namespace (create if not provided)
|
||||||
|
resource "aws_service_discovery_private_dns_namespace" "grafana" {
|
||||||
|
count = var.service_discovery_namespace_id == "" ? 1 : 0
|
||||||
|
|
||||||
|
name = var.service_discovery_namespace_name
|
||||||
|
vpc = var.vpc_id
|
||||||
|
|
||||||
|
description = "Service discovery namespace for Grafana OTEL monitoring"
|
||||||
|
|
||||||
|
tags = merge(local.common_tags, {
|
||||||
|
Name = "${local.name_prefix}-namespace"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Service Discovery Service for Grafana
|
||||||
|
resource "aws_service_discovery_service" "grafana" {
|
||||||
|
name = var.service_name
|
||||||
|
|
||||||
|
dns_config {
|
||||||
|
namespace_id = var.service_discovery_namespace_id != "" ? var.service_discovery_namespace_id : aws_service_discovery_private_dns_namespace.grafana[0].id
|
||||||
|
|
||||||
|
dns_records {
|
||||||
|
ttl = 60
|
||||||
|
type = "A"
|
||||||
|
}
|
||||||
|
|
||||||
|
routing_policy = "MULTIVALUE"
|
||||||
|
}
|
||||||
|
|
||||||
|
health_check_custom_config {
|
||||||
|
failure_threshold = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
description = "Grafana OTEL LGTM monitoring stack service discovery"
|
||||||
|
|
||||||
|
tags = merge(local.common_tags, {
|
||||||
|
Name = "${local.name_prefix}-service-discovery"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
# CloudWatch Log Group for Grafana
|
||||||
|
resource "aws_cloudwatch_log_group" "grafana" {
|
||||||
|
name = "/ecs/${local.name_prefix}"
|
||||||
|
retention_in_days = var.log_retention_days
|
||||||
|
|
||||||
|
tags = merge(local.common_tags, {
|
||||||
|
Name = "${local.name_prefix}-logs"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
# Security Group for Grafana ECS Tasks
|
||||||
|
resource "aws_security_group" "grafana" {
|
||||||
|
name_prefix = "${local.name_prefix}-"
|
||||||
|
vpc_id = var.vpc_id
|
||||||
|
description = "Security group for Grafana OTEL ECS tasks"
|
||||||
|
|
||||||
|
# Allow Grafana UI access from specified CIDR blocks
|
||||||
|
ingress {
|
||||||
|
description = "Grafana UI access"
|
||||||
|
from_port = 3000
|
||||||
|
to_port = 3000
|
||||||
|
protocol = "tcp"
|
||||||
|
cidr_blocks = var.allowed_cidr_blocks
|
||||||
|
}
|
||||||
|
|
||||||
|
# Allow OTLP gRPC from specified security groups
|
||||||
|
dynamic "ingress" {
|
||||||
|
for_each = length(var.otlp_sources_security_group_ids) > 0 ? [1] : []
|
||||||
|
content {
|
||||||
|
description = "OTLP gRPC from sources"
|
||||||
|
from_port = 4317
|
||||||
|
to_port = 4317
|
||||||
|
protocol = "tcp"
|
||||||
|
security_groups = var.otlp_sources_security_group_ids
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Allow OTLP HTTP from specified security groups
|
||||||
|
dynamic "ingress" {
|
||||||
|
for_each = length(var.otlp_sources_security_group_ids) > 0 ? [1] : []
|
||||||
|
content {
|
||||||
|
description = "OTLP HTTP from sources"
|
||||||
|
from_port = 4318
|
||||||
|
to_port = 4318
|
||||||
|
protocol = "tcp"
|
||||||
|
security_groups = var.otlp_sources_security_group_ids
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Allow all outbound traffic
|
||||||
|
egress {
|
||||||
|
from_port = 0
|
||||||
|
to_port = 0
|
||||||
|
protocol = "-1"
|
||||||
|
cidr_blocks = ["0.0.0.0/0"]
|
||||||
|
}
|
||||||
|
|
||||||
|
tags = merge(local.common_tags, {
|
||||||
|
Name = "${local.name_prefix}-security-group"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
# IAM Role for ECS Task Execution
|
||||||
|
resource "aws_iam_role" "grafana_execution" {
|
||||||
|
name = "${local.name_prefix}-execution-role"
|
||||||
|
|
||||||
|
assume_role_policy = jsonencode({
|
||||||
|
Version = "2012-10-17"
|
||||||
|
Statement = [
|
||||||
|
{
|
||||||
|
Action = "sts:AssumeRole"
|
||||||
|
Effect = "Allow"
|
||||||
|
Principal = {
|
||||||
|
Service = "ecs-tasks.amazonaws.com"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
})
|
||||||
|
|
||||||
|
tags = merge(local.common_tags, {
|
||||||
|
Name = "${local.name_prefix}-execution-role"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
# Attach AWS managed ECS execution policy
|
||||||
|
resource "aws_iam_role_policy_attachment" "grafana_execution_policy" {
|
||||||
|
role = aws_iam_role.grafana_execution.name
|
||||||
|
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Additional policy for CloudWatch logs
|
||||||
|
resource "aws_iam_role_policy" "grafana_logs_policy" {
|
||||||
|
name = "${local.name_prefix}-logs-policy"
|
||||||
|
role = aws_iam_role.grafana_execution.id
|
||||||
|
|
||||||
|
policy = jsonencode({
|
||||||
|
Version = "2012-10-17"
|
||||||
|
Statement = [
|
||||||
|
{
|
||||||
|
Effect = "Allow"
|
||||||
|
Action = [
|
||||||
|
"logs:CreateLogGroup",
|
||||||
|
"logs:CreateLogStream",
|
||||||
|
"logs:PutLogEvents"
|
||||||
|
]
|
||||||
|
Resource = "${aws_cloudwatch_log_group.grafana.arn}:*"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
# ECS Task Definition for Grafana OTEL LGTM
|
||||||
|
resource "aws_ecs_task_definition" "grafana" {
|
||||||
|
family = local.name_prefix
|
||||||
|
network_mode = "awsvpc"
|
||||||
|
requires_compatibilities = ["FARGATE"]
|
||||||
|
cpu = var.cpu
|
||||||
|
memory = var.memory
|
||||||
|
execution_role_arn = aws_iam_role.grafana_execution.arn
|
||||||
|
task_role_arn = aws_iam_role.grafana_execution.arn
|
||||||
|
|
||||||
|
container_definitions = jsonencode([
|
||||||
|
{
|
||||||
|
name = "grafana-otel-lgtm"
|
||||||
|
image = var.container_image
|
||||||
|
cpu = 0
|
||||||
|
essential = true
|
||||||
|
|
||||||
|
portMappings = [
|
||||||
|
{
|
||||||
|
containerPort = 3000
|
||||||
|
hostPort = 3000
|
||||||
|
protocol = "tcp"
|
||||||
|
name = "grafana-ui"
|
||||||
|
appProtocol = "http"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
containerPort = 4317
|
||||||
|
hostPort = 4317
|
||||||
|
protocol = "tcp"
|
||||||
|
name = "otlp-grpc"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
containerPort = 4318
|
||||||
|
hostPort = 4318
|
||||||
|
protocol = "tcp"
|
||||||
|
name = "otlp-http"
|
||||||
|
appProtocol = "http"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
environment = [
|
||||||
|
{
|
||||||
|
name = "GF_SECURITY_ADMIN_PASSWORD"
|
||||||
|
value = var.grafana_admin_password
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "GF_SECURITY_ADMIN_USER"
|
||||||
|
value = var.grafana_admin_user
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "GF_INSTALL_PLUGINS"
|
||||||
|
value = ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name = "GF_FEATURE_TOGGLES_ENABLE"
|
||||||
|
value = "traceqlEditor"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
logConfiguration = {
|
||||||
|
logDriver = "awslogs"
|
||||||
|
options = {
|
||||||
|
"awslogs-group" = aws_cloudwatch_log_group.grafana.name
|
||||||
|
"mode" = "non-blocking"
|
||||||
|
"awslogs-create-group" = "true"
|
||||||
|
"max-buffer-size" = "25m"
|
||||||
|
"awslogs-region" = var.aws_region
|
||||||
|
"awslogs-stream-prefix" = "grafana"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
healthCheck = {
|
||||||
|
command = [
|
||||||
|
"CMD-SHELL",
|
||||||
|
"curl --silent --fail http://localhost:3000/api/health || exit 1"
|
||||||
|
]
|
||||||
|
interval = 30
|
||||||
|
timeout = 10
|
||||||
|
retries = 3
|
||||||
|
startPeriod = 60
|
||||||
|
}
|
||||||
|
|
||||||
|
systemControls = []
|
||||||
|
}
|
||||||
|
])
|
||||||
|
|
||||||
|
tags = merge(local.common_tags, {
|
||||||
|
Name = "${local.name_prefix}-task-definition"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
# ECS Service for Grafana
|
||||||
|
resource "aws_ecs_service" "grafana" {
|
||||||
|
name = local.name_prefix
|
||||||
|
cluster = var.cluster_name
|
||||||
|
task_definition = aws_ecs_task_definition.grafana.arn
|
||||||
|
desired_count = var.desired_count
|
||||||
|
|
||||||
|
triggers = {
|
||||||
|
redeployment = sha1(jsonencode(aws_ecs_task_definition.grafana.container_definitions))
|
||||||
|
}
|
||||||
|
|
||||||
|
capacity_provider_strategy {
|
||||||
|
capacity_provider = "FARGATE"
|
||||||
|
weight = 1
|
||||||
|
base = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
platform_version = "LATEST"
|
||||||
|
|
||||||
|
deployment_maximum_percent = 200
|
||||||
|
deployment_minimum_healthy_percent = 100
|
||||||
|
|
||||||
|
deployment_circuit_breaker {
|
||||||
|
enable = true
|
||||||
|
rollback = false
|
||||||
|
}
|
||||||
|
|
||||||
|
network_configuration {
|
||||||
|
subnets = var.private_subnet_ids
|
||||||
|
security_groups = concat(
|
||||||
|
[aws_security_group.grafana.id],
|
||||||
|
var.additional_security_group_ids
|
||||||
|
)
|
||||||
|
assign_public_ip = false
|
||||||
|
}
|
||||||
|
|
||||||
|
service_registries {
|
||||||
|
registry_arn = aws_service_discovery_service.grafana.arn
|
||||||
|
}
|
||||||
|
|
||||||
|
deployment_controller {
|
||||||
|
type = "ECS"
|
||||||
|
}
|
||||||
|
|
||||||
|
enable_execute_command = var.enable_execute_command
|
||||||
|
|
||||||
|
tags = merge(local.common_tags, {
|
||||||
|
Name = "${local.name_prefix}-service"
|
||||||
|
})
|
||||||
|
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = [desired_count]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Auto Scaling Target (if enabled)
|
||||||
|
resource "aws_appautoscaling_target" "grafana" {
|
||||||
|
count = var.enable_autoscaling ? 1 : 0
|
||||||
|
|
||||||
|
max_capacity = var.max_capacity
|
||||||
|
min_capacity = var.min_capacity
|
||||||
|
resource_id = "service/${var.cluster_name}/${aws_ecs_service.grafana.name}"
|
||||||
|
scalable_dimension = "ecs:service:DesiredCount"
|
||||||
|
service_namespace = "ecs"
|
||||||
|
|
||||||
|
depends_on = [aws_ecs_service.grafana]
|
||||||
|
|
||||||
|
tags = local.common_tags
|
||||||
|
}
|
||||||
|
|
||||||
|
# Auto Scaling Policy (if enabled)
|
||||||
|
resource "aws_appautoscaling_policy" "grafana_scale_up" {
|
||||||
|
count = var.enable_autoscaling ? 1 : 0
|
||||||
|
|
||||||
|
name = "${local.name_prefix}-scale-up"
|
||||||
|
policy_type = "TargetTrackingScaling"
|
||||||
|
resource_id = aws_appautoscaling_target.grafana[0].resource_id
|
||||||
|
scalable_dimension = aws_appautoscaling_target.grafana[0].scalable_dimension
|
||||||
|
service_namespace = aws_appautoscaling_target.grafana[0].service_namespace
|
||||||
|
|
||||||
|
target_tracking_scaling_policy_configuration {
|
||||||
|
target_value = var.cpu_target_value
|
||||||
|
|
||||||
|
predefined_metric_specification {
|
||||||
|
predefined_metric_type = "ECSServiceAverageCPUUtilization"
|
||||||
|
}
|
||||||
|
|
||||||
|
scale_out_cooldown = 600 # 10 minutes
|
||||||
|
scale_in_cooldown = 300 # 5 minutes
|
||||||
|
}
|
||||||
|
}
|
||||||
114
iac/modules/grafana-otel/outputs.tf
Normal file
114
iac/modules/grafana-otel/outputs.tf
Normal file
|
|
@ -0,0 +1,114 @@
|
||||||
|
# Service Information
|
||||||
|
output "service_name" {
|
||||||
|
description = "Name of the Grafana ECS service"
|
||||||
|
value = aws_ecs_service.grafana.name
|
||||||
|
}
|
||||||
|
|
||||||
|
output "service_arn" {
|
||||||
|
description = "ARN of the Grafana ECS service"
|
||||||
|
value = aws_ecs_service.grafana.id
|
||||||
|
}
|
||||||
|
|
||||||
|
output "task_definition_arn" {
|
||||||
|
description = "ARN of the Grafana task definition"
|
||||||
|
value = aws_ecs_task_definition.grafana.arn
|
||||||
|
}
|
||||||
|
|
||||||
|
# Access Information
|
||||||
|
output "grafana_dashboard_url" {
|
||||||
|
description = "Grafana dashboard URL (accessible from allowed CIDR blocks)"
|
||||||
|
value = var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:3000" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:3000"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "grafana_admin_credentials" {
|
||||||
|
description = "Grafana admin login credentials"
|
||||||
|
value = {
|
||||||
|
username = var.grafana_admin_user
|
||||||
|
password = var.grafana_admin_password
|
||||||
|
}
|
||||||
|
sensitive = true
|
||||||
|
}
|
||||||
|
|
||||||
|
output "otlp_endpoints" {
|
||||||
|
description = "OpenTelemetry OTLP endpoints for telemetry data"
|
||||||
|
value = {
|
||||||
|
grpc = var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4317" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4317"
|
||||||
|
http = var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4318" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4318"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Service Discovery Information
|
||||||
|
output "service_discovery_namespace_id" {
|
||||||
|
description = "Service discovery namespace ID"
|
||||||
|
value = var.service_discovery_namespace_id != "" ? var.service_discovery_namespace_id : aws_service_discovery_private_dns_namespace.grafana[0].id
|
||||||
|
}
|
||||||
|
|
||||||
|
output "service_discovery_namespace_name" {
|
||||||
|
description = "Service discovery namespace name"
|
||||||
|
value = var.service_discovery_namespace_id != "" ? var.service_discovery_namespace_name : aws_service_discovery_private_dns_namespace.grafana[0].name
|
||||||
|
}
|
||||||
|
|
||||||
|
output "service_discovery_service_arn" {
|
||||||
|
description = "Service discovery service ARN"
|
||||||
|
value = aws_service_discovery_service.grafana.arn
|
||||||
|
}
|
||||||
|
|
||||||
|
# Security Information
|
||||||
|
output "security_group_id" {
|
||||||
|
description = "Security group ID for Grafana tasks"
|
||||||
|
value = aws_security_group.grafana.id
|
||||||
|
}
|
||||||
|
|
||||||
|
output "execution_role_arn" {
|
||||||
|
description = "IAM execution role ARN for Grafana tasks"
|
||||||
|
value = aws_iam_role.grafana_execution.arn
|
||||||
|
}
|
||||||
|
|
||||||
|
# Monitoring Information
|
||||||
|
output "cloudwatch_log_group_name" {
|
||||||
|
description = "CloudWatch log group name for Grafana logs"
|
||||||
|
value = aws_cloudwatch_log_group.grafana.name
|
||||||
|
}
|
||||||
|
|
||||||
|
output "cloudwatch_log_group_arn" {
|
||||||
|
description = "CloudWatch log group ARN for Grafana logs"
|
||||||
|
value = aws_cloudwatch_log_group.grafana.arn
|
||||||
|
}
|
||||||
|
|
||||||
|
# Setup Instructions
|
||||||
|
output "setup_instructions" {
|
||||||
|
description = "Instructions for accessing and configuring Grafana monitoring"
|
||||||
|
value = <<-EOT
|
||||||
|
|
||||||
|
=== GRAFANA OTEL MONITORING SETUP ===
|
||||||
|
|
||||||
|
1. VERIFICATION COMMANDS (run from within VPC):
|
||||||
|
nslookup ${var.service_name}.${var.service_discovery_namespace_id != "" ? var.service_discovery_namespace_name : aws_service_discovery_private_dns_namespace.grafana[0].name}
|
||||||
|
curl ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:3000" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:3000"}
|
||||||
|
|
||||||
|
2. GRAFANA ACCESS:
|
||||||
|
URL: ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:3000" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:3000"}
|
||||||
|
Username: ${var.grafana_admin_user}
|
||||||
|
Password: ${var.grafana_admin_password}
|
||||||
|
|
||||||
|
3. OPENTELEMETRY ENDPOINTS:
|
||||||
|
- OTLP gRPC: ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4317" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4317"}
|
||||||
|
- OTLP HTTP: ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4318" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4318"}
|
||||||
|
|
||||||
|
4. MONITORING DATA SOURCES:
|
||||||
|
- Prometheus: Pre-configured for metrics
|
||||||
|
- Tempo: Pre-configured for distributed traces
|
||||||
|
- Loki: Pre-configured for logs aggregation
|
||||||
|
|
||||||
|
5. TROUBLESHOOTING:
|
||||||
|
- Check ECS service status: aws ecs describe-services --cluster ${var.cluster_name} --services ${local.name_prefix}
|
||||||
|
- View Grafana logs: aws logs tail ${aws_cloudwatch_log_group.grafana.name} --follow
|
||||||
|
- Test connectivity from application security groups
|
||||||
|
|
||||||
|
6. INTEGRATION WITH APPLICATIONS:
|
||||||
|
To send telemetry data to this Grafana instance, configure your applications with:
|
||||||
|
- OTEL_EXPORTER_OTLP_ENDPOINT: ${var.service_discovery_namespace_id != "" ? "http://${var.service_name}.${var.service_discovery_namespace_name}:4317" : "http://${var.service_name}.${aws_service_discovery_private_dns_namespace.grafana[0].name}:4317"}
|
||||||
|
- Ensure your application security groups are added to otlp_sources_security_group_ids
|
||||||
|
|
||||||
|
EOT
|
||||||
|
}
|
||||||
156
iac/modules/grafana-otel/variables.tf
Normal file
156
iac/modules/grafana-otel/variables.tf
Normal file
|
|
@ -0,0 +1,156 @@
|
||||||
|
# Core Infrastructure Variables
|
||||||
|
variable "aws_region" {
|
||||||
|
description = "AWS region for deployment"
|
||||||
|
type = string
|
||||||
|
default = "us-east-1"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "environment" {
|
||||||
|
description = "Environment name (e.g., production, staging, dev)"
|
||||||
|
type = string
|
||||||
|
default = "production"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "name_prefix" {
|
||||||
|
description = "Prefix for all resource names"
|
||||||
|
type = string
|
||||||
|
default = "grafana-otel"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Network Configuration
|
||||||
|
variable "vpc_id" {
|
||||||
|
description = "VPC ID where Grafana will be deployed"
|
||||||
|
type = string
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "private_subnet_ids" {
|
||||||
|
description = "Private subnet IDs for Grafana ECS tasks"
|
||||||
|
type = list(string)
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "allowed_cidr_blocks" {
|
||||||
|
description = "CIDR blocks allowed to access Grafana UI (port 3000)"
|
||||||
|
type = list(string)
|
||||||
|
default = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# ECS Configuration
|
||||||
|
variable "cluster_name" {
|
||||||
|
description = "ECS cluster name where Grafana will be deployed"
|
||||||
|
type = string
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "container_image" {
|
||||||
|
description = "Grafana OTEL LGTM container image"
|
||||||
|
type = string
|
||||||
|
default = "grafana/otel-lgtm:latest"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "cpu" {
|
||||||
|
description = "CPU units for Grafana task"
|
||||||
|
type = number
|
||||||
|
default = 1024
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "memory" {
|
||||||
|
description = "Memory (MB) for Grafana task"
|
||||||
|
type = number
|
||||||
|
default = 2048
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "desired_count" {
|
||||||
|
description = "Desired number of Grafana tasks"
|
||||||
|
type = number
|
||||||
|
default = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Grafana Configuration
|
||||||
|
variable "grafana_admin_user" {
|
||||||
|
description = "Grafana admin username"
|
||||||
|
type = string
|
||||||
|
default = "admin"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "grafana_admin_password" {
|
||||||
|
description = "Grafana admin password"
|
||||||
|
type = string
|
||||||
|
default = "openwebui_monitoring_2024"
|
||||||
|
sensitive = true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Service Discovery Configuration
|
||||||
|
variable "service_discovery_namespace_id" {
|
||||||
|
description = "Service discovery namespace ID (if using existing namespace)"
|
||||||
|
type = string
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "service_discovery_namespace_name" {
|
||||||
|
description = "Service discovery namespace name (creates new if namespace_id not provided)"
|
||||||
|
type = string
|
||||||
|
default = "grafana-monitoring"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "service_name" {
|
||||||
|
description = "Service discovery service name"
|
||||||
|
type = string
|
||||||
|
default = "otel-monitor"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Monitoring Configuration
|
||||||
|
variable "log_retention_days" {
|
||||||
|
description = "CloudWatch log retention in days"
|
||||||
|
type = number
|
||||||
|
default = 7
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "enable_autoscaling" {
|
||||||
|
description = "Enable ECS autoscaling for Grafana"
|
||||||
|
type = bool
|
||||||
|
default = true
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "max_capacity" {
|
||||||
|
description = "Maximum number of tasks for autoscaling"
|
||||||
|
type = number
|
||||||
|
default = 2
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "min_capacity" {
|
||||||
|
description = "Minimum number of tasks for autoscaling"
|
||||||
|
type = number
|
||||||
|
default = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "cpu_target_value" {
|
||||||
|
description = "Target CPU utilization for autoscaling"
|
||||||
|
type = number
|
||||||
|
default = 80.0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Security Configuration
|
||||||
|
variable "additional_security_group_ids" {
|
||||||
|
description = "Additional security group IDs to attach to Grafana tasks"
|
||||||
|
type = list(string)
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "enable_execute_command" {
|
||||||
|
description = "Enable ECS execute command for debugging"
|
||||||
|
type = bool
|
||||||
|
default = true
|
||||||
|
}
|
||||||
|
|
||||||
|
# OpenTelemetry Configuration
|
||||||
|
variable "otlp_sources_security_group_ids" {
|
||||||
|
description = "Security group IDs that should be allowed to send OTLP data to Grafana"
|
||||||
|
type = list(string)
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Tags
|
||||||
|
variable "tags" {
|
||||||
|
description = "Additional tags for all resources"
|
||||||
|
type = map(string)
|
||||||
|
default = {}
|
||||||
|
}
|
||||||
10
iac/modules/grafana-otel/versions.tf
Normal file
10
iac/modules/grafana-otel/versions.tf
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
terraform {
|
||||||
|
required_version = ">= 1.0"
|
||||||
|
|
||||||
|
required_providers {
|
||||||
|
aws = {
|
||||||
|
source = "hashicorp/aws"
|
||||||
|
version = ">= 5.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue