feat(infra): set standalone Grafana service

This commit is contained in:
loitragg 2025-09-27 16:05:04 +07:00
parent e917f7eb67
commit 03c0d47b7d
No known key found for this signature in database
GPG key ID: 96292BAF3E28CFF5
6 changed files with 699 additions and 0 deletions

View file

@ -0,0 +1,25 @@
# This file is maintained automatically by "terraform init".
# Manual edits may be lost in future updates.
provider "registry.terraform.io/hashicorp/aws" {
version = "5.100.0"
constraints = ">= 5.0.0, ~> 5.0"
hashes = [
"h1:Ijt7pOlB7Tr7maGQIqtsLFbl7pSMIj06TVdkoSBcYOw=",
"zh:054b8dd49f0549c9a7cc27d159e45327b7b65cf404da5e5a20da154b90b8a644",
"zh:0b97bf8d5e03d15d83cc40b0530a1f84b459354939ba6f135a0086c20ebbe6b2",
"zh:1589a2266af699cbd5d80737a0fe02e54ec9cf2ca54e7e00ac51c7359056f274",
"zh:6330766f1d85f01ae6ea90d1b214b8b74cc8c1badc4696b165b36ddd4cc15f7b",
"zh:7c8c2e30d8e55291b86fcb64bdf6c25489d538688545eb48fd74ad622e5d3862",
"zh:99b1003bd9bd32ee323544da897148f46a527f622dc3971af63ea3e251596342",
"zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
"zh:9f8b909d3ec50ade83c8062290378b1ec553edef6a447c56dadc01a99f4eaa93",
"zh:aaef921ff9aabaf8b1869a86d692ebd24fbd4e12c21205034bb679b9caf883a2",
"zh:ac882313207aba00dd5a76dbd572a0ddc818bb9cbf5c9d61b28fe30efaec951e",
"zh:bb64e8aff37becab373a1a0cc1080990785304141af42ed6aa3dd4913b000421",
"zh:dfe495f6621df5540d9c92ad40b8067376350b005c637ea6efac5dc15028add4",
"zh:f0ddf0eaf052766cfe09dea8200a946519f653c384ab4336e2a4a64fdd6310e9",
"zh:f1b7e684f4c7ae1eed272b6de7d2049bb87a0275cb04dbb7cda6636f600699c9",
"zh:ff461571e3f233699bf690db319dfe46aec75e58726636a0d97dd9ac6e32fb70",
]
}

View file

@ -0,0 +1,335 @@
# Grafana OTEL Standalone Deployment
This directory contains a complete example for deploying the Grafana OTEL monitoring stack as a standalone service, independent from the main OpenWebUI infrastructure.
**Location**: This deployment example is located in `iac/grafana-standalone/` and uses the module from `iac/modules/grafana-otel/`.
## Quick Start
### 1. Prerequisites
- AWS CLI configured with appropriate permissions
- Terraform >= 1.0 installed
- Existing ECS cluster
- VPC with private subnets
- Access to S3 bucket `gg-ai-terraform-states` for state storage
### 2. Configuration
1. Copy the example variables file:
```bash
cp terraform.tfvars.example terraform.tfvars
```
2. Edit `terraform.tfvars` with your environment values:
```hcl
# Required: Update these values for your environment
vpc_id = "vpc-your-vpc-id"
private_subnet_ids = ["subnet-12345", "subnet-67890"]
cluster_name = "your-ecs-cluster"
# Optional: Customize as needed
grafana_admin_password = "your-secure-password"
allowed_cidr_blocks = ["your-vpn-cidr/24"]
```
### 3. Deploy
```bash
# Initialize Terraform with remote backend
terraform init
# Review the plan
terraform plan
# Deploy the infrastructure
terraform apply
```
**Note**: If you encounter AWS credential errors during `terraform init`, ensure your AWS CLI session is active:
```bash
# Refresh AWS credentials if needed
aws sts get-caller-identity --profile 908027381725_AdministratorAccess
```
## Remote State Backend
This deployment uses an S3 remote backend for state management with the following configuration:
```hcl
backend "s3" {
bucket = "gg-ai-terraform-states"
key = "production/grafana-monitoring/terraform.tfstate"
region = "us-east-1"
profile = "908027381725_AdministratorAccess"
dynamodb_table = "terraform-state-locks"
encrypt = true
}
```
### Key Benefits:
- **Team Collaboration**: Multiple team members can work with the same state
- **State Locking**: DynamoDB table prevents concurrent modifications
- **Encryption**: State file is encrypted at rest
- **Separate State**: Independent from main OpenWebUI infrastructure state
- **Versioning**: S3 bucket versioning enables state history and recovery
### State Path Structure:
- **Main Infrastructure**: `production/gravity-ai-chat/terraform.tfstate`
- **Grafana Monitoring**: `production/grafana-monitoring/terraform.tfstate`
This separation allows independent deployment and management of the monitoring stack.
### 4. Access Grafana
After deployment, Terraform will output the access information:
```bash
# Get the Grafana URL and credentials
terraform output grafana_dashboard_url
terraform output -json grafana_admin_credentials
# Get setup instructions
terraform output -raw setup_instructions
```
## Configuration Options
### Basic Configuration
For a simple deployment with default settings:
```hcl
# terraform.tfvars
vpc_id = "vpc-12345678"
private_subnet_ids = ["subnet-12345", "subnet-67890"]
cluster_name = "my-cluster"
```
### Production Configuration
For a production deployment with custom settings:
```hcl
# terraform.tfvars
environment = "production"
name_prefix = "prod-grafana"
# Increased resources
cpu = 2048
memory = 4096
desired_count = 2
# Autoscaling enabled
enable_autoscaling = true
max_capacity = 3
min_capacity = 2
# Longer log retention
log_retention_days = 30
# Custom Grafana credentials
grafana_admin_user = "monitoring-admin"
grafana_admin_password = "very-secure-password-123"
# Network access from specific CIDRs
allowed_cidr_blocks = [
"192.168.1.0/24", # Office network
"10.100.0.0/16", # VPN network
]
# Applications that will send telemetry
otlp_sources_security_group_ids = [
"sg-app1-security-group",
"sg-app2-security-group",
]
```
### Integration with Existing Service Discovery
If you have an existing service discovery namespace:
```hcl
# Use existing namespace
service_discovery_namespace_id = "ns-existing-12345"
service_name = "monitoring"
```
## Integration with Applications
After deploying Grafana, configure your applications to send telemetry data:
### 1. Add Application Security Groups
Update your `terraform.tfvars`:
```hcl
otlp_sources_security_group_ids = [
"sg-your-app-security-group",
]
```
Then run `terraform apply` to update the security group rules.
### 2. Configure Application Environment Variables
In your application deployment (ECS task definition, Kubernetes deployment, etc.):
```bash
# OpenTelemetry configuration
OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-monitor.grafana-monitoring:4317
OTEL_EXPORTER_OTLP_INSECURE=true
OTEL_SERVICE_NAME=my-application
OTEL_RESOURCE_ATTRIBUTES=service.version=1.0.0,deployment.environment=production
```
### 3. Verify Integration
```bash
# Check service discovery
nslookup otel-monitor.grafana-monitoring
# Test OTLP endpoint connectivity
curl http://otel-monitor.grafana-monitoring:4317
# Access Grafana dashboard
curl http://otel-monitor.grafana-monitoring:3000
```
## Monitoring and Maintenance
### Viewing Logs
```bash
# View Grafana container logs
aws logs tail /ecs/grafana-otel --follow
# Check ECS service events
aws ecs describe-services --cluster your-cluster --services grafana-otel
```
### Scaling
```bash
# Manual scaling (if autoscaling is disabled)
aws ecs update-service --cluster your-cluster --service grafana-otel --desired-count 2
# Update autoscaling settings via Terraform
# Edit terraform.tfvars and run terraform apply
```
### Updates
```bash
# Update to latest Grafana OTEL image
terraform apply -var="container_image=grafana/otel-lgtm:latest"
# Update configuration
# Edit terraform.tfvars and run terraform apply
```
## Troubleshooting
### Common Issues
1. **Service not starting**
- Check CloudWatch logs for container errors
- Verify ECS cluster has capacity
- Check security group rules
2. **Cannot access Grafana UI**
- Verify allowed_cidr_blocks includes your IP
- Check VPC connectivity (VPN, bastion host)
- Confirm service discovery is working
3. **No telemetry data**
- Verify otlp_sources_security_group_ids
- Check application OTLP endpoint configuration
- Confirm network connectivity between services
### Useful Commands
```bash
# Check service status
terraform show | grep -A 10 "aws_ecs_service"
# Verify service discovery
aws servicediscovery list-services
# Check security groups
aws ec2 describe-security-groups --group-ids $(terraform output -raw security_group_id)
# View all outputs
terraform output
```
## Cleanup
To remove all resources:
```bash
terraform destroy
```
## State Management Commands
### Working with Remote State
```bash
# Initialize with remote backend (first time setup)
terraform init
# Migrate from local to remote state (if you have existing local state)
terraform init -migrate-state
# View remote state
terraform show
# List resources in state
terraform state list
# Pull remote state to local (for inspection)
terraform state pull > current-state.json
# Check state lock status
aws dynamodb describe-table --table-name terraform-state-locks --profile 908027381725_AdministratorAccess
```
### State Recovery and Backup
```bash
# Download current state from S3
aws s3 cp s3://gg-ai-terraform-states/production/grafana-monitoring/terraform.tfstate ./backup-state.tfstate --profile 908027381725_AdministratorAccess
# List state versions (if bucket versioning is enabled)
aws s3api list-object-versions --bucket gg-ai-terraform-states --prefix production/grafana-monitoring/terraform.tfstate --profile 908027381725_AdministratorAccess
# Force unlock state (if locked and lock is stale)
terraform force-unlock LOCK_ID
```
## Security Considerations
- Store sensitive variables (passwords) in environment variables or use AWS Secrets Manager
- Restrict `allowed_cidr_blocks` to minimum required networks
- Use strong passwords for Grafana admin account
- Regularly update the Grafana OTEL container image
- Monitor CloudWatch logs for security events
## Cost Estimation
Default configuration (1 task, 1 vCPU, 2GB RAM):
- ECS Fargate: ~$35-50/month
- CloudWatch Logs: ~$1-5/month (depending on log volume)
- Service Discovery: ~$0.50/month
Total estimated cost: ~$40-60/month
## Support
For issues or questions:
1. Check the module documentation: `../modules/grafana-otel/README.md`
2. Review Terraform and AWS documentation
3. Check CloudWatch logs for detailed error messages

View file

@ -0,0 +1,74 @@
# Configure the AWS Provider
terraform {
required_version = ">= 1.0"
# Remote state backend configuration for Grafana standalone
backend "s3" {
bucket = "gg-ai-terraform-states"
key = "production/grafana-monitoring/terraform.tfstate"
region = "us-east-1"
profile = "908027381725_AdministratorAccess"
dynamodb_table = "terraform-state-locks"
encrypt = true
}
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
provider "aws" {
region = var.aws_region
profile = var.aws_profile
}
# Deploy Grafana OTEL monitoring stack
module "grafana_otel" {
source = "../modules/grafana-otel"
# Core Infrastructure (required)
vpc_id = var.vpc_id
private_subnet_ids = var.private_subnet_ids
cluster_name = var.cluster_name
# Environment Configuration
environment = var.environment
aws_region = var.aws_region
name_prefix = var.name_prefix
# Network Access Configuration
allowed_cidr_blocks = var.allowed_cidr_blocks
# Grafana Configuration
grafana_admin_user = var.grafana_admin_user
grafana_admin_password = var.grafana_admin_password
# Service Discovery Configuration
service_discovery_namespace_id = var.service_discovery_namespace_id
service_discovery_namespace_name = var.service_discovery_namespace_name
service_name = var.service_name
# Resource Configuration
cpu = var.cpu
memory = var.memory
desired_count = var.desired_count
# Scaling Configuration
enable_autoscaling = var.enable_autoscaling
max_capacity = var.max_capacity
min_capacity = var.min_capacity
cpu_target_value = var.cpu_target_value
# Monitoring Configuration
log_retention_days = var.log_retention_days
# Security Configuration
otlp_sources_security_group_ids = var.otlp_sources_security_group_ids
additional_security_group_ids = var.additional_security_group_ids
# Tags
tags = var.tags
}

View file

@ -0,0 +1,46 @@
# Pass through module outputs
output "grafana_dashboard_url" {
description = "Grafana dashboard URL (accessible from allowed CIDR blocks)"
value = module.grafana_otel.grafana_dashboard_url
}
output "grafana_admin_credentials" {
description = "Grafana admin login credentials"
value = module.grafana_otel.grafana_admin_credentials
sensitive = true
}
output "otlp_endpoints" {
description = "OpenTelemetry OTLP endpoints for telemetry data"
value = module.grafana_otel.otlp_endpoints
}
output "service_discovery_info" {
description = "Service discovery information"
value = {
namespace_id = module.grafana_otel.service_discovery_namespace_id
namespace_name = module.grafana_otel.service_discovery_namespace_name
service_arn = module.grafana_otel.service_discovery_service_arn
}
}
output "security_group_id" {
description = "Security group ID for Grafana tasks"
value = module.grafana_otel.security_group_id
}
output "execution_role_arn" {
description = "IAM execution role ARN for Grafana tasks"
value = module.grafana_otel.execution_role_arn
}
output "cloudwatch_log_group_name" {
description = "CloudWatch log group name for Grafana logs"
value = module.grafana_otel.cloudwatch_log_group_name
}
output "setup_instructions" {
description = "Instructions for accessing and configuring Grafana monitoring"
value = module.grafana_otel.setup_instructions
sensitive = true
}

View file

@ -0,0 +1,65 @@
# AWS Configuration
aws_region = "us-east-1"
aws_profile = "908027381725_AdministratorAccess"
# Environment
environment = "production"
name_prefix = "grafana-otel"
# Core Infrastructure (Required - Update these values)
vpc_id = "vpc-01bc2784063a567d3"
private_subnet_ids = [
"subnet-01296c54f7bff84bc",
"subnet-00da3547f2178dd85"
]
cluster_name = "webUIcluster2"
# Network Access Configuration
# Allow access from VPN and internal networks
allowed_cidr_blocks = [
"192.168.158.0/24", # GG VPN
"192.168.144.0/23", # Internal VPC
"10.0.0.0/8" # Private networks
]
# Grafana Configuration
grafana_admin_user = "admin"
grafana_admin_password = "your-secure-password-here"
# Service Discovery Configuration
# Leave empty to create new namespace, or specify existing namespace ID
service_discovery_namespace_id = "ns-m5evzfyyw2zelrfu"
service_discovery_namespace_name = "ggai"
service_name = "grafana-monitoring"
# Resource Configuration
cpu = 1024 # 1 vCPU
memory = 2048 # 2 GB
desired_count = 1
# Scaling Configuration
enable_autoscaling = true
max_capacity = 2
min_capacity = 1
cpu_target_value = 80.0
# Monitoring Configuration
log_retention_days = 7
# Security Configuration
# Add security group IDs of applications that will send OTLP data
otlp_sources_security_group_ids = [
# "sg-05e12bd2e202e19f6", # Example: OpenWebUI security group
# "sg-87654321", # Example: Other application security group
]
# Additional security groups to attach to Grafana tasks
additional_security_group_ids = []
# Tags
tags = {
Project = "grafana-monitoring"
Environment = "production"
Owner = "platform-team"
ManagedBy = "terraform"
}

View file

@ -0,0 +1,154 @@
# AWS Configuration
variable "aws_region" {
description = "AWS region for deployment"
type = string
default = "us-east-1"
}
variable "aws_profile" {
description = "AWS CLI profile to use"
type = string
default = "908027381725_AdministratorAccess"
}
variable "environment" {
description = "Environment name (e.g., production, staging, dev)"
type = string
default = "production"
}
variable "name_prefix" {
description = "Prefix for all resource names"
type = string
default = "grafana-otel"
}
# Core Infrastructure (Required)
variable "vpc_id" {
description = "VPC ID where Grafana will be deployed"
type = string
}
variable "private_subnet_ids" {
description = "Private subnet IDs for Grafana ECS tasks"
type = list(string)
}
variable "cluster_name" {
description = "ECS cluster name where Grafana will be deployed"
type = string
}
# Network Access Configuration
variable "allowed_cidr_blocks" {
description = "CIDR blocks allowed to access Grafana UI (port 3000)"
type = list(string)
default = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
}
# Grafana Configuration
variable "grafana_admin_user" {
description = "Grafana admin username"
type = string
default = "admin"
}
variable "grafana_admin_password" {
description = "Grafana admin password"
type = string
default = "openwebui_monitoring_2024"
sensitive = true
}
# Service Discovery Configuration
variable "service_discovery_namespace_id" {
description = "Service discovery namespace ID (if using existing namespace)"
type = string
default = ""
}
variable "service_discovery_namespace_name" {
description = "Service discovery namespace name (creates new if namespace_id not provided)"
type = string
default = "grafana-monitoring"
}
variable "service_name" {
description = "Service discovery service name"
type = string
default = "otel-monitor"
}
# Resource Configuration
variable "cpu" {
description = "CPU units for Grafana task"
type = number
default = 1024
}
variable "memory" {
description = "Memory (MB) for Grafana task"
type = number
default = 2048
}
variable "desired_count" {
description = "Desired number of Grafana tasks"
type = number
default = 1
}
# Scaling Configuration
variable "enable_autoscaling" {
description = "Enable ECS autoscaling for Grafana"
type = bool
default = true
}
variable "max_capacity" {
description = "Maximum number of tasks for autoscaling"
type = number
default = 2
}
variable "min_capacity" {
description = "Minimum number of tasks for autoscaling"
type = number
default = 1
}
variable "cpu_target_value" {
description = "Target CPU utilization for autoscaling"
type = number
default = 80.0
}
# Monitoring Configuration
variable "log_retention_days" {
description = "CloudWatch log retention in days"
type = number
default = 7
}
# Security Configuration
variable "otlp_sources_security_group_ids" {
description = "Security group IDs that should be allowed to send OTLP data to Grafana"
type = list(string)
default = []
}
variable "additional_security_group_ids" {
description = "Additional security group IDs to attach to Grafana tasks"
type = list(string)
default = []
}
# Tags
variable "tags" {
description = "Additional tags for all resources"
type = map(string)
default = {
Project = "grafana-monitoring"
ManagedBy = "terraform"
}
}