diff --git a/iac/grafana-standalone/.terraform.lock.hcl b/iac/grafana-standalone/.terraform.lock.hcl new file mode 100644 index 0000000000..9e61c1a03b --- /dev/null +++ b/iac/grafana-standalone/.terraform.lock.hcl @@ -0,0 +1,25 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.100.0" + constraints = ">= 5.0.0, ~> 5.0" + hashes = [ + "h1:Ijt7pOlB7Tr7maGQIqtsLFbl7pSMIj06TVdkoSBcYOw=", + "zh:054b8dd49f0549c9a7cc27d159e45327b7b65cf404da5e5a20da154b90b8a644", + "zh:0b97bf8d5e03d15d83cc40b0530a1f84b459354939ba6f135a0086c20ebbe6b2", + "zh:1589a2266af699cbd5d80737a0fe02e54ec9cf2ca54e7e00ac51c7359056f274", + "zh:6330766f1d85f01ae6ea90d1b214b8b74cc8c1badc4696b165b36ddd4cc15f7b", + "zh:7c8c2e30d8e55291b86fcb64bdf6c25489d538688545eb48fd74ad622e5d3862", + "zh:99b1003bd9bd32ee323544da897148f46a527f622dc3971af63ea3e251596342", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:9f8b909d3ec50ade83c8062290378b1ec553edef6a447c56dadc01a99f4eaa93", + "zh:aaef921ff9aabaf8b1869a86d692ebd24fbd4e12c21205034bb679b9caf883a2", + "zh:ac882313207aba00dd5a76dbd572a0ddc818bb9cbf5c9d61b28fe30efaec951e", + "zh:bb64e8aff37becab373a1a0cc1080990785304141af42ed6aa3dd4913b000421", + "zh:dfe495f6621df5540d9c92ad40b8067376350b005c637ea6efac5dc15028add4", + "zh:f0ddf0eaf052766cfe09dea8200a946519f653c384ab4336e2a4a64fdd6310e9", + "zh:f1b7e684f4c7ae1eed272b6de7d2049bb87a0275cb04dbb7cda6636f600699c9", + "zh:ff461571e3f233699bf690db319dfe46aec75e58726636a0d97dd9ac6e32fb70", + ] +} diff --git a/iac/grafana-standalone/README.md b/iac/grafana-standalone/README.md new file mode 100644 index 0000000000..e0983df072 --- /dev/null +++ b/iac/grafana-standalone/README.md @@ -0,0 +1,335 @@ +# Grafana OTEL Standalone Deployment + +This directory contains a complete example for deploying the Grafana OTEL monitoring stack as a standalone service, independent from the main OpenWebUI infrastructure. + +**Location**: This deployment example is located in `iac/grafana-standalone/` and uses the module from `iac/modules/grafana-otel/`. + +## Quick Start + +### 1. Prerequisites + +- AWS CLI configured with appropriate permissions +- Terraform >= 1.0 installed +- Existing ECS cluster +- VPC with private subnets +- Access to S3 bucket `gg-ai-terraform-states` for state storage + +### 2. Configuration + +1. Copy the example variables file: + ```bash + cp terraform.tfvars.example terraform.tfvars + ``` + +2. Edit `terraform.tfvars` with your environment values: + ```hcl + # Required: Update these values for your environment + vpc_id = "vpc-your-vpc-id" + private_subnet_ids = ["subnet-12345", "subnet-67890"] + cluster_name = "your-ecs-cluster" + + # Optional: Customize as needed + grafana_admin_password = "your-secure-password" + allowed_cidr_blocks = ["your-vpn-cidr/24"] + ``` + +### 3. Deploy + +```bash +# Initialize Terraform with remote backend +terraform init + +# Review the plan +terraform plan + +# Deploy the infrastructure +terraform apply +``` + +**Note**: If you encounter AWS credential errors during `terraform init`, ensure your AWS CLI session is active: +```bash +# Refresh AWS credentials if needed +aws sts get-caller-identity --profile 908027381725_AdministratorAccess +``` + +## Remote State Backend + +This deployment uses an S3 remote backend for state management with the following configuration: + +```hcl +backend "s3" { + bucket = "gg-ai-terraform-states" + key = "production/grafana-monitoring/terraform.tfstate" + region = "us-east-1" + profile = "908027381725_AdministratorAccess" + dynamodb_table = "terraform-state-locks" + encrypt = true +} +``` + +### Key Benefits: + +- **Team Collaboration**: Multiple team members can work with the same state +- **State Locking**: DynamoDB table prevents concurrent modifications +- **Encryption**: State file is encrypted at rest +- **Separate State**: Independent from main OpenWebUI infrastructure state +- **Versioning**: S3 bucket versioning enables state history and recovery + +### State Path Structure: + +- **Main Infrastructure**: `production/gravity-ai-chat/terraform.tfstate` +- **Grafana Monitoring**: `production/grafana-monitoring/terraform.tfstate` + +This separation allows independent deployment and management of the monitoring stack. + +### 4. Access Grafana + +After deployment, Terraform will output the access information: + +```bash +# Get the Grafana URL and credentials +terraform output grafana_dashboard_url +terraform output -json grafana_admin_credentials + +# Get setup instructions +terraform output -raw setup_instructions +``` + +## Configuration Options + +### Basic Configuration + +For a simple deployment with default settings: + +```hcl +# terraform.tfvars +vpc_id = "vpc-12345678" +private_subnet_ids = ["subnet-12345", "subnet-67890"] +cluster_name = "my-cluster" +``` + +### Production Configuration + +For a production deployment with custom settings: + +```hcl +# terraform.tfvars +environment = "production" +name_prefix = "prod-grafana" + +# Increased resources +cpu = 2048 +memory = 4096 +desired_count = 2 + +# Autoscaling enabled +enable_autoscaling = true +max_capacity = 3 +min_capacity = 2 + +# Longer log retention +log_retention_days = 30 + +# Custom Grafana credentials +grafana_admin_user = "monitoring-admin" +grafana_admin_password = "very-secure-password-123" + +# Network access from specific CIDRs +allowed_cidr_blocks = [ + "192.168.1.0/24", # Office network + "10.100.0.0/16", # VPN network +] + +# Applications that will send telemetry +otlp_sources_security_group_ids = [ + "sg-app1-security-group", + "sg-app2-security-group", +] +``` + +### Integration with Existing Service Discovery + +If you have an existing service discovery namespace: + +```hcl +# Use existing namespace +service_discovery_namespace_id = "ns-existing-12345" +service_name = "monitoring" +``` + +## Integration with Applications + +After deploying Grafana, configure your applications to send telemetry data: + +### 1. Add Application Security Groups + +Update your `terraform.tfvars`: + +```hcl +otlp_sources_security_group_ids = [ + "sg-your-app-security-group", +] +``` + +Then run `terraform apply` to update the security group rules. + +### 2. Configure Application Environment Variables + +In your application deployment (ECS task definition, Kubernetes deployment, etc.): + +```bash +# OpenTelemetry configuration +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-monitor.grafana-monitoring:4317 +OTEL_EXPORTER_OTLP_INSECURE=true +OTEL_SERVICE_NAME=my-application +OTEL_RESOURCE_ATTRIBUTES=service.version=1.0.0,deployment.environment=production +``` + +### 3. Verify Integration + +```bash +# Check service discovery +nslookup otel-monitor.grafana-monitoring + +# Test OTLP endpoint connectivity +curl http://otel-monitor.grafana-monitoring:4317 + +# Access Grafana dashboard +curl http://otel-monitor.grafana-monitoring:3000 +``` + +## Monitoring and Maintenance + +### Viewing Logs + +```bash +# View Grafana container logs +aws logs tail /ecs/grafana-otel --follow + +# Check ECS service events +aws ecs describe-services --cluster your-cluster --services grafana-otel +``` + +### Scaling + +```bash +# Manual scaling (if autoscaling is disabled) +aws ecs update-service --cluster your-cluster --service grafana-otel --desired-count 2 + +# Update autoscaling settings via Terraform +# Edit terraform.tfvars and run terraform apply +``` + +### Updates + +```bash +# Update to latest Grafana OTEL image +terraform apply -var="container_image=grafana/otel-lgtm:latest" + +# Update configuration +# Edit terraform.tfvars and run terraform apply +``` + +## Troubleshooting + +### Common Issues + +1. **Service not starting** + - Check CloudWatch logs for container errors + - Verify ECS cluster has capacity + - Check security group rules + +2. **Cannot access Grafana UI** + - Verify allowed_cidr_blocks includes your IP + - Check VPC connectivity (VPN, bastion host) + - Confirm service discovery is working + +3. **No telemetry data** + - Verify otlp_sources_security_group_ids + - Check application OTLP endpoint configuration + - Confirm network connectivity between services + +### Useful Commands + +```bash +# Check service status +terraform show | grep -A 10 "aws_ecs_service" + +# Verify service discovery +aws servicediscovery list-services + +# Check security groups +aws ec2 describe-security-groups --group-ids $(terraform output -raw security_group_id) + +# View all outputs +terraform output +``` + +## Cleanup + +To remove all resources: + +```bash +terraform destroy +``` + +## State Management Commands + +### Working with Remote State + +```bash +# Initialize with remote backend (first time setup) +terraform init + +# Migrate from local to remote state (if you have existing local state) +terraform init -migrate-state + +# View remote state +terraform show + +# List resources in state +terraform state list + +# Pull remote state to local (for inspection) +terraform state pull > current-state.json + +# Check state lock status +aws dynamodb describe-table --table-name terraform-state-locks --profile 908027381725_AdministratorAccess +``` + +### State Recovery and Backup + +```bash +# Download current state from S3 +aws s3 cp s3://gg-ai-terraform-states/production/grafana-monitoring/terraform.tfstate ./backup-state.tfstate --profile 908027381725_AdministratorAccess + +# List state versions (if bucket versioning is enabled) +aws s3api list-object-versions --bucket gg-ai-terraform-states --prefix production/grafana-monitoring/terraform.tfstate --profile 908027381725_AdministratorAccess + +# Force unlock state (if locked and lock is stale) +terraform force-unlock LOCK_ID +``` + +## Security Considerations + +- Store sensitive variables (passwords) in environment variables or use AWS Secrets Manager +- Restrict `allowed_cidr_blocks` to minimum required networks +- Use strong passwords for Grafana admin account +- Regularly update the Grafana OTEL container image +- Monitor CloudWatch logs for security events + +## Cost Estimation + +Default configuration (1 task, 1 vCPU, 2GB RAM): +- ECS Fargate: ~$35-50/month +- CloudWatch Logs: ~$1-5/month (depending on log volume) +- Service Discovery: ~$0.50/month + +Total estimated cost: ~$40-60/month + +## Support + +For issues or questions: +1. Check the module documentation: `../modules/grafana-otel/README.md` +2. Review Terraform and AWS documentation +3. Check CloudWatch logs for detailed error messages diff --git a/iac/grafana-standalone/main.tf b/iac/grafana-standalone/main.tf new file mode 100644 index 0000000000..25d7464b06 --- /dev/null +++ b/iac/grafana-standalone/main.tf @@ -0,0 +1,74 @@ +# Configure the AWS Provider +terraform { + required_version = ">= 1.0" + + # Remote state backend configuration for Grafana standalone + backend "s3" { + bucket = "gg-ai-terraform-states" + key = "production/grafana-monitoring/terraform.tfstate" + region = "us-east-1" + profile = "908027381725_AdministratorAccess" + dynamodb_table = "terraform-state-locks" + encrypt = true + } + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} + +provider "aws" { + region = var.aws_region + profile = var.aws_profile +} + +# Deploy Grafana OTEL monitoring stack +module "grafana_otel" { + source = "../modules/grafana-otel" + + # Core Infrastructure (required) + vpc_id = var.vpc_id + private_subnet_ids = var.private_subnet_ids + cluster_name = var.cluster_name + + # Environment Configuration + environment = var.environment + aws_region = var.aws_region + name_prefix = var.name_prefix + + # Network Access Configuration + allowed_cidr_blocks = var.allowed_cidr_blocks + + # Grafana Configuration + grafana_admin_user = var.grafana_admin_user + grafana_admin_password = var.grafana_admin_password + + # Service Discovery Configuration + service_discovery_namespace_id = var.service_discovery_namespace_id + service_discovery_namespace_name = var.service_discovery_namespace_name + service_name = var.service_name + + # Resource Configuration + cpu = var.cpu + memory = var.memory + desired_count = var.desired_count + + # Scaling Configuration + enable_autoscaling = var.enable_autoscaling + max_capacity = var.max_capacity + min_capacity = var.min_capacity + cpu_target_value = var.cpu_target_value + + # Monitoring Configuration + log_retention_days = var.log_retention_days + + # Security Configuration + otlp_sources_security_group_ids = var.otlp_sources_security_group_ids + additional_security_group_ids = var.additional_security_group_ids + + # Tags + tags = var.tags +} diff --git a/iac/grafana-standalone/outputs.tf b/iac/grafana-standalone/outputs.tf new file mode 100644 index 0000000000..58a19b59b8 --- /dev/null +++ b/iac/grafana-standalone/outputs.tf @@ -0,0 +1,46 @@ +# Pass through module outputs +output "grafana_dashboard_url" { + description = "Grafana dashboard URL (accessible from allowed CIDR blocks)" + value = module.grafana_otel.grafana_dashboard_url +} + +output "grafana_admin_credentials" { + description = "Grafana admin login credentials" + value = module.grafana_otel.grafana_admin_credentials + sensitive = true +} + +output "otlp_endpoints" { + description = "OpenTelemetry OTLP endpoints for telemetry data" + value = module.grafana_otel.otlp_endpoints +} + +output "service_discovery_info" { + description = "Service discovery information" + value = { + namespace_id = module.grafana_otel.service_discovery_namespace_id + namespace_name = module.grafana_otel.service_discovery_namespace_name + service_arn = module.grafana_otel.service_discovery_service_arn + } +} + +output "security_group_id" { + description = "Security group ID for Grafana tasks" + value = module.grafana_otel.security_group_id +} + +output "execution_role_arn" { + description = "IAM execution role ARN for Grafana tasks" + value = module.grafana_otel.execution_role_arn +} + +output "cloudwatch_log_group_name" { + description = "CloudWatch log group name for Grafana logs" + value = module.grafana_otel.cloudwatch_log_group_name +} + +output "setup_instructions" { + description = "Instructions for accessing and configuring Grafana monitoring" + value = module.grafana_otel.setup_instructions + sensitive = true +} diff --git a/iac/grafana-standalone/terraform.tfvars.example b/iac/grafana-standalone/terraform.tfvars.example new file mode 100644 index 0000000000..5902dcd74e --- /dev/null +++ b/iac/grafana-standalone/terraform.tfvars.example @@ -0,0 +1,65 @@ +# AWS Configuration +aws_region = "us-east-1" +aws_profile = "908027381725_AdministratorAccess" + +# Environment +environment = "production" +name_prefix = "grafana-otel" + +# Core Infrastructure (Required - Update these values) +vpc_id = "vpc-01bc2784063a567d3" +private_subnet_ids = [ + "subnet-01296c54f7bff84bc", + "subnet-00da3547f2178dd85" +] +cluster_name = "webUIcluster2" + +# Network Access Configuration +# Allow access from VPN and internal networks +allowed_cidr_blocks = [ + "192.168.158.0/24", # GG VPN + "192.168.144.0/23", # Internal VPC + "10.0.0.0/8" # Private networks +] + +# Grafana Configuration +grafana_admin_user = "admin" +grafana_admin_password = "your-secure-password-here" + +# Service Discovery Configuration +# Leave empty to create new namespace, or specify existing namespace ID +service_discovery_namespace_id = "ns-m5evzfyyw2zelrfu" +service_discovery_namespace_name = "ggai" +service_name = "grafana-monitoring" + +# Resource Configuration +cpu = 1024 # 1 vCPU +memory = 2048 # 2 GB +desired_count = 1 + +# Scaling Configuration +enable_autoscaling = true +max_capacity = 2 +min_capacity = 1 +cpu_target_value = 80.0 + +# Monitoring Configuration +log_retention_days = 7 + +# Security Configuration +# Add security group IDs of applications that will send OTLP data +otlp_sources_security_group_ids = [ + # "sg-05e12bd2e202e19f6", # Example: OpenWebUI security group + # "sg-87654321", # Example: Other application security group +] + +# Additional security groups to attach to Grafana tasks +additional_security_group_ids = [] + +# Tags +tags = { + Project = "grafana-monitoring" + Environment = "production" + Owner = "platform-team" + ManagedBy = "terraform" +} diff --git a/iac/grafana-standalone/variables.tf b/iac/grafana-standalone/variables.tf new file mode 100644 index 0000000000..b5c308aaeb --- /dev/null +++ b/iac/grafana-standalone/variables.tf @@ -0,0 +1,154 @@ +# AWS Configuration +variable "aws_region" { + description = "AWS region for deployment" + type = string + default = "us-east-1" +} + +variable "aws_profile" { + description = "AWS CLI profile to use" + type = string + default = "908027381725_AdministratorAccess" +} + +variable "environment" { + description = "Environment name (e.g., production, staging, dev)" + type = string + default = "production" +} + +variable "name_prefix" { + description = "Prefix for all resource names" + type = string + default = "grafana-otel" +} + +# Core Infrastructure (Required) +variable "vpc_id" { + description = "VPC ID where Grafana will be deployed" + type = string +} + +variable "private_subnet_ids" { + description = "Private subnet IDs for Grafana ECS tasks" + type = list(string) +} + +variable "cluster_name" { + description = "ECS cluster name where Grafana will be deployed" + type = string +} + +# Network Access Configuration +variable "allowed_cidr_blocks" { + description = "CIDR blocks allowed to access Grafana UI (port 3000)" + type = list(string) + default = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"] +} + +# Grafana Configuration +variable "grafana_admin_user" { + description = "Grafana admin username" + type = string + default = "admin" +} + +variable "grafana_admin_password" { + description = "Grafana admin password" + type = string + default = "openwebui_monitoring_2024" + sensitive = true +} + +# Service Discovery Configuration +variable "service_discovery_namespace_id" { + description = "Service discovery namespace ID (if using existing namespace)" + type = string + default = "" +} + +variable "service_discovery_namespace_name" { + description = "Service discovery namespace name (creates new if namespace_id not provided)" + type = string + default = "grafana-monitoring" +} + +variable "service_name" { + description = "Service discovery service name" + type = string + default = "otel-monitor" +} + +# Resource Configuration +variable "cpu" { + description = "CPU units for Grafana task" + type = number + default = 1024 +} + +variable "memory" { + description = "Memory (MB) for Grafana task" + type = number + default = 2048 +} + +variable "desired_count" { + description = "Desired number of Grafana tasks" + type = number + default = 1 +} + +# Scaling Configuration +variable "enable_autoscaling" { + description = "Enable ECS autoscaling for Grafana" + type = bool + default = true +} + +variable "max_capacity" { + description = "Maximum number of tasks for autoscaling" + type = number + default = 2 +} + +variable "min_capacity" { + description = "Minimum number of tasks for autoscaling" + type = number + default = 1 +} + +variable "cpu_target_value" { + description = "Target CPU utilization for autoscaling" + type = number + default = 80.0 +} + +# Monitoring Configuration +variable "log_retention_days" { + description = "CloudWatch log retention in days" + type = number + default = 7 +} + +# Security Configuration +variable "otlp_sources_security_group_ids" { + description = "Security group IDs that should be allowed to send OTLP data to Grafana" + type = list(string) + default = [] +} + +variable "additional_security_group_ids" { + description = "Additional security group IDs to attach to Grafana tasks" + type = list(string) + default = [] +} + +# Tags +variable "tags" { + description = "Additional tags for all resources" + type = map(string) + default = { + Project = "grafana-monitoring" + ManagedBy = "terraform" + } +}