Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,629 @@
# Ruvector Burst Scaling Infrastructure
#
# This Terraform configuration manages:
# - Cloud Run services with auto-scaling
# - Load balancers
# - Cloud SQL and Redis with scaling policies
# - Monitoring and alerting
# - Budget alerts
terraform {
required_version = ">= 1.0"
required_providers {
google = {
source = "hashicorp/google"
version = "~> 5.0"
}
google-beta = {
source = "hashicorp/google-beta"
version = "~> 5.0"
}
}
backend "gcs" {
bucket = "ruvector-terraform-state"
prefix = "burst-scaling"
}
}
provider "google" {
project = var.project_id
region = var.primary_region
}
provider "google-beta" {
project = var.project_id
region = var.primary_region
}
# ===== Cloud Run Services =====
resource "google_cloud_run_v2_service" "ruvector" {
for_each = toset(var.regions)
name = "ruvector-${each.key}"
location = each.key
template {
scaling {
min_instance_count = var.min_instances
max_instance_count = var.max_instances
}
containers {
image = var.container_image
resources {
limits = {
cpu = var.cpu_limit
memory = var.memory_limit
}
cpu_idle = true
startup_cpu_boost = true
}
ports {
container_port = 8080
name = "http1"
}
env {
name = "REGION"
value = each.key
}
env {
name = "MAX_CONNECTIONS"
value = tostring(var.max_connections_per_instance)
}
env {
name = "DATABASE_URL"
value_source {
secret_key_ref {
secret = google_secret_manager_secret.database_url.id
version = "latest"
}
}
}
env {
name = "REDIS_URL"
value_source {
secret_key_ref {
secret = google_secret_manager_secret.redis_url.id
version = "latest"
}
}
}
}
# Aggressive auto-scaling configuration
max_instance_request_concurrency = var.max_concurrency
service_account = google_service_account.ruvector.email
timeout = "300s"
}
traffic {
type = "TRAFFIC_TARGET_ALLOCATION_TYPE_LATEST"
percent = 100
}
depends_on = [
google_project_service.cloud_run,
google_secret_manager_secret_iam_member.cloud_run_database,
google_secret_manager_secret_iam_member.cloud_run_redis
]
}
# Auto-scaling policies for Cloud Run
resource "google_monitoring_alert_policy" "high_cpu" {
for_each = toset(var.regions)
display_name = "High CPU - ${each.key}"
combiner = "OR"
conditions {
display_name = "CPU utilization above ${var.cpu_scale_out_threshold * 100}%"
condition_threshold {
filter = "resource.type = \"cloud_run_revision\" AND resource.labels.service_name = \"ruvector-${each.key}\" AND metric.type = \"run.googleapis.com/container/cpu/utilizations\""
duration = "60s"
comparison = "COMPARISON_GT"
threshold_value = var.cpu_scale_out_threshold
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_MEAN"
}
}
}
notification_channels = [google_monitoring_notification_channel.email.id]
alert_strategy {
auto_close = "1800s"
}
}
# ===== Global Load Balancer =====
resource "google_compute_global_address" "ruvector" {
name = "ruvector-lb-ip"
}
resource "google_compute_global_forwarding_rule" "ruvector" {
name = "ruvector-lb-forwarding-rule"
target = google_compute_target_https_proxy.ruvector.id
port_range = "443"
ip_address = google_compute_global_address.ruvector.address
load_balancing_scheme = "EXTERNAL_MANAGED"
}
resource "google_compute_target_https_proxy" "ruvector" {
name = "ruvector-https-proxy"
url_map = google_compute_url_map.ruvector.id
ssl_certificates = [google_compute_managed_ssl_certificate.ruvector.id]
}
resource "google_compute_managed_ssl_certificate" "ruvector" {
name = "ruvector-ssl-cert"
managed {
domains = [var.domain]
}
}
resource "google_compute_url_map" "ruvector" {
name = "ruvector-url-map"
default_service = google_compute_backend_service.ruvector.id
}
resource "google_compute_backend_service" "ruvector" {
name = "ruvector-backend"
protocol = "HTTP"
port_name = "http"
timeout_sec = 30
load_balancing_scheme = "EXTERNAL_MANAGED"
# Health check
health_checks = [google_compute_health_check.ruvector.id]
# CDN configuration
enable_cdn = true
cdn_policy {
cache_mode = "CACHE_ALL_STATIC"
default_ttl = 3600
client_ttl = 3600
max_ttl = 86400
negative_caching = true
serve_while_stale = 86400
}
# IAP for admin endpoints
iap {
enabled = var.enable_iap
oauth2_client_id = var.iap_client_id
oauth2_client_secret = var.iap_client_secret
}
# Add backends for each region
dynamic "backend" {
for_each = toset(var.regions)
content {
group = google_compute_region_network_endpoint_group.ruvector[backend.key].id
balancing_mode = "UTILIZATION"
capacity_scaler = 1.0
max_utilization = var.backend_max_utilization
# Connection draining
max_connections_per_instance = var.max_connections_per_instance
}
}
# Circuit breaker
circuit_breakers {
max_connections = var.circuit_breaker_max_connections
}
# Outlier detection
outlier_detection {
consecutive_errors = 5
interval {
seconds = 10
}
base_ejection_time {
seconds = 30
}
max_ejection_percent = 50
enforcing_consecutive_errors = 100
}
# Log configuration
log_config {
enable = true
sample_rate = var.log_sample_rate
}
}
resource "google_compute_region_network_endpoint_group" "ruvector" {
for_each = toset(var.regions)
name = "ruvector-neg-${each.key}"
network_endpoint_type = "SERVERLESS"
region = each.key
cloud_run {
service = google_cloud_run_v2_service.ruvector[each.key].name
}
}
resource "google_compute_health_check" "ruvector" {
name = "ruvector-health-check"
check_interval_sec = 10
timeout_sec = 5
healthy_threshold = 2
unhealthy_threshold = 3
http_health_check {
port = 8080
request_path = "/health"
proxy_header = "NONE"
}
}
# ===== Cloud SQL (PostgreSQL) =====
resource "google_sql_database_instance" "ruvector" {
for_each = toset(var.regions)
name = "ruvector-db-${each.key}"
database_version = "POSTGRES_15"
region = each.key
settings {
tier = var.database_tier
availability_type = "REGIONAL"
disk_autoresize = true
disk_size = var.database_disk_size
disk_type = "PD_SSD"
backup_configuration {
enabled = true
point_in_time_recovery_enabled = true
start_time = "03:00"
transaction_log_retention_days = 7
backup_retention_settings {
retained_backups = 30
}
}
ip_configuration {
ipv4_enabled = false
private_network = google_compute_network.ruvector.id
require_ssl = true
}
insights_config {
query_insights_enabled = true
query_string_length = 1024
record_application_tags = true
record_client_address = true
}
database_flags {
name = "max_connections"
value = var.database_max_connections
}
database_flags {
name = "shared_buffers"
value = "262144" # 2GB
}
database_flags {
name = "effective_cache_size"
value = "786432" # 6GB
}
}
deletion_protection = var.enable_deletion_protection
depends_on = [
google_project_service.sql_admin,
google_service_networking_connection.private_vpc_connection
]
}
# Read replicas for scaling reads
resource "google_sql_database_instance" "ruvector_replica" {
for_each = var.enable_read_replicas ? toset(var.regions) : toset([])
name = "ruvector-db-${each.key}-replica"
master_instance_name = google_sql_database_instance.ruvector[each.key].name
region = each.key
database_version = "POSTGRES_15"
replica_configuration {
failover_target = false
}
settings {
tier = var.database_replica_tier
availability_type = "ZONAL"
disk_autoresize = true
disk_type = "PD_SSD"
ip_configuration {
ipv4_enabled = false
private_network = google_compute_network.ruvector.id
}
}
deletion_protection = var.enable_deletion_protection
}
# ===== Redis (Memorystore) =====
resource "google_redis_instance" "ruvector" {
for_each = toset(var.regions)
name = "ruvector-redis-${each.key}"
tier = "STANDARD_HA"
memory_size_gb = var.redis_memory_size
region = each.key
redis_version = "REDIS_7_0"
display_name = "Ruvector Redis - ${each.key}"
authorized_network = google_compute_network.ruvector.id
connect_mode = "PRIVATE_SERVICE_ACCESS"
redis_configs = {
maxmemory-policy = "allkeys-lru"
notify-keyspace-events = "Ex"
}
maintenance_policy {
weekly_maintenance_window {
day = "SUNDAY"
start_time {
hours = 3
minutes = 0
}
}
}
depends_on = [
google_project_service.redis,
google_service_networking_connection.private_vpc_connection
]
}
# ===== Networking =====
resource "google_compute_network" "ruvector" {
name = "ruvector-network"
auto_create_subnetworks = false
}
resource "google_compute_subnetwork" "ruvector" {
for_each = toset(var.regions)
name = "ruvector-subnet-${each.key}"
ip_cidr_range = cidrsubnet(var.vpc_cidr, 8, index(var.regions, each.key))
region = each.key
network = google_compute_network.ruvector.id
private_ip_google_access = true
}
resource "google_compute_global_address" "private_ip_address" {
name = "ruvector-private-ip"
purpose = "VPC_PEERING"
address_type = "INTERNAL"
prefix_length = 16
network = google_compute_network.ruvector.id
}
resource "google_service_networking_connection" "private_vpc_connection" {
network = google_compute_network.ruvector.id
service = "servicenetworking.googleapis.com"
reserved_peering_ranges = [google_compute_global_address.private_ip_address.name]
}
# ===== IAM & Service Accounts =====
resource "google_service_account" "ruvector" {
account_id = "ruvector-service"
display_name = "Ruvector Service Account"
}
resource "google_project_iam_member" "ruvector_monitoring" {
project = var.project_id
role = "roles/monitoring.metricWriter"
member = "serviceAccount:${google_service_account.ruvector.email}"
}
resource "google_project_iam_member" "ruvector_logging" {
project = var.project_id
role = "roles/logging.logWriter"
member = "serviceAccount:${google_service_account.ruvector.email}"
}
resource "google_project_iam_member" "ruvector_trace" {
project = var.project_id
role = "roles/cloudtrace.agent"
member = "serviceAccount:${google_service_account.ruvector.email}"
}
# ===== Secrets Manager =====
resource "google_secret_manager_secret" "database_url" {
secret_id = "ruvector-database-url"
replication {
auto {}
}
}
resource "google_secret_manager_secret" "redis_url" {
secret_id = "ruvector-redis-url"
replication {
auto {}
}
}
resource "google_secret_manager_secret_iam_member" "cloud_run_database" {
secret_id = google_secret_manager_secret.database_url.id
role = "roles/secretmanager.secretAccessor"
member = "serviceAccount:${google_service_account.ruvector.email}"
}
resource "google_secret_manager_secret_iam_member" "cloud_run_redis" {
secret_id = google_secret_manager_secret.redis_url.id
role = "roles/secretmanager.secretAccessor"
member = "serviceAccount:${google_service_account.ruvector.email}"
}
# ===== Monitoring & Alerts =====
resource "google_monitoring_notification_channel" "email" {
display_name = "Email Notifications"
type = "email"
labels = {
email_address = var.alert_email
}
}
resource "google_monitoring_notification_channel" "pagerduty" {
count = var.pagerduty_integration_key != "" ? 1 : 0
display_name = "PagerDuty"
type = "pagerduty"
sensitive_labels {
service_key = var.pagerduty_integration_key
}
}
# Budget alerts
resource "google_billing_budget" "ruvector" {
billing_account = var.billing_account
display_name = "Ruvector Budget"
budget_filter {
projects = ["projects/${var.project_id}"]
}
amount {
specified_amount {
currency_code = "USD"
units = tostring(var.monthly_budget)
}
}
threshold_rules {
threshold_percent = 0.5
}
threshold_rules {
threshold_percent = 0.8
}
threshold_rules {
threshold_percent = 0.9
}
threshold_rules {
threshold_percent = 1.0
}
threshold_rules {
threshold_percent = 1.2
spend_basis = "FORECASTED_SPEND"
}
all_updates_rule {
monitoring_notification_channels = [
google_monitoring_notification_channel.email.id
]
disable_default_iam_recipients = false
}
}
# ===== Enable Required APIs =====
resource "google_project_service" "cloud_run" {
service = "run.googleapis.com"
disable_on_destroy = false
}
resource "google_project_service" "compute" {
service = "compute.googleapis.com"
disable_on_destroy = false
}
resource "google_project_service" "sql_admin" {
service = "sqladmin.googleapis.com"
disable_on_destroy = false
}
resource "google_project_service" "redis" {
service = "redis.googleapis.com"
disable_on_destroy = false
}
resource "google_project_service" "monitoring" {
service = "monitoring.googleapis.com"
disable_on_destroy = false
}
resource "google_project_service" "logging" {
service = "logging.googleapis.com"
disable_on_destroy = false
}
resource "google_project_service" "secretmanager" {
service = "secretmanager.googleapis.com"
disable_on_destroy = false
}
# ===== Outputs =====
output "load_balancer_ip" {
description = "Global load balancer IP address"
value = google_compute_global_address.ruvector.address
}
output "cloud_run_services" {
description = "Cloud Run service URLs by region"
value = {
for region, service in google_cloud_run_v2_service.ruvector :
region => service.uri
}
}
output "database_instances" {
description = "Cloud SQL instance connection names"
value = {
for region, db in google_sql_database_instance.ruvector :
region => db.connection_name
}
}
output "redis_instances" {
description = "Redis instance hosts"
value = {
for region, redis in google_redis_instance.ruvector :
region => redis.host
}
sensitive = true
}

View File

@@ -0,0 +1,417 @@
# Terraform Variables for Ruvector Burst Scaling
# ===== Project Configuration =====
variable "project_id" {
description = "GCP Project ID"
type = string
}
variable "billing_account" {
description = "GCP Billing Account ID"
type = string
}
variable "primary_region" {
description = "Primary GCP region"
type = string
default = "us-central1"
}
variable "regions" {
description = "List of regions to deploy to"
type = list(string)
default = ["us-central1", "europe-west1", "asia-east1"]
}
variable "domain" {
description = "Domain name for the application"
type = string
}
# ===== Cloud Run Configuration =====
variable "container_image" {
description = "Container image for Cloud Run"
type = string
default = "gcr.io/ruvector/app:latest"
}
variable "min_instances" {
description = "Minimum number of Cloud Run instances per region"
type = number
default = 10
}
variable "max_instances" {
description = "Maximum number of Cloud Run instances per region"
type = number
default = 1000
}
variable "cpu_limit" {
description = "CPU limit for Cloud Run containers"
type = string
default = "4000m" # 4 vCPUs
}
variable "memory_limit" {
description = "Memory limit for Cloud Run containers"
type = string
default = "8Gi" # 8GB
}
variable "max_concurrency" {
description = "Maximum concurrent requests per Cloud Run instance"
type = number
default = 1000
}
variable "max_connections_per_instance" {
description = "Maximum connections per Cloud Run instance"
type = number
default = 500000
}
# ===== Scaling Thresholds =====
variable "cpu_scale_out_threshold" {
description = "CPU utilization threshold for scaling out (0-1)"
type = number
default = 0.70
}
variable "cpu_scale_in_threshold" {
description = "CPU utilization threshold for scaling in (0-1)"
type = number
default = 0.30
}
variable "memory_scale_out_threshold" {
description = "Memory utilization threshold for scaling out (0-1)"
type = number
default = 0.75
}
variable "memory_scale_in_threshold" {
description = "Memory utilization threshold for scaling in (0-1)"
type = number
default = 0.35
}
variable "latency_threshold_ms" {
description = "P99 latency threshold in milliseconds"
type = number
default = 50
}
# ===== Load Balancer Configuration =====
variable "backend_max_utilization" {
description = "Maximum backend utilization before load balancer scales (0-1)"
type = number
default = 0.80
}
variable "circuit_breaker_max_connections" {
description = "Maximum connections before circuit breaker trips"
type = number
default = 10000
}
variable "log_sample_rate" {
description = "Sampling rate for load balancer logs (0-1)"
type = number
default = 0.1
}
variable "enable_iap" {
description = "Enable Identity-Aware Proxy for admin endpoints"
type = bool
default = false
}
variable "iap_client_id" {
description = "IAP OAuth2 Client ID"
type = string
default = ""
sensitive = true
}
variable "iap_client_secret" {
description = "IAP OAuth2 Client Secret"
type = string
default = ""
sensitive = true
}
# ===== Database Configuration =====
variable "database_tier" {
description = "Cloud SQL instance tier"
type = string
default = "db-custom-16-65536" # 16 vCPUs, 64GB RAM
}
variable "database_replica_tier" {
description = "Cloud SQL read replica instance tier"
type = string
default = "db-custom-8-32768" # 8 vCPUs, 32GB RAM
}
variable "database_disk_size" {
description = "Cloud SQL disk size in GB"
type = number
default = 500
}
variable "database_max_connections" {
description = "Maximum database connections"
type = string
default = "5000"
}
variable "enable_read_replicas" {
description = "Enable Cloud SQL read replicas"
type = bool
default = true
}
# ===== Redis Configuration =====
variable "redis_memory_size" {
description = "Redis memory size in GB"
type = number
default = 64
}
# ===== Network Configuration =====
variable "vpc_cidr" {
description = "VPC CIDR block"
type = string
default = "10.0.0.0/16"
}
# ===== Budget Configuration =====
variable "hourly_budget" {
description = "Hourly budget limit in USD"
type = number
default = 10000
}
variable "daily_budget" {
description = "Daily budget limit in USD"
type = number
default = 200000
}
variable "monthly_budget" {
description = "Monthly budget limit in USD"
type = number
default = 5000000
}
variable "budget_warning_threshold" {
description = "Budget warning threshold (0-1)"
type = number
default = 0.80
}
variable "hard_budget_limit" {
description = "Enforce hard budget limit (stop scaling when reached)"
type = bool
default = false
}
# ===== Alerting Configuration =====
variable "alert_email" {
description = "Email address for alerts"
type = string
}
variable "pagerduty_integration_key" {
description = "PagerDuty integration key for critical alerts"
type = string
default = ""
sensitive = true
}
# ===== Burst Event Configuration =====
variable "burst_multiplier_max" {
description = "Maximum burst multiplier (e.g., 50 for 50x normal load)"
type = number
default = 50
}
variable "pre_warm_time_seconds" {
description = "Time in seconds to start pre-warming before predicted burst"
type = number
default = 900 # 15 minutes
}
variable "scale_out_step" {
description = "Number of instances to add during scale-out"
type = number
default = 10
}
variable "scale_in_step" {
description = "Number of instances to remove during scale-in"
type = number
default = 2
}
variable "scale_out_cooldown_seconds" {
description = "Cooldown period after scale-out in seconds"
type = number
default = 60
}
variable "scale_in_cooldown_seconds" {
description = "Cooldown period after scale-in in seconds"
type = number
default = 300
}
# ===== Cost Optimization =====
variable "enable_deletion_protection" {
description = "Enable deletion protection for databases"
type = bool
default = true
}
variable "enable_preemptible_instances" {
description = "Use preemptible instances for non-critical workloads"
type = bool
default = false
}
# ===== Regional Cost Configuration =====
variable "region_costs" {
description = "Hourly cost per instance by region (USD)"
type = map(number)
default = {
"us-central1" = 0.50
"us-east1" = 0.52
"us-west1" = 0.54
"europe-west1" = 0.55
"europe-west4" = 0.58
"asia-east1" = 0.60
"asia-southeast1" = 0.62
"south-america-east1" = 0.65
}
}
variable "region_priorities" {
description = "Priority ranking for regions (1-10, higher = more important)"
type = map(number)
default = {
"us-central1" = 10
"us-east1" = 9
"europe-west1" = 9
"asia-east1" = 8
"us-west1" = 7
"asia-southeast1" = 6
"europe-west4" = 6
"south-america-east1" = 5
}
}
# ===== Monitoring Configuration =====
variable "metrics_retention_days" {
description = "Number of days to retain monitoring metrics"
type = number
default = 90
}
variable "enable_cloud_trace" {
description = "Enable Cloud Trace for distributed tracing"
type = bool
default = true
}
variable "trace_sample_rate" {
description = "Sampling rate for Cloud Trace (0-1)"
type = number
default = 0.1
}
variable "enable_cloud_profiler" {
description = "Enable Cloud Profiler for performance profiling"
type = bool
default = true
}
# ===== Environment =====
variable "environment" {
description = "Environment name (dev, staging, prod)"
type = string
default = "prod"
}
variable "tags" {
description = "Additional tags for resources"
type = map(string)
default = {
"managed-by" = "terraform"
"project" = "ruvector"
"component" = "burst-scaling"
}
}
# ===== Feature Flags =====
variable "enable_adaptive_scaling" {
description = "Enable adaptive scaling with ML predictions"
type = bool
default = true
}
variable "enable_traffic_shedding" {
description = "Enable traffic shedding during extreme load"
type = bool
default = true
}
variable "enable_graceful_degradation" {
description = "Enable graceful degradation features"
type = bool
default = true
}
# ===== Example terraform.tfvars =====
# Copy this to terraform.tfvars and customize:
#
# project_id = "ruvector-prod"
# billing_account = "0123AB-CDEF45-67890"
# domain = "api.ruvector.io"
# alert_email = "ops@ruvector.io"
#
# regions = [
# "us-central1",
# "europe-west1",
# "asia-east1"
# ]
#
# # Burst scaling
# min_instances = 10
# max_instances = 1000
# burst_multiplier_max = 50
#
# # Budget
# hourly_budget = 10000
# daily_budget = 200000
# monthly_budget = 5000000
#
# # Thresholds
# cpu_scale_out_threshold = 0.70
# latency_threshold_ms = 50