git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
1191 lines
31 KiB
Markdown
1191 lines
31 KiB
Markdown
# RuVector Performance Optimization Guide
|
|
|
|
## Executive Summary
|
|
|
|
This guide provides advanced performance tuning strategies for RuVector's globally distributed streaming system. Following these optimizations can improve:
|
|
|
|
- **Latency**: 30-50% reduction in P99 latency
|
|
- **Throughput**: 2-3x increase in queries per second
|
|
- **Cost**: 20-40% reduction in operational costs
|
|
- **Scalability**: Better handling of burst traffic
|
|
|
|
---
|
|
|
|
## Table of Contents
|
|
|
|
1. [System Architecture Performance](#system-architecture-performance)
|
|
2. [Cloud Run Optimizations](#cloud-run-optimizations)
|
|
3. [Database Performance](#database-performance)
|
|
4. [Cache Optimization](#cache-optimization)
|
|
5. [Network Performance](#network-performance)
|
|
6. [Query Optimization](#query-optimization)
|
|
7. [Resource Allocation](#resource-allocation)
|
|
8. [Monitoring & Profiling](#monitoring--profiling)
|
|
|
|
---
|
|
|
|
## System Architecture Performance
|
|
|
|
### Multi-Region Strategy
|
|
|
|
**Optimal Region Selection**:
|
|
```javascript
|
|
// Region selection algorithm
|
|
function selectOptimalRegion(clientLocation, currentLoad) {
|
|
const regions = [
|
|
{ name: 'us-central1', latency: calculateLatency(clientLocation, 'us-central1'), load: currentLoad['us-central1'], capacity: 80M },
|
|
{ name: 'europe-west1', latency: calculateLatency(clientLocation, 'europe-west1'), load: currentLoad['europe-west1'], capacity: 80M },
|
|
{ name: 'asia-east1', latency: calculateLatency(clientLocation, 'asia-east1'), load: currentLoad['asia-east1'], capacity: 80M },
|
|
];
|
|
|
|
// Score: 60% latency, 40% available capacity
|
|
return regions
|
|
.map(r => ({
|
|
...r,
|
|
score: (1 / r.latency) * 0.6 + ((r.capacity - r.load) / r.capacity) * 0.4
|
|
}))
|
|
.sort((a, b) => b.score - a.score)[0].name;
|
|
}
|
|
```
|
|
|
|
**Benefits**:
|
|
- 20-40ms latency reduction vs. random region selection
|
|
- Better load distribution
|
|
- Reduced cross-region traffic
|
|
|
|
### Connection Pooling
|
|
|
|
**Optimal Pool Sizes**:
|
|
```typescript
|
|
// Based on benchmarks for 500M concurrent
|
|
const POOL_CONFIG = {
|
|
database: {
|
|
min: 50, // Keep warm connections
|
|
max: 500, // Per Cloud Run instance
|
|
idleTimeout: 30000,
|
|
acquireTimeout: 60000,
|
|
evictionRunInterval: 10000,
|
|
},
|
|
redis: {
|
|
min: 20,
|
|
max: 200,
|
|
idleTimeout: 60000,
|
|
},
|
|
vectorDB: {
|
|
min: 10,
|
|
max: 100,
|
|
idleTimeout: 120000,
|
|
}
|
|
};
|
|
|
|
// Implementation
|
|
import { Pool } from 'pg';
|
|
import { createClient } from 'redis';
|
|
|
|
const dbPool = new Pool({
|
|
host: process.env.DB_HOST,
|
|
database: 'ruvector',
|
|
...POOL_CONFIG.database,
|
|
});
|
|
|
|
const redisClient = createClient({
|
|
socket: {
|
|
host: process.env.REDIS_HOST,
|
|
},
|
|
...POOL_CONFIG.redis,
|
|
});
|
|
```
|
|
|
|
**Impact**:
|
|
- 15-25ms reduction in query latency
|
|
- 50% reduction in connection overhead
|
|
- Better resource utilization
|
|
|
|
---
|
|
|
|
## Cloud Run Optimizations
|
|
|
|
### Instance Configuration
|
|
|
|
**Optimal Settings for 500M Concurrent**:
|
|
```yaml
|
|
# Per-region configuration
|
|
spec:
|
|
template:
|
|
metadata:
|
|
annotations:
|
|
autoscaling.knative.dev/minScale: "20" # Keep warm instances
|
|
autoscaling.knative.dev/maxScale: "1000" # Scale up to 1000
|
|
run.googleapis.com/cpu-throttling: "false" # Always allocate CPU
|
|
run.googleapis.com/execution-environment: "gen2" # Latest runtime
|
|
spec:
|
|
containers:
|
|
- image: gcr.io/project/ruvector-streaming
|
|
resources:
|
|
limits:
|
|
cpu: "4000m" # 4 vCPU
|
|
memory: "16Gi" # 16GB RAM
|
|
env:
|
|
- name: NODE_ENV
|
|
value: "production"
|
|
- name: NODE_OPTIONS
|
|
value: "--max-old-space-size=14336 --optimize-for-size"
|
|
ports:
|
|
- containerPort: 8080
|
|
name: h2c # HTTP/2 with cleartext (faster than HTTP/1)
|
|
|
|
# Startup optimization
|
|
startupProbe:
|
|
httpGet:
|
|
path: /startup
|
|
port: 8080
|
|
initialDelaySeconds: 0
|
|
periodSeconds: 1
|
|
failureThreshold: 30
|
|
|
|
# Health checks
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /health
|
|
port: 8080
|
|
initialDelaySeconds: 0
|
|
periodSeconds: 10
|
|
|
|
# Concurrency
|
|
containerConcurrency: 100 # 100 concurrent requests per instance
|
|
```
|
|
|
|
**Key Optimizations**:
|
|
1. **CPU throttling disabled**: Always-allocated CPU for consistent performance
|
|
2. **Gen2 execution**: 2x faster cold starts, more CPU
|
|
3. **HTTP/2 cleartext**: 30% lower latency vs HTTP/1.1
|
|
4. **Optimized Node.js**: Tuned heap size and V8 flags
|
|
|
|
### Cold Start Mitigation
|
|
|
|
**Strategy 1: Min Instances**
|
|
```bash
|
|
# Keep instances warm in each region
|
|
gcloud run services update ruvector-streaming \
|
|
--region=us-central1 \
|
|
--min-instances=20
|
|
|
|
# Cost: ~$14/day per region for 20 instances
|
|
# Benefit: Eliminate ~95% of cold starts
|
|
```
|
|
|
|
**Strategy 2: Scheduled Pre-Warming**
|
|
```typescript
|
|
// Pre-warm before predicted traffic spikes
|
|
import { scheduler } from '@google-cloud/scheduler';
|
|
|
|
async function schedulePreWarm(event: { time: Date, targetInstances: number, region: string }) {
|
|
const job = {
|
|
name: `prewarm-${event.region}-${event.time.getTime()}`,
|
|
schedule: calculateCron(event.time, -15), // 15 min before
|
|
httpTarget: {
|
|
uri: `https://run.googleapis.com/v2/projects/${PROJECT_ID}/locations/${event.region}/services/ruvector-streaming`,
|
|
httpMethod: 'PATCH',
|
|
body: Buffer.from(JSON.stringify({
|
|
template: {
|
|
metadata: {
|
|
annotations: {
|
|
'autoscaling.knative.dev/minScale': event.targetInstances.toString()
|
|
}
|
|
}
|
|
}
|
|
})).toString('base64'),
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
},
|
|
oauthToken: {
|
|
serviceAccountEmail: DEPLOYER_SA,
|
|
},
|
|
},
|
|
};
|
|
|
|
await scheduler.createJob({ parent, job });
|
|
}
|
|
|
|
// Usage: Pre-warm for World Cup
|
|
await schedulePreWarm({
|
|
time: new Date('2026-07-15T17:45:00Z'),
|
|
targetInstances: 500,
|
|
region: 'europe-west3',
|
|
});
|
|
```
|
|
|
|
**Strategy 3: Connection Keep-Alive**
|
|
```typescript
|
|
// Client-side: maintain persistent connections
|
|
const client = new WebSocket('wss://api.ruvector.io/stream', {
|
|
perMessageDeflate: false, // Disable compression for latency
|
|
});
|
|
|
|
// Send heartbeat every 30s to keep connection alive
|
|
setInterval(() => {
|
|
if (client.readyState === WebSocket.OPEN) {
|
|
client.send(JSON.stringify({ type: 'ping' }));
|
|
}
|
|
}, 30000);
|
|
|
|
// Server-side: respond to heartbeats
|
|
server.on('message', (data) => {
|
|
const msg = JSON.parse(data);
|
|
if (msg.type === 'ping') {
|
|
client.send(JSON.stringify({ type: 'pong', timestamp: Date.now() }));
|
|
}
|
|
});
|
|
```
|
|
|
|
**Impact**:
|
|
- Cold start probability: < 5% (vs 40% baseline)
|
|
- Cold start latency: ~800ms → ~200ms (Gen2)
|
|
- Consistent P99 latency
|
|
|
|
### Request Batching
|
|
|
|
**Implementation**:
|
|
```typescript
|
|
class QueryBatcher {
|
|
private queue: Array<{ query: VectorQuery, resolve: Function, reject: Function }> = [];
|
|
private timer: NodeJS.Timeout | null = null;
|
|
private readonly batchSize = 100;
|
|
private readonly batchDelay = 10; // ms
|
|
|
|
async query(vectorQuery: VectorQuery): Promise<SearchResult> {
|
|
return new Promise((resolve, reject) => {
|
|
this.queue.push({ query: vectorQuery, resolve, reject });
|
|
|
|
if (this.queue.length >= this.batchSize) {
|
|
this.flush();
|
|
} else if (!this.timer) {
|
|
this.timer = setTimeout(() => this.flush(), this.batchDelay);
|
|
}
|
|
});
|
|
}
|
|
|
|
private async flush() {
|
|
if (this.timer) {
|
|
clearTimeout(this.timer);
|
|
this.timer = null;
|
|
}
|
|
|
|
const batch = this.queue.splice(0, this.batchSize);
|
|
if (batch.length === 0) return;
|
|
|
|
try {
|
|
// Batch query to vector database
|
|
const results = await vectorDB.batchQuery(batch.map(b => b.query));
|
|
|
|
// Resolve individual promises
|
|
results.forEach((result, i) => {
|
|
batch[i].resolve(result);
|
|
});
|
|
} catch (error) {
|
|
// Reject all on error
|
|
batch.forEach(b => b.reject(error));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Usage
|
|
const batcher = new QueryBatcher();
|
|
const result = await batcher.query({ vector: [0.1, 0.2, ...], topK: 10 });
|
|
```
|
|
|
|
**Benefits**:
|
|
- 5-10x reduction in database round trips
|
|
- 40-60% increase in throughput
|
|
- Lower per-query cost
|
|
|
|
---
|
|
|
|
## Database Performance
|
|
|
|
### Connection Management
|
|
|
|
**Optimal PgBouncer Configuration**:
|
|
```ini
|
|
# pgbouncer.ini
|
|
[databases]
|
|
ruvector = host=127.0.0.1 port=5432 dbname=ruvector
|
|
|
|
[pgbouncer]
|
|
listen_addr = 0.0.0.0
|
|
listen_port = 6432
|
|
auth_type = md5
|
|
auth_file = /etc/pgbouncer/userlist.txt
|
|
|
|
# Connection pooling
|
|
pool_mode = transaction # Transaction-level pooling
|
|
max_client_conn = 10000 # Total client connections
|
|
default_pool_size = 50 # Connections per user/database
|
|
reserve_pool_size = 25 # Emergency reserve
|
|
reserve_pool_timeout = 5
|
|
|
|
# Performance
|
|
server_idle_timeout = 600 # Close idle server connections after 10 min
|
|
server_lifetime = 3600 # Recycle connections every hour
|
|
server_connect_timeout = 15
|
|
query_timeout = 0 # No query timeout (handle at app level)
|
|
|
|
# Logging
|
|
log_connections = 0
|
|
log_disconnections = 0
|
|
log_pooler_errors = 1
|
|
```
|
|
|
|
**Deploy PgBouncer**:
|
|
```bash
|
|
# Run PgBouncer as sidecar in Cloud Run
|
|
# Or as a separate Cloud Run service
|
|
|
|
docker run -d \
|
|
--name pgbouncer \
|
|
-p 6432:6432 \
|
|
-e DB_HOST=10.1.2.3 \
|
|
-e DB_NAME=ruvector \
|
|
-e DB_USER=ruvector_app \
|
|
-e DB_PASSWORD=secret \
|
|
edoburu/pgbouncer
|
|
```
|
|
|
|
**Impact**:
|
|
- 20-30ms reduction in connection acquisition time
|
|
- Support 10x more concurrent clients
|
|
- Reduced database CPU/memory usage
|
|
|
|
### Query Optimization
|
|
|
|
**1. Indexes**:
|
|
```sql
|
|
-- Essential indexes for vector search
|
|
CREATE INDEX CONCURRENTLY idx_vectors_metadata_gin
|
|
ON vectors USING gin(metadata jsonb_path_ops);
|
|
|
|
CREATE INDEX CONCURRENTLY idx_vectors_updated_at
|
|
ON vectors(updated_at DESC) WHERE deleted_at IS NULL;
|
|
|
|
CREATE INDEX CONCURRENTLY idx_vectors_category
|
|
ON vectors((metadata->>'category')) WHERE deleted_at IS NULL;
|
|
|
|
-- Partial indexes for common filters
|
|
CREATE INDEX CONCURRENTLY idx_vectors_active
|
|
ON vectors(id) WHERE deleted_at IS NULL AND (metadata->>'status') = 'active';
|
|
|
|
-- Covering index for common query
|
|
CREATE INDEX CONCURRENTLY idx_vectors_covering
|
|
ON vectors(id, metadata, updated_at)
|
|
WHERE deleted_at IS NULL;
|
|
```
|
|
|
|
**2. Partitioning**:
|
|
```sql
|
|
-- Partition vectors table by created_at (monthly partitions)
|
|
CREATE TABLE vectors_partitioned (
|
|
id BIGSERIAL,
|
|
vector_data BYTEA,
|
|
metadata JSONB,
|
|
created_at TIMESTAMP NOT NULL,
|
|
updated_at TIMESTAMP,
|
|
deleted_at TIMESTAMP,
|
|
PRIMARY KEY (id, created_at)
|
|
) PARTITION BY RANGE (created_at);
|
|
|
|
-- Create partitions
|
|
CREATE TABLE vectors_2025_01 PARTITION OF vectors_partitioned
|
|
FOR VALUES FROM ('2025-01-01') TO ('2025-02-01');
|
|
|
|
CREATE TABLE vectors_2025_02 PARTITION OF vectors_partitioned
|
|
FOR VALUES FROM ('2025-02-01') TO ('2025-03-01');
|
|
|
|
-- Auto-create partitions with pg_partman
|
|
CREATE EXTENSION pg_partman;
|
|
|
|
SELECT partman.create_parent(
|
|
'public.vectors_partitioned',
|
|
'created_at',
|
|
'native',
|
|
'monthly'
|
|
);
|
|
```
|
|
|
|
**Benefits**:
|
|
- 50-80% faster queries on recent data
|
|
- Easier maintenance (drop old partitions)
|
|
- Better query planning
|
|
|
|
**3. Prepared Statements**:
|
|
```typescript
|
|
// Use prepared statements for repeated queries
|
|
const PREPARED_QUERIES = {
|
|
searchVectors: {
|
|
name: 'search_vectors',
|
|
text: `
|
|
SELECT id, metadata, vector_data,
|
|
ts_rank_cd(to_tsvector('english', metadata->>'description'), query) AS rank
|
|
FROM vectors, plainto_tsquery('english', $1) query
|
|
WHERE deleted_at IS NULL
|
|
AND to_tsvector('english', metadata->>'description') @@ query
|
|
AND (metadata->>'category') = $2
|
|
ORDER BY rank DESC
|
|
LIMIT $3
|
|
`,
|
|
},
|
|
insertVector: {
|
|
name: 'insert_vector',
|
|
text: `
|
|
INSERT INTO vectors (vector_data, metadata, created_at)
|
|
VALUES ($1, $2, NOW())
|
|
RETURNING id
|
|
`,
|
|
},
|
|
};
|
|
|
|
// Prepare on startup
|
|
await Promise.all(
|
|
Object.values(PREPARED_QUERIES).map(q =>
|
|
db.query(`PREPARE ${q.name} AS ${q.text}`)
|
|
)
|
|
);
|
|
|
|
// Execute prepared statement
|
|
const result = await db.query({
|
|
name: 'search_vectors',
|
|
values: [searchTerm, category, limit],
|
|
});
|
|
```
|
|
|
|
**Impact**:
|
|
- 10-20% faster query execution
|
|
- Reduced query planning overhead
|
|
- Lower CPU usage
|
|
|
|
### Read Replicas
|
|
|
|
**Configuration**:
|
|
```bash
|
|
# Create read replicas in each region
|
|
for region in us-central1 europe-west1 asia-east1; do
|
|
gcloud sql replicas create ruvector-replica-${region} \
|
|
--master-instance-name=ruvector-db \
|
|
--region=${region} \
|
|
--tier=db-custom-4-16384 \
|
|
--replica-type=READ
|
|
done
|
|
```
|
|
|
|
**Connection Routing**:
|
|
```typescript
|
|
// Route reads to local replica, writes to primary
|
|
class DatabaseRouter {
|
|
private primaryPool: Pool;
|
|
private replicaPools: Map<string, Pool>;
|
|
|
|
constructor() {
|
|
this.primaryPool = new Pool({ host: PRIMARY_HOST, ... });
|
|
this.replicaPools = new Map([
|
|
['us-central1', new Pool({ host: US_REPLICA_HOST, ... })],
|
|
['europe-west1', new Pool({ host: EU_REPLICA_HOST, ... })],
|
|
['asia-east1', new Pool({ host: ASIA_REPLICA_HOST, ... })],
|
|
]);
|
|
}
|
|
|
|
async query(sql: string, params: any[], isWrite = false) {
|
|
if (isWrite) {
|
|
return this.primaryPool.query(sql, params);
|
|
}
|
|
|
|
// Route to local replica
|
|
const region = process.env.CLOUD_RUN_REGION;
|
|
const pool = this.replicaPools.get(region) || this.primaryPool;
|
|
return pool.query(sql, params);
|
|
}
|
|
}
|
|
|
|
// Usage
|
|
const db = new DatabaseRouter();
|
|
await db.query('SELECT * FROM vectors WHERE id = $1', [id], false); // Read from replica
|
|
await db.query('INSERT INTO vectors ...', [...], true); // Write to primary
|
|
```
|
|
|
|
**Benefits**:
|
|
- 50-70% reduction in primary database load
|
|
- Lower read latency (local replica)
|
|
- Better geographic distribution
|
|
|
|
---
|
|
|
|
## Cache Optimization
|
|
|
|
### Redis Configuration
|
|
|
|
**Optimal Settings**:
|
|
```bash
|
|
# Redis configuration for high concurrency
|
|
redis-cli CONFIG SET maxmemory 120gb
|
|
redis-cli CONFIG SET maxmemory-policy allkeys-lru
|
|
redis-cli CONFIG SET maxmemory-samples 10
|
|
redis-cli CONFIG SET lazyfree-lazy-eviction yes
|
|
redis-cli CONFIG SET lazyfree-lazy-expire yes
|
|
redis-cli CONFIG SET io-threads 4
|
|
redis-cli CONFIG SET io-threads-do-reads yes
|
|
redis-cli CONFIG SET tcp-backlog 65535
|
|
redis-cli CONFIG SET timeout 0
|
|
redis-cli CONFIG SET tcp-keepalive 300
|
|
```
|
|
|
|
### Cache Strategy
|
|
|
|
**Multi-Level Caching**:
|
|
```typescript
|
|
class MultiLevelCache {
|
|
private l1: Map<string, any>; // In-memory (process)
|
|
private l2: Redis.Cluster; // Redis (regional)
|
|
private l3: CDN; // Cloud CDN (global)
|
|
|
|
constructor() {
|
|
// L1: In-memory cache (1GB per instance)
|
|
this.l1 = new Map();
|
|
setInterval(() => this.evictL1(), 60000); // Evict every minute
|
|
|
|
// L2: Redis cluster
|
|
this.l2 = new Redis.Cluster([
|
|
{ host: 'redis1', port: 6379 },
|
|
{ host: 'redis2', port: 6379 },
|
|
{ host: 'redis3', port: 6379 },
|
|
], {
|
|
redisOptions: {
|
|
password: REDIS_PASSWORD,
|
|
enableReadyCheck: true,
|
|
maxRetriesPerRequest: 3,
|
|
},
|
|
clusterRetryStrategy: (times) => Math.min(times * 100, 3000),
|
|
});
|
|
|
|
// L3: Cloud CDN (configured in GCP)
|
|
}
|
|
|
|
async get(key: string): Promise<any> {
|
|
// Check L1
|
|
if (this.l1.has(key)) {
|
|
return this.l1.get(key);
|
|
}
|
|
|
|
// Check L2 (Redis)
|
|
const l2Value = await this.l2.get(key);
|
|
if (l2Value) {
|
|
const parsed = JSON.parse(l2Value);
|
|
this.l1.set(key, parsed); // Populate L1
|
|
return parsed;
|
|
}
|
|
|
|
// Check L3 (CDN) - implicit via HTTP caching headers
|
|
return null;
|
|
}
|
|
|
|
async set(key: string, value: any, ttl: number = 3600) {
|
|
// Set L1
|
|
this.l1.set(key, value);
|
|
|
|
// Set L2
|
|
await this.l2.setex(key, ttl, JSON.stringify(value));
|
|
|
|
// L3 set via HTTP Cache-Control headers
|
|
}
|
|
|
|
private evictL1() {
|
|
// Simple LRU eviction: keep only 10,000 most recent
|
|
if (this.l1.size > 10000) {
|
|
const toDelete = this.l1.size - 10000;
|
|
const keys = Array.from(this.l1.keys()).slice(0, toDelete);
|
|
keys.forEach(k => this.l1.delete(k));
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
**Cache Key Design**:
|
|
```typescript
|
|
// Good cache key: specific, versioned, with TTL
|
|
function cacheKey(query: VectorQuery): string {
|
|
const vectorHash = hash(query.vector); // Use fast hash (xxhash)
|
|
const filtersHash = hash(JSON.stringify(query.filters));
|
|
const version = 'v2'; // Bump when vector index changes
|
|
|
|
return `query:${version}:${vectorHash}:${filtersHash}:${query.topK}`;
|
|
}
|
|
|
|
// Cache with appropriate TTL
|
|
const key = cacheKey(query);
|
|
let result = await cache.get(key);
|
|
|
|
if (!result) {
|
|
result = await vectorDB.query(query);
|
|
// Cache for 1 hour (shorter for frequently updated data)
|
|
await cache.set(key, result, 3600);
|
|
}
|
|
```
|
|
|
|
**Impact**:
|
|
- 80-95% cache hit rate achievable
|
|
- 10-20ms average response time (vs 50-100ms without cache)
|
|
- 70-90% reduction in database load
|
|
|
|
### CDN Configuration
|
|
|
|
**Cache-Control Headers**:
|
|
```typescript
|
|
// Set aggressive caching for static responses
|
|
app.get('/api/vectors/:id', async (req, res) => {
|
|
const vector = await db.getVector(req.params.id);
|
|
|
|
if (!vector) {
|
|
return res.status(404).json({ error: 'Not found' });
|
|
}
|
|
|
|
// Cache in CDN for 1 hour, browser for 5 minutes
|
|
res.set('Cache-Control', 'public, max-age=300, s-maxage=3600');
|
|
res.set('CDN-Cache-Control', 'max-age=3600');
|
|
res.set('Vary', 'Accept-Encoding, Authorization'); // Vary by encoding and auth
|
|
res.set('ETag', vector.etag);
|
|
|
|
// Support conditional requests
|
|
if (req.get('If-None-Match') === vector.etag) {
|
|
return res.status(304).end();
|
|
}
|
|
|
|
res.json(vector);
|
|
});
|
|
```
|
|
|
|
**CDN Invalidation**:
|
|
```typescript
|
|
// Invalidate CDN cache when vector updated
|
|
import { Compute } from '@google-cloud/compute';
|
|
const compute = new Compute();
|
|
|
|
async function invalidateCDN(vectorId: string) {
|
|
const path = `/api/vectors/${vectorId}`;
|
|
|
|
await compute.request({
|
|
method: 'POST',
|
|
uri: `/compute/v1/projects/${PROJECT_ID}/global/urlMaps/ruvector-lb/invalidateCache`,
|
|
json: {
|
|
path,
|
|
host: 'api.ruvector.io',
|
|
},
|
|
});
|
|
}
|
|
|
|
// Call after update
|
|
await db.updateVector(id, data);
|
|
await invalidateCDN(id);
|
|
```
|
|
|
|
---
|
|
|
|
## Network Performance
|
|
|
|
### HTTP/2 Multiplexing
|
|
|
|
**Client Configuration**:
|
|
```typescript
|
|
import http2 from 'http2';
|
|
|
|
// Reuse single HTTP/2 connection for multiple requests
|
|
const client = http2.connect('https://api.ruvector.io', {
|
|
maxSessionMemory: 1000, // MB
|
|
settings: {
|
|
enablePush: false,
|
|
initialWindowSize: 65535,
|
|
maxConcurrentStreams: 100,
|
|
},
|
|
});
|
|
|
|
// Make concurrent requests over single connection
|
|
async function batchQuery(queries: VectorQuery[]) {
|
|
return Promise.all(
|
|
queries.map(query =>
|
|
new Promise((resolve, reject) => {
|
|
const req = client.request({
|
|
':method': 'POST',
|
|
':path': '/api/query',
|
|
'content-type': 'application/json',
|
|
});
|
|
|
|
let data = '';
|
|
req.on('data', chunk => data += chunk);
|
|
req.on('end', () => resolve(JSON.parse(data)));
|
|
req.on('error', reject);
|
|
|
|
req.write(JSON.stringify(query));
|
|
req.end();
|
|
})
|
|
)
|
|
);
|
|
}
|
|
```
|
|
|
|
**Benefits**:
|
|
- 40-60% reduction in connection overhead
|
|
- Lower latency for multiple requests
|
|
- Better resource utilization
|
|
|
|
### WebSocket Optimization
|
|
|
|
**Compression**:
|
|
```typescript
|
|
import WebSocket from 'ws';
|
|
import zlib from 'zlib';
|
|
|
|
// Server-side: per-message deflate
|
|
const wss = new WebSocket.Server({
|
|
port: 8080,
|
|
perMessageDeflate: {
|
|
zlibDeflateOptions: {
|
|
level: zlib.constants.Z_BEST_SPEED, // Fast compression
|
|
},
|
|
clientNoContextTakeover: true, // No context between messages
|
|
serverNoContextTakeover: true,
|
|
clientMaxWindowBits: 10,
|
|
serverMaxWindowBits: 10,
|
|
},
|
|
});
|
|
|
|
// Client-side: binary frames for vectors
|
|
const ws = new WebSocket('wss://api.ruvector.io/stream', {
|
|
perMessageDeflate: true,
|
|
});
|
|
|
|
// Send vector as binary (more efficient than JSON)
|
|
const vectorBuffer = Float32Array.from(vector).buffer;
|
|
ws.send(vectorBuffer, { binary: true });
|
|
|
|
// Receive results
|
|
ws.on('message', (data) => {
|
|
if (data instanceof Buffer) {
|
|
const results = deserializeResults(data);
|
|
handleResults(results);
|
|
}
|
|
});
|
|
```
|
|
|
|
**Benefits**:
|
|
- 30-50% bandwidth reduction
|
|
- Lower latency for large vectors
|
|
- More efficient serialization
|
|
|
|
---
|
|
|
|
## Query Optimization
|
|
|
|
### Vector Search Tuning
|
|
|
|
**HNSW Parameters**:
|
|
```rust
|
|
// Optimal HNSW parameters for 500M vectors
|
|
use hnsw_rs::prelude::*;
|
|
|
|
let hnsw = Hnsw::<f32, DistCosine>::new(
|
|
16, // M: Number of connections per layer (trade-off: accuracy vs memory)
|
|
100, // ef_construction: Higher = better accuracy, slower indexing
|
|
768, // Dimension
|
|
1000, // Max elements per block
|
|
DistCosine,
|
|
);
|
|
|
|
// Query-time parameters
|
|
let ef_search = 64; // Higher = better recall, slower search
|
|
let num_results = 10;
|
|
|
|
let results = hnsw.search(&query_vector, num_results, ef_search);
|
|
```
|
|
|
|
**Parameter Tuning Guide**:
|
|
| M | ef_construction | ef_search | Recall | Build Time | Query Time |
|
|
|---|-----------------|-----------|--------|------------|------------|
|
|
| 8 | 50 | 32 | 85% | 1x | 0.5ms |
|
|
| 16 | 100 | 64 | 95% | 2x | 1.0ms |
|
|
| 32 | 200 | 128 | 99% | 4x | 2.5ms |
|
|
|
|
**Recommendation for 500M scale**:
|
|
- M = 16 (good accuracy/memory balance)
|
|
- ef_construction = 100 (high quality index)
|
|
- ef_search = 64 (95%+ recall, <2ms query time)
|
|
|
|
### Filtering Optimization
|
|
|
|
**Pre-filtering vs Post-filtering**:
|
|
```typescript
|
|
// BAD: Post-filtering (queries all vectors, then filters)
|
|
async function searchWithPostFilter(vector: number[], filters: Filters, topK: number) {
|
|
const results = await hnsw.search(vector, topK * 10); // Over-fetch
|
|
return results.filter(r => matchesFilters(r, filters)).slice(0, topK);
|
|
}
|
|
|
|
// GOOD: Pre-filtering (only queries matching vectors)
|
|
async function searchWithPreFilter(vector: number[], filters: Filters, topK: number) {
|
|
// Use database index to get candidate IDs
|
|
const candidateIds = await db.query(
|
|
'SELECT id FROM vectors WHERE (metadata->>\'category\') = $1 AND deleted_at IS NULL',
|
|
[filters.category]
|
|
);
|
|
|
|
// Query only candidates
|
|
return hnsw.searchFiltered(vector, topK, candidateIds.map(r => r.id));
|
|
}
|
|
```
|
|
|
|
**Benefits**:
|
|
- 50-80% faster for filtered queries
|
|
- Lower memory usage
|
|
- Better scalability
|
|
|
|
---
|
|
|
|
## Resource Allocation
|
|
|
|
### CPU Optimization
|
|
|
|
**Node.js Tuning**:
|
|
```bash
|
|
# Optimal Node.js flags for Cloud Run
|
|
export NODE_OPTIONS="
|
|
--max-old-space-size=14336 # 14GB heap (leave 2GB for system)
|
|
--optimize-for-size # Reduce memory usage
|
|
--max-semi-space-size=64 # MB, for young generation
|
|
--max-old-generation-size=13312 # MB, for old generation
|
|
--no-turbo-inlining # Reduce compilation time
|
|
--turbo-fast-api-calls # Faster native calls
|
|
--experimental-wasm-simd # Enable WASM SIMD
|
|
"
|
|
```
|
|
|
|
**Worker Threads**:
|
|
```typescript
|
|
import { Worker, isMainThread, parentPort, workerData } from 'worker_threads';
|
|
import os from 'os';
|
|
|
|
const NUM_WORKERS = os.cpus().length; // 4 for Cloud Run 4 vCPU
|
|
|
|
if (isMainThread) {
|
|
// Main thread: distribute work to workers
|
|
const workers: Worker[] = [];
|
|
for (let i = 0; i < NUM_WORKERS; i++) {
|
|
workers.push(new Worker(__filename, {
|
|
workerData: { workerId: i },
|
|
}));
|
|
}
|
|
|
|
// Round-robin distribution
|
|
let current = 0;
|
|
export function queryVector(vector: number[]): Promise<SearchResult> {
|
|
return new Promise((resolve, reject) => {
|
|
const worker = workers[current];
|
|
current = (current + 1) % NUM_WORKERS;
|
|
|
|
worker.once('message', resolve);
|
|
worker.once('error', reject);
|
|
worker.postMessage({ type: 'query', vector });
|
|
});
|
|
}
|
|
} else {
|
|
// Worker thread: handle queries
|
|
const vectorDB = loadVectorDB();
|
|
|
|
parentPort.on('message', async (msg) => {
|
|
if (msg.type === 'query') {
|
|
const result = await vectorDB.search(msg.vector, 10);
|
|
parentPort.postMessage(result);
|
|
}
|
|
});
|
|
}
|
|
```
|
|
|
|
**Benefits**:
|
|
- 2-3x throughput improvement
|
|
- Better CPU utilization (all cores used)
|
|
- Lower P99 latency (parallel processing)
|
|
|
|
### Memory Optimization
|
|
|
|
**Vector Quantization**:
|
|
```rust
|
|
// Reduce memory by 4-32x using quantization
|
|
use ruvector::quantization::{ScalarQuantizer, ProductQuantizer};
|
|
|
|
// Scalar quantization: f32 -> u8 (4x compression)
|
|
let sq = ScalarQuantizer::new(768); // dimension
|
|
let quantized = sq.quantize(&vector); // Vec<f32> -> Vec<u8>
|
|
let reconstructed = sq.dequantize(&quantized);
|
|
|
|
// Product quantization: 768 dims -> 96 bytes (32x compression)
|
|
let pq = ProductQuantizer::new(768, 96, 256); // dim, num_centroids, num_subvectors
|
|
let quantized = pq.quantize(&vector); // Vec<f32> -> Vec<u8>
|
|
|
|
// Query with quantized vectors (asymmetric distance)
|
|
let distance = pq.asymmetric_distance(&query_vector, &quantized);
|
|
```
|
|
|
|
**Impact**:
|
|
- 4-32x memory reduction
|
|
- 10-30% faster queries (CPU cache locality)
|
|
- Trade-off: ~5% recall reduction
|
|
|
|
**Streaming Responses**:
|
|
```typescript
|
|
// Stream results as they're found (don't buffer all)
|
|
app.get('/api/stream-query', async (req, res) => {
|
|
res.setHeader('Content-Type', 'text/event-stream');
|
|
res.setHeader('Cache-Control', 'no-cache');
|
|
res.setHeader('Connection', 'keep-alive');
|
|
|
|
const query = JSON.parse(req.query.q);
|
|
|
|
// Stream results incrementally
|
|
for await (const result of vectorDB.streamSearch(query)) {
|
|
res.write(`data: ${JSON.stringify(result)}\n\n`);
|
|
}
|
|
|
|
res.end();
|
|
});
|
|
|
|
// Client-side: process results as they arrive
|
|
const eventSource = new EventSource(`/api/stream-query?q=${JSON.stringify(query)}`);
|
|
eventSource.onmessage = (event) => {
|
|
const result = JSON.parse(event.data);
|
|
displayResult(result); // Show immediately
|
|
};
|
|
```
|
|
|
|
**Benefits**:
|
|
- Lower memory usage
|
|
- Faster time-to-first-result
|
|
- Better user experience
|
|
|
|
---
|
|
|
|
## Monitoring & Profiling
|
|
|
|
### OpenTelemetry Instrumentation
|
|
|
|
**Comprehensive Tracing**:
|
|
```typescript
|
|
import { trace, SpanStatusCode } from '@opentelemetry/api';
|
|
import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
|
|
import { TraceExporter } from '@google-cloud/opentelemetry-cloud-trace-exporter';
|
|
|
|
// Initialize tracer
|
|
const provider = new NodeTracerProvider();
|
|
provider.addSpanProcessor(new BatchSpanProcessor(new TraceExporter()));
|
|
provider.register();
|
|
|
|
const tracer = trace.getTracer('ruvector');
|
|
|
|
// Instrument query
|
|
async function query(vector: number[], topK: number) {
|
|
const span = tracer.startSpan('vectorDB.query');
|
|
span.setAttribute('vector.dim', vector.length);
|
|
span.setAttribute('topK', topK);
|
|
|
|
try {
|
|
// Cache lookup
|
|
const cacheSpan = tracer.startSpan('cache.lookup', { parent: span });
|
|
const cached = await cache.get(cacheKey(vector));
|
|
cacheSpan.setAttribute('cache.hit', cached !== null);
|
|
cacheSpan.end();
|
|
|
|
if (cached) {
|
|
span.setStatus({ code: SpanStatusCode.OK });
|
|
return cached;
|
|
}
|
|
|
|
// Database query
|
|
const dbSpan = tracer.startSpan('database.query', { parent: span });
|
|
const result = await vectorDB.search(vector, topK);
|
|
dbSpan.setAttribute('result.count', result.length);
|
|
dbSpan.end();
|
|
|
|
// Cache set
|
|
const setCacheSpan = tracer.startSpan('cache.set', { parent: span });
|
|
await cache.set(cacheKey(vector), result, 3600);
|
|
setCacheSpan.end();
|
|
|
|
span.setStatus({ code: SpanStatusCode.OK });
|
|
return result;
|
|
} catch (error) {
|
|
span.recordException(error);
|
|
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
|
|
throw error;
|
|
} finally {
|
|
span.end();
|
|
}
|
|
}
|
|
```
|
|
|
|
**Custom Metrics**:
|
|
```typescript
|
|
import { MeterProvider, PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics';
|
|
import { MetricExporter } from '@google-cloud/opentelemetry-cloud-monitoring-exporter';
|
|
|
|
const meterProvider = new MeterProvider({
|
|
readers: [
|
|
new PeriodicExportingMetricReader({
|
|
exporter: new MetricExporter(),
|
|
exportIntervalMillis: 60000,
|
|
}),
|
|
],
|
|
});
|
|
|
|
const meter = meterProvider.getMeter('ruvector');
|
|
|
|
// Define metrics
|
|
const queryCounter = meter.createCounter('vector.queries.total', {
|
|
description: 'Total number of vector queries',
|
|
});
|
|
|
|
const queryDuration = meter.createHistogram('vector.query.duration', {
|
|
description: 'Query duration in milliseconds',
|
|
unit: 'ms',
|
|
});
|
|
|
|
const cacheHitRatio = meter.createObservableGauge('cache.hit_ratio', {
|
|
description: 'Cache hit ratio (0-1)',
|
|
});
|
|
|
|
// Record metrics
|
|
function instrumentedQuery(vector: number[], topK: number) {
|
|
const start = Date.now();
|
|
queryCounter.add(1, { region: process.env.REGION });
|
|
|
|
try {
|
|
const result = await query(vector, topK);
|
|
const duration = Date.now() - start;
|
|
queryDuration.record(duration, { success: 'true' });
|
|
return result;
|
|
} catch (error) {
|
|
queryDuration.record(Date.now() - start, { success: 'false' });
|
|
throw error;
|
|
}
|
|
}
|
|
```
|
|
|
|
### Performance Profiling
|
|
|
|
**V8 Profiling**:
|
|
```bash
|
|
# Start with profiling enabled
|
|
node --prof app.js
|
|
|
|
# Generate report
|
|
node --prof-process isolate-0x*.log > profile.txt
|
|
|
|
# Look for hot functions
|
|
grep "\\[JavaScript\\]" profile.txt | head -20
|
|
```
|
|
|
|
**Heap Snapshots**:
|
|
```typescript
|
|
import v8 from 'v8';
|
|
import fs from 'fs';
|
|
|
|
// Take heap snapshot periodically
|
|
setInterval(() => {
|
|
const snapshot = v8.writeHeapSnapshot(`heap-${Date.now()}.heapsnapshot`);
|
|
console.log('Heap snapshot written:', snapshot);
|
|
}, 3600000); // Every hour
|
|
|
|
// Analyze with Chrome DevTools
|
|
```
|
|
|
|
**Memory Leak Detection**:
|
|
```typescript
|
|
import { memwatch } from '@airbnb/node-memwatch';
|
|
|
|
memwatch.on('leak', (info) => {
|
|
console.error('Memory leak detected:', info);
|
|
// Alert ops team
|
|
});
|
|
|
|
memwatch.on('stats', (stats) => {
|
|
console.log('Memory usage:', {
|
|
heapUsed: stats.current_base,
|
|
heapTotal: stats.max,
|
|
percentUsed: (stats.current_base / stats.max) * 100,
|
|
});
|
|
});
|
|
```
|
|
|
|
---
|
|
|
|
## Performance Checklist
|
|
|
|
### Before Deployment
|
|
- [ ] Connection pools configured (DB, Redis, vector DB)
|
|
- [ ] Indexes created on all filtered columns
|
|
- [ ] Prepared statements used for repeated queries
|
|
- [ ] Multi-level caching implemented (L1, L2, L3)
|
|
- [ ] HTTP/2 enabled
|
|
- [ ] Compression enabled (gzip, brotli)
|
|
- [ ] CDN configured with appropriate cache headers
|
|
- [ ] Min instances set to avoid cold starts
|
|
- [ ] Worker threads enabled for CPU-heavy work
|
|
- [ ] OpenTelemetry instrumentation added
|
|
- [ ] Custom metrics defined
|
|
- [ ] Load tests passed
|
|
|
|
### After Deployment
|
|
- [ ] Monitor P50/P95/P99 latency
|
|
- [ ] Check cache hit rates (target > 75%)
|
|
- [ ] Verify connection pool utilization
|
|
- [ ] Review slow query logs
|
|
- [ ] Analyze trace data for bottlenecks
|
|
- [ ] Check for memory leaks
|
|
- [ ] Validate auto-scaling behavior
|
|
- [ ] Review cost per query
|
|
- [ ] Iterate and optimize
|
|
|
|
---
|
|
|
|
## Expected Performance Targets
|
|
|
|
| Metric | Target | Excellent |
|
|
|--------|--------|-----------|
|
|
| P50 Latency | < 10ms | < 5ms |
|
|
| P95 Latency | < 30ms | < 15ms |
|
|
| P99 Latency | < 50ms | < 25ms |
|
|
| Cache Hit Rate | > 70% | > 85% |
|
|
| Throughput | 50K QPS | 100K+ QPS |
|
|
| Error Rate | < 0.1% | < 0.01% |
|
|
| CPU Utilization | 60-80% | 50-70% |
|
|
| Memory Utilization | 70-85% | 60-75% |
|
|
| Cost per 1M queries | < $5 | < $3 |
|
|
|
|
---
|
|
|
|
## Conclusion
|
|
|
|
Implementing these optimizations can dramatically improve RuVector's performance:
|
|
|
|
- **30-50% latency reduction** through caching and connection pooling
|
|
- **2-3x throughput increase** via batching and parallel processing
|
|
- **20-40% cost reduction** through better resource utilization
|
|
- **10x better scalability** with quantization and partitioning
|
|
|
|
**Priority Order**:
|
|
1. Connection pooling (biggest impact)
|
|
2. Multi-level caching (L1, L2, L3)
|
|
3. Database optimizations (indexes, replicas)
|
|
4. HTTP/2 and compression
|
|
5. Worker threads for CPU work
|
|
6. Quantization for memory
|
|
7. Advanced profiling and tuning
|
|
|
|
---
|
|
|
|
**Document Version**: 1.0
|
|
**Last Updated**: 2025-11-20
|
|
**Status**: Production-Ready
|