Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,666 @@
# Optimization Strategy
## Overview
Comprehensive optimization strategies for ruvector-postgres covering SIMD acceleration, memory management, query optimization, and PostgreSQL-specific tuning.
## SIMD Optimization
### Architecture Detection & Dispatch
```rust
// src/simd/dispatch.rs
#[derive(Debug, Clone, Copy)]
pub enum SimdCapability {
AVX512,
AVX2,
NEON,
Scalar,
}
lazy_static! {
static ref SIMD_CAPABILITY: SimdCapability = detect_simd();
}
fn detect_simd() -> SimdCapability {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
return SimdCapability::AVX512;
}
if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
return SimdCapability::AVX2;
}
}
#[cfg(target_arch = "aarch64")]
{
return SimdCapability::NEON;
}
SimdCapability::Scalar
}
/// Dispatch to optimal implementation
#[inline]
pub fn distance_dispatch(a: &[f32], b: &[f32], metric: DistanceMetric) -> f32 {
match *SIMD_CAPABILITY {
SimdCapability::AVX512 => distance_avx512(a, b, metric),
SimdCapability::AVX2 => distance_avx2(a, b, metric),
SimdCapability::NEON => distance_neon(a, b, metric),
SimdCapability::Scalar => distance_scalar(a, b, metric),
}
}
```
### Vectorized Operations
```rust
// AVX-512 optimized distance
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f", enable = "avx512vl")]
unsafe fn euclidean_avx512(a: &[f32], b: &[f32]) -> f32 {
use std::arch::x86_64::*;
let mut sum = _mm512_setzero_ps();
let chunks = a.len() / 16;
for i in 0..chunks {
let va = _mm512_loadu_ps(a.as_ptr().add(i * 16));
let vb = _mm512_loadu_ps(b.as_ptr().add(i * 16));
let diff = _mm512_sub_ps(va, vb);
sum = _mm512_fmadd_ps(diff, diff, sum);
}
// Handle remainder
let mut result = _mm512_reduce_add_ps(sum);
for i in (chunks * 16)..a.len() {
let diff = a[i] - b[i];
result += diff * diff;
}
result.sqrt()
}
// ARM NEON optimized distance
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn euclidean_neon(a: &[f32], b: &[f32]) -> f32 {
use std::arch::aarch64::*;
let mut sum = vdupq_n_f32(0.0);
let chunks = a.len() / 4;
for i in 0..chunks {
let va = vld1q_f32(a.as_ptr().add(i * 4));
let vb = vld1q_f32(b.as_ptr().add(i * 4));
let diff = vsubq_f32(va, vb);
sum = vfmaq_f32(sum, diff, diff);
}
let sum_array: [f32; 4] = std::mem::transmute(sum);
let mut result: f32 = sum_array.iter().sum();
for i in (chunks * 4)..a.len() {
let diff = a[i] - b[i];
result += diff * diff;
}
result.sqrt()
}
```
### Batch Processing
```rust
/// Process multiple vectors in parallel batches
pub fn batch_distances(
query: &[f32],
candidates: &[&[f32]],
metric: DistanceMetric,
) -> Vec<f32> {
const BATCH_SIZE: usize = 256;
candidates
.par_chunks(BATCH_SIZE)
.flat_map(|batch| {
batch.iter()
.map(|c| distance_dispatch(query, c, metric))
.collect::<Vec<_>>()
})
.collect()
}
/// Prefetch-optimized batch processing
pub fn batch_distances_prefetch(
query: &[f32],
candidates: &[Vec<f32>],
metric: DistanceMetric,
) -> Vec<f32> {
let mut results = Vec::with_capacity(candidates.len());
for i in 0..candidates.len() {
// Prefetch next vectors
if i + 4 < candidates.len() {
prefetch_read(&candidates[i + 4]);
}
results.push(distance_dispatch(query, &candidates[i], metric));
}
results
}
#[inline]
fn prefetch_read<T>(data: &T) {
#[cfg(target_arch = "x86_64")]
unsafe {
std::arch::x86_64::_mm_prefetch(
data as *const T as *const i8,
std::arch::x86_64::_MM_HINT_T0,
);
}
}
```
## Memory Optimization
### Zero-Copy Operations
```rust
/// Memory-mapped vector storage
pub struct MappedVectors {
mmap: memmap2::Mmap,
dim: usize,
count: usize,
}
impl MappedVectors {
pub fn open(path: &Path, dim: usize) -> io::Result<Self> {
let file = File::open(path)?;
let mmap = unsafe { memmap2::Mmap::map(&file)? };
let count = mmap.len() / (dim * std::mem::size_of::<f32>());
Ok(Self { mmap, dim, count })
}
/// Zero-copy access to vector
#[inline]
pub fn get(&self, index: usize) -> &[f32] {
let offset = index * self.dim;
let bytes = &self.mmap[offset * 4..(offset + self.dim) * 4];
unsafe { std::slice::from_raw_parts(bytes.as_ptr() as *const f32, self.dim) }
}
}
/// PostgreSQL shared memory integration
pub struct SharedVectorCache {
shmem: pg_sys::dsm_segment,
vectors: *mut f32,
capacity: usize,
dim: usize,
}
impl SharedVectorCache {
pub fn create(capacity: usize, dim: usize) -> Self {
let size = capacity * dim * std::mem::size_of::<f32>();
let shmem = unsafe { pg_sys::dsm_create(size, 0) };
let vectors = unsafe { pg_sys::dsm_segment_address(shmem) as *mut f32 };
Self { shmem, vectors, capacity, dim }
}
#[inline]
pub fn get(&self, index: usize) -> &[f32] {
unsafe {
std::slice::from_raw_parts(
self.vectors.add(index * self.dim),
self.dim
)
}
}
}
```
### Memory Pool
```rust
/// Thread-local memory pool for temporary allocations
thread_local! {
static VECTOR_POOL: RefCell<VectorPool> = RefCell::new(VectorPool::new());
}
pub struct VectorPool {
pools: HashMap<usize, Vec<Vec<f32>>>,
max_cached: usize,
}
impl VectorPool {
pub fn new() -> Self {
Self {
pools: HashMap::new(),
max_cached: 1024,
}
}
pub fn acquire(&mut self, dim: usize) -> Vec<f32> {
self.pools
.get_mut(&dim)
.and_then(|pool| pool.pop())
.unwrap_or_else(|| vec![0.0; dim])
}
pub fn release(&mut self, mut vec: Vec<f32>) {
let dim = vec.len();
let pool = self.pools.entry(dim).or_insert_with(Vec::new);
if pool.len() < self.max_cached {
vec.iter_mut().for_each(|x| *x = 0.0);
pool.push(vec);
}
}
}
/// RAII guard for pooled vectors
pub struct PooledVec(Vec<f32>);
impl Drop for PooledVec {
fn drop(&mut self) {
VECTOR_POOL.with(|pool| {
pool.borrow_mut().release(std::mem::take(&mut self.0));
});
}
}
```
### Quantization for Memory Reduction
```rust
/// 8-bit scalar quantization (4x memory reduction)
pub struct ScalarQuantized {
data: Vec<u8>,
scale: f32,
offset: f32,
dim: usize,
}
impl ScalarQuantized {
pub fn from_f32(vectors: &[Vec<f32>]) -> Self {
let (min, max) = find_minmax(vectors);
let scale = (max - min) / 255.0;
let offset = min;
let data: Vec<u8> = vectors.iter()
.flat_map(|v| {
v.iter().map(|&x| ((x - offset) / scale) as u8)
})
.collect();
Self { data, scale, offset, dim: vectors[0].len() }
}
#[inline]
pub fn distance(&self, query: &[f32], index: usize) -> f32 {
let start = index * self.dim;
let quantized = &self.data[start..start + self.dim];
let mut sum = 0.0f32;
for (i, &q) in quantized.iter().enumerate() {
let reconstructed = q as f32 * self.scale + self.offset;
let diff = query[i] - reconstructed;
sum += diff * diff;
}
sum.sqrt()
}
}
/// Binary quantization (32x memory reduction)
pub struct BinaryQuantized {
data: BitVec,
dim: usize,
}
impl BinaryQuantized {
pub fn from_f32(vectors: &[Vec<f32>]) -> Self {
let dim = vectors[0].len();
let mut data = BitVec::with_capacity(vectors.len() * dim);
for vec in vectors {
for &x in vec {
data.push(x > 0.0);
}
}
Self { data, dim }
}
/// Hamming distance (extremely fast)
#[inline]
pub fn hamming_distance(&self, query_bits: &BitVec, index: usize) -> u32 {
let start = index * self.dim;
let doc_bits = &self.data[start..start + self.dim];
// XOR and popcount
doc_bits.iter()
.zip(query_bits.iter())
.filter(|(a, b)| a != b)
.count() as u32
}
}
```
## Query Optimization
### Query Plan Caching
```rust
/// Cache compiled query plans
pub struct QueryPlanCache {
cache: DashMap<u64, Arc<QueryPlan>>,
max_size: usize,
hit_count: AtomicU64,
miss_count: AtomicU64,
}
impl QueryPlanCache {
pub fn get_or_compile<F>(&self, query_hash: u64, compile: F) -> Arc<QueryPlan>
where
F: FnOnce() -> QueryPlan,
{
if let Some(plan) = self.cache.get(&query_hash) {
self.hit_count.fetch_add(1, Ordering::Relaxed);
return plan.clone();
}
self.miss_count.fetch_add(1, Ordering::Relaxed);
let plan = Arc::new(compile());
// LRU eviction if needed
if self.cache.len() >= self.max_size {
self.evict_lru();
}
self.cache.insert(query_hash, plan.clone());
plan
}
}
```
### Adaptive Index Selection
```rust
/// Choose optimal index based on query characteristics
pub fn select_index(
query: &SearchQuery,
available_indexes: &[IndexInfo],
table_stats: &TableStats,
) -> &IndexInfo {
let selectivity = estimate_selectivity(query, table_stats);
let expected_results = (table_stats.row_count as f64 * selectivity) as usize;
// Decision tree for index selection
if expected_results < 100 {
// Sequential scan may be faster for very small result sets
return &available_indexes.iter()
.find(|i| i.index_type == IndexType::BTree)
.unwrap_or(&available_indexes[0]);
}
if query.has_vector_similarity() {
// Prefer HNSW for similarity search
if let Some(hnsw) = available_indexes.iter()
.find(|i| i.index_type == IndexType::Hnsw)
{
return hnsw;
}
}
// Default to IVFFlat for range queries
available_indexes.iter()
.find(|i| i.index_type == IndexType::IvfFlat)
.unwrap_or(&available_indexes[0])
}
/// Adaptive ef_search based on query complexity
pub fn adaptive_ef_search(
query: &[f32],
index: &HnswIndex,
target_recall: f64,
) -> usize {
// Start with learned baseline
let baseline = index.learned_ef_for_query(query);
// Adjust based on query density
let query_norm = query.iter().map(|x| x * x).sum::<f32>().sqrt();
let density_factor = if query_norm < 1.0 { 1.2 } else { 1.0 };
// Adjust based on target recall
let recall_factor = match target_recall {
r if r >= 0.99 => 2.0,
r if r >= 0.95 => 1.5,
r if r >= 0.90 => 1.2,
_ => 1.0,
};
((baseline as f64 * density_factor * recall_factor) as usize).max(10)
}
```
### Parallel Query Execution
```rust
/// Parallel index scan
pub fn parallel_search(
query: &[f32],
index: &HnswIndex,
k: usize,
num_threads: usize,
) -> Vec<(u64, f32)> {
// Divide search into regions
let entry_points = index.get_diverse_entry_points(num_threads);
let results: Vec<_> = entry_points
.into_par_iter()
.map(|entry| index.search_from(query, entry, k * 2))
.collect();
// Merge results
let mut merged: Vec<_> = results.into_iter().flatten().collect();
merged.sort_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap());
merged.dedup_by_key(|(id, _)| *id);
merged.truncate(k);
merged
}
/// Intra-query parallelism for complex queries
pub fn parallel_filter_search(
query: &[f32],
filters: &[Filter],
index: &HnswIndex,
k: usize,
) -> Vec<(u64, f32)> {
// Stage 1: Parallel filter evaluation
let filter_results: Vec<HashSet<u64>> = filters
.par_iter()
.map(|f| evaluate_filter(f))
.collect();
// Stage 2: Intersect filter results
let valid_ids = filter_results
.into_iter()
.reduce(|a, b| a.intersection(&b).copied().collect())
.unwrap_or_default();
// Stage 3: Vector search with filter
index.search_with_filter(query, k, |id| valid_ids.contains(&id))
}
```
## PostgreSQL-Specific Optimizations
### Buffer Management
```rust
/// Custom buffer pool for vector data
pub struct VectorBufferPool {
buffers: Vec<Buffer>,
free_list: Mutex<Vec<usize>>,
usage_count: Vec<AtomicU32>,
}
impl VectorBufferPool {
/// Pin buffer with usage tracking
pub fn pin(&self, index: usize) -> PinnedBuffer {
self.usage_count[index].fetch_add(1, Ordering::Relaxed);
PinnedBuffer { pool: self, index }
}
/// Clock sweep eviction
pub fn evict_if_needed(&self) -> Option<usize> {
let mut hand = 0;
loop {
let count = self.usage_count[hand].load(Ordering::Relaxed);
if count == 0 {
return Some(hand);
}
self.usage_count[hand].store(count - 1, Ordering::Relaxed);
hand = (hand + 1) % self.buffers.len();
}
}
}
```
### WAL Optimization
```rust
/// Batch WAL writes for bulk operations
pub fn bulk_insert_optimized(
vectors: &[Vec<f32>],
ids: &[u64],
batch_size: usize,
) {
// Group into batches
for batch in vectors.chunks(batch_size).zip(ids.chunks(batch_size)) {
// Single WAL record for batch
let wal_record = create_batch_wal_record(batch.0, batch.1);
unsafe {
// Write single WAL entry
pg_sys::XLogInsert(RUVECTOR_RMGR_ID, XLOG_RUVECTOR_BATCH_INSERT);
}
// Apply batch
apply_batch(batch.0, batch.1);
}
}
```
### Statistics Collection
```rust
/// Collect statistics for query planner
pub fn analyze_vector_column(
table_oid: pg_sys::Oid,
column_num: i16,
sample_rows: &[pg_sys::HeapTuple],
) -> VectorStats {
let mut vectors: Vec<Vec<f32>> = Vec::new();
// Extract sample vectors
for tuple in sample_rows {
if let Some(vec) = extract_vector(tuple, column_num) {
vectors.push(vec);
}
}
// Compute statistics
let dim = vectors[0].len();
let centroid = compute_centroid(&vectors);
let avg_norm = vectors.iter()
.map(|v| v.iter().map(|x| x * x).sum::<f32>().sqrt())
.sum::<f32>() / vectors.len() as f32;
// Compute distribution statistics
let distances: Vec<f32> = vectors.iter()
.map(|v| euclidean_distance(v, &centroid))
.collect();
VectorStats {
dim,
avg_norm,
centroid,
distance_histogram: compute_histogram(&distances, 100),
null_fraction: 0.0, // TODO: compute from sample
}
}
```
## Configuration Recommendations
### GUC Parameters
```sql
-- Memory settings
SET ruvector.shared_cache_size = '256MB';
SET ruvector.work_mem = '64MB';
-- Parallelism
SET ruvector.max_parallel_workers = 4;
SET ruvector.parallel_search_threshold = 10000;
-- Index tuning
SET ruvector.ef_search = 64; -- HNSW search quality
SET ruvector.probes = 10; -- IVFFlat probe count
SET ruvector.quantization = 'sq8'; -- Default quantization
-- Learning
SET ruvector.learning_enabled = on;
SET ruvector.learning_rate = 0.01;
-- Maintenance
SET ruvector.maintenance_work_mem = '512MB';
SET ruvector.autovacuum_enabled = on;
```
### Hardware-Specific Tuning
```yaml
# Intel Xeon (AVX-512)
ruvector.simd_mode: 'avx512'
ruvector.vector_batch_size: 256
ruvector.prefetch_distance: 4
# AMD EPYC (AVX2)
ruvector.simd_mode: 'avx2'
ruvector.vector_batch_size: 128
ruvector.prefetch_distance: 8
# Apple M1/M2 (NEON)
ruvector.simd_mode: 'neon'
ruvector.vector_batch_size: 64
ruvector.prefetch_distance: 4
# Memory-constrained
ruvector.quantization: 'binary'
ruvector.shared_cache_size: '64MB'
ruvector.enable_mmap: on
```
## Performance Monitoring
```sql
-- View SIMD statistics
SELECT * FROM ruvector_simd_stats();
-- Memory usage
SELECT * FROM ruvector_memory_stats();
-- Cache hit rates
SELECT * FROM ruvector_cache_stats();
-- Query performance
SELECT * FROM ruvector_query_stats()
ORDER BY total_time DESC
LIMIT 10;
```