git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
667 lines
16 KiB
Markdown
667 lines
16 KiB
Markdown
# Optimization Strategy
|
|
|
|
## Overview
|
|
|
|
Comprehensive optimization strategies for ruvector-postgres covering SIMD acceleration, memory management, query optimization, and PostgreSQL-specific tuning.
|
|
|
|
## SIMD Optimization
|
|
|
|
### Architecture Detection & Dispatch
|
|
|
|
```rust
|
|
// src/simd/dispatch.rs
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
pub enum SimdCapability {
|
|
AVX512,
|
|
AVX2,
|
|
NEON,
|
|
Scalar,
|
|
}
|
|
|
|
lazy_static! {
|
|
static ref SIMD_CAPABILITY: SimdCapability = detect_simd();
|
|
}
|
|
|
|
fn detect_simd() -> SimdCapability {
|
|
#[cfg(target_arch = "x86_64")]
|
|
{
|
|
if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
|
|
return SimdCapability::AVX512;
|
|
}
|
|
if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
|
|
return SimdCapability::AVX2;
|
|
}
|
|
}
|
|
|
|
#[cfg(target_arch = "aarch64")]
|
|
{
|
|
return SimdCapability::NEON;
|
|
}
|
|
|
|
SimdCapability::Scalar
|
|
}
|
|
|
|
/// Dispatch to optimal implementation
|
|
#[inline]
|
|
pub fn distance_dispatch(a: &[f32], b: &[f32], metric: DistanceMetric) -> f32 {
|
|
match *SIMD_CAPABILITY {
|
|
SimdCapability::AVX512 => distance_avx512(a, b, metric),
|
|
SimdCapability::AVX2 => distance_avx2(a, b, metric),
|
|
SimdCapability::NEON => distance_neon(a, b, metric),
|
|
SimdCapability::Scalar => distance_scalar(a, b, metric),
|
|
}
|
|
}
|
|
```
|
|
|
|
### Vectorized Operations
|
|
|
|
```rust
|
|
// AVX-512 optimized distance
|
|
#[cfg(target_arch = "x86_64")]
|
|
#[target_feature(enable = "avx512f", enable = "avx512vl")]
|
|
unsafe fn euclidean_avx512(a: &[f32], b: &[f32]) -> f32 {
|
|
use std::arch::x86_64::*;
|
|
|
|
let mut sum = _mm512_setzero_ps();
|
|
let chunks = a.len() / 16;
|
|
|
|
for i in 0..chunks {
|
|
let va = _mm512_loadu_ps(a.as_ptr().add(i * 16));
|
|
let vb = _mm512_loadu_ps(b.as_ptr().add(i * 16));
|
|
let diff = _mm512_sub_ps(va, vb);
|
|
sum = _mm512_fmadd_ps(diff, diff, sum);
|
|
}
|
|
|
|
// Handle remainder
|
|
let mut result = _mm512_reduce_add_ps(sum);
|
|
for i in (chunks * 16)..a.len() {
|
|
let diff = a[i] - b[i];
|
|
result += diff * diff;
|
|
}
|
|
|
|
result.sqrt()
|
|
}
|
|
|
|
// ARM NEON optimized distance
|
|
#[cfg(target_arch = "aarch64")]
|
|
#[target_feature(enable = "neon")]
|
|
unsafe fn euclidean_neon(a: &[f32], b: &[f32]) -> f32 {
|
|
use std::arch::aarch64::*;
|
|
|
|
let mut sum = vdupq_n_f32(0.0);
|
|
let chunks = a.len() / 4;
|
|
|
|
for i in 0..chunks {
|
|
let va = vld1q_f32(a.as_ptr().add(i * 4));
|
|
let vb = vld1q_f32(b.as_ptr().add(i * 4));
|
|
let diff = vsubq_f32(va, vb);
|
|
sum = vfmaq_f32(sum, diff, diff);
|
|
}
|
|
|
|
let sum_array: [f32; 4] = std::mem::transmute(sum);
|
|
let mut result: f32 = sum_array.iter().sum();
|
|
|
|
for i in (chunks * 4)..a.len() {
|
|
let diff = a[i] - b[i];
|
|
result += diff * diff;
|
|
}
|
|
|
|
result.sqrt()
|
|
}
|
|
```
|
|
|
|
### Batch Processing
|
|
|
|
```rust
|
|
/// Process multiple vectors in parallel batches
|
|
pub fn batch_distances(
|
|
query: &[f32],
|
|
candidates: &[&[f32]],
|
|
metric: DistanceMetric,
|
|
) -> Vec<f32> {
|
|
const BATCH_SIZE: usize = 256;
|
|
|
|
candidates
|
|
.par_chunks(BATCH_SIZE)
|
|
.flat_map(|batch| {
|
|
batch.iter()
|
|
.map(|c| distance_dispatch(query, c, metric))
|
|
.collect::<Vec<_>>()
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Prefetch-optimized batch processing
|
|
pub fn batch_distances_prefetch(
|
|
query: &[f32],
|
|
candidates: &[Vec<f32>],
|
|
metric: DistanceMetric,
|
|
) -> Vec<f32> {
|
|
let mut results = Vec::with_capacity(candidates.len());
|
|
|
|
for i in 0..candidates.len() {
|
|
// Prefetch next vectors
|
|
if i + 4 < candidates.len() {
|
|
prefetch_read(&candidates[i + 4]);
|
|
}
|
|
|
|
results.push(distance_dispatch(query, &candidates[i], metric));
|
|
}
|
|
|
|
results
|
|
}
|
|
|
|
#[inline]
|
|
fn prefetch_read<T>(data: &T) {
|
|
#[cfg(target_arch = "x86_64")]
|
|
unsafe {
|
|
std::arch::x86_64::_mm_prefetch(
|
|
data as *const T as *const i8,
|
|
std::arch::x86_64::_MM_HINT_T0,
|
|
);
|
|
}
|
|
}
|
|
```
|
|
|
|
## Memory Optimization
|
|
|
|
### Zero-Copy Operations
|
|
|
|
```rust
|
|
/// Memory-mapped vector storage
|
|
pub struct MappedVectors {
|
|
mmap: memmap2::Mmap,
|
|
dim: usize,
|
|
count: usize,
|
|
}
|
|
|
|
impl MappedVectors {
|
|
pub fn open(path: &Path, dim: usize) -> io::Result<Self> {
|
|
let file = File::open(path)?;
|
|
let mmap = unsafe { memmap2::Mmap::map(&file)? };
|
|
let count = mmap.len() / (dim * std::mem::size_of::<f32>());
|
|
|
|
Ok(Self { mmap, dim, count })
|
|
}
|
|
|
|
/// Zero-copy access to vector
|
|
#[inline]
|
|
pub fn get(&self, index: usize) -> &[f32] {
|
|
let offset = index * self.dim;
|
|
let bytes = &self.mmap[offset * 4..(offset + self.dim) * 4];
|
|
unsafe { std::slice::from_raw_parts(bytes.as_ptr() as *const f32, self.dim) }
|
|
}
|
|
}
|
|
|
|
/// PostgreSQL shared memory integration
|
|
pub struct SharedVectorCache {
|
|
shmem: pg_sys::dsm_segment,
|
|
vectors: *mut f32,
|
|
capacity: usize,
|
|
dim: usize,
|
|
}
|
|
|
|
impl SharedVectorCache {
|
|
pub fn create(capacity: usize, dim: usize) -> Self {
|
|
let size = capacity * dim * std::mem::size_of::<f32>();
|
|
let shmem = unsafe { pg_sys::dsm_create(size, 0) };
|
|
let vectors = unsafe { pg_sys::dsm_segment_address(shmem) as *mut f32 };
|
|
|
|
Self { shmem, vectors, capacity, dim }
|
|
}
|
|
|
|
#[inline]
|
|
pub fn get(&self, index: usize) -> &[f32] {
|
|
unsafe {
|
|
std::slice::from_raw_parts(
|
|
self.vectors.add(index * self.dim),
|
|
self.dim
|
|
)
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
### Memory Pool
|
|
|
|
```rust
|
|
/// Thread-local memory pool for temporary allocations
|
|
thread_local! {
|
|
static VECTOR_POOL: RefCell<VectorPool> = RefCell::new(VectorPool::new());
|
|
}
|
|
|
|
pub struct VectorPool {
|
|
pools: HashMap<usize, Vec<Vec<f32>>>,
|
|
max_cached: usize,
|
|
}
|
|
|
|
impl VectorPool {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
pools: HashMap::new(),
|
|
max_cached: 1024,
|
|
}
|
|
}
|
|
|
|
pub fn acquire(&mut self, dim: usize) -> Vec<f32> {
|
|
self.pools
|
|
.get_mut(&dim)
|
|
.and_then(|pool| pool.pop())
|
|
.unwrap_or_else(|| vec![0.0; dim])
|
|
}
|
|
|
|
pub fn release(&mut self, mut vec: Vec<f32>) {
|
|
let dim = vec.len();
|
|
let pool = self.pools.entry(dim).or_insert_with(Vec::new);
|
|
|
|
if pool.len() < self.max_cached {
|
|
vec.iter_mut().for_each(|x| *x = 0.0);
|
|
pool.push(vec);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// RAII guard for pooled vectors
|
|
pub struct PooledVec(Vec<f32>);
|
|
|
|
impl Drop for PooledVec {
|
|
fn drop(&mut self) {
|
|
VECTOR_POOL.with(|pool| {
|
|
pool.borrow_mut().release(std::mem::take(&mut self.0));
|
|
});
|
|
}
|
|
}
|
|
```
|
|
|
|
### Quantization for Memory Reduction
|
|
|
|
```rust
|
|
/// 8-bit scalar quantization (4x memory reduction)
|
|
pub struct ScalarQuantized {
|
|
data: Vec<u8>,
|
|
scale: f32,
|
|
offset: f32,
|
|
dim: usize,
|
|
}
|
|
|
|
impl ScalarQuantized {
|
|
pub fn from_f32(vectors: &[Vec<f32>]) -> Self {
|
|
let (min, max) = find_minmax(vectors);
|
|
let scale = (max - min) / 255.0;
|
|
let offset = min;
|
|
|
|
let data: Vec<u8> = vectors.iter()
|
|
.flat_map(|v| {
|
|
v.iter().map(|&x| ((x - offset) / scale) as u8)
|
|
})
|
|
.collect();
|
|
|
|
Self { data, scale, offset, dim: vectors[0].len() }
|
|
}
|
|
|
|
#[inline]
|
|
pub fn distance(&self, query: &[f32], index: usize) -> f32 {
|
|
let start = index * self.dim;
|
|
let quantized = &self.data[start..start + self.dim];
|
|
|
|
let mut sum = 0.0f32;
|
|
for (i, &q) in quantized.iter().enumerate() {
|
|
let reconstructed = q as f32 * self.scale + self.offset;
|
|
let diff = query[i] - reconstructed;
|
|
sum += diff * diff;
|
|
}
|
|
sum.sqrt()
|
|
}
|
|
}
|
|
|
|
/// Binary quantization (32x memory reduction)
|
|
pub struct BinaryQuantized {
|
|
data: BitVec,
|
|
dim: usize,
|
|
}
|
|
|
|
impl BinaryQuantized {
|
|
pub fn from_f32(vectors: &[Vec<f32>]) -> Self {
|
|
let dim = vectors[0].len();
|
|
let mut data = BitVec::with_capacity(vectors.len() * dim);
|
|
|
|
for vec in vectors {
|
|
for &x in vec {
|
|
data.push(x > 0.0);
|
|
}
|
|
}
|
|
|
|
Self { data, dim }
|
|
}
|
|
|
|
/// Hamming distance (extremely fast)
|
|
#[inline]
|
|
pub fn hamming_distance(&self, query_bits: &BitVec, index: usize) -> u32 {
|
|
let start = index * self.dim;
|
|
let doc_bits = &self.data[start..start + self.dim];
|
|
|
|
// XOR and popcount
|
|
doc_bits.iter()
|
|
.zip(query_bits.iter())
|
|
.filter(|(a, b)| a != b)
|
|
.count() as u32
|
|
}
|
|
}
|
|
```
|
|
|
|
## Query Optimization
|
|
|
|
### Query Plan Caching
|
|
|
|
```rust
|
|
/// Cache compiled query plans
|
|
pub struct QueryPlanCache {
|
|
cache: DashMap<u64, Arc<QueryPlan>>,
|
|
max_size: usize,
|
|
hit_count: AtomicU64,
|
|
miss_count: AtomicU64,
|
|
}
|
|
|
|
impl QueryPlanCache {
|
|
pub fn get_or_compile<F>(&self, query_hash: u64, compile: F) -> Arc<QueryPlan>
|
|
where
|
|
F: FnOnce() -> QueryPlan,
|
|
{
|
|
if let Some(plan) = self.cache.get(&query_hash) {
|
|
self.hit_count.fetch_add(1, Ordering::Relaxed);
|
|
return plan.clone();
|
|
}
|
|
|
|
self.miss_count.fetch_add(1, Ordering::Relaxed);
|
|
let plan = Arc::new(compile());
|
|
|
|
// LRU eviction if needed
|
|
if self.cache.len() >= self.max_size {
|
|
self.evict_lru();
|
|
}
|
|
|
|
self.cache.insert(query_hash, plan.clone());
|
|
plan
|
|
}
|
|
}
|
|
```
|
|
|
|
### Adaptive Index Selection
|
|
|
|
```rust
|
|
/// Choose optimal index based on query characteristics
|
|
pub fn select_index(
|
|
query: &SearchQuery,
|
|
available_indexes: &[IndexInfo],
|
|
table_stats: &TableStats,
|
|
) -> &IndexInfo {
|
|
let selectivity = estimate_selectivity(query, table_stats);
|
|
let expected_results = (table_stats.row_count as f64 * selectivity) as usize;
|
|
|
|
// Decision tree for index selection
|
|
if expected_results < 100 {
|
|
// Sequential scan may be faster for very small result sets
|
|
return &available_indexes.iter()
|
|
.find(|i| i.index_type == IndexType::BTree)
|
|
.unwrap_or(&available_indexes[0]);
|
|
}
|
|
|
|
if query.has_vector_similarity() {
|
|
// Prefer HNSW for similarity search
|
|
if let Some(hnsw) = available_indexes.iter()
|
|
.find(|i| i.index_type == IndexType::Hnsw)
|
|
{
|
|
return hnsw;
|
|
}
|
|
}
|
|
|
|
// Default to IVFFlat for range queries
|
|
available_indexes.iter()
|
|
.find(|i| i.index_type == IndexType::IvfFlat)
|
|
.unwrap_or(&available_indexes[0])
|
|
}
|
|
|
|
/// Adaptive ef_search based on query complexity
|
|
pub fn adaptive_ef_search(
|
|
query: &[f32],
|
|
index: &HnswIndex,
|
|
target_recall: f64,
|
|
) -> usize {
|
|
// Start with learned baseline
|
|
let baseline = index.learned_ef_for_query(query);
|
|
|
|
// Adjust based on query density
|
|
let query_norm = query.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
let density_factor = if query_norm < 1.0 { 1.2 } else { 1.0 };
|
|
|
|
// Adjust based on target recall
|
|
let recall_factor = match target_recall {
|
|
r if r >= 0.99 => 2.0,
|
|
r if r >= 0.95 => 1.5,
|
|
r if r >= 0.90 => 1.2,
|
|
_ => 1.0,
|
|
};
|
|
|
|
((baseline as f64 * density_factor * recall_factor) as usize).max(10)
|
|
}
|
|
```
|
|
|
|
### Parallel Query Execution
|
|
|
|
```rust
|
|
/// Parallel index scan
|
|
pub fn parallel_search(
|
|
query: &[f32],
|
|
index: &HnswIndex,
|
|
k: usize,
|
|
num_threads: usize,
|
|
) -> Vec<(u64, f32)> {
|
|
// Divide search into regions
|
|
let entry_points = index.get_diverse_entry_points(num_threads);
|
|
|
|
let results: Vec<_> = entry_points
|
|
.into_par_iter()
|
|
.map(|entry| index.search_from(query, entry, k * 2))
|
|
.collect();
|
|
|
|
// Merge results
|
|
let mut merged: Vec<_> = results.into_iter().flatten().collect();
|
|
merged.sort_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap());
|
|
merged.dedup_by_key(|(id, _)| *id);
|
|
merged.truncate(k);
|
|
merged
|
|
}
|
|
|
|
/// Intra-query parallelism for complex queries
|
|
pub fn parallel_filter_search(
|
|
query: &[f32],
|
|
filters: &[Filter],
|
|
index: &HnswIndex,
|
|
k: usize,
|
|
) -> Vec<(u64, f32)> {
|
|
// Stage 1: Parallel filter evaluation
|
|
let filter_results: Vec<HashSet<u64>> = filters
|
|
.par_iter()
|
|
.map(|f| evaluate_filter(f))
|
|
.collect();
|
|
|
|
// Stage 2: Intersect filter results
|
|
let valid_ids = filter_results
|
|
.into_iter()
|
|
.reduce(|a, b| a.intersection(&b).copied().collect())
|
|
.unwrap_or_default();
|
|
|
|
// Stage 3: Vector search with filter
|
|
index.search_with_filter(query, k, |id| valid_ids.contains(&id))
|
|
}
|
|
```
|
|
|
|
## PostgreSQL-Specific Optimizations
|
|
|
|
### Buffer Management
|
|
|
|
```rust
|
|
/// Custom buffer pool for vector data
|
|
pub struct VectorBufferPool {
|
|
buffers: Vec<Buffer>,
|
|
free_list: Mutex<Vec<usize>>,
|
|
usage_count: Vec<AtomicU32>,
|
|
}
|
|
|
|
impl VectorBufferPool {
|
|
/// Pin buffer with usage tracking
|
|
pub fn pin(&self, index: usize) -> PinnedBuffer {
|
|
self.usage_count[index].fetch_add(1, Ordering::Relaxed);
|
|
PinnedBuffer { pool: self, index }
|
|
}
|
|
|
|
/// Clock sweep eviction
|
|
pub fn evict_if_needed(&self) -> Option<usize> {
|
|
let mut hand = 0;
|
|
loop {
|
|
let count = self.usage_count[hand].load(Ordering::Relaxed);
|
|
if count == 0 {
|
|
return Some(hand);
|
|
}
|
|
self.usage_count[hand].store(count - 1, Ordering::Relaxed);
|
|
hand = (hand + 1) % self.buffers.len();
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
### WAL Optimization
|
|
|
|
```rust
|
|
/// Batch WAL writes for bulk operations
|
|
pub fn bulk_insert_optimized(
|
|
vectors: &[Vec<f32>],
|
|
ids: &[u64],
|
|
batch_size: usize,
|
|
) {
|
|
// Group into batches
|
|
for batch in vectors.chunks(batch_size).zip(ids.chunks(batch_size)) {
|
|
// Single WAL record for batch
|
|
let wal_record = create_batch_wal_record(batch.0, batch.1);
|
|
|
|
unsafe {
|
|
// Write single WAL entry
|
|
pg_sys::XLogInsert(RUVECTOR_RMGR_ID, XLOG_RUVECTOR_BATCH_INSERT);
|
|
}
|
|
|
|
// Apply batch
|
|
apply_batch(batch.0, batch.1);
|
|
}
|
|
}
|
|
```
|
|
|
|
### Statistics Collection
|
|
|
|
```rust
|
|
/// Collect statistics for query planner
|
|
pub fn analyze_vector_column(
|
|
table_oid: pg_sys::Oid,
|
|
column_num: i16,
|
|
sample_rows: &[pg_sys::HeapTuple],
|
|
) -> VectorStats {
|
|
let mut vectors: Vec<Vec<f32>> = Vec::new();
|
|
|
|
// Extract sample vectors
|
|
for tuple in sample_rows {
|
|
if let Some(vec) = extract_vector(tuple, column_num) {
|
|
vectors.push(vec);
|
|
}
|
|
}
|
|
|
|
// Compute statistics
|
|
let dim = vectors[0].len();
|
|
let centroid = compute_centroid(&vectors);
|
|
let avg_norm = vectors.iter()
|
|
.map(|v| v.iter().map(|x| x * x).sum::<f32>().sqrt())
|
|
.sum::<f32>() / vectors.len() as f32;
|
|
|
|
// Compute distribution statistics
|
|
let distances: Vec<f32> = vectors.iter()
|
|
.map(|v| euclidean_distance(v, ¢roid))
|
|
.collect();
|
|
|
|
VectorStats {
|
|
dim,
|
|
avg_norm,
|
|
centroid,
|
|
distance_histogram: compute_histogram(&distances, 100),
|
|
null_fraction: 0.0, // TODO: compute from sample
|
|
}
|
|
}
|
|
```
|
|
|
|
## Configuration Recommendations
|
|
|
|
### GUC Parameters
|
|
|
|
```sql
|
|
-- Memory settings
|
|
SET ruvector.shared_cache_size = '256MB';
|
|
SET ruvector.work_mem = '64MB';
|
|
|
|
-- Parallelism
|
|
SET ruvector.max_parallel_workers = 4;
|
|
SET ruvector.parallel_search_threshold = 10000;
|
|
|
|
-- Index tuning
|
|
SET ruvector.ef_search = 64; -- HNSW search quality
|
|
SET ruvector.probes = 10; -- IVFFlat probe count
|
|
SET ruvector.quantization = 'sq8'; -- Default quantization
|
|
|
|
-- Learning
|
|
SET ruvector.learning_enabled = on;
|
|
SET ruvector.learning_rate = 0.01;
|
|
|
|
-- Maintenance
|
|
SET ruvector.maintenance_work_mem = '512MB';
|
|
SET ruvector.autovacuum_enabled = on;
|
|
```
|
|
|
|
### Hardware-Specific Tuning
|
|
|
|
```yaml
|
|
# Intel Xeon (AVX-512)
|
|
ruvector.simd_mode: 'avx512'
|
|
ruvector.vector_batch_size: 256
|
|
ruvector.prefetch_distance: 4
|
|
|
|
# AMD EPYC (AVX2)
|
|
ruvector.simd_mode: 'avx2'
|
|
ruvector.vector_batch_size: 128
|
|
ruvector.prefetch_distance: 8
|
|
|
|
# Apple M1/M2 (NEON)
|
|
ruvector.simd_mode: 'neon'
|
|
ruvector.vector_batch_size: 64
|
|
ruvector.prefetch_distance: 4
|
|
|
|
# Memory-constrained
|
|
ruvector.quantization: 'binary'
|
|
ruvector.shared_cache_size: '64MB'
|
|
ruvector.enable_mmap: on
|
|
```
|
|
|
|
## Performance Monitoring
|
|
|
|
```sql
|
|
-- View SIMD statistics
|
|
SELECT * FROM ruvector_simd_stats();
|
|
|
|
-- Memory usage
|
|
SELECT * FROM ruvector_memory_stats();
|
|
|
|
-- Cache hit rates
|
|
SELECT * FROM ruvector_cache_stats();
|
|
|
|
-- Query performance
|
|
SELECT * FROM ruvector_query_stats()
|
|
ORDER BY total_time DESC
|
|
LIMIT 10;
|
|
```
|