Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/docs/guides/AGENTICDB_EMBEDDINGS_WARNING.md
+++ b/vendor/ruvector/docs/guides/AGENTICDB_EMBEDDINGS_WARNING.md
@@ -0,0 +1,474 @@
+# AgenticDB Embedding Limitation - MUST READ
+
+## ⚠️⚠️⚠️ CRITICAL WARNING ⚠️⚠️⚠️
+
+**AgenticDB currently uses PLACEHOLDER HASH-BASED EMBEDDINGS, not real semantic embeddings.**
+
+## What This Means
+
+The current `generate_text_embedding()` function creates embeddings using a simple hash that does NOT understand semantic meaning:
+
+### ❌ What DOESN'T Work
+- Semantic similarity: "dog" and "cat" are NOT similar
+- Synonyms: "happy" and "joyful" are NOT similar
+- Related concepts: "car" and "automobile" are NOT similar
+- Paraphrasing: "I like pizza" and "Pizza is my favorite" are NOT similar
+
+### ✅ What "Works" (But Shouldn't)
+- Character similarity: "dog" and "god" ARE similar (same letters)
+- Typos: "teh" and "the" ARE similar (close characters)
+- This is NOT semantic search - it's character overlap!
+
+## Why This Exists
+
+The placeholder embedding allows:
+1. Testing the AgenticDB API structure
+2. Demonstrating the API usage patterns
+3. Running benchmarks on vector operations
+4. Developing without external dependencies
+
+**But it should NEVER be used for production semantic search.**
+
+## Production Integration - Choose ONE
+
+### Option 1: ONNX Runtime (Recommended ⭐)
+
+**Best for**: Production deployments, cross-platform compatibility
+
+```rust
+use ort::{Session, Environment, Value, TensorRTExecutionProvider};
+use tokenizers::Tokenizer;
+
+pub struct OnnxEmbedder {
+    session: Session,
+    tokenizer: Tokenizer,
+}
+
+impl OnnxEmbedder {
+    pub fn new(model_path: &str, tokenizer_path: &str) -> Result<Self> {
+        let environment = Environment::builder()
+            .with_name("embeddings")
+            .with_execution_providers([TensorRTExecutionProvider::default().build()])
+            .build()?;
+
+        let session = Session::builder()?
+            .with_optimization_level(ort::GraphOptimizationLevel::Level3)?
+            .with_intra_threads(4)?
+            .with_model_from_file(model_path)?;
+
+        let tokenizer = Tokenizer::from_file(tokenizer_path)?;
+
+        Ok(Self { session, tokenizer })
+    }
+
+    pub fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
+        let encoding = self.tokenizer.encode(text, true)?;
+        let input_ids = encoding.get_ids();
+        let attention_mask = encoding.get_attention_mask();
+
+        let input_ids_array = ndarray::Array2::from_shape_vec(
+            (1, input_ids.len()),
+            input_ids.iter().map(|&x| x as i64).collect()
+        )?;
+
+        let attention_mask_array = ndarray::Array2::from_shape_vec(
+            (1, attention_mask.len()),
+            attention_mask.iter().map(|&x| x as i64).collect()
+        )?;
+
+        let outputs = self.session.run(ort::inputs![
+            "input_ids" => Value::from_array(self.session.allocator(), &input_ids_array)?,
+            "attention_mask" => Value::from_array(self.session.allocator(), &attention_mask_array)?
+        ])?;
+
+        let embeddings: ort::OrtOwnedTensor<f32, _> = outputs["last_hidden_state"].try_extract()?;
+        let embeddings = embeddings.view();
+
+        // Mean pooling
+        let embedding_vec = embeddings
+            .mean_axis(ndarray::Axis(1))
+            .unwrap()
+            .to_vec();
+
+        Ok(embedding_vec)
+    }
+}
+
+// Replace AgenticDB's generate_text_embedding:
+// 1. Add OnnxEmbedder field to AgenticDB struct
+// 2. Initialize in new()
+// 3. Call embedder.generate_text_embedding(text) instead of hash
+```
+
+**Models to use**:
+- `all-MiniLM-L6-v2` (384 dims, fast, good quality)
+- `all-mpnet-base-v2` (768 dims, higher quality)
+- `gte-small` (384 dims, multilingual)
+
+**Get ONNX models**:
+```bash
+python -m pip install optimum[onnxruntime]
+optimum-cli export onnx --model sentence-transformers/all-MiniLM-L6-v2 all-MiniLM-L6-v2-onnx/
+```
+
+---
+
+### Option 2: Candle (Pure Rust)
+
+**Best for**: Native Rust deployments, no Python dependencies
+
+```rust
+use candle_core::{Device, Tensor};
+use candle_nn::VarBuilder;
+use candle_transformers::models::bert::{BertModel, Config as BertConfig};
+
+pub struct CandleEmbedder {
+    model: BertModel,
+    tokenizer: tokenizers::Tokenizer,
+    device: Device,
+}
+
+impl CandleEmbedder {
+    pub fn new(model_path: &str, tokenizer_path: &str) -> Result<Self> {
+        let device = Device::cuda_if_available(0)?;
+
+        let config = BertConfig::default();
+        let vb = VarBuilder::from_pth(model_path, candle_core::DType::F32, &device)?;
+        let model = BertModel::load(vb, &config)?;
+
+        let tokenizer = tokenizers::Tokenizer::from_file(tokenizer_path)?;
+
+        Ok(Self { model, tokenizer, device })
+    }
+
+    pub fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
+        let encoding = self.tokenizer.encode(text, true)?;
+
+        let input_ids = Tensor::new(
+            encoding.get_ids(),
+            &self.device
+        )?.unsqueeze(0)?;
+
+        let token_type_ids = Tensor::zeros(
+            (1, encoding.get_ids().len()),
+            candle_core::DType::U32,
+            &self.device
+        )?;
+
+        let embeddings = self.model.forward(&input_ids, &token_type_ids)?;
+
+        // Mean pooling
+        let embedding_vec = embeddings
+            .mean(1)?
+            .to_vec1::<f32>()?;
+
+        Ok(embedding_vec)
+    }
+}
+```
+
+**Dependencies**:
+```toml
+[dependencies]
+candle-core = "0.3"
+candle-nn = "0.3"
+candle-transformers = "0.3"
+```
+
+---
+
+### Option 3: API-based (OpenAI, Cohere, Anthropic)
+
+**Best for**: Quick prototyping, cloud deployments
+
+#### OpenAI
+
+```rust
+use reqwest;
+use serde_json::json;
+
+pub struct OpenAIEmbedder {
+    client: reqwest::Client,
+    api_key: String,
+}
+
+impl OpenAIEmbedder {
+    pub fn new(api_key: String) -> Self {
+        Self {
+            client: reqwest::Client::new(),
+            api_key,
+        }
+    }
+
+    pub async fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
+        let response = self.client
+            .post("https://api.openai.com/v1/embeddings")
+            .header("Authorization", format!("Bearer {}", self.api_key))
+            .json(&json!({
+                "model": "text-embedding-3-small",
+                "input": text,
+            }))
+            .send()
+            .await?;
+
+        let json: serde_json::Value = response.json().await?;
+        let embeddings = json["data"][0]["embedding"]
+            .as_array()
+            .unwrap()
+            .iter()
+            .map(|v| v.as_f64().unwrap() as f32)
+            .collect();
+
+        Ok(embeddings)
+    }
+}
+```
+
+**Costs** (as of 2024):
+- `text-embedding-3-small`: $0.02 / 1M tokens
+- `text-embedding-3-large`: $0.13 / 1M tokens
+
+#### Cohere
+
+```rust
+pub async fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
+    let response = self.client
+        .post("https://api.cohere.ai/v1/embed")
+        .header("Authorization", format!("Bearer {}", self.api_key))
+        .json(&json!({
+            "model": "embed-english-v3.0",
+            "texts": [text],
+            "input_type": "search_query",
+        }))
+        .send()
+        .await?;
+
+    let json: serde_json::Value = response.json().await?;
+    let embeddings = json["embeddings"][0]
+        .as_array()
+        .unwrap()
+        .iter()
+        .map(|v| v.as_f64().unwrap() as f32)
+        .collect();
+
+    Ok(embeddings)
+}
+```
+
+**Costs**: $0.10 / 1M tokens
+
+---
+
+### Option 4: Python Bindings (sentence-transformers)
+
+**Best for**: Leveraging existing Python ML ecosystem
+
+```rust
+use pyo3::prelude::*;
+use pyo3::types::PyModule;
+use numpy::PyArray1;
+
+pub struct PythonEmbedder {
+    model: Py<PyAny>,
+}
+
+impl PythonEmbedder {
+    pub fn new(model_name: &str) -> Result<Self> {
+        Python::with_gil(|py| {
+            let sentence_transformers = PyModule::import(py, "sentence_transformers")?;
+            let model = sentence_transformers
+                .getattr("SentenceTransformer")?
+                .call1((model_name,))?;
+
+            Ok(Self {
+                model: model.into(),
+            })
+        })
+    }
+
+    pub fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
+        Python::with_gil(|py| {
+            let embeddings = self.model
+                .call_method1(py, "encode", (text,))?
+                .extract::<&PyArray1<f32>>(py)?;
+
+            Ok(embeddings.to_vec()?)
+        })
+    }
+}
+```
+
+**Dependencies**:
+```toml
+[dependencies]
+pyo3 = { version = "0.20", features = ["extension-module"] }
+numpy = "0.20"
+```
+
+**Python setup**:
+```bash
+pip install sentence-transformers
+```
+
+---
+
+## Integration Steps
+
+### 1. Choose Your Approach
+
+Pick one of the 4 options above based on your requirements:
+- **ONNX**: Best balance of performance and compatibility ⭐
+- **Candle**: Pure Rust, no external runtime
+- **API**: Fastest to prototype, pay per use
+- **Python**: Maximum flexibility with ML libraries
+
+### 2. Update AgenticDB Struct
+
+```rust
+pub struct AgenticDB {
+    vector_db: Arc<VectorDB>,
+    db: Arc<Database>,
+    dimensions: usize,
+    embedder: Arc<dyn Embedder>, // Add this
+}
+```
+
+### 3. Create Embedder Trait
+
+```rust
+pub trait Embedder: Send + Sync {
+    fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>>;
+}
+
+// Implement for each option:
+impl Embedder for OnnxEmbedder { /* ... */ }
+impl Embedder for CandleEmbedder { /* ... */ }
+impl Embedder for OpenAIEmbedder { /* ... */ }
+impl Embedder for PythonEmbedder { /* ... */ }
+```
+
+### 4. Update Constructor
+
+```rust
+impl AgenticDB {
+    pub fn new(options: DbOptions, embedder: Arc<dyn Embedder>) -> Result<Self> {
+        // ... existing code ...
+        Ok(Self {
+            vector_db,
+            db,
+            dimensions: options.dimensions,
+            embedder, // Use provided embedder
+        })
+    }
+}
+```
+
+### 5. Replace Hash Implementation
+
+```rust
+fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
+    self.embedder.generate_text_embedding(text)
+}
+```
+
+### 6. Update Tests
+
+```rust
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    struct MockEmbedder;
+    impl Embedder for MockEmbedder {
+        fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
+            // Use hash for tests only
+            // ... hash implementation ...
+        }
+    }
+
+    fn create_test_db() -> Result<AgenticDB> {
+        let embedder = Arc::new(MockEmbedder);
+        AgenticDB::new(options, embedder)
+    }
+}
+```
+
+---
+
+## Verification
+
+After integration, verify semantic search works:
+
+```rust
+#[test]
+fn test_semantic_similarity() {
+    let db = create_db_with_real_embeddings()?;
+
+    // These should be similar with real embeddings
+    let skill1 = db.create_skill(
+        "Dog Handler".to_string(),
+        "Take care of dogs".to_string(),
+        HashMap::new(),
+        vec![],
+    )?;
+
+    let skill2 = db.create_skill(
+        "Cat Handler".to_string(),
+        "Take care of cats".to_string(),
+        HashMap::new(),
+        vec![],
+    )?;
+
+    // Search with semantic query
+    let results = db.search_skills("pet care", 5)?;
+
+    // Both should be found because "pet care" is semantically similar
+    // to both "take care of dogs" and "take care of cats"
+    assert!(results.len() >= 2);
+
+    // With hash embeddings, this would likely fail!
+}
+```
+
+---
+
+## Performance Considerations
+
+| Method | Latency | Cost | Offline | Quality |
+|--------|---------|------|---------|---------|
+| **ONNX** | ~5-20ms | Free | ✅ | ⭐⭐⭐⭐ |
+| **Candle** | ~10-30ms | Free | ✅ | ⭐⭐⭐⭐ |
+| **OpenAI API** | ~100-300ms | $0.02/1M tokens | ❌ | ⭐⭐⭐⭐⭐ |
+| **Cohere API** | ~100-300ms | $0.10/1M tokens | ❌ | ⭐⭐⭐⭐ |
+| **Python** | ~5-20ms | Free | ✅ | ⭐⭐⭐⭐ |
+| **Hash (current)** | ~0.1ms | Free | ✅ | ❌ |
+
+---
+
+## Feature Flag (Future)
+
+We plan to add a compile-time check:
+
+```rust
+#[cfg(not(feature = "real-embeddings"))]
+compile_error!(
+    "AgenticDB requires 'real-embeddings' feature for production use. \
+     Current placeholder embeddings do NOT provide semantic search. \
+     Enable with: cargo build --features real-embeddings"
+);
+```
+
+---
+
+## Conclusion
+
+**DO NOT use the current AgenticDB implementation for semantic search in production.**
+
+The placeholder embeddings are ONLY suitable for:
+- API structure testing
+- Performance benchmarking (vector operations)
+- Development without external dependencies
+
+For any real semantic search use case, integrate one of the four real embedding options above.
+
+**See `/examples/onnx-embeddings` for a complete ONNX integration example.**