Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,32 @@
//! Basic embedding example demonstrating single text embedding
use anyhow::Result;
use ruvector_onnx_embeddings::{Embedder, EmbedderConfig, PretrainedModel};
#[tokio::main]
async fn main() -> Result<()> {
// Create embedder with a specific model
let config = EmbedderConfig::pretrained(PretrainedModel::AllMiniLmL6V2);
let mut embedder = Embedder::new(config).await?;
// Embed text
let text = "Hello, RuVector!";
let embedding = embedder.embed_one(text)?;
println!("Text: {}", text);
println!("Embedding dimension: {}", embedding.len());
println!("First 10 values: {:?}", &embedding[..10]);
// Compute similarity
let similar_text = "Greetings, RuVector!";
let different_text = "The weather is sunny.";
let sim1 = embedder.similarity(text, similar_text)?;
let sim2 = embedder.similarity(text, different_text)?;
println!("\nSimilarity scores:");
println!(" '{}' <-> '{}': {:.4}", text, similar_text, sim1);
println!(" '{}' <-> '{}': {:.4}", text, different_text, sim2);
Ok(())
}

View File

@@ -0,0 +1,53 @@
//! Batch embedding example with parallel processing
use anyhow::Result;
use ruvector_onnx_embeddings::{
EmbedderBuilder, PretrainedModel, PoolingStrategy,
};
use std::time::Instant;
#[tokio::main]
async fn main() -> Result<()> {
// Create embedder with custom settings
let mut embedder = EmbedderBuilder::new()
.pretrained(PretrainedModel::AllMiniLmL6V2)
.pooling(PoolingStrategy::Mean)
.normalize(true)
.batch_size(32)
.max_length(256)
.build()
.await?;
// Generate test data
let texts: Vec<String> = (0..100)
.map(|i| format!("This is test sentence number {} for batch embedding.", i))
.collect();
println!("Embedding {} texts...", texts.len());
// Sequential embedding
let start = Instant::now();
let output = embedder.embed(&texts)?;
let seq_time = start.elapsed();
println!("Sequential: {:?} ({:.2} texts/sec)",
seq_time,
texts.len() as f64 / seq_time.as_secs_f64()
);
// Parallel embedding
let start = Instant::now();
let output_parallel = embedder.embed_parallel(&texts)?;
let par_time = start.elapsed();
println!("Parallel: {:?} ({:.2} texts/sec)",
par_time,
texts.len() as f64 / par_time.as_secs_f64()
);
println!("\nSpeedup: {:.2}x", seq_time.as_secs_f64() / par_time.as_secs_f64());
println!("Total embeddings: {}", output.len());
println!("Dimension: {}", output.dimension);
Ok(())
}

View File

@@ -0,0 +1,87 @@
//! Semantic search example using RuVector integration
use anyhow::Result;
use ruvector_onnx_embeddings::{
Embedder, RuVectorEmbeddings, IndexConfig, Distance,
};
#[tokio::main]
async fn main() -> Result<()> {
println!("=== Semantic Search with RuVector ONNX Embeddings ===\n");
// Initialize embedder
let embedder = Embedder::default_model().await?;
println!("Loaded model with dimension: {}", embedder.dimension());
// Create index with custom configuration
let config = IndexConfig {
distance: Distance::Cosine,
max_elements: 100_000,
ef_search: 100,
};
let index = RuVectorEmbeddings::new("semantic_docs", embedder, config)?;
// Sample document corpus
let documents = vec![
("doc1", "Rust provides memory safety without garbage collection through its ownership system."),
("doc2", "Python's simplicity makes it ideal for beginners learning programming."),
("doc3", "JavaScript dominates web development with frameworks like React and Vue."),
("doc4", "Machine learning models can be trained using TensorFlow or PyTorch."),
("doc5", "Docker containers provide consistent deployment environments."),
("doc6", "Kubernetes orchestrates containerized applications at scale."),
("doc7", "GraphQL offers a more efficient alternative to REST APIs."),
("doc8", "PostgreSQL is a powerful open-source relational database."),
("doc9", "Redis provides in-memory data storage for caching."),
("doc10", "Elasticsearch enables full-text search across large datasets."),
];
// Index documents with metadata
println!("Indexing {} documents...", documents.len());
for (id, content) in &documents {
let metadata = serde_json::json!({ "doc_id": id });
index.insert(content, Some(metadata))?;
}
println!("Index contains {} vectors\n", index.len());
// Perform semantic searches
let queries = vec![
"How can I ensure memory safety in my code?",
"What's the best language for web applications?",
"How do I deploy applications in containers?",
"I need a fast database for caching",
];
for query in queries {
println!("🔍 Query: \"{}\"\n", query);
let results = index.search(query, 3)?;
for (rank, result) in results.iter().enumerate() {
println!(" {}. [Score: {:.4}]", rank + 1, result.score);
println!(" {}", result.text);
if let Some(meta) = &result.metadata {
if let Some(doc_id) = meta.get("doc_id") {
println!(" ({})", doc_id);
}
}
println!();
}
println!("{}\n", "-".repeat(70));
}
// Find similar documents
println!("=== Finding Similar Documents ===\n");
let query_doc = documents[0].1; // Rust document
println!("Finding documents similar to:\n\"{}\"\n", query_doc);
let similar = index.search(query_doc, 4)?;
for (i, result) in similar.iter().skip(1).enumerate() {
// Skip first (self)
println!(" {}. [Score: {:.4}] {}", i + 1, result.score, result.text);
}
Ok(())
}