Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
[package]
name = "ruvector-filter"
version.workspace = true
edition.workspace = true
license.workspace = true
authors.workspace = true
repository.workspace = true
readme = "README.md"
description = "Advanced metadata filtering for Ruvector vector search"
[dependencies]
ruvector-core = { version = "2.0.2", path = "../ruvector-core" }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
dashmap = { workspace = true }
uuid = { workspace = true }
chrono = { workspace = true }
ordered-float = "4.5"

View File

@@ -0,0 +1,226 @@
# Ruvector Filter
[![Crates.io](https://img.shields.io/crates/v/ruvector-filter.svg)](https://crates.io/crates/ruvector-filter)
[![Documentation](https://docs.rs/ruvector-filter/badge.svg)](https://docs.rs/ruvector-filter)
[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Rust](https://img.shields.io/badge/rust-1.77%2B-orange.svg)](https://www.rust-lang.org)
**Advanced metadata filtering for Ruvector vector search.**
`ruvector-filter` provides a powerful filter expression language for combining vector similarity search with metadata constraints. Supports complex boolean expressions, range queries, and efficient filter evaluation. Part of the [Ruvector](https://github.com/ruvnet/ruvector) ecosystem.
## Why Ruvector Filter?
- **Rich Expressions**: Complex boolean filter expressions
- **Type-Safe**: Strongly typed filter operations
- **Optimized**: Filter pushdown for efficient evaluation
- **Extensible**: Custom filter operators
- **JSON Compatible**: Easy integration with JSON metadata
## Features
### Core Capabilities
- **Comparison Operators**: `=`, `!=`, `<`, `>`, `<=`, `>=`
- **Boolean Logic**: `AND`, `OR`, `NOT`
- **Range Queries**: `BETWEEN`, `IN`
- **String Matching**: `CONTAINS`, `STARTS_WITH`, `ENDS_WITH`
- **Null Handling**: `IS NULL`, `IS NOT NULL`
### Advanced Features
- **Nested Fields**: Filter on nested JSON properties
- **Array Operations**: `ANY`, `ALL`, `NONE` on arrays
- **Regex Matching**: Pattern-based string filtering
- **Geo Filters**: Distance and bounding box (planned)
- **Custom Functions**: Extensible filter functions
## Installation
Add `ruvector-filter` to your `Cargo.toml`:
```toml
[dependencies]
ruvector-filter = "0.1.1"
```
## Quick Start
### Basic Filtering
```rust
use ruvector_filter::{Filter, FilterBuilder};
// Build filter expression
let filter = FilterBuilder::new()
.field("category").eq("electronics")
.and()
.field("price").lt(1000.0)
.build()?;
// Apply to search
let results = db.search(SearchQuery {
vector: query_vec,
k: 10,
filter: Some(filter),
..Default::default()
})?;
```
### Complex Expressions
```rust
use ruvector_filter::{Filter, FilterExpr, op};
// Complex boolean expression
let filter = op::and(vec![
op::eq("status", "active"),
op::or(vec![
op::gt("priority", 5),
op::in_("tags", vec!["urgent", "important"]),
]),
op::not(op::eq("archived", true)),
]);
// Range query
let filter = op::and(vec![
op::between("price", 100.0, 500.0),
op::between("created_at", "2024-01-01", "2024-12-31"),
]);
```
### String Matching
```rust
use ruvector_filter::op;
// String operations
let filter = op::and(vec![
op::contains("description", "machine learning"),
op::starts_with("name", "Project"),
op::regex("email", r".*@company\.com"),
]);
```
### Nested Field Access
```rust
use ruvector_filter::op;
// Access nested JSON fields
let filter = op::and(vec![
op::eq("user.role", "admin"),
op::gt("metadata.views", 1000),
op::in_("settings.theme", vec!["dark", "light"]),
]);
```
## API Overview
### Core Types
```rust
// Filter expression
pub enum FilterExpr {
// Comparison
Eq(String, Value),
Ne(String, Value),
Lt(String, Value),
Gt(String, Value),
Le(String, Value),
Ge(String, Value),
// Boolean
And(Vec<FilterExpr>),
Or(Vec<FilterExpr>),
Not(Box<FilterExpr>),
// Range
Between(String, Value, Value),
In(String, Vec<Value>),
// String
Contains(String, String),
StartsWith(String, String),
EndsWith(String, String),
Regex(String, String),
// Null
IsNull(String),
IsNotNull(String),
}
// Filter builder
pub struct FilterBuilder { /* ... */ }
```
### Filter Operations
```rust
// Convenience functions in `op` module
pub mod op {
pub fn eq(field: &str, value: impl Into<Value>) -> FilterExpr;
pub fn ne(field: &str, value: impl Into<Value>) -> FilterExpr;
pub fn lt(field: &str, value: impl Into<Value>) -> FilterExpr;
pub fn gt(field: &str, value: impl Into<Value>) -> FilterExpr;
pub fn le(field: &str, value: impl Into<Value>) -> FilterExpr;
pub fn ge(field: &str, value: impl Into<Value>) -> FilterExpr;
pub fn and(exprs: Vec<FilterExpr>) -> FilterExpr;
pub fn or(exprs: Vec<FilterExpr>) -> FilterExpr;
pub fn not(expr: FilterExpr) -> FilterExpr;
pub fn between(field: &str, min: impl Into<Value>, max: impl Into<Value>) -> FilterExpr;
pub fn in_(field: &str, values: Vec<impl Into<Value>>) -> FilterExpr;
pub fn contains(field: &str, substring: &str) -> FilterExpr;
pub fn starts_with(field: &str, prefix: &str) -> FilterExpr;
pub fn ends_with(field: &str, suffix: &str) -> FilterExpr;
pub fn regex(field: &str, pattern: &str) -> FilterExpr;
}
```
### Filter Evaluation
```rust
impl FilterExpr {
pub fn evaluate(&self, metadata: &serde_json::Value) -> bool;
pub fn optimize(&self) -> FilterExpr;
pub fn to_json(&self) -> serde_json::Value;
pub fn from_json(json: &serde_json::Value) -> Result<Self>;
}
```
## Performance Tips
1. **Put most selective filters first** in AND expressions
2. **Use IN instead of multiple OR** for equality checks
3. **Avoid regex when possible** - use prefix/suffix matching
4. **Index frequently filtered fields** in your metadata
## Related Crates
- **[ruvector-core](../ruvector-core/)** - Core vector database engine
- **[ruvector-collections](../ruvector-collections/)** - Collection management
## Documentation
- **[Main README](../../README.md)** - Complete project overview
- **[API Documentation](https://docs.rs/ruvector-filter)** - Full API reference
- **[GitHub Repository](https://github.com/ruvnet/ruvector)** - Source code
## License
**MIT License** - see [LICENSE](../../LICENSE) for details.
---
<div align="center">
**Part of [Ruvector](https://github.com/ruvnet/ruvector) - Built by [rUv](https://ruv.io)**
[![Star on GitHub](https://img.shields.io/github/stars/ruvnet/ruvector?style=social)](https://github.com/ruvnet/ruvector)
[Documentation](https://docs.rs/ruvector-filter) | [Crates.io](https://crates.io/crates/ruvector-filter) | [GitHub](https://github.com/ruvnet/ruvector)
</div>

View File

@@ -0,0 +1,37 @@
use thiserror::Error;
/// Errors that can occur during filter operations
#[derive(Error, Debug)]
pub enum FilterError {
#[error("Index not found for field: {0}")]
IndexNotFound(String),
#[error("Invalid index type for field: {0}")]
InvalidIndexType(String),
#[error("Type mismatch in filter expression: expected {expected}, got {actual}")]
TypeMismatch { expected: String, actual: String },
#[error("Invalid filter expression: {0}")]
InvalidExpression(String),
#[error("Field not found in payload: {0}")]
FieldNotFound(String),
#[error("Invalid value for operation: {0}")]
InvalidValue(String),
#[error("Geo operation error: {0}")]
GeoError(String),
#[error("JSON error: {0}")]
JsonError(#[from] serde_json::Error),
#[error("IO error: {0}")]
IoError(#[from] std::io::Error),
#[error("Parse error: {0}")]
ParseError(String),
}
pub type Result<T> = std::result::Result<T, FilterError>;

View File

@@ -0,0 +1,593 @@
use crate::error::{FilterError, Result};
use crate::expression::FilterExpression;
use crate::index::{PayloadIndex, PayloadIndexManager};
use ordered_float::OrderedFloat;
use serde_json::Value;
use std::collections::HashSet;
/// Evaluates filter expressions against payload indices
pub struct FilterEvaluator<'a> {
indices: &'a PayloadIndexManager,
}
impl<'a> FilterEvaluator<'a> {
/// Create a new filter evaluator
pub fn new(indices: &'a PayloadIndexManager) -> Self {
Self { indices }
}
/// Evaluate a filter expression and return matching vector IDs
pub fn evaluate(&self, filter: &FilterExpression) -> Result<HashSet<String>> {
match filter {
FilterExpression::Eq { field, value } => self.evaluate_eq(field, value),
FilterExpression::Ne { field, value } => self.evaluate_ne(field, value),
FilterExpression::Gt { field, value } => self.evaluate_gt(field, value),
FilterExpression::Gte { field, value } => self.evaluate_gte(field, value),
FilterExpression::Lt { field, value } => self.evaluate_lt(field, value),
FilterExpression::Lte { field, value } => self.evaluate_lte(field, value),
FilterExpression::Range { field, gte, lte } => {
self.evaluate_range(field, gte.as_ref(), lte.as_ref())
}
FilterExpression::In { field, values } => self.evaluate_in(field, values),
FilterExpression::Match { field, text } => self.evaluate_match(field, text),
FilterExpression::GeoRadius {
field,
lat,
lon,
radius_m,
} => self.evaluate_geo_radius(field, *lat, *lon, *radius_m),
FilterExpression::GeoBoundingBox {
field,
top_left,
bottom_right,
} => self.evaluate_geo_bbox(field, *top_left, *bottom_right),
FilterExpression::And(filters) => self.evaluate_and(filters),
FilterExpression::Or(filters) => self.evaluate_or(filters),
FilterExpression::Not(filter) => self.evaluate_not(filter),
FilterExpression::Exists { field } => self.evaluate_exists(field),
FilterExpression::IsNull { field } => self.evaluate_is_null(field),
}
}
/// Check if a payload matches a filter expression
pub fn matches(&self, payload: &Value, filter: &FilterExpression) -> bool {
match filter {
FilterExpression::Eq { field, value } => {
Self::get_field_value(payload, field).map_or(false, |v| v == value)
}
FilterExpression::Ne { field, value } => {
Self::get_field_value(payload, field).map_or(true, |v| v != value)
}
FilterExpression::Gt { field, value } => Self::get_field_value(payload, field)
.map_or(false, |v| {
Self::compare_values(v, value) == Some(std::cmp::Ordering::Greater)
}),
FilterExpression::Gte { field, value } => {
Self::get_field_value(payload, field).map_or(false, |v| {
matches!(
Self::compare_values(v, value),
Some(std::cmp::Ordering::Greater | std::cmp::Ordering::Equal)
)
})
}
FilterExpression::Lt { field, value } => Self::get_field_value(payload, field)
.map_or(false, |v| {
Self::compare_values(v, value) == Some(std::cmp::Ordering::Less)
}),
FilterExpression::Lte { field, value } => {
Self::get_field_value(payload, field).map_or(false, |v| {
matches!(
Self::compare_values(v, value),
Some(std::cmp::Ordering::Less | std::cmp::Ordering::Equal)
)
})
}
FilterExpression::Range { field, gte, lte } => {
if let Some(v) = Self::get_field_value(payload, field) {
let gte_match = gte.as_ref().map_or(true, |gte_val| {
matches!(
Self::compare_values(v, gte_val),
Some(std::cmp::Ordering::Greater | std::cmp::Ordering::Equal)
)
});
let lte_match = lte.as_ref().map_or(true, |lte_val| {
matches!(
Self::compare_values(v, lte_val),
Some(std::cmp::Ordering::Less | std::cmp::Ordering::Equal)
)
});
gte_match && lte_match
} else {
false
}
}
FilterExpression::In { field, values } => {
Self::get_field_value(payload, field).map_or(false, |v| values.contains(v))
}
FilterExpression::Match { field, text } => Self::get_field_value(payload, field)
.and_then(|v| v.as_str())
.map_or(false, |s| s.to_lowercase().contains(&text.to_lowercase())),
FilterExpression::And(filters) => filters.iter().all(|f| self.matches(payload, f)),
FilterExpression::Or(filters) => filters.iter().any(|f| self.matches(payload, f)),
FilterExpression::Not(filter) => !self.matches(payload, filter),
FilterExpression::Exists { field } => Self::get_field_value(payload, field).is_some(),
FilterExpression::IsNull { field } => {
Self::get_field_value(payload, field).map_or(true, |v| v.is_null())
}
_ => false, // Geo operations not supported in direct matching
}
}
fn evaluate_eq(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
let index = self
.indices
.get_index(field)
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
match index {
PayloadIndex::Integer(map) => {
if let Some(num) = value.as_i64() {
Ok(map.get(&num).cloned().unwrap_or_default())
} else {
Ok(HashSet::new())
}
}
PayloadIndex::Float(map) => {
if let Some(num) = value.as_f64() {
Ok(map.get(&OrderedFloat(num)).cloned().unwrap_or_default())
} else {
Ok(HashSet::new())
}
}
PayloadIndex::Keyword(map) => {
if let Some(s) = value.as_str() {
Ok(map.get(s).cloned().unwrap_or_default())
} else {
Ok(HashSet::new())
}
}
PayloadIndex::Bool(map) => {
if let Some(b) = value.as_bool() {
Ok(map.get(&b).cloned().unwrap_or_default())
} else {
Ok(HashSet::new())
}
}
_ => Err(FilterError::InvalidIndexType(field.to_string())),
}
}
fn evaluate_ne(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
let eq_results = self.evaluate_eq(field, value)?;
let all_ids = self.get_all_ids_for_field(field)?;
Ok(all_ids.difference(&eq_results).cloned().collect())
}
fn evaluate_gt(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
let index = self
.indices
.get_index(field)
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
match index {
PayloadIndex::Integer(map) => {
if let Some(num) = value.as_i64() {
Ok(map
.range((num + 1)..)
.flat_map(|(_, ids)| ids)
.cloned()
.collect())
} else {
Ok(HashSet::new())
}
}
PayloadIndex::Float(map) => {
if let Some(num) = value.as_f64() {
let threshold = OrderedFloat(num);
Ok(map
.range(threshold..)
.filter(|(k, _)| **k > threshold)
.flat_map(|(_, ids)| ids)
.cloned()
.collect())
} else {
Ok(HashSet::new())
}
}
_ => Err(FilterError::InvalidIndexType(field.to_string())),
}
}
fn evaluate_gte(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
let index = self
.indices
.get_index(field)
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
match index {
PayloadIndex::Integer(map) => {
if let Some(num) = value.as_i64() {
Ok(map.range(num..).flat_map(|(_, ids)| ids).cloned().collect())
} else {
Ok(HashSet::new())
}
}
PayloadIndex::Float(map) => {
if let Some(num) = value.as_f64() {
Ok(map
.range(OrderedFloat(num)..)
.flat_map(|(_, ids)| ids)
.cloned()
.collect())
} else {
Ok(HashSet::new())
}
}
_ => Err(FilterError::InvalidIndexType(field.to_string())),
}
}
fn evaluate_lt(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
let index = self
.indices
.get_index(field)
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
match index {
PayloadIndex::Integer(map) => {
if let Some(num) = value.as_i64() {
Ok(map.range(..num).flat_map(|(_, ids)| ids).cloned().collect())
} else {
Ok(HashSet::new())
}
}
PayloadIndex::Float(map) => {
if let Some(num) = value.as_f64() {
Ok(map
.range(..OrderedFloat(num))
.flat_map(|(_, ids)| ids)
.cloned()
.collect())
} else {
Ok(HashSet::new())
}
}
_ => Err(FilterError::InvalidIndexType(field.to_string())),
}
}
fn evaluate_lte(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
let index = self
.indices
.get_index(field)
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
match index {
PayloadIndex::Integer(map) => {
if let Some(num) = value.as_i64() {
Ok(map
.range(..=num)
.flat_map(|(_, ids)| ids)
.cloned()
.collect())
} else {
Ok(HashSet::new())
}
}
PayloadIndex::Float(map) => {
if let Some(num) = value.as_f64() {
Ok(map
.range(..=OrderedFloat(num))
.flat_map(|(_, ids)| ids)
.cloned()
.collect())
} else {
Ok(HashSet::new())
}
}
_ => Err(FilterError::InvalidIndexType(field.to_string())),
}
}
fn evaluate_range(
&self,
field: &str,
gte: Option<&Value>,
lte: Option<&Value>,
) -> Result<HashSet<String>> {
let mut result = self.get_all_ids_for_field(field)?;
if let Some(gte_val) = gte {
let gte_results = self.evaluate_gte(field, gte_val)?;
result = result.intersection(&gte_results).cloned().collect();
}
if let Some(lte_val) = lte {
let lte_results = self.evaluate_lte(field, lte_val)?;
result = result.intersection(&lte_results).cloned().collect();
}
Ok(result)
}
fn evaluate_in(&self, field: &str, values: &[Value]) -> Result<HashSet<String>> {
let mut result = HashSet::new();
for value in values {
let ids = self.evaluate_eq(field, value)?;
result.extend(ids);
}
Ok(result)
}
fn evaluate_match(&self, field: &str, text: &str) -> Result<HashSet<String>> {
let index = self
.indices
.get_index(field)
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
match index {
PayloadIndex::Text(map) => {
let words: Vec<_> = text.split_whitespace().map(|w| w.to_lowercase()).collect();
let mut result = HashSet::new();
for word in words {
if let Some(ids) = map.get(&word) {
result.extend(ids.iter().cloned());
}
}
Ok(result)
}
_ => Err(FilterError::InvalidIndexType(field.to_string())),
}
}
fn evaluate_geo_radius(
&self,
field: &str,
lat: f64,
lon: f64,
radius_m: f64,
) -> Result<HashSet<String>> {
let index = self
.indices
.get_index(field)
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
match index {
PayloadIndex::Geo(points) => {
let mut result = HashSet::new();
for (id, point_lat, point_lon) in points {
let distance = haversine_distance(lat, lon, *point_lat, *point_lon);
if distance <= radius_m {
result.insert(id.clone());
}
}
Ok(result)
}
_ => Err(FilterError::InvalidIndexType(field.to_string())),
}
}
fn evaluate_geo_bbox(
&self,
field: &str,
top_left: (f64, f64),
bottom_right: (f64, f64),
) -> Result<HashSet<String>> {
let index = self
.indices
.get_index(field)
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
match index {
PayloadIndex::Geo(points) => {
let mut result = HashSet::new();
let (north, west) = top_left;
let (south, east) = bottom_right;
for (id, lat, lon) in points {
if *lat <= north && *lat >= south && *lon >= west && *lon <= east {
result.insert(id.clone());
}
}
Ok(result)
}
_ => Err(FilterError::InvalidIndexType(field.to_string())),
}
}
fn evaluate_and(&self, filters: &[FilterExpression]) -> Result<HashSet<String>> {
if filters.is_empty() {
return Ok(HashSet::new());
}
let mut result = self.evaluate(&filters[0])?;
for filter in &filters[1..] {
let next = self.evaluate(filter)?;
result = result.intersection(&next).cloned().collect();
if result.is_empty() {
break;
}
}
Ok(result)
}
fn evaluate_or(&self, filters: &[FilterExpression]) -> Result<HashSet<String>> {
let mut result = HashSet::new();
for filter in filters {
let next = self.evaluate(filter)?;
result.extend(next);
}
Ok(result)
}
fn evaluate_not(&self, filter: &FilterExpression) -> Result<HashSet<String>> {
let filter_results = self.evaluate(filter)?;
let fields = filter.get_fields();
let mut all_ids = HashSet::new();
for field in fields {
all_ids.extend(self.get_all_ids_for_field(&field)?);
}
Ok(all_ids.difference(&filter_results).cloned().collect())
}
fn evaluate_exists(&self, field: &str) -> Result<HashSet<String>> {
self.get_all_ids_for_field(field)
}
fn evaluate_is_null(&self, _field: &str) -> Result<HashSet<String>> {
// This would require tracking null values separately
// For now, return empty set
Ok(HashSet::new())
}
fn get_all_ids_for_field(&self, field: &str) -> Result<HashSet<String>> {
let index = self
.indices
.get_index(field)
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
let ids = match index {
PayloadIndex::Integer(map) => map.values().flatten().cloned().collect(),
PayloadIndex::Float(map) => map.values().flatten().cloned().collect(),
PayloadIndex::Keyword(map) => map.values().flatten().cloned().collect(),
PayloadIndex::Bool(map) => map.values().flatten().cloned().collect(),
PayloadIndex::Geo(points) => points.iter().map(|(id, _, _)| id.clone()).collect(),
PayloadIndex::Text(map) => map.values().flatten().cloned().collect(),
};
Ok(ids)
}
fn get_field_value<'b>(payload: &'b Value, field: &str) -> Option<&'b Value> {
payload.as_object()?.get(field)
}
fn compare_values(a: &Value, b: &Value) -> Option<std::cmp::Ordering> {
match (a, b) {
(Value::Number(a), Value::Number(b)) => {
let a = a.as_f64()?;
let b = b.as_f64()?;
a.partial_cmp(&b)
}
(Value::String(a), Value::String(b)) => Some(a.cmp(b)),
_ => None,
}
}
}
/// Calculate haversine distance between two points in meters
fn haversine_distance(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> f64 {
const EARTH_RADIUS_M: f64 = 6_371_000.0; // Earth's radius in meters
let lat1_rad = lat1.to_radians();
let lat2_rad = lat2.to_radians();
let delta_lat = (lat2 - lat1).to_radians();
let delta_lon = (lon2 - lon1).to_radians();
let a = (delta_lat / 2.0).sin().powi(2)
+ lat1_rad.cos() * lat2_rad.cos() * (delta_lon / 2.0).sin().powi(2);
let c = 2.0 * a.sqrt().atan2((1.0 - a).sqrt());
EARTH_RADIUS_M * c
}
#[cfg(test)]
mod tests {
use super::*;
use crate::index::IndexType;
use serde_json::json;
#[test]
fn test_eq_filter() {
let mut manager = PayloadIndexManager::new();
manager.create_index("status", IndexType::Keyword).unwrap();
manager
.index_payload("v1", &json!({"status": "active"}))
.unwrap();
manager
.index_payload("v2", &json!({"status": "active"}))
.unwrap();
manager
.index_payload("v3", &json!({"status": "inactive"}))
.unwrap();
let evaluator = FilterEvaluator::new(&manager);
let filter = FilterExpression::eq("status", json!("active"));
let results = evaluator.evaluate(&filter).unwrap();
assert_eq!(results.len(), 2);
assert!(results.contains("v1"));
assert!(results.contains("v2"));
}
#[test]
fn test_range_filter() {
let mut manager = PayloadIndexManager::new();
manager.create_index("age", IndexType::Integer).unwrap();
manager.index_payload("v1", &json!({"age": 25})).unwrap();
manager.index_payload("v2", &json!({"age": 30})).unwrap();
manager.index_payload("v3", &json!({"age": 35})).unwrap();
let evaluator = FilterEvaluator::new(&manager);
let filter = FilterExpression::range("age", Some(json!(25)), Some(json!(30)));
let results = evaluator.evaluate(&filter).unwrap();
assert_eq!(results.len(), 2);
assert!(results.contains("v1"));
assert!(results.contains("v2"));
}
#[test]
fn test_and_filter() {
let mut manager = PayloadIndexManager::new();
manager.create_index("age", IndexType::Integer).unwrap();
manager.create_index("status", IndexType::Keyword).unwrap();
manager
.index_payload("v1", &json!({"age": 25, "status": "active"}))
.unwrap();
manager
.index_payload("v2", &json!({"age": 30, "status": "active"}))
.unwrap();
manager
.index_payload("v3", &json!({"age": 25, "status": "inactive"}))
.unwrap();
let evaluator = FilterEvaluator::new(&manager);
let filter = FilterExpression::and(vec![
FilterExpression::eq("age", json!(25)),
FilterExpression::eq("status", json!("active")),
]);
let results = evaluator.evaluate(&filter).unwrap();
assert_eq!(results.len(), 1);
assert!(results.contains("v1"));
}
#[test]
fn test_matches_payload() {
let manager = PayloadIndexManager::new();
let evaluator = FilterEvaluator::new(&manager);
let payload = json!({
"age": 25,
"status": "active",
"name": "Alice"
});
assert!(evaluator.matches(&payload, &FilterExpression::eq("age", json!(25))));
assert!(evaluator.matches(&payload, &FilterExpression::eq("status", json!("active"))));
assert!(!evaluator.matches(&payload, &FilterExpression::eq("age", json!(30))));
}
#[test]
fn test_haversine_distance() {
// New York to Los Angeles (approx 3935 km)
let distance = haversine_distance(40.7128, -74.0060, 34.0522, -118.2437);
assert!((distance - 3_935_000.0).abs() < 50_000.0); // Within 50km tolerance
}
}

View File

@@ -0,0 +1,282 @@
use serde::{Deserialize, Serialize};
use serde_json::Value;
/// Filter expression for querying vectors by payload
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum FilterExpression {
// Comparison operators
Eq {
field: String,
value: Value,
},
Ne {
field: String,
value: Value,
},
Gt {
field: String,
value: Value,
},
Gte {
field: String,
value: Value,
},
Lt {
field: String,
value: Value,
},
Lte {
field: String,
value: Value,
},
// Range
Range {
field: String,
gte: Option<Value>,
lte: Option<Value>,
},
// Array operations
In {
field: String,
values: Vec<Value>,
},
// Text matching
Match {
field: String,
text: String,
},
// Geo operations (basic)
GeoRadius {
field: String,
lat: f64,
lon: f64,
radius_m: f64,
},
GeoBoundingBox {
field: String,
top_left: (f64, f64),
bottom_right: (f64, f64),
},
// Logical operators
And(Vec<FilterExpression>),
Or(Vec<FilterExpression>),
Not(Box<FilterExpression>),
// Existence check
Exists {
field: String,
},
IsNull {
field: String,
},
}
impl FilterExpression {
/// Create an equality filter
pub fn eq(field: impl Into<String>, value: Value) -> Self {
Self::Eq {
field: field.into(),
value,
}
}
/// Create a not-equal filter
pub fn ne(field: impl Into<String>, value: Value) -> Self {
Self::Ne {
field: field.into(),
value,
}
}
/// Create a greater-than filter
pub fn gt(field: impl Into<String>, value: Value) -> Self {
Self::Gt {
field: field.into(),
value,
}
}
/// Create a greater-than-or-equal filter
pub fn gte(field: impl Into<String>, value: Value) -> Self {
Self::Gte {
field: field.into(),
value,
}
}
/// Create a less-than filter
pub fn lt(field: impl Into<String>, value: Value) -> Self {
Self::Lt {
field: field.into(),
value,
}
}
/// Create a less-than-or-equal filter
pub fn lte(field: impl Into<String>, value: Value) -> Self {
Self::Lte {
field: field.into(),
value,
}
}
/// Create a range filter
pub fn range(field: impl Into<String>, gte: Option<Value>, lte: Option<Value>) -> Self {
Self::Range {
field: field.into(),
gte,
lte,
}
}
/// Create an IN filter
pub fn in_values(field: impl Into<String>, values: Vec<Value>) -> Self {
Self::In {
field: field.into(),
values,
}
}
/// Create a text match filter
pub fn match_text(field: impl Into<String>, text: impl Into<String>) -> Self {
Self::Match {
field: field.into(),
text: text.into(),
}
}
/// Create a geo radius filter
pub fn geo_radius(field: impl Into<String>, lat: f64, lon: f64, radius_m: f64) -> Self {
Self::GeoRadius {
field: field.into(),
lat,
lon,
radius_m,
}
}
/// Create a geo bounding box filter
pub fn geo_bounding_box(
field: impl Into<String>,
top_left: (f64, f64),
bottom_right: (f64, f64),
) -> Self {
Self::GeoBoundingBox {
field: field.into(),
top_left,
bottom_right,
}
}
/// Create an AND filter
pub fn and(filters: Vec<FilterExpression>) -> Self {
Self::And(filters)
}
/// Create an OR filter
pub fn or(filters: Vec<FilterExpression>) -> Self {
Self::Or(filters)
}
/// Create a NOT filter
pub fn not(filter: FilterExpression) -> Self {
Self::Not(Box::new(filter))
}
/// Create an EXISTS filter
pub fn exists(field: impl Into<String>) -> Self {
Self::Exists {
field: field.into(),
}
}
/// Create an IS NULL filter
pub fn is_null(field: impl Into<String>) -> Self {
Self::IsNull {
field: field.into(),
}
}
/// Get all field names referenced in this expression
pub fn get_fields(&self) -> Vec<String> {
let mut fields = Vec::new();
self.collect_fields(&mut fields);
fields.sort();
fields.dedup();
fields
}
fn collect_fields(&self, fields: &mut Vec<String>) {
match self {
Self::Eq { field, .. }
| Self::Ne { field, .. }
| Self::Gt { field, .. }
| Self::Gte { field, .. }
| Self::Lt { field, .. }
| Self::Lte { field, .. }
| Self::Range { field, .. }
| Self::In { field, .. }
| Self::Match { field, .. }
| Self::GeoRadius { field, .. }
| Self::GeoBoundingBox { field, .. }
| Self::Exists { field }
| Self::IsNull { field } => {
fields.push(field.clone());
}
Self::And(exprs) | Self::Or(exprs) => {
for expr in exprs {
expr.collect_fields(fields);
}
}
Self::Not(expr) => {
expr.collect_fields(fields);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_filter_builders() {
let filter = FilterExpression::eq("status", json!("active"));
assert!(matches!(filter, FilterExpression::Eq { .. }));
let filter = FilterExpression::and(vec![
FilterExpression::eq("status", json!("active")),
FilterExpression::gte("age", json!(18)),
]);
assert!(matches!(filter, FilterExpression::And(_)));
}
#[test]
fn test_get_fields() {
let filter = FilterExpression::and(vec![
FilterExpression::eq("status", json!("active")),
FilterExpression::or(vec![
FilterExpression::gte("age", json!(18)),
FilterExpression::lt("score", json!(100)),
]),
]);
let fields = filter.get_fields();
assert_eq!(fields, vec!["age", "score", "status"]);
}
#[test]
fn test_serialization() {
let filter = FilterExpression::eq("status", json!("active"));
let json = serde_json::to_string(&filter).unwrap();
let deserialized: FilterExpression = serde_json::from_str(&json).unwrap();
assert!(matches!(deserialized, FilterExpression::Eq { .. }));
}
}

View File

@@ -0,0 +1,380 @@
use crate::error::{FilterError, Result};
use ordered_float::OrderedFloat;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::{BTreeMap, HashMap, HashSet};
/// Type of payload index
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum IndexType {
Integer,
Float,
Keyword,
Bool,
Geo,
Text,
}
/// Payload index for efficient filtering
#[derive(Debug, Clone)]
pub enum PayloadIndex {
Integer(BTreeMap<i64, HashSet<String>>),
Float(BTreeMap<OrderedFloat<f64>, HashSet<String>>),
Keyword(HashMap<String, HashSet<String>>),
Bool(HashMap<bool, HashSet<String>>),
Geo(Vec<(String, f64, f64)>), // vector_id, lat, lon
Text(HashMap<String, HashSet<String>>), // Simple text index (word -> vector_ids)
}
impl PayloadIndex {
/// Create a new index of the given type
pub fn new(index_type: IndexType) -> Self {
match index_type {
IndexType::Integer => Self::Integer(BTreeMap::new()),
IndexType::Float => Self::Float(BTreeMap::new()),
IndexType::Keyword => Self::Keyword(HashMap::new()),
IndexType::Bool => Self::Bool(HashMap::new()),
IndexType::Geo => Self::Geo(Vec::new()),
IndexType::Text => Self::Text(HashMap::new()),
}
}
/// Get the index type
pub fn index_type(&self) -> IndexType {
match self {
Self::Integer(_) => IndexType::Integer,
Self::Float(_) => IndexType::Float,
Self::Keyword(_) => IndexType::Keyword,
Self::Bool(_) => IndexType::Bool,
Self::Geo(_) => IndexType::Geo,
Self::Text(_) => IndexType::Text,
}
}
/// Add a value to the index
pub fn add(&mut self, vector_id: &str, value: &Value) -> Result<()> {
match self {
Self::Integer(index) => {
if let Some(num) = value.as_i64() {
index
.entry(num)
.or_insert_with(HashSet::new)
.insert(vector_id.to_string());
}
}
Self::Float(index) => {
if let Some(num) = value.as_f64() {
index
.entry(OrderedFloat(num))
.or_insert_with(HashSet::new)
.insert(vector_id.to_string());
}
}
Self::Keyword(index) => {
if let Some(s) = value.as_str() {
index
.entry(s.to_string())
.or_insert_with(HashSet::new)
.insert(vector_id.to_string());
}
}
Self::Bool(index) => {
if let Some(b) = value.as_bool() {
index
.entry(b)
.or_insert_with(HashSet::new)
.insert(vector_id.to_string());
}
}
Self::Geo(index) => {
if let Some(obj) = value.as_object() {
if let (Some(lat), Some(lon)) = (
obj.get("lat").and_then(|v| v.as_f64()),
obj.get("lon").and_then(|v| v.as_f64()),
) {
index.push((vector_id.to_string(), lat, lon));
}
}
}
Self::Text(index) => {
if let Some(text) = value.as_str() {
// Simple word tokenization
for word in text.split_whitespace() {
let word = word.to_lowercase();
index
.entry(word)
.or_insert_with(HashSet::new)
.insert(vector_id.to_string());
}
}
}
}
Ok(())
}
/// Remove a vector from the index
pub fn remove(&mut self, vector_id: &str, value: &Value) -> Result<()> {
match self {
Self::Integer(index) => {
if let Some(num) = value.as_i64() {
if let Some(set) = index.get_mut(&num) {
set.remove(vector_id);
if set.is_empty() {
index.remove(&num);
}
}
}
}
Self::Float(index) => {
if let Some(num) = value.as_f64() {
if let Some(set) = index.get_mut(&OrderedFloat(num)) {
set.remove(vector_id);
if set.is_empty() {
index.remove(&OrderedFloat(num));
}
}
}
}
Self::Keyword(index) => {
if let Some(s) = value.as_str() {
if let Some(set) = index.get_mut(s) {
set.remove(vector_id);
if set.is_empty() {
index.remove(s);
}
}
}
}
Self::Bool(index) => {
if let Some(b) = value.as_bool() {
if let Some(set) = index.get_mut(&b) {
set.remove(vector_id);
if set.is_empty() {
index.remove(&b);
}
}
}
}
Self::Geo(index) => {
index.retain(|(id, _, _)| id != vector_id);
}
Self::Text(index) => {
if let Some(text) = value.as_str() {
for word in text.split_whitespace() {
let word = word.to_lowercase();
if let Some(set) = index.get_mut(&word) {
set.remove(vector_id);
if set.is_empty() {
index.remove(&word);
}
}
}
}
}
}
Ok(())
}
/// Clear all entries for a vector ID
pub fn clear(&mut self, vector_id: &str) {
match self {
Self::Integer(index) => {
for set in index.values_mut() {
set.remove(vector_id);
}
index.retain(|_, set| !set.is_empty());
}
Self::Float(index) => {
for set in index.values_mut() {
set.remove(vector_id);
}
index.retain(|_, set| !set.is_empty());
}
Self::Keyword(index) => {
for set in index.values_mut() {
set.remove(vector_id);
}
index.retain(|_, set| !set.is_empty());
}
Self::Bool(index) => {
for set in index.values_mut() {
set.remove(vector_id);
}
index.retain(|_, set| !set.is_empty());
}
Self::Geo(index) => {
index.retain(|(id, _, _)| id != vector_id);
}
Self::Text(index) => {
for set in index.values_mut() {
set.remove(vector_id);
}
index.retain(|_, set| !set.is_empty());
}
}
}
}
/// Manager for payload indices
#[derive(Debug, Default)]
pub struct PayloadIndexManager {
indices: HashMap<String, PayloadIndex>,
}
impl PayloadIndexManager {
/// Create a new payload index manager
pub fn new() -> Self {
Self {
indices: HashMap::new(),
}
}
/// Create an index on a field
pub fn create_index(&mut self, field: &str, index_type: IndexType) -> Result<()> {
if self.indices.contains_key(field) {
return Err(FilterError::InvalidExpression(format!(
"Index already exists for field: {}",
field
)));
}
self.indices
.insert(field.to_string(), PayloadIndex::new(index_type));
Ok(())
}
/// Drop an index
pub fn drop_index(&mut self, field: &str) -> Result<()> {
if self.indices.remove(field).is_none() {
return Err(FilterError::IndexNotFound(field.to_string()));
}
Ok(())
}
/// Check if an index exists for a field
pub fn has_index(&self, field: &str) -> bool {
self.indices.contains_key(field)
}
/// Get an index by field name
pub fn get_index(&self, field: &str) -> Option<&PayloadIndex> {
self.indices.get(field)
}
/// Get a mutable index by field name
pub fn get_index_mut(&mut self, field: &str) -> Option<&mut PayloadIndex> {
self.indices.get_mut(field)
}
/// Index a payload for a vector
pub fn index_payload(&mut self, vector_id: &str, payload: &Value) -> Result<()> {
if let Some(obj) = payload.as_object() {
for (field, value) in obj {
if let Some(index) = self.indices.get_mut(field) {
index.add(vector_id, value)?;
}
}
}
Ok(())
}
/// Remove a payload from all indices
pub fn remove_payload(&mut self, vector_id: &str, payload: &Value) -> Result<()> {
if let Some(obj) = payload.as_object() {
for (field, value) in obj {
if let Some(index) = self.indices.get_mut(field) {
index.remove(vector_id, value)?;
}
}
}
Ok(())
}
/// Clear all entries for a vector ID from all indices
pub fn clear_vector(&mut self, vector_id: &str) {
for index in self.indices.values_mut() {
index.clear(vector_id);
}
}
/// Get all indexed fields
pub fn indexed_fields(&self) -> Vec<String> {
self.indices.keys().cloned().collect()
}
/// Get the number of indices
pub fn index_count(&self) -> usize {
self.indices.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_integer_index() {
let mut index = PayloadIndex::new(IndexType::Integer);
index.add("v1", &json!(42)).unwrap();
index.add("v2", &json!(42)).unwrap();
index.add("v3", &json!(100)).unwrap();
if let PayloadIndex::Integer(map) = index {
assert_eq!(map.get(&42).unwrap().len(), 2);
assert_eq!(map.get(&100).unwrap().len(), 1);
} else {
panic!("Wrong index type");
}
}
#[test]
fn test_keyword_index() {
let mut index = PayloadIndex::new(IndexType::Keyword);
index.add("v1", &json!("active")).unwrap();
index.add("v2", &json!("active")).unwrap();
index.add("v3", &json!("inactive")).unwrap();
if let PayloadIndex::Keyword(map) = index {
assert_eq!(map.get("active").unwrap().len(), 2);
assert_eq!(map.get("inactive").unwrap().len(), 1);
} else {
panic!("Wrong index type");
}
}
#[test]
fn test_index_manager() {
let mut manager = PayloadIndexManager::new();
manager.create_index("age", IndexType::Integer).unwrap();
manager.create_index("status", IndexType::Keyword).unwrap();
let payload = json!({
"age": 25,
"status": "active",
"name": "Alice"
});
manager.index_payload("v1", &payload).unwrap();
assert!(manager.has_index("age"));
assert!(manager.has_index("status"));
assert!(!manager.has_index("name"));
}
#[test]
fn test_geo_index() {
let mut index = PayloadIndex::new(IndexType::Geo);
index
.add("v1", &json!({"lat": 40.7128, "lon": -74.0060}))
.unwrap();
index
.add("v2", &json!({"lat": 34.0522, "lon": -118.2437}))
.unwrap();
if let PayloadIndex::Geo(points) = index {
assert_eq!(points.len(), 2);
} else {
panic!("Wrong index type");
}
}
}

View File

@@ -0,0 +1,215 @@
#![recursion_limit = "2048"]
//! # rUvector Filter
//!
//! Advanced payload indexing and filtering for rUvector.
//!
//! This crate provides:
//! - Flexible filter expressions (equality, range, geo, text, logical operators)
//! - Efficient payload indexing (integer, float, keyword, boolean, geo, text)
//! - Fast filter evaluation using indices
//! - Support for complex queries with AND/OR/NOT
//!
//! ## Examples
//!
//! ### Creating and Using Filters
//!
//! ```rust
//! use ruvector_filter::{FilterExpression, PayloadIndexManager, FilterEvaluator, IndexType};
//! use serde_json::json;
//!
//! // Create index manager
//! let mut manager = PayloadIndexManager::new();
//! manager.create_index("status", IndexType::Keyword).unwrap();
//! manager.create_index("age", IndexType::Integer).unwrap();
//!
//! // Index some payloads
//! manager.index_payload("v1", &json!({"status": "active", "age": 25})).unwrap();
//! manager.index_payload("v2", &json!({"status": "active", "age": 30})).unwrap();
//! manager.index_payload("v3", &json!({"status": "inactive", "age": 25})).unwrap();
//!
//! // Create filter
//! let filter = FilterExpression::and(vec![
//! FilterExpression::eq("status", json!("active")),
//! FilterExpression::gte("age", json!(25)),
//! ]);
//!
//! // Evaluate filter
//! let evaluator = FilterEvaluator::new(&manager);
//! let results = evaluator.evaluate(&filter).unwrap();
//! assert_eq!(results.len(), 2);
//! ```
//!
//! ### Geo Filtering
//!
//! ```rust
//! use ruvector_filter::{FilterExpression, PayloadIndexManager, FilterEvaluator, IndexType};
//! use serde_json::json;
//!
//! let mut manager = PayloadIndexManager::new();
//! manager.create_index("location", IndexType::Geo).unwrap();
//!
//! manager.index_payload("v1", &json!({
//! "location": {"lat": 40.7128, "lon": -74.0060}
//! })).unwrap();
//!
//! // Find all points within 1000m of a location
//! let filter = FilterExpression::geo_radius("location", 40.7128, -74.0060, 1000.0);
//! let evaluator = FilterEvaluator::new(&manager);
//! let results = evaluator.evaluate(&filter).unwrap();
//! ```
pub mod error;
pub mod evaluator;
pub mod expression;
pub mod index;
// Re-export main types
pub use error::{FilterError, Result};
pub use evaluator::FilterEvaluator;
pub use expression::FilterExpression;
pub use index::{IndexType, PayloadIndex, PayloadIndexManager};
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_full_workflow() {
// Create index manager
let mut manager = PayloadIndexManager::new();
manager.create_index("status", IndexType::Keyword).unwrap();
manager.create_index("age", IndexType::Integer).unwrap();
manager.create_index("score", IndexType::Float).unwrap();
// Index payloads
manager
.index_payload(
"v1",
&json!({
"status": "active",
"age": 25,
"score": 0.9
}),
)
.unwrap();
manager
.index_payload(
"v2",
&json!({
"status": "active",
"age": 30,
"score": 0.85
}),
)
.unwrap();
manager
.index_payload(
"v3",
&json!({
"status": "inactive",
"age": 25,
"score": 0.7
}),
)
.unwrap();
// Create complex filter
let filter = FilterExpression::and(vec![
FilterExpression::eq("status", json!("active")),
FilterExpression::or(vec![
FilterExpression::gte("age", json!(30)),
FilterExpression::gte("score", json!(0.9)),
]),
]);
// Evaluate
let evaluator = FilterEvaluator::new(&manager);
let results = evaluator.evaluate(&filter).unwrap();
// Should match v1 (age=25, score=0.9) and v2 (age=30, score=0.85)
assert_eq!(results.len(), 2);
assert!(results.contains("v1"));
assert!(results.contains("v2"));
}
#[test]
fn test_text_matching() {
let mut manager = PayloadIndexManager::new();
manager
.create_index("description", IndexType::Text)
.unwrap();
manager
.index_payload(
"v1",
&json!({
"description": "The quick brown fox"
}),
)
.unwrap();
manager
.index_payload(
"v2",
&json!({
"description": "The lazy dog"
}),
)
.unwrap();
let evaluator = FilterEvaluator::new(&manager);
let filter = FilterExpression::match_text("description", "quick");
let results = evaluator.evaluate(&filter).unwrap();
assert_eq!(results.len(), 1);
assert!(results.contains("v1"));
}
#[test]
fn test_not_filter() {
let mut manager = PayloadIndexManager::new();
manager.create_index("status", IndexType::Keyword).unwrap();
manager
.index_payload("v1", &json!({"status": "active"}))
.unwrap();
manager
.index_payload("v2", &json!({"status": "inactive"}))
.unwrap();
let evaluator = FilterEvaluator::new(&manager);
let filter = FilterExpression::not(FilterExpression::eq("status", json!("active")));
let results = evaluator.evaluate(&filter).unwrap();
assert_eq!(results.len(), 1);
assert!(results.contains("v2"));
}
#[test]
fn test_in_filter() {
let mut manager = PayloadIndexManager::new();
manager.create_index("status", IndexType::Keyword).unwrap();
manager
.index_payload("v1", &json!({"status": "active"}))
.unwrap();
manager
.index_payload("v2", &json!({"status": "pending"}))
.unwrap();
manager
.index_payload("v3", &json!({"status": "inactive"}))
.unwrap();
let evaluator = FilterEvaluator::new(&manager);
let filter = FilterExpression::in_values("status", vec![json!("active"), json!("pending")]);
let results = evaluator.evaluate(&filter).unwrap();
assert_eq!(results.len(), 2);
assert!(results.contains("v1"));
assert!(results.contains("v2"));
}
}