Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
19
crates/ruvector-filter/Cargo.toml
Normal file
19
crates/ruvector-filter/Cargo.toml
Normal file
@@ -0,0 +1,19 @@
|
||||
[package]
|
||||
name = "ruvector-filter"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
authors.workspace = true
|
||||
repository.workspace = true
|
||||
readme = "README.md"
|
||||
description = "Advanced metadata filtering for Ruvector vector search"
|
||||
|
||||
[dependencies]
|
||||
ruvector-core = { version = "2.0.2", path = "../ruvector-core" }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
dashmap = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
ordered-float = "4.5"
|
||||
226
crates/ruvector-filter/README.md
Normal file
226
crates/ruvector-filter/README.md
Normal file
@@ -0,0 +1,226 @@
|
||||
# Ruvector Filter
|
||||
|
||||
[](https://crates.io/crates/ruvector-filter)
|
||||
[](https://docs.rs/ruvector-filter)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://www.rust-lang.org)
|
||||
|
||||
**Advanced metadata filtering for Ruvector vector search.**
|
||||
|
||||
`ruvector-filter` provides a powerful filter expression language for combining vector similarity search with metadata constraints. Supports complex boolean expressions, range queries, and efficient filter evaluation. Part of the [Ruvector](https://github.com/ruvnet/ruvector) ecosystem.
|
||||
|
||||
## Why Ruvector Filter?
|
||||
|
||||
- **Rich Expressions**: Complex boolean filter expressions
|
||||
- **Type-Safe**: Strongly typed filter operations
|
||||
- **Optimized**: Filter pushdown for efficient evaluation
|
||||
- **Extensible**: Custom filter operators
|
||||
- **JSON Compatible**: Easy integration with JSON metadata
|
||||
|
||||
## Features
|
||||
|
||||
### Core Capabilities
|
||||
|
||||
- **Comparison Operators**: `=`, `!=`, `<`, `>`, `<=`, `>=`
|
||||
- **Boolean Logic**: `AND`, `OR`, `NOT`
|
||||
- **Range Queries**: `BETWEEN`, `IN`
|
||||
- **String Matching**: `CONTAINS`, `STARTS_WITH`, `ENDS_WITH`
|
||||
- **Null Handling**: `IS NULL`, `IS NOT NULL`
|
||||
|
||||
### Advanced Features
|
||||
|
||||
- **Nested Fields**: Filter on nested JSON properties
|
||||
- **Array Operations**: `ANY`, `ALL`, `NONE` on arrays
|
||||
- **Regex Matching**: Pattern-based string filtering
|
||||
- **Geo Filters**: Distance and bounding box (planned)
|
||||
- **Custom Functions**: Extensible filter functions
|
||||
|
||||
## Installation
|
||||
|
||||
Add `ruvector-filter` to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
ruvector-filter = "0.1.1"
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Filtering
|
||||
|
||||
```rust
|
||||
use ruvector_filter::{Filter, FilterBuilder};
|
||||
|
||||
// Build filter expression
|
||||
let filter = FilterBuilder::new()
|
||||
.field("category").eq("electronics")
|
||||
.and()
|
||||
.field("price").lt(1000.0)
|
||||
.build()?;
|
||||
|
||||
// Apply to search
|
||||
let results = db.search(SearchQuery {
|
||||
vector: query_vec,
|
||||
k: 10,
|
||||
filter: Some(filter),
|
||||
..Default::default()
|
||||
})?;
|
||||
```
|
||||
|
||||
### Complex Expressions
|
||||
|
||||
```rust
|
||||
use ruvector_filter::{Filter, FilterExpr, op};
|
||||
|
||||
// Complex boolean expression
|
||||
let filter = op::and(vec![
|
||||
op::eq("status", "active"),
|
||||
op::or(vec![
|
||||
op::gt("priority", 5),
|
||||
op::in_("tags", vec!["urgent", "important"]),
|
||||
]),
|
||||
op::not(op::eq("archived", true)),
|
||||
]);
|
||||
|
||||
// Range query
|
||||
let filter = op::and(vec![
|
||||
op::between("price", 100.0, 500.0),
|
||||
op::between("created_at", "2024-01-01", "2024-12-31"),
|
||||
]);
|
||||
```
|
||||
|
||||
### String Matching
|
||||
|
||||
```rust
|
||||
use ruvector_filter::op;
|
||||
|
||||
// String operations
|
||||
let filter = op::and(vec![
|
||||
op::contains("description", "machine learning"),
|
||||
op::starts_with("name", "Project"),
|
||||
op::regex("email", r".*@company\.com"),
|
||||
]);
|
||||
```
|
||||
|
||||
### Nested Field Access
|
||||
|
||||
```rust
|
||||
use ruvector_filter::op;
|
||||
|
||||
// Access nested JSON fields
|
||||
let filter = op::and(vec![
|
||||
op::eq("user.role", "admin"),
|
||||
op::gt("metadata.views", 1000),
|
||||
op::in_("settings.theme", vec!["dark", "light"]),
|
||||
]);
|
||||
```
|
||||
|
||||
## API Overview
|
||||
|
||||
### Core Types
|
||||
|
||||
```rust
|
||||
// Filter expression
|
||||
pub enum FilterExpr {
|
||||
// Comparison
|
||||
Eq(String, Value),
|
||||
Ne(String, Value),
|
||||
Lt(String, Value),
|
||||
Gt(String, Value),
|
||||
Le(String, Value),
|
||||
Ge(String, Value),
|
||||
|
||||
// Boolean
|
||||
And(Vec<FilterExpr>),
|
||||
Or(Vec<FilterExpr>),
|
||||
Not(Box<FilterExpr>),
|
||||
|
||||
// Range
|
||||
Between(String, Value, Value),
|
||||
In(String, Vec<Value>),
|
||||
|
||||
// String
|
||||
Contains(String, String),
|
||||
StartsWith(String, String),
|
||||
EndsWith(String, String),
|
||||
Regex(String, String),
|
||||
|
||||
// Null
|
||||
IsNull(String),
|
||||
IsNotNull(String),
|
||||
}
|
||||
|
||||
// Filter builder
|
||||
pub struct FilterBuilder { /* ... */ }
|
||||
```
|
||||
|
||||
### Filter Operations
|
||||
|
||||
```rust
|
||||
// Convenience functions in `op` module
|
||||
pub mod op {
|
||||
pub fn eq(field: &str, value: impl Into<Value>) -> FilterExpr;
|
||||
pub fn ne(field: &str, value: impl Into<Value>) -> FilterExpr;
|
||||
pub fn lt(field: &str, value: impl Into<Value>) -> FilterExpr;
|
||||
pub fn gt(field: &str, value: impl Into<Value>) -> FilterExpr;
|
||||
pub fn le(field: &str, value: impl Into<Value>) -> FilterExpr;
|
||||
pub fn ge(field: &str, value: impl Into<Value>) -> FilterExpr;
|
||||
|
||||
pub fn and(exprs: Vec<FilterExpr>) -> FilterExpr;
|
||||
pub fn or(exprs: Vec<FilterExpr>) -> FilterExpr;
|
||||
pub fn not(expr: FilterExpr) -> FilterExpr;
|
||||
|
||||
pub fn between(field: &str, min: impl Into<Value>, max: impl Into<Value>) -> FilterExpr;
|
||||
pub fn in_(field: &str, values: Vec<impl Into<Value>>) -> FilterExpr;
|
||||
|
||||
pub fn contains(field: &str, substring: &str) -> FilterExpr;
|
||||
pub fn starts_with(field: &str, prefix: &str) -> FilterExpr;
|
||||
pub fn ends_with(field: &str, suffix: &str) -> FilterExpr;
|
||||
pub fn regex(field: &str, pattern: &str) -> FilterExpr;
|
||||
}
|
||||
```
|
||||
|
||||
### Filter Evaluation
|
||||
|
||||
```rust
|
||||
impl FilterExpr {
|
||||
pub fn evaluate(&self, metadata: &serde_json::Value) -> bool;
|
||||
pub fn optimize(&self) -> FilterExpr;
|
||||
pub fn to_json(&self) -> serde_json::Value;
|
||||
pub fn from_json(json: &serde_json::Value) -> Result<Self>;
|
||||
}
|
||||
```
|
||||
|
||||
## Performance Tips
|
||||
|
||||
1. **Put most selective filters first** in AND expressions
|
||||
2. **Use IN instead of multiple OR** for equality checks
|
||||
3. **Avoid regex when possible** - use prefix/suffix matching
|
||||
4. **Index frequently filtered fields** in your metadata
|
||||
|
||||
## Related Crates
|
||||
|
||||
- **[ruvector-core](../ruvector-core/)** - Core vector database engine
|
||||
- **[ruvector-collections](../ruvector-collections/)** - Collection management
|
||||
|
||||
## Documentation
|
||||
|
||||
- **[Main README](../../README.md)** - Complete project overview
|
||||
- **[API Documentation](https://docs.rs/ruvector-filter)** - Full API reference
|
||||
- **[GitHub Repository](https://github.com/ruvnet/ruvector)** - Source code
|
||||
|
||||
## License
|
||||
|
||||
**MIT License** - see [LICENSE](../../LICENSE) for details.
|
||||
|
||||
---
|
||||
|
||||
<div align="center">
|
||||
|
||||
**Part of [Ruvector](https://github.com/ruvnet/ruvector) - Built by [rUv](https://ruv.io)**
|
||||
|
||||
[](https://github.com/ruvnet/ruvector)
|
||||
|
||||
[Documentation](https://docs.rs/ruvector-filter) | [Crates.io](https://crates.io/crates/ruvector-filter) | [GitHub](https://github.com/ruvnet/ruvector)
|
||||
|
||||
</div>
|
||||
37
crates/ruvector-filter/src/error.rs
Normal file
37
crates/ruvector-filter/src/error.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
use thiserror::Error;
|
||||
|
||||
/// Errors that can occur during filter operations
|
||||
#[derive(Error, Debug)]
|
||||
pub enum FilterError {
|
||||
#[error("Index not found for field: {0}")]
|
||||
IndexNotFound(String),
|
||||
|
||||
#[error("Invalid index type for field: {0}")]
|
||||
InvalidIndexType(String),
|
||||
|
||||
#[error("Type mismatch in filter expression: expected {expected}, got {actual}")]
|
||||
TypeMismatch { expected: String, actual: String },
|
||||
|
||||
#[error("Invalid filter expression: {0}")]
|
||||
InvalidExpression(String),
|
||||
|
||||
#[error("Field not found in payload: {0}")]
|
||||
FieldNotFound(String),
|
||||
|
||||
#[error("Invalid value for operation: {0}")]
|
||||
InvalidValue(String),
|
||||
|
||||
#[error("Geo operation error: {0}")]
|
||||
GeoError(String),
|
||||
|
||||
#[error("JSON error: {0}")]
|
||||
JsonError(#[from] serde_json::Error),
|
||||
|
||||
#[error("IO error: {0}")]
|
||||
IoError(#[from] std::io::Error),
|
||||
|
||||
#[error("Parse error: {0}")]
|
||||
ParseError(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, FilterError>;
|
||||
593
crates/ruvector-filter/src/evaluator.rs
Normal file
593
crates/ruvector-filter/src/evaluator.rs
Normal file
@@ -0,0 +1,593 @@
|
||||
use crate::error::{FilterError, Result};
|
||||
use crate::expression::FilterExpression;
|
||||
use crate::index::{PayloadIndex, PayloadIndexManager};
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde_json::Value;
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Evaluates filter expressions against payload indices
|
||||
pub struct FilterEvaluator<'a> {
|
||||
indices: &'a PayloadIndexManager,
|
||||
}
|
||||
|
||||
impl<'a> FilterEvaluator<'a> {
|
||||
/// Create a new filter evaluator
|
||||
pub fn new(indices: &'a PayloadIndexManager) -> Self {
|
||||
Self { indices }
|
||||
}
|
||||
|
||||
/// Evaluate a filter expression and return matching vector IDs
|
||||
pub fn evaluate(&self, filter: &FilterExpression) -> Result<HashSet<String>> {
|
||||
match filter {
|
||||
FilterExpression::Eq { field, value } => self.evaluate_eq(field, value),
|
||||
FilterExpression::Ne { field, value } => self.evaluate_ne(field, value),
|
||||
FilterExpression::Gt { field, value } => self.evaluate_gt(field, value),
|
||||
FilterExpression::Gte { field, value } => self.evaluate_gte(field, value),
|
||||
FilterExpression::Lt { field, value } => self.evaluate_lt(field, value),
|
||||
FilterExpression::Lte { field, value } => self.evaluate_lte(field, value),
|
||||
FilterExpression::Range { field, gte, lte } => {
|
||||
self.evaluate_range(field, gte.as_ref(), lte.as_ref())
|
||||
}
|
||||
FilterExpression::In { field, values } => self.evaluate_in(field, values),
|
||||
FilterExpression::Match { field, text } => self.evaluate_match(field, text),
|
||||
FilterExpression::GeoRadius {
|
||||
field,
|
||||
lat,
|
||||
lon,
|
||||
radius_m,
|
||||
} => self.evaluate_geo_radius(field, *lat, *lon, *radius_m),
|
||||
FilterExpression::GeoBoundingBox {
|
||||
field,
|
||||
top_left,
|
||||
bottom_right,
|
||||
} => self.evaluate_geo_bbox(field, *top_left, *bottom_right),
|
||||
FilterExpression::And(filters) => self.evaluate_and(filters),
|
||||
FilterExpression::Or(filters) => self.evaluate_or(filters),
|
||||
FilterExpression::Not(filter) => self.evaluate_not(filter),
|
||||
FilterExpression::Exists { field } => self.evaluate_exists(field),
|
||||
FilterExpression::IsNull { field } => self.evaluate_is_null(field),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a payload matches a filter expression
|
||||
pub fn matches(&self, payload: &Value, filter: &FilterExpression) -> bool {
|
||||
match filter {
|
||||
FilterExpression::Eq { field, value } => {
|
||||
Self::get_field_value(payload, field).map_or(false, |v| v == value)
|
||||
}
|
||||
FilterExpression::Ne { field, value } => {
|
||||
Self::get_field_value(payload, field).map_or(true, |v| v != value)
|
||||
}
|
||||
FilterExpression::Gt { field, value } => Self::get_field_value(payload, field)
|
||||
.map_or(false, |v| {
|
||||
Self::compare_values(v, value) == Some(std::cmp::Ordering::Greater)
|
||||
}),
|
||||
FilterExpression::Gte { field, value } => {
|
||||
Self::get_field_value(payload, field).map_or(false, |v| {
|
||||
matches!(
|
||||
Self::compare_values(v, value),
|
||||
Some(std::cmp::Ordering::Greater | std::cmp::Ordering::Equal)
|
||||
)
|
||||
})
|
||||
}
|
||||
FilterExpression::Lt { field, value } => Self::get_field_value(payload, field)
|
||||
.map_or(false, |v| {
|
||||
Self::compare_values(v, value) == Some(std::cmp::Ordering::Less)
|
||||
}),
|
||||
FilterExpression::Lte { field, value } => {
|
||||
Self::get_field_value(payload, field).map_or(false, |v| {
|
||||
matches!(
|
||||
Self::compare_values(v, value),
|
||||
Some(std::cmp::Ordering::Less | std::cmp::Ordering::Equal)
|
||||
)
|
||||
})
|
||||
}
|
||||
FilterExpression::Range { field, gte, lte } => {
|
||||
if let Some(v) = Self::get_field_value(payload, field) {
|
||||
let gte_match = gte.as_ref().map_or(true, |gte_val| {
|
||||
matches!(
|
||||
Self::compare_values(v, gte_val),
|
||||
Some(std::cmp::Ordering::Greater | std::cmp::Ordering::Equal)
|
||||
)
|
||||
});
|
||||
let lte_match = lte.as_ref().map_or(true, |lte_val| {
|
||||
matches!(
|
||||
Self::compare_values(v, lte_val),
|
||||
Some(std::cmp::Ordering::Less | std::cmp::Ordering::Equal)
|
||||
)
|
||||
});
|
||||
gte_match && lte_match
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
FilterExpression::In { field, values } => {
|
||||
Self::get_field_value(payload, field).map_or(false, |v| values.contains(v))
|
||||
}
|
||||
FilterExpression::Match { field, text } => Self::get_field_value(payload, field)
|
||||
.and_then(|v| v.as_str())
|
||||
.map_or(false, |s| s.to_lowercase().contains(&text.to_lowercase())),
|
||||
FilterExpression::And(filters) => filters.iter().all(|f| self.matches(payload, f)),
|
||||
FilterExpression::Or(filters) => filters.iter().any(|f| self.matches(payload, f)),
|
||||
FilterExpression::Not(filter) => !self.matches(payload, filter),
|
||||
FilterExpression::Exists { field } => Self::get_field_value(payload, field).is_some(),
|
||||
FilterExpression::IsNull { field } => {
|
||||
Self::get_field_value(payload, field).map_or(true, |v| v.is_null())
|
||||
}
|
||||
_ => false, // Geo operations not supported in direct matching
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_eq(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
|
||||
let index = self
|
||||
.indices
|
||||
.get_index(field)
|
||||
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
|
||||
|
||||
match index {
|
||||
PayloadIndex::Integer(map) => {
|
||||
if let Some(num) = value.as_i64() {
|
||||
Ok(map.get(&num).cloned().unwrap_or_default())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
PayloadIndex::Float(map) => {
|
||||
if let Some(num) = value.as_f64() {
|
||||
Ok(map.get(&OrderedFloat(num)).cloned().unwrap_or_default())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
PayloadIndex::Keyword(map) => {
|
||||
if let Some(s) = value.as_str() {
|
||||
Ok(map.get(s).cloned().unwrap_or_default())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
PayloadIndex::Bool(map) => {
|
||||
if let Some(b) = value.as_bool() {
|
||||
Ok(map.get(&b).cloned().unwrap_or_default())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
_ => Err(FilterError::InvalidIndexType(field.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_ne(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
|
||||
let eq_results = self.evaluate_eq(field, value)?;
|
||||
let all_ids = self.get_all_ids_for_field(field)?;
|
||||
Ok(all_ids.difference(&eq_results).cloned().collect())
|
||||
}
|
||||
|
||||
fn evaluate_gt(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
|
||||
let index = self
|
||||
.indices
|
||||
.get_index(field)
|
||||
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
|
||||
|
||||
match index {
|
||||
PayloadIndex::Integer(map) => {
|
||||
if let Some(num) = value.as_i64() {
|
||||
Ok(map
|
||||
.range((num + 1)..)
|
||||
.flat_map(|(_, ids)| ids)
|
||||
.cloned()
|
||||
.collect())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
PayloadIndex::Float(map) => {
|
||||
if let Some(num) = value.as_f64() {
|
||||
let threshold = OrderedFloat(num);
|
||||
Ok(map
|
||||
.range(threshold..)
|
||||
.filter(|(k, _)| **k > threshold)
|
||||
.flat_map(|(_, ids)| ids)
|
||||
.cloned()
|
||||
.collect())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
_ => Err(FilterError::InvalidIndexType(field.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_gte(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
|
||||
let index = self
|
||||
.indices
|
||||
.get_index(field)
|
||||
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
|
||||
|
||||
match index {
|
||||
PayloadIndex::Integer(map) => {
|
||||
if let Some(num) = value.as_i64() {
|
||||
Ok(map.range(num..).flat_map(|(_, ids)| ids).cloned().collect())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
PayloadIndex::Float(map) => {
|
||||
if let Some(num) = value.as_f64() {
|
||||
Ok(map
|
||||
.range(OrderedFloat(num)..)
|
||||
.flat_map(|(_, ids)| ids)
|
||||
.cloned()
|
||||
.collect())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
_ => Err(FilterError::InvalidIndexType(field.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_lt(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
|
||||
let index = self
|
||||
.indices
|
||||
.get_index(field)
|
||||
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
|
||||
|
||||
match index {
|
||||
PayloadIndex::Integer(map) => {
|
||||
if let Some(num) = value.as_i64() {
|
||||
Ok(map.range(..num).flat_map(|(_, ids)| ids).cloned().collect())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
PayloadIndex::Float(map) => {
|
||||
if let Some(num) = value.as_f64() {
|
||||
Ok(map
|
||||
.range(..OrderedFloat(num))
|
||||
.flat_map(|(_, ids)| ids)
|
||||
.cloned()
|
||||
.collect())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
_ => Err(FilterError::InvalidIndexType(field.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_lte(&self, field: &str, value: &Value) -> Result<HashSet<String>> {
|
||||
let index = self
|
||||
.indices
|
||||
.get_index(field)
|
||||
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
|
||||
|
||||
match index {
|
||||
PayloadIndex::Integer(map) => {
|
||||
if let Some(num) = value.as_i64() {
|
||||
Ok(map
|
||||
.range(..=num)
|
||||
.flat_map(|(_, ids)| ids)
|
||||
.cloned()
|
||||
.collect())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
PayloadIndex::Float(map) => {
|
||||
if let Some(num) = value.as_f64() {
|
||||
Ok(map
|
||||
.range(..=OrderedFloat(num))
|
||||
.flat_map(|(_, ids)| ids)
|
||||
.cloned()
|
||||
.collect())
|
||||
} else {
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
}
|
||||
_ => Err(FilterError::InvalidIndexType(field.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_range(
|
||||
&self,
|
||||
field: &str,
|
||||
gte: Option<&Value>,
|
||||
lte: Option<&Value>,
|
||||
) -> Result<HashSet<String>> {
|
||||
let mut result = self.get_all_ids_for_field(field)?;
|
||||
|
||||
if let Some(gte_val) = gte {
|
||||
let gte_results = self.evaluate_gte(field, gte_val)?;
|
||||
result = result.intersection(>e_results).cloned().collect();
|
||||
}
|
||||
|
||||
if let Some(lte_val) = lte {
|
||||
let lte_results = self.evaluate_lte(field, lte_val)?;
|
||||
result = result.intersection(<e_results).cloned().collect();
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn evaluate_in(&self, field: &str, values: &[Value]) -> Result<HashSet<String>> {
|
||||
let mut result = HashSet::new();
|
||||
for value in values {
|
||||
let ids = self.evaluate_eq(field, value)?;
|
||||
result.extend(ids);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn evaluate_match(&self, field: &str, text: &str) -> Result<HashSet<String>> {
|
||||
let index = self
|
||||
.indices
|
||||
.get_index(field)
|
||||
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
|
||||
|
||||
match index {
|
||||
PayloadIndex::Text(map) => {
|
||||
let words: Vec<_> = text.split_whitespace().map(|w| w.to_lowercase()).collect();
|
||||
let mut result = HashSet::new();
|
||||
for word in words {
|
||||
if let Some(ids) = map.get(&word) {
|
||||
result.extend(ids.iter().cloned());
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
_ => Err(FilterError::InvalidIndexType(field.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_geo_radius(
|
||||
&self,
|
||||
field: &str,
|
||||
lat: f64,
|
||||
lon: f64,
|
||||
radius_m: f64,
|
||||
) -> Result<HashSet<String>> {
|
||||
let index = self
|
||||
.indices
|
||||
.get_index(field)
|
||||
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
|
||||
|
||||
match index {
|
||||
PayloadIndex::Geo(points) => {
|
||||
let mut result = HashSet::new();
|
||||
for (id, point_lat, point_lon) in points {
|
||||
let distance = haversine_distance(lat, lon, *point_lat, *point_lon);
|
||||
if distance <= radius_m {
|
||||
result.insert(id.clone());
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
_ => Err(FilterError::InvalidIndexType(field.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_geo_bbox(
|
||||
&self,
|
||||
field: &str,
|
||||
top_left: (f64, f64),
|
||||
bottom_right: (f64, f64),
|
||||
) -> Result<HashSet<String>> {
|
||||
let index = self
|
||||
.indices
|
||||
.get_index(field)
|
||||
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
|
||||
|
||||
match index {
|
||||
PayloadIndex::Geo(points) => {
|
||||
let mut result = HashSet::new();
|
||||
let (north, west) = top_left;
|
||||
let (south, east) = bottom_right;
|
||||
|
||||
for (id, lat, lon) in points {
|
||||
if *lat <= north && *lat >= south && *lon >= west && *lon <= east {
|
||||
result.insert(id.clone());
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
_ => Err(FilterError::InvalidIndexType(field.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
fn evaluate_and(&self, filters: &[FilterExpression]) -> Result<HashSet<String>> {
|
||||
if filters.is_empty() {
|
||||
return Ok(HashSet::new());
|
||||
}
|
||||
|
||||
let mut result = self.evaluate(&filters[0])?;
|
||||
for filter in &filters[1..] {
|
||||
let next = self.evaluate(filter)?;
|
||||
result = result.intersection(&next).cloned().collect();
|
||||
if result.is_empty() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn evaluate_or(&self, filters: &[FilterExpression]) -> Result<HashSet<String>> {
|
||||
let mut result = HashSet::new();
|
||||
for filter in filters {
|
||||
let next = self.evaluate(filter)?;
|
||||
result.extend(next);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn evaluate_not(&self, filter: &FilterExpression) -> Result<HashSet<String>> {
|
||||
let filter_results = self.evaluate(filter)?;
|
||||
let fields = filter.get_fields();
|
||||
let mut all_ids = HashSet::new();
|
||||
|
||||
for field in fields {
|
||||
all_ids.extend(self.get_all_ids_for_field(&field)?);
|
||||
}
|
||||
|
||||
Ok(all_ids.difference(&filter_results).cloned().collect())
|
||||
}
|
||||
|
||||
fn evaluate_exists(&self, field: &str) -> Result<HashSet<String>> {
|
||||
self.get_all_ids_for_field(field)
|
||||
}
|
||||
|
||||
fn evaluate_is_null(&self, _field: &str) -> Result<HashSet<String>> {
|
||||
// This would require tracking null values separately
|
||||
// For now, return empty set
|
||||
Ok(HashSet::new())
|
||||
}
|
||||
|
||||
fn get_all_ids_for_field(&self, field: &str) -> Result<HashSet<String>> {
|
||||
let index = self
|
||||
.indices
|
||||
.get_index(field)
|
||||
.ok_or_else(|| FilterError::IndexNotFound(field.to_string()))?;
|
||||
|
||||
let ids = match index {
|
||||
PayloadIndex::Integer(map) => map.values().flatten().cloned().collect(),
|
||||
PayloadIndex::Float(map) => map.values().flatten().cloned().collect(),
|
||||
PayloadIndex::Keyword(map) => map.values().flatten().cloned().collect(),
|
||||
PayloadIndex::Bool(map) => map.values().flatten().cloned().collect(),
|
||||
PayloadIndex::Geo(points) => points.iter().map(|(id, _, _)| id.clone()).collect(),
|
||||
PayloadIndex::Text(map) => map.values().flatten().cloned().collect(),
|
||||
};
|
||||
|
||||
Ok(ids)
|
||||
}
|
||||
|
||||
fn get_field_value<'b>(payload: &'b Value, field: &str) -> Option<&'b Value> {
|
||||
payload.as_object()?.get(field)
|
||||
}
|
||||
|
||||
fn compare_values(a: &Value, b: &Value) -> Option<std::cmp::Ordering> {
|
||||
match (a, b) {
|
||||
(Value::Number(a), Value::Number(b)) => {
|
||||
let a = a.as_f64()?;
|
||||
let b = b.as_f64()?;
|
||||
a.partial_cmp(&b)
|
||||
}
|
||||
(Value::String(a), Value::String(b)) => Some(a.cmp(b)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate haversine distance between two points in meters
|
||||
fn haversine_distance(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> f64 {
|
||||
const EARTH_RADIUS_M: f64 = 6_371_000.0; // Earth's radius in meters
|
||||
|
||||
let lat1_rad = lat1.to_radians();
|
||||
let lat2_rad = lat2.to_radians();
|
||||
let delta_lat = (lat2 - lat1).to_radians();
|
||||
let delta_lon = (lon2 - lon1).to_radians();
|
||||
|
||||
let a = (delta_lat / 2.0).sin().powi(2)
|
||||
+ lat1_rad.cos() * lat2_rad.cos() * (delta_lon / 2.0).sin().powi(2);
|
||||
let c = 2.0 * a.sqrt().atan2((1.0 - a).sqrt());
|
||||
|
||||
EARTH_RADIUS_M * c
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::index::IndexType;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn test_eq_filter() {
|
||||
let mut manager = PayloadIndexManager::new();
|
||||
manager.create_index("status", IndexType::Keyword).unwrap();
|
||||
|
||||
manager
|
||||
.index_payload("v1", &json!({"status": "active"}))
|
||||
.unwrap();
|
||||
manager
|
||||
.index_payload("v2", &json!({"status": "active"}))
|
||||
.unwrap();
|
||||
manager
|
||||
.index_payload("v3", &json!({"status": "inactive"}))
|
||||
.unwrap();
|
||||
|
||||
let evaluator = FilterEvaluator::new(&manager);
|
||||
let filter = FilterExpression::eq("status", json!("active"));
|
||||
let results = evaluator.evaluate(&filter).unwrap();
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
assert!(results.contains("v1"));
|
||||
assert!(results.contains("v2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_filter() {
|
||||
let mut manager = PayloadIndexManager::new();
|
||||
manager.create_index("age", IndexType::Integer).unwrap();
|
||||
|
||||
manager.index_payload("v1", &json!({"age": 25})).unwrap();
|
||||
manager.index_payload("v2", &json!({"age": 30})).unwrap();
|
||||
manager.index_payload("v3", &json!({"age": 35})).unwrap();
|
||||
|
||||
let evaluator = FilterEvaluator::new(&manager);
|
||||
let filter = FilterExpression::range("age", Some(json!(25)), Some(json!(30)));
|
||||
let results = evaluator.evaluate(&filter).unwrap();
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
assert!(results.contains("v1"));
|
||||
assert!(results.contains("v2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_and_filter() {
|
||||
let mut manager = PayloadIndexManager::new();
|
||||
manager.create_index("age", IndexType::Integer).unwrap();
|
||||
manager.create_index("status", IndexType::Keyword).unwrap();
|
||||
|
||||
manager
|
||||
.index_payload("v1", &json!({"age": 25, "status": "active"}))
|
||||
.unwrap();
|
||||
manager
|
||||
.index_payload("v2", &json!({"age": 30, "status": "active"}))
|
||||
.unwrap();
|
||||
manager
|
||||
.index_payload("v3", &json!({"age": 25, "status": "inactive"}))
|
||||
.unwrap();
|
||||
|
||||
let evaluator = FilterEvaluator::new(&manager);
|
||||
let filter = FilterExpression::and(vec![
|
||||
FilterExpression::eq("age", json!(25)),
|
||||
FilterExpression::eq("status", json!("active")),
|
||||
]);
|
||||
let results = evaluator.evaluate(&filter).unwrap();
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert!(results.contains("v1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_matches_payload() {
|
||||
let manager = PayloadIndexManager::new();
|
||||
let evaluator = FilterEvaluator::new(&manager);
|
||||
|
||||
let payload = json!({
|
||||
"age": 25,
|
||||
"status": "active",
|
||||
"name": "Alice"
|
||||
});
|
||||
|
||||
assert!(evaluator.matches(&payload, &FilterExpression::eq("age", json!(25))));
|
||||
assert!(evaluator.matches(&payload, &FilterExpression::eq("status", json!("active"))));
|
||||
assert!(!evaluator.matches(&payload, &FilterExpression::eq("age", json!(30))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_haversine_distance() {
|
||||
// New York to Los Angeles (approx 3935 km)
|
||||
let distance = haversine_distance(40.7128, -74.0060, 34.0522, -118.2437);
|
||||
assert!((distance - 3_935_000.0).abs() < 50_000.0); // Within 50km tolerance
|
||||
}
|
||||
}
|
||||
282
crates/ruvector-filter/src/expression.rs
Normal file
282
crates/ruvector-filter/src/expression.rs
Normal file
@@ -0,0 +1,282 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
/// Filter expression for querying vectors by payload
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
pub enum FilterExpression {
|
||||
// Comparison operators
|
||||
Eq {
|
||||
field: String,
|
||||
value: Value,
|
||||
},
|
||||
Ne {
|
||||
field: String,
|
||||
value: Value,
|
||||
},
|
||||
Gt {
|
||||
field: String,
|
||||
value: Value,
|
||||
},
|
||||
Gte {
|
||||
field: String,
|
||||
value: Value,
|
||||
},
|
||||
Lt {
|
||||
field: String,
|
||||
value: Value,
|
||||
},
|
||||
Lte {
|
||||
field: String,
|
||||
value: Value,
|
||||
},
|
||||
|
||||
// Range
|
||||
Range {
|
||||
field: String,
|
||||
gte: Option<Value>,
|
||||
lte: Option<Value>,
|
||||
},
|
||||
|
||||
// Array operations
|
||||
In {
|
||||
field: String,
|
||||
values: Vec<Value>,
|
||||
},
|
||||
|
||||
// Text matching
|
||||
Match {
|
||||
field: String,
|
||||
text: String,
|
||||
},
|
||||
|
||||
// Geo operations (basic)
|
||||
GeoRadius {
|
||||
field: String,
|
||||
lat: f64,
|
||||
lon: f64,
|
||||
radius_m: f64,
|
||||
},
|
||||
GeoBoundingBox {
|
||||
field: String,
|
||||
top_left: (f64, f64),
|
||||
bottom_right: (f64, f64),
|
||||
},
|
||||
|
||||
// Logical operators
|
||||
And(Vec<FilterExpression>),
|
||||
Or(Vec<FilterExpression>),
|
||||
Not(Box<FilterExpression>),
|
||||
|
||||
// Existence check
|
||||
Exists {
|
||||
field: String,
|
||||
},
|
||||
IsNull {
|
||||
field: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl FilterExpression {
|
||||
/// Create an equality filter
|
||||
pub fn eq(field: impl Into<String>, value: Value) -> Self {
|
||||
Self::Eq {
|
||||
field: field.into(),
|
||||
value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a not-equal filter
|
||||
pub fn ne(field: impl Into<String>, value: Value) -> Self {
|
||||
Self::Ne {
|
||||
field: field.into(),
|
||||
value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a greater-than filter
|
||||
pub fn gt(field: impl Into<String>, value: Value) -> Self {
|
||||
Self::Gt {
|
||||
field: field.into(),
|
||||
value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a greater-than-or-equal filter
|
||||
pub fn gte(field: impl Into<String>, value: Value) -> Self {
|
||||
Self::Gte {
|
||||
field: field.into(),
|
||||
value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a less-than filter
|
||||
pub fn lt(field: impl Into<String>, value: Value) -> Self {
|
||||
Self::Lt {
|
||||
field: field.into(),
|
||||
value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a less-than-or-equal filter
|
||||
pub fn lte(field: impl Into<String>, value: Value) -> Self {
|
||||
Self::Lte {
|
||||
field: field.into(),
|
||||
value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a range filter
|
||||
pub fn range(field: impl Into<String>, gte: Option<Value>, lte: Option<Value>) -> Self {
|
||||
Self::Range {
|
||||
field: field.into(),
|
||||
gte,
|
||||
lte,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an IN filter
|
||||
pub fn in_values(field: impl Into<String>, values: Vec<Value>) -> Self {
|
||||
Self::In {
|
||||
field: field.into(),
|
||||
values,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a text match filter
|
||||
pub fn match_text(field: impl Into<String>, text: impl Into<String>) -> Self {
|
||||
Self::Match {
|
||||
field: field.into(),
|
||||
text: text.into(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a geo radius filter
|
||||
pub fn geo_radius(field: impl Into<String>, lat: f64, lon: f64, radius_m: f64) -> Self {
|
||||
Self::GeoRadius {
|
||||
field: field.into(),
|
||||
lat,
|
||||
lon,
|
||||
radius_m,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a geo bounding box filter
|
||||
pub fn geo_bounding_box(
|
||||
field: impl Into<String>,
|
||||
top_left: (f64, f64),
|
||||
bottom_right: (f64, f64),
|
||||
) -> Self {
|
||||
Self::GeoBoundingBox {
|
||||
field: field.into(),
|
||||
top_left,
|
||||
bottom_right,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an AND filter
|
||||
pub fn and(filters: Vec<FilterExpression>) -> Self {
|
||||
Self::And(filters)
|
||||
}
|
||||
|
||||
/// Create an OR filter
|
||||
pub fn or(filters: Vec<FilterExpression>) -> Self {
|
||||
Self::Or(filters)
|
||||
}
|
||||
|
||||
/// Create a NOT filter
|
||||
pub fn not(filter: FilterExpression) -> Self {
|
||||
Self::Not(Box::new(filter))
|
||||
}
|
||||
|
||||
/// Create an EXISTS filter
|
||||
pub fn exists(field: impl Into<String>) -> Self {
|
||||
Self::Exists {
|
||||
field: field.into(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an IS NULL filter
|
||||
pub fn is_null(field: impl Into<String>) -> Self {
|
||||
Self::IsNull {
|
||||
field: field.into(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all field names referenced in this expression
|
||||
pub fn get_fields(&self) -> Vec<String> {
|
||||
let mut fields = Vec::new();
|
||||
self.collect_fields(&mut fields);
|
||||
fields.sort();
|
||||
fields.dedup();
|
||||
fields
|
||||
}
|
||||
|
||||
fn collect_fields(&self, fields: &mut Vec<String>) {
|
||||
match self {
|
||||
Self::Eq { field, .. }
|
||||
| Self::Ne { field, .. }
|
||||
| Self::Gt { field, .. }
|
||||
| Self::Gte { field, .. }
|
||||
| Self::Lt { field, .. }
|
||||
| Self::Lte { field, .. }
|
||||
| Self::Range { field, .. }
|
||||
| Self::In { field, .. }
|
||||
| Self::Match { field, .. }
|
||||
| Self::GeoRadius { field, .. }
|
||||
| Self::GeoBoundingBox { field, .. }
|
||||
| Self::Exists { field }
|
||||
| Self::IsNull { field } => {
|
||||
fields.push(field.clone());
|
||||
}
|
||||
Self::And(exprs) | Self::Or(exprs) => {
|
||||
for expr in exprs {
|
||||
expr.collect_fields(fields);
|
||||
}
|
||||
}
|
||||
Self::Not(expr) => {
|
||||
expr.collect_fields(fields);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn test_filter_builders() {
|
||||
let filter = FilterExpression::eq("status", json!("active"));
|
||||
assert!(matches!(filter, FilterExpression::Eq { .. }));
|
||||
|
||||
let filter = FilterExpression::and(vec![
|
||||
FilterExpression::eq("status", json!("active")),
|
||||
FilterExpression::gte("age", json!(18)),
|
||||
]);
|
||||
assert!(matches!(filter, FilterExpression::And(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_fields() {
|
||||
let filter = FilterExpression::and(vec![
|
||||
FilterExpression::eq("status", json!("active")),
|
||||
FilterExpression::or(vec![
|
||||
FilterExpression::gte("age", json!(18)),
|
||||
FilterExpression::lt("score", json!(100)),
|
||||
]),
|
||||
]);
|
||||
|
||||
let fields = filter.get_fields();
|
||||
assert_eq!(fields, vec!["age", "score", "status"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialization() {
|
||||
let filter = FilterExpression::eq("status", json!("active"));
|
||||
let json = serde_json::to_string(&filter).unwrap();
|
||||
let deserialized: FilterExpression = serde_json::from_str(&json).unwrap();
|
||||
assert!(matches!(deserialized, FilterExpression::Eq { .. }));
|
||||
}
|
||||
}
|
||||
380
crates/ruvector-filter/src/index.rs
Normal file
380
crates/ruvector-filter/src/index.rs
Normal file
@@ -0,0 +1,380 @@
|
||||
use crate::error::{FilterError, Result};
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
|
||||
/// Type of payload index
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum IndexType {
|
||||
Integer,
|
||||
Float,
|
||||
Keyword,
|
||||
Bool,
|
||||
Geo,
|
||||
Text,
|
||||
}
|
||||
|
||||
/// Payload index for efficient filtering
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum PayloadIndex {
|
||||
Integer(BTreeMap<i64, HashSet<String>>),
|
||||
Float(BTreeMap<OrderedFloat<f64>, HashSet<String>>),
|
||||
Keyword(HashMap<String, HashSet<String>>),
|
||||
Bool(HashMap<bool, HashSet<String>>),
|
||||
Geo(Vec<(String, f64, f64)>), // vector_id, lat, lon
|
||||
Text(HashMap<String, HashSet<String>>), // Simple text index (word -> vector_ids)
|
||||
}
|
||||
|
||||
impl PayloadIndex {
|
||||
/// Create a new index of the given type
|
||||
pub fn new(index_type: IndexType) -> Self {
|
||||
match index_type {
|
||||
IndexType::Integer => Self::Integer(BTreeMap::new()),
|
||||
IndexType::Float => Self::Float(BTreeMap::new()),
|
||||
IndexType::Keyword => Self::Keyword(HashMap::new()),
|
||||
IndexType::Bool => Self::Bool(HashMap::new()),
|
||||
IndexType::Geo => Self::Geo(Vec::new()),
|
||||
IndexType::Text => Self::Text(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the index type
|
||||
pub fn index_type(&self) -> IndexType {
|
||||
match self {
|
||||
Self::Integer(_) => IndexType::Integer,
|
||||
Self::Float(_) => IndexType::Float,
|
||||
Self::Keyword(_) => IndexType::Keyword,
|
||||
Self::Bool(_) => IndexType::Bool,
|
||||
Self::Geo(_) => IndexType::Geo,
|
||||
Self::Text(_) => IndexType::Text,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a value to the index
|
||||
pub fn add(&mut self, vector_id: &str, value: &Value) -> Result<()> {
|
||||
match self {
|
||||
Self::Integer(index) => {
|
||||
if let Some(num) = value.as_i64() {
|
||||
index
|
||||
.entry(num)
|
||||
.or_insert_with(HashSet::new)
|
||||
.insert(vector_id.to_string());
|
||||
}
|
||||
}
|
||||
Self::Float(index) => {
|
||||
if let Some(num) = value.as_f64() {
|
||||
index
|
||||
.entry(OrderedFloat(num))
|
||||
.or_insert_with(HashSet::new)
|
||||
.insert(vector_id.to_string());
|
||||
}
|
||||
}
|
||||
Self::Keyword(index) => {
|
||||
if let Some(s) = value.as_str() {
|
||||
index
|
||||
.entry(s.to_string())
|
||||
.or_insert_with(HashSet::new)
|
||||
.insert(vector_id.to_string());
|
||||
}
|
||||
}
|
||||
Self::Bool(index) => {
|
||||
if let Some(b) = value.as_bool() {
|
||||
index
|
||||
.entry(b)
|
||||
.or_insert_with(HashSet::new)
|
||||
.insert(vector_id.to_string());
|
||||
}
|
||||
}
|
||||
Self::Geo(index) => {
|
||||
if let Some(obj) = value.as_object() {
|
||||
if let (Some(lat), Some(lon)) = (
|
||||
obj.get("lat").and_then(|v| v.as_f64()),
|
||||
obj.get("lon").and_then(|v| v.as_f64()),
|
||||
) {
|
||||
index.push((vector_id.to_string(), lat, lon));
|
||||
}
|
||||
}
|
||||
}
|
||||
Self::Text(index) => {
|
||||
if let Some(text) = value.as_str() {
|
||||
// Simple word tokenization
|
||||
for word in text.split_whitespace() {
|
||||
let word = word.to_lowercase();
|
||||
index
|
||||
.entry(word)
|
||||
.or_insert_with(HashSet::new)
|
||||
.insert(vector_id.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remove a vector from the index
|
||||
pub fn remove(&mut self, vector_id: &str, value: &Value) -> Result<()> {
|
||||
match self {
|
||||
Self::Integer(index) => {
|
||||
if let Some(num) = value.as_i64() {
|
||||
if let Some(set) = index.get_mut(&num) {
|
||||
set.remove(vector_id);
|
||||
if set.is_empty() {
|
||||
index.remove(&num);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Self::Float(index) => {
|
||||
if let Some(num) = value.as_f64() {
|
||||
if let Some(set) = index.get_mut(&OrderedFloat(num)) {
|
||||
set.remove(vector_id);
|
||||
if set.is_empty() {
|
||||
index.remove(&OrderedFloat(num));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Self::Keyword(index) => {
|
||||
if let Some(s) = value.as_str() {
|
||||
if let Some(set) = index.get_mut(s) {
|
||||
set.remove(vector_id);
|
||||
if set.is_empty() {
|
||||
index.remove(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Self::Bool(index) => {
|
||||
if let Some(b) = value.as_bool() {
|
||||
if let Some(set) = index.get_mut(&b) {
|
||||
set.remove(vector_id);
|
||||
if set.is_empty() {
|
||||
index.remove(&b);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Self::Geo(index) => {
|
||||
index.retain(|(id, _, _)| id != vector_id);
|
||||
}
|
||||
Self::Text(index) => {
|
||||
if let Some(text) = value.as_str() {
|
||||
for word in text.split_whitespace() {
|
||||
let word = word.to_lowercase();
|
||||
if let Some(set) = index.get_mut(&word) {
|
||||
set.remove(vector_id);
|
||||
if set.is_empty() {
|
||||
index.remove(&word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clear all entries for a vector ID
|
||||
pub fn clear(&mut self, vector_id: &str) {
|
||||
match self {
|
||||
Self::Integer(index) => {
|
||||
for set in index.values_mut() {
|
||||
set.remove(vector_id);
|
||||
}
|
||||
index.retain(|_, set| !set.is_empty());
|
||||
}
|
||||
Self::Float(index) => {
|
||||
for set in index.values_mut() {
|
||||
set.remove(vector_id);
|
||||
}
|
||||
index.retain(|_, set| !set.is_empty());
|
||||
}
|
||||
Self::Keyword(index) => {
|
||||
for set in index.values_mut() {
|
||||
set.remove(vector_id);
|
||||
}
|
||||
index.retain(|_, set| !set.is_empty());
|
||||
}
|
||||
Self::Bool(index) => {
|
||||
for set in index.values_mut() {
|
||||
set.remove(vector_id);
|
||||
}
|
||||
index.retain(|_, set| !set.is_empty());
|
||||
}
|
||||
Self::Geo(index) => {
|
||||
index.retain(|(id, _, _)| id != vector_id);
|
||||
}
|
||||
Self::Text(index) => {
|
||||
for set in index.values_mut() {
|
||||
set.remove(vector_id);
|
||||
}
|
||||
index.retain(|_, set| !set.is_empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Manager for payload indices
|
||||
#[derive(Debug, Default)]
|
||||
pub struct PayloadIndexManager {
|
||||
indices: HashMap<String, PayloadIndex>,
|
||||
}
|
||||
|
||||
impl PayloadIndexManager {
|
||||
/// Create a new payload index manager
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
indices: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an index on a field
|
||||
pub fn create_index(&mut self, field: &str, index_type: IndexType) -> Result<()> {
|
||||
if self.indices.contains_key(field) {
|
||||
return Err(FilterError::InvalidExpression(format!(
|
||||
"Index already exists for field: {}",
|
||||
field
|
||||
)));
|
||||
}
|
||||
self.indices
|
||||
.insert(field.to_string(), PayloadIndex::new(index_type));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Drop an index
|
||||
pub fn drop_index(&mut self, field: &str) -> Result<()> {
|
||||
if self.indices.remove(field).is_none() {
|
||||
return Err(FilterError::IndexNotFound(field.to_string()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check if an index exists for a field
|
||||
pub fn has_index(&self, field: &str) -> bool {
|
||||
self.indices.contains_key(field)
|
||||
}
|
||||
|
||||
/// Get an index by field name
|
||||
pub fn get_index(&self, field: &str) -> Option<&PayloadIndex> {
|
||||
self.indices.get(field)
|
||||
}
|
||||
|
||||
/// Get a mutable index by field name
|
||||
pub fn get_index_mut(&mut self, field: &str) -> Option<&mut PayloadIndex> {
|
||||
self.indices.get_mut(field)
|
||||
}
|
||||
|
||||
/// Index a payload for a vector
|
||||
pub fn index_payload(&mut self, vector_id: &str, payload: &Value) -> Result<()> {
|
||||
if let Some(obj) = payload.as_object() {
|
||||
for (field, value) in obj {
|
||||
if let Some(index) = self.indices.get_mut(field) {
|
||||
index.add(vector_id, value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remove a payload from all indices
|
||||
pub fn remove_payload(&mut self, vector_id: &str, payload: &Value) -> Result<()> {
|
||||
if let Some(obj) = payload.as_object() {
|
||||
for (field, value) in obj {
|
||||
if let Some(index) = self.indices.get_mut(field) {
|
||||
index.remove(vector_id, value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clear all entries for a vector ID from all indices
|
||||
pub fn clear_vector(&mut self, vector_id: &str) {
|
||||
for index in self.indices.values_mut() {
|
||||
index.clear(vector_id);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all indexed fields
|
||||
pub fn indexed_fields(&self) -> Vec<String> {
|
||||
self.indices.keys().cloned().collect()
|
||||
}
|
||||
|
||||
/// Get the number of indices
|
||||
pub fn index_count(&self) -> usize {
|
||||
self.indices.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn test_integer_index() {
|
||||
let mut index = PayloadIndex::new(IndexType::Integer);
|
||||
index.add("v1", &json!(42)).unwrap();
|
||||
index.add("v2", &json!(42)).unwrap();
|
||||
index.add("v3", &json!(100)).unwrap();
|
||||
|
||||
if let PayloadIndex::Integer(map) = index {
|
||||
assert_eq!(map.get(&42).unwrap().len(), 2);
|
||||
assert_eq!(map.get(&100).unwrap().len(), 1);
|
||||
} else {
|
||||
panic!("Wrong index type");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_keyword_index() {
|
||||
let mut index = PayloadIndex::new(IndexType::Keyword);
|
||||
index.add("v1", &json!("active")).unwrap();
|
||||
index.add("v2", &json!("active")).unwrap();
|
||||
index.add("v3", &json!("inactive")).unwrap();
|
||||
|
||||
if let PayloadIndex::Keyword(map) = index {
|
||||
assert_eq!(map.get("active").unwrap().len(), 2);
|
||||
assert_eq!(map.get("inactive").unwrap().len(), 1);
|
||||
} else {
|
||||
panic!("Wrong index type");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_manager() {
|
||||
let mut manager = PayloadIndexManager::new();
|
||||
manager.create_index("age", IndexType::Integer).unwrap();
|
||||
manager.create_index("status", IndexType::Keyword).unwrap();
|
||||
|
||||
let payload = json!({
|
||||
"age": 25,
|
||||
"status": "active",
|
||||
"name": "Alice"
|
||||
});
|
||||
|
||||
manager.index_payload("v1", &payload).unwrap();
|
||||
assert!(manager.has_index("age"));
|
||||
assert!(manager.has_index("status"));
|
||||
assert!(!manager.has_index("name"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_geo_index() {
|
||||
let mut index = PayloadIndex::new(IndexType::Geo);
|
||||
index
|
||||
.add("v1", &json!({"lat": 40.7128, "lon": -74.0060}))
|
||||
.unwrap();
|
||||
index
|
||||
.add("v2", &json!({"lat": 34.0522, "lon": -118.2437}))
|
||||
.unwrap();
|
||||
|
||||
if let PayloadIndex::Geo(points) = index {
|
||||
assert_eq!(points.len(), 2);
|
||||
} else {
|
||||
panic!("Wrong index type");
|
||||
}
|
||||
}
|
||||
}
|
||||
215
crates/ruvector-filter/src/lib.rs
Normal file
215
crates/ruvector-filter/src/lib.rs
Normal file
@@ -0,0 +1,215 @@
|
||||
#![recursion_limit = "2048"]
|
||||
|
||||
//! # rUvector Filter
|
||||
//!
|
||||
//! Advanced payload indexing and filtering for rUvector.
|
||||
//!
|
||||
//! This crate provides:
|
||||
//! - Flexible filter expressions (equality, range, geo, text, logical operators)
|
||||
//! - Efficient payload indexing (integer, float, keyword, boolean, geo, text)
|
||||
//! - Fast filter evaluation using indices
|
||||
//! - Support for complex queries with AND/OR/NOT
|
||||
//!
|
||||
//! ## Examples
|
||||
//!
|
||||
//! ### Creating and Using Filters
|
||||
//!
|
||||
//! ```rust
|
||||
//! use ruvector_filter::{FilterExpression, PayloadIndexManager, FilterEvaluator, IndexType};
|
||||
//! use serde_json::json;
|
||||
//!
|
||||
//! // Create index manager
|
||||
//! let mut manager = PayloadIndexManager::new();
|
||||
//! manager.create_index("status", IndexType::Keyword).unwrap();
|
||||
//! manager.create_index("age", IndexType::Integer).unwrap();
|
||||
//!
|
||||
//! // Index some payloads
|
||||
//! manager.index_payload("v1", &json!({"status": "active", "age": 25})).unwrap();
|
||||
//! manager.index_payload("v2", &json!({"status": "active", "age": 30})).unwrap();
|
||||
//! manager.index_payload("v3", &json!({"status": "inactive", "age": 25})).unwrap();
|
||||
//!
|
||||
//! // Create filter
|
||||
//! let filter = FilterExpression::and(vec![
|
||||
//! FilterExpression::eq("status", json!("active")),
|
||||
//! FilterExpression::gte("age", json!(25)),
|
||||
//! ]);
|
||||
//!
|
||||
//! // Evaluate filter
|
||||
//! let evaluator = FilterEvaluator::new(&manager);
|
||||
//! let results = evaluator.evaluate(&filter).unwrap();
|
||||
//! assert_eq!(results.len(), 2);
|
||||
//! ```
|
||||
//!
|
||||
//! ### Geo Filtering
|
||||
//!
|
||||
//! ```rust
|
||||
//! use ruvector_filter::{FilterExpression, PayloadIndexManager, FilterEvaluator, IndexType};
|
||||
//! use serde_json::json;
|
||||
//!
|
||||
//! let mut manager = PayloadIndexManager::new();
|
||||
//! manager.create_index("location", IndexType::Geo).unwrap();
|
||||
//!
|
||||
//! manager.index_payload("v1", &json!({
|
||||
//! "location": {"lat": 40.7128, "lon": -74.0060}
|
||||
//! })).unwrap();
|
||||
//!
|
||||
//! // Find all points within 1000m of a location
|
||||
//! let filter = FilterExpression::geo_radius("location", 40.7128, -74.0060, 1000.0);
|
||||
//! let evaluator = FilterEvaluator::new(&manager);
|
||||
//! let results = evaluator.evaluate(&filter).unwrap();
|
||||
//! ```
|
||||
|
||||
pub mod error;
|
||||
pub mod evaluator;
|
||||
pub mod expression;
|
||||
pub mod index;
|
||||
|
||||
// Re-export main types
|
||||
pub use error::{FilterError, Result};
|
||||
pub use evaluator::FilterEvaluator;
|
||||
pub use expression::FilterExpression;
|
||||
pub use index::{IndexType, PayloadIndex, PayloadIndexManager};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn test_full_workflow() {
|
||||
// Create index manager
|
||||
let mut manager = PayloadIndexManager::new();
|
||||
manager.create_index("status", IndexType::Keyword).unwrap();
|
||||
manager.create_index("age", IndexType::Integer).unwrap();
|
||||
manager.create_index("score", IndexType::Float).unwrap();
|
||||
|
||||
// Index payloads
|
||||
manager
|
||||
.index_payload(
|
||||
"v1",
|
||||
&json!({
|
||||
"status": "active",
|
||||
"age": 25,
|
||||
"score": 0.9
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
manager
|
||||
.index_payload(
|
||||
"v2",
|
||||
&json!({
|
||||
"status": "active",
|
||||
"age": 30,
|
||||
"score": 0.85
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
manager
|
||||
.index_payload(
|
||||
"v3",
|
||||
&json!({
|
||||
"status": "inactive",
|
||||
"age": 25,
|
||||
"score": 0.7
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Create complex filter
|
||||
let filter = FilterExpression::and(vec![
|
||||
FilterExpression::eq("status", json!("active")),
|
||||
FilterExpression::or(vec![
|
||||
FilterExpression::gte("age", json!(30)),
|
||||
FilterExpression::gte("score", json!(0.9)),
|
||||
]),
|
||||
]);
|
||||
|
||||
// Evaluate
|
||||
let evaluator = FilterEvaluator::new(&manager);
|
||||
let results = evaluator.evaluate(&filter).unwrap();
|
||||
|
||||
// Should match v1 (age=25, score=0.9) and v2 (age=30, score=0.85)
|
||||
assert_eq!(results.len(), 2);
|
||||
assert!(results.contains("v1"));
|
||||
assert!(results.contains("v2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_matching() {
|
||||
let mut manager = PayloadIndexManager::new();
|
||||
manager
|
||||
.create_index("description", IndexType::Text)
|
||||
.unwrap();
|
||||
|
||||
manager
|
||||
.index_payload(
|
||||
"v1",
|
||||
&json!({
|
||||
"description": "The quick brown fox"
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
manager
|
||||
.index_payload(
|
||||
"v2",
|
||||
&json!({
|
||||
"description": "The lazy dog"
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let evaluator = FilterEvaluator::new(&manager);
|
||||
let filter = FilterExpression::match_text("description", "quick");
|
||||
let results = evaluator.evaluate(&filter).unwrap();
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert!(results.contains("v1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_not_filter() {
|
||||
let mut manager = PayloadIndexManager::new();
|
||||
manager.create_index("status", IndexType::Keyword).unwrap();
|
||||
|
||||
manager
|
||||
.index_payload("v1", &json!({"status": "active"}))
|
||||
.unwrap();
|
||||
manager
|
||||
.index_payload("v2", &json!({"status": "inactive"}))
|
||||
.unwrap();
|
||||
|
||||
let evaluator = FilterEvaluator::new(&manager);
|
||||
let filter = FilterExpression::not(FilterExpression::eq("status", json!("active")));
|
||||
let results = evaluator.evaluate(&filter).unwrap();
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert!(results.contains("v2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_in_filter() {
|
||||
let mut manager = PayloadIndexManager::new();
|
||||
manager.create_index("status", IndexType::Keyword).unwrap();
|
||||
|
||||
manager
|
||||
.index_payload("v1", &json!({"status": "active"}))
|
||||
.unwrap();
|
||||
manager
|
||||
.index_payload("v2", &json!({"status": "pending"}))
|
||||
.unwrap();
|
||||
manager
|
||||
.index_payload("v3", &json!({"status": "inactive"}))
|
||||
.unwrap();
|
||||
|
||||
let evaluator = FilterEvaluator::new(&manager);
|
||||
let filter = FilterExpression::in_values("status", vec![json!("active"), json!("pending")]);
|
||||
let results = evaluator.evaluate(&filter).unwrap();
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
assert!(results.contains("v1"));
|
||||
assert!(results.contains("v2"));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user