Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
518
vendor/ruvector/examples/data/openalex/src/frontier.rs
vendored
Normal file
518
vendor/ruvector/examples/data/openalex/src/frontier.rs
vendored
Normal file
@@ -0,0 +1,518 @@
|
||||
//! Research frontier detection using coherence signals
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{TopicEdge, TopicGraph, TopicNode, Work};
|
||||
|
||||
/// An emerging research frontier
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct EmergingFrontier {
|
||||
/// Frontier identifier
|
||||
pub id: String,
|
||||
|
||||
/// Primary topic name
|
||||
pub name: String,
|
||||
|
||||
/// Related topic names
|
||||
pub related_topics: Vec<String>,
|
||||
|
||||
/// Growth rate (works per year)
|
||||
pub growth_rate: f64,
|
||||
|
||||
/// Coherence delta (change in min-cut boundary)
|
||||
pub coherence_delta: f64,
|
||||
|
||||
/// Citation momentum (trend in citation rates)
|
||||
pub citation_momentum: f64,
|
||||
|
||||
/// Detected boundary nodes (topics at the frontier edge)
|
||||
pub boundary_topics: Vec<String>,
|
||||
|
||||
/// First detected
|
||||
pub detected_at: DateTime<Utc>,
|
||||
|
||||
/// Confidence score (0-1)
|
||||
pub confidence: f64,
|
||||
|
||||
/// Evidence supporting this frontier
|
||||
pub evidence: Vec<FrontierEvidence>,
|
||||
}
|
||||
|
||||
/// Evidence for a frontier detection
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FrontierEvidence {
|
||||
/// Evidence type
|
||||
pub evidence_type: String,
|
||||
|
||||
/// Value
|
||||
pub value: f64,
|
||||
|
||||
/// Explanation
|
||||
pub explanation: String,
|
||||
}
|
||||
|
||||
/// A cross-domain bridge connecting two research areas
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CrossDomainBridge {
|
||||
/// Bridge identifier
|
||||
pub id: String,
|
||||
|
||||
/// Source domain/topic
|
||||
pub source_domain: String,
|
||||
|
||||
/// Target domain/topic
|
||||
pub target_domain: String,
|
||||
|
||||
/// Bridge topics (connector nodes)
|
||||
pub bridge_topics: Vec<String>,
|
||||
|
||||
/// Citation flow (source → target)
|
||||
pub citation_flow: f64,
|
||||
|
||||
/// Reverse flow (target → source)
|
||||
pub reverse_flow: f64,
|
||||
|
||||
/// Bridge strength (combined normalized flow)
|
||||
pub strength: f64,
|
||||
|
||||
/// Is this a new connection?
|
||||
pub is_emerging: bool,
|
||||
|
||||
/// First observed
|
||||
pub first_observed: DateTime<Utc>,
|
||||
|
||||
/// Key papers establishing the bridge
|
||||
pub key_works: Vec<String>,
|
||||
}
|
||||
|
||||
/// Research frontier radar for detecting emerging fields
|
||||
pub struct FrontierRadar {
|
||||
/// Topic graph snapshots over time
|
||||
snapshots: Vec<(DateTime<Utc>, TopicGraph)>,
|
||||
|
||||
/// Minimum growth rate to consider
|
||||
min_growth_rate: f64,
|
||||
|
||||
/// Minimum coherence shift to detect
|
||||
min_coherence_shift: f64,
|
||||
|
||||
/// Detected frontiers
|
||||
frontiers: Vec<EmergingFrontier>,
|
||||
|
||||
/// Detected bridges
|
||||
bridges: Vec<CrossDomainBridge>,
|
||||
}
|
||||
|
||||
impl FrontierRadar {
|
||||
/// Create a new frontier radar
|
||||
pub fn new(min_growth_rate: f64, min_coherence_shift: f64) -> Self {
|
||||
Self {
|
||||
snapshots: Vec::new(),
|
||||
min_growth_rate,
|
||||
min_coherence_shift,
|
||||
frontiers: Vec::new(),
|
||||
bridges: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a topic graph snapshot
|
||||
pub fn add_snapshot(&mut self, timestamp: DateTime<Utc>, graph: TopicGraph) {
|
||||
self.snapshots.push((timestamp, graph));
|
||||
self.snapshots.sort_by_key(|(ts, _)| *ts);
|
||||
}
|
||||
|
||||
/// Build snapshots from works partitioned by time
|
||||
pub fn build_from_works(&mut self, works: &[Work], window_days: i64) {
|
||||
if works.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Find time range
|
||||
let mut min_date = Utc::now();
|
||||
let mut max_date = DateTime::<Utc>::MIN_UTC;
|
||||
|
||||
for work in works {
|
||||
if let Some(date) = work.publication_date {
|
||||
if date < min_date {
|
||||
min_date = date;
|
||||
}
|
||||
if date > max_date {
|
||||
max_date = date;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Partition works into time windows
|
||||
let window_duration = chrono::Duration::days(window_days);
|
||||
let mut current_start = min_date;
|
||||
|
||||
while current_start < max_date {
|
||||
let current_end = current_start + window_duration;
|
||||
|
||||
let window_works: Vec<_> = works
|
||||
.iter()
|
||||
.filter(|w| {
|
||||
w.publication_date
|
||||
.map(|d| d >= current_start && d < current_end)
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
if !window_works.is_empty() {
|
||||
let graph = TopicGraph::from_works(&window_works);
|
||||
self.add_snapshot(current_start, graph);
|
||||
}
|
||||
|
||||
current_start = current_end;
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect emerging frontiers from snapshots
|
||||
pub fn detect_frontiers(&mut self) -> Vec<EmergingFrontier> {
|
||||
if self.snapshots.len() < 2 {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let mut frontiers = Vec::new();
|
||||
let mut frontier_counter = 0;
|
||||
|
||||
// Compare consecutive snapshots
|
||||
for i in 1..self.snapshots.len() {
|
||||
let (prev_ts, prev_graph) = &self.snapshots[i - 1];
|
||||
let (curr_ts, curr_graph) = &self.snapshots[i];
|
||||
|
||||
// Find topics with significant growth
|
||||
for (topic_id, curr_node) in &curr_graph.topics {
|
||||
let prev_node = prev_graph.topics.get(topic_id);
|
||||
|
||||
let growth = if let Some(prev) = prev_node {
|
||||
if prev.work_count > 0 {
|
||||
(curr_node.work_count as f64 - prev.work_count as f64)
|
||||
/ prev.work_count as f64
|
||||
} else {
|
||||
f64::INFINITY
|
||||
}
|
||||
} else {
|
||||
// New topic
|
||||
f64::INFINITY
|
||||
};
|
||||
|
||||
if growth > self.min_growth_rate {
|
||||
// Calculate coherence shift
|
||||
let coherence_delta = self.compute_topic_coherence_delta(
|
||||
topic_id,
|
||||
prev_graph,
|
||||
curr_graph,
|
||||
);
|
||||
|
||||
if coherence_delta.abs() > self.min_coherence_shift {
|
||||
// Calculate citation momentum
|
||||
let citation_momentum = curr_node.avg_citations
|
||||
- prev_node.map(|n| n.avg_citations).unwrap_or(0.0);
|
||||
|
||||
// Find boundary topics
|
||||
let boundary_topics = self.find_boundary_topics(topic_id, curr_graph);
|
||||
|
||||
// Build evidence
|
||||
let mut evidence = vec![
|
||||
FrontierEvidence {
|
||||
evidence_type: "growth_rate".to_string(),
|
||||
value: growth,
|
||||
explanation: format!(
|
||||
"{:.0}% increase in works",
|
||||
growth * 100.0
|
||||
),
|
||||
},
|
||||
FrontierEvidence {
|
||||
evidence_type: "coherence_delta".to_string(),
|
||||
value: coherence_delta,
|
||||
explanation: format!(
|
||||
"Coherence {} by {:.2}",
|
||||
if coherence_delta > 0.0 {
|
||||
"increased"
|
||||
} else {
|
||||
"decreased"
|
||||
},
|
||||
coherence_delta.abs()
|
||||
),
|
||||
},
|
||||
];
|
||||
|
||||
if citation_momentum > 0.0 {
|
||||
evidence.push(FrontierEvidence {
|
||||
evidence_type: "citation_momentum".to_string(),
|
||||
value: citation_momentum,
|
||||
explanation: format!(
|
||||
"+{:.1} avg citations",
|
||||
citation_momentum
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
// Calculate confidence based on evidence strength
|
||||
let confidence = self.calculate_confidence(growth, coherence_delta, citation_momentum);
|
||||
|
||||
if confidence >= 0.3 {
|
||||
frontiers.push(EmergingFrontier {
|
||||
id: format!("frontier_{}", frontier_counter),
|
||||
name: curr_node.name.clone(),
|
||||
related_topics: self.find_related_topics(topic_id, curr_graph),
|
||||
growth_rate: curr_node.growth_rate,
|
||||
coherence_delta,
|
||||
citation_momentum,
|
||||
boundary_topics,
|
||||
detected_at: *curr_ts,
|
||||
confidence,
|
||||
evidence,
|
||||
});
|
||||
frontier_counter += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by confidence
|
||||
frontiers.sort_by(|a, b| {
|
||||
b.confidence
|
||||
.partial_cmp(&a.confidence)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
|
||||
self.frontiers = frontiers.clone();
|
||||
frontiers
|
||||
}
|
||||
|
||||
/// Detect cross-domain bridges
|
||||
pub fn detect_bridges(&mut self) -> Vec<CrossDomainBridge> {
|
||||
if self.snapshots.is_empty() {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let mut bridges = Vec::new();
|
||||
let mut bridge_counter = 0;
|
||||
|
||||
let (curr_ts, curr_graph) = self.snapshots.last().unwrap();
|
||||
|
||||
// Build domain → topics mapping (simplified: use top-level grouping)
|
||||
let mut domain_topics: HashMap<String, Vec<String>> = HashMap::new();
|
||||
for (topic_id, node) in &curr_graph.topics {
|
||||
// Use first word as domain (simplified)
|
||||
let domain = node
|
||||
.name
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.unwrap_or("Unknown")
|
||||
.to_string();
|
||||
domain_topics
|
||||
.entry(domain.clone())
|
||||
.or_default()
|
||||
.push(topic_id.clone());
|
||||
}
|
||||
|
||||
// Find cross-domain edges
|
||||
let mut domain_flows: HashMap<(String, String), Vec<&TopicEdge>> = HashMap::new();
|
||||
|
||||
for edge in &curr_graph.edges {
|
||||
let src_domain = self.get_domain(&edge.source, curr_graph);
|
||||
let tgt_domain = self.get_domain(&edge.target, curr_graph);
|
||||
|
||||
if src_domain != tgt_domain {
|
||||
domain_flows
|
||||
.entry((src_domain.clone(), tgt_domain.clone()))
|
||||
.or_default()
|
||||
.push(edge);
|
||||
}
|
||||
}
|
||||
|
||||
// Create bridge records
|
||||
for ((src_domain, tgt_domain), edges) in domain_flows {
|
||||
let total_flow: f64 = edges.iter().map(|e| e.weight).sum();
|
||||
let citation_count: usize = edges.iter().map(|e| e.citation_count).sum();
|
||||
|
||||
if citation_count >= 5 {
|
||||
// Minimum threshold
|
||||
let bridge_topics: Vec<String> = edges
|
||||
.iter()
|
||||
.flat_map(|e| vec![e.source.clone(), e.target.clone()])
|
||||
.collect::<std::collections::HashSet<_>>()
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
// Check if this is emerging (compare with previous snapshot)
|
||||
let is_emerging = if self.snapshots.len() >= 2 {
|
||||
let (_, prev_graph) = &self.snapshots[self.snapshots.len() - 2];
|
||||
let prev_flow: f64 = prev_graph
|
||||
.edges
|
||||
.iter()
|
||||
.filter(|e| {
|
||||
self.get_domain(&e.source, prev_graph) == src_domain
|
||||
&& self.get_domain(&e.target, prev_graph) == tgt_domain
|
||||
})
|
||||
.map(|e| e.weight)
|
||||
.sum();
|
||||
total_flow > prev_flow * 1.5 // 50% growth
|
||||
} else {
|
||||
true
|
||||
};
|
||||
|
||||
bridges.push(CrossDomainBridge {
|
||||
id: format!("bridge_{}", bridge_counter),
|
||||
source_domain: src_domain.clone(),
|
||||
target_domain: tgt_domain.clone(),
|
||||
bridge_topics,
|
||||
citation_flow: total_flow,
|
||||
reverse_flow: 0.0, // Would need to compute reverse direction
|
||||
strength: total_flow / citation_count as f64,
|
||||
is_emerging,
|
||||
first_observed: *curr_ts,
|
||||
key_works: vec![], // Would need work-level data
|
||||
});
|
||||
bridge_counter += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by strength
|
||||
bridges.sort_by(|a, b| {
|
||||
b.strength
|
||||
.partial_cmp(&a.strength)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
|
||||
self.bridges = bridges.clone();
|
||||
bridges
|
||||
}
|
||||
|
||||
/// Compute coherence delta for a topic between snapshots
|
||||
fn compute_topic_coherence_delta(
|
||||
&self,
|
||||
topic_id: &str,
|
||||
prev_graph: &TopicGraph,
|
||||
curr_graph: &TopicGraph,
|
||||
) -> f64 {
|
||||
// Compute local coherence as ratio of intra-topic to inter-topic edges
|
||||
let prev_coherence = self.compute_local_coherence(topic_id, prev_graph);
|
||||
let curr_coherence = self.compute_local_coherence(topic_id, curr_graph);
|
||||
|
||||
curr_coherence - prev_coherence
|
||||
}
|
||||
|
||||
/// Compute local coherence for a topic
|
||||
fn compute_local_coherence(&self, topic_id: &str, graph: &TopicGraph) -> f64 {
|
||||
// Find edges involving this topic
|
||||
let edges: Vec<_> = graph
|
||||
.edges
|
||||
.iter()
|
||||
.filter(|e| e.source == topic_id || e.target == topic_id)
|
||||
.collect();
|
||||
|
||||
if edges.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Coherence = sum of weights
|
||||
edges.iter().map(|e| e.weight).sum::<f64>() / edges.len() as f64
|
||||
}
|
||||
|
||||
/// Find topics at the boundary (connected to other clusters)
|
||||
fn find_boundary_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
|
||||
// Find topics connected to this topic that have high connectivity elsewhere
|
||||
graph
|
||||
.edges
|
||||
.iter()
|
||||
.filter(|e| e.source == topic_id)
|
||||
.map(|e| e.target.clone())
|
||||
.take(5)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Find related topics
|
||||
fn find_related_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
|
||||
graph
|
||||
.edges
|
||||
.iter()
|
||||
.filter(|e| e.source == topic_id || e.target == topic_id)
|
||||
.flat_map(|e| {
|
||||
if e.source == topic_id {
|
||||
vec![e.target.clone()]
|
||||
} else {
|
||||
vec![e.source.clone()]
|
||||
}
|
||||
})
|
||||
.take(10)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get domain for a topic (simplified)
|
||||
fn get_domain(&self, topic_id: &str, graph: &TopicGraph) -> String {
|
||||
graph
|
||||
.topics
|
||||
.get(topic_id)
|
||||
.map(|n| {
|
||||
n.name
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.unwrap_or("Unknown")
|
||||
.to_string()
|
||||
})
|
||||
.unwrap_or_else(|| "Unknown".to_string())
|
||||
}
|
||||
|
||||
/// Calculate confidence score
|
||||
fn calculate_confidence(
|
||||
&self,
|
||||
growth: f64,
|
||||
coherence_delta: f64,
|
||||
citation_momentum: f64,
|
||||
) -> f64 {
|
||||
let growth_score = (growth.min(5.0) / 5.0).max(0.0);
|
||||
let coherence_score = (coherence_delta.abs().min(1.0)).max(0.0);
|
||||
let citation_score = (citation_momentum / 10.0).min(1.0).max(0.0);
|
||||
|
||||
(growth_score * 0.4 + coherence_score * 0.4 + citation_score * 0.2).min(1.0)
|
||||
}
|
||||
|
||||
/// Get detected frontiers
|
||||
pub fn frontiers(&self) -> &[EmergingFrontier] {
|
||||
&self.frontiers
|
||||
}
|
||||
|
||||
/// Get detected bridges
|
||||
pub fn bridges(&self) -> &[CrossDomainBridge] {
|
||||
&self.bridges
|
||||
}
|
||||
|
||||
/// Get highest confidence frontiers
|
||||
pub fn top_frontiers(&self, n: usize) -> Vec<&EmergingFrontier> {
|
||||
self.frontiers.iter().take(n).collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_frontier_radar_creation() {
|
||||
let radar = FrontierRadar::new(0.1, 0.2);
|
||||
assert!(radar.frontiers().is_empty());
|
||||
assert!(radar.bridges().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_confidence_calculation() {
|
||||
let radar = FrontierRadar::new(0.1, 0.2);
|
||||
|
||||
// High confidence
|
||||
let high = radar.calculate_confidence(2.0, 0.5, 5.0);
|
||||
assert!(high > 0.5);
|
||||
|
||||
// Low confidence
|
||||
let low = radar.calculate_confidence(0.05, 0.01, 0.1);
|
||||
assert!(low < 0.3);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user