Files
wifi-densepose/vendor/ruvector/examples/data/openalex/src/frontier.rs

519 lines
17 KiB
Rust

//! Research frontier detection using coherence signals
use std::collections::HashMap;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use crate::{TopicEdge, TopicGraph, TopicNode, Work};
/// An emerging research frontier
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmergingFrontier {
/// Frontier identifier
pub id: String,
/// Primary topic name
pub name: String,
/// Related topic names
pub related_topics: Vec<String>,
/// Growth rate (works per year)
pub growth_rate: f64,
/// Coherence delta (change in min-cut boundary)
pub coherence_delta: f64,
/// Citation momentum (trend in citation rates)
pub citation_momentum: f64,
/// Detected boundary nodes (topics at the frontier edge)
pub boundary_topics: Vec<String>,
/// First detected
pub detected_at: DateTime<Utc>,
/// Confidence score (0-1)
pub confidence: f64,
/// Evidence supporting this frontier
pub evidence: Vec<FrontierEvidence>,
}
/// Evidence for a frontier detection
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FrontierEvidence {
/// Evidence type
pub evidence_type: String,
/// Value
pub value: f64,
/// Explanation
pub explanation: String,
}
/// A cross-domain bridge connecting two research areas
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrossDomainBridge {
/// Bridge identifier
pub id: String,
/// Source domain/topic
pub source_domain: String,
/// Target domain/topic
pub target_domain: String,
/// Bridge topics (connector nodes)
pub bridge_topics: Vec<String>,
/// Citation flow (source → target)
pub citation_flow: f64,
/// Reverse flow (target → source)
pub reverse_flow: f64,
/// Bridge strength (combined normalized flow)
pub strength: f64,
/// Is this a new connection?
pub is_emerging: bool,
/// First observed
pub first_observed: DateTime<Utc>,
/// Key papers establishing the bridge
pub key_works: Vec<String>,
}
/// Research frontier radar for detecting emerging fields
pub struct FrontierRadar {
/// Topic graph snapshots over time
snapshots: Vec<(DateTime<Utc>, TopicGraph)>,
/// Minimum growth rate to consider
min_growth_rate: f64,
/// Minimum coherence shift to detect
min_coherence_shift: f64,
/// Detected frontiers
frontiers: Vec<EmergingFrontier>,
/// Detected bridges
bridges: Vec<CrossDomainBridge>,
}
impl FrontierRadar {
/// Create a new frontier radar
pub fn new(min_growth_rate: f64, min_coherence_shift: f64) -> Self {
Self {
snapshots: Vec::new(),
min_growth_rate,
min_coherence_shift,
frontiers: Vec::new(),
bridges: Vec::new(),
}
}
/// Add a topic graph snapshot
pub fn add_snapshot(&mut self, timestamp: DateTime<Utc>, graph: TopicGraph) {
self.snapshots.push((timestamp, graph));
self.snapshots.sort_by_key(|(ts, _)| *ts);
}
/// Build snapshots from works partitioned by time
pub fn build_from_works(&mut self, works: &[Work], window_days: i64) {
if works.is_empty() {
return;
}
// Find time range
let mut min_date = Utc::now();
let mut max_date = DateTime::<Utc>::MIN_UTC;
for work in works {
if let Some(date) = work.publication_date {
if date < min_date {
min_date = date;
}
if date > max_date {
max_date = date;
}
}
}
// Partition works into time windows
let window_duration = chrono::Duration::days(window_days);
let mut current_start = min_date;
while current_start < max_date {
let current_end = current_start + window_duration;
let window_works: Vec<_> = works
.iter()
.filter(|w| {
w.publication_date
.map(|d| d >= current_start && d < current_end)
.unwrap_or(false)
})
.cloned()
.collect();
if !window_works.is_empty() {
let graph = TopicGraph::from_works(&window_works);
self.add_snapshot(current_start, graph);
}
current_start = current_end;
}
}
/// Detect emerging frontiers from snapshots
pub fn detect_frontiers(&mut self) -> Vec<EmergingFrontier> {
if self.snapshots.len() < 2 {
return vec![];
}
let mut frontiers = Vec::new();
let mut frontier_counter = 0;
// Compare consecutive snapshots
for i in 1..self.snapshots.len() {
let (prev_ts, prev_graph) = &self.snapshots[i - 1];
let (curr_ts, curr_graph) = &self.snapshots[i];
// Find topics with significant growth
for (topic_id, curr_node) in &curr_graph.topics {
let prev_node = prev_graph.topics.get(topic_id);
let growth = if let Some(prev) = prev_node {
if prev.work_count > 0 {
(curr_node.work_count as f64 - prev.work_count as f64)
/ prev.work_count as f64
} else {
f64::INFINITY
}
} else {
// New topic
f64::INFINITY
};
if growth > self.min_growth_rate {
// Calculate coherence shift
let coherence_delta = self.compute_topic_coherence_delta(
topic_id,
prev_graph,
curr_graph,
);
if coherence_delta.abs() > self.min_coherence_shift {
// Calculate citation momentum
let citation_momentum = curr_node.avg_citations
- prev_node.map(|n| n.avg_citations).unwrap_or(0.0);
// Find boundary topics
let boundary_topics = self.find_boundary_topics(topic_id, curr_graph);
// Build evidence
let mut evidence = vec![
FrontierEvidence {
evidence_type: "growth_rate".to_string(),
value: growth,
explanation: format!(
"{:.0}% increase in works",
growth * 100.0
),
},
FrontierEvidence {
evidence_type: "coherence_delta".to_string(),
value: coherence_delta,
explanation: format!(
"Coherence {} by {:.2}",
if coherence_delta > 0.0 {
"increased"
} else {
"decreased"
},
coherence_delta.abs()
),
},
];
if citation_momentum > 0.0 {
evidence.push(FrontierEvidence {
evidence_type: "citation_momentum".to_string(),
value: citation_momentum,
explanation: format!(
"+{:.1} avg citations",
citation_momentum
),
});
}
// Calculate confidence based on evidence strength
let confidence = self.calculate_confidence(growth, coherence_delta, citation_momentum);
if confidence >= 0.3 {
frontiers.push(EmergingFrontier {
id: format!("frontier_{}", frontier_counter),
name: curr_node.name.clone(),
related_topics: self.find_related_topics(topic_id, curr_graph),
growth_rate: curr_node.growth_rate,
coherence_delta,
citation_momentum,
boundary_topics,
detected_at: *curr_ts,
confidence,
evidence,
});
frontier_counter += 1;
}
}
}
}
}
// Sort by confidence
frontiers.sort_by(|a, b| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
});
self.frontiers = frontiers.clone();
frontiers
}
/// Detect cross-domain bridges
pub fn detect_bridges(&mut self) -> Vec<CrossDomainBridge> {
if self.snapshots.is_empty() {
return vec![];
}
let mut bridges = Vec::new();
let mut bridge_counter = 0;
let (curr_ts, curr_graph) = self.snapshots.last().unwrap();
// Build domain → topics mapping (simplified: use top-level grouping)
let mut domain_topics: HashMap<String, Vec<String>> = HashMap::new();
for (topic_id, node) in &curr_graph.topics {
// Use first word as domain (simplified)
let domain = node
.name
.split_whitespace()
.next()
.unwrap_or("Unknown")
.to_string();
domain_topics
.entry(domain.clone())
.or_default()
.push(topic_id.clone());
}
// Find cross-domain edges
let mut domain_flows: HashMap<(String, String), Vec<&TopicEdge>> = HashMap::new();
for edge in &curr_graph.edges {
let src_domain = self.get_domain(&edge.source, curr_graph);
let tgt_domain = self.get_domain(&edge.target, curr_graph);
if src_domain != tgt_domain {
domain_flows
.entry((src_domain.clone(), tgt_domain.clone()))
.or_default()
.push(edge);
}
}
// Create bridge records
for ((src_domain, tgt_domain), edges) in domain_flows {
let total_flow: f64 = edges.iter().map(|e| e.weight).sum();
let citation_count: usize = edges.iter().map(|e| e.citation_count).sum();
if citation_count >= 5 {
// Minimum threshold
let bridge_topics: Vec<String> = edges
.iter()
.flat_map(|e| vec![e.source.clone(), e.target.clone()])
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
// Check if this is emerging (compare with previous snapshot)
let is_emerging = if self.snapshots.len() >= 2 {
let (_, prev_graph) = &self.snapshots[self.snapshots.len() - 2];
let prev_flow: f64 = prev_graph
.edges
.iter()
.filter(|e| {
self.get_domain(&e.source, prev_graph) == src_domain
&& self.get_domain(&e.target, prev_graph) == tgt_domain
})
.map(|e| e.weight)
.sum();
total_flow > prev_flow * 1.5 // 50% growth
} else {
true
};
bridges.push(CrossDomainBridge {
id: format!("bridge_{}", bridge_counter),
source_domain: src_domain.clone(),
target_domain: tgt_domain.clone(),
bridge_topics,
citation_flow: total_flow,
reverse_flow: 0.0, // Would need to compute reverse direction
strength: total_flow / citation_count as f64,
is_emerging,
first_observed: *curr_ts,
key_works: vec![], // Would need work-level data
});
bridge_counter += 1;
}
}
// Sort by strength
bridges.sort_by(|a, b| {
b.strength
.partial_cmp(&a.strength)
.unwrap_or(std::cmp::Ordering::Equal)
});
self.bridges = bridges.clone();
bridges
}
/// Compute coherence delta for a topic between snapshots
fn compute_topic_coherence_delta(
&self,
topic_id: &str,
prev_graph: &TopicGraph,
curr_graph: &TopicGraph,
) -> f64 {
// Compute local coherence as ratio of intra-topic to inter-topic edges
let prev_coherence = self.compute_local_coherence(topic_id, prev_graph);
let curr_coherence = self.compute_local_coherence(topic_id, curr_graph);
curr_coherence - prev_coherence
}
/// Compute local coherence for a topic
fn compute_local_coherence(&self, topic_id: &str, graph: &TopicGraph) -> f64 {
// Find edges involving this topic
let edges: Vec<_> = graph
.edges
.iter()
.filter(|e| e.source == topic_id || e.target == topic_id)
.collect();
if edges.is_empty() {
return 0.0;
}
// Coherence = sum of weights
edges.iter().map(|e| e.weight).sum::<f64>() / edges.len() as f64
}
/// Find topics at the boundary (connected to other clusters)
fn find_boundary_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
// Find topics connected to this topic that have high connectivity elsewhere
graph
.edges
.iter()
.filter(|e| e.source == topic_id)
.map(|e| e.target.clone())
.take(5)
.collect()
}
/// Find related topics
fn find_related_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
graph
.edges
.iter()
.filter(|e| e.source == topic_id || e.target == topic_id)
.flat_map(|e| {
if e.source == topic_id {
vec![e.target.clone()]
} else {
vec![e.source.clone()]
}
})
.take(10)
.collect()
}
/// Get domain for a topic (simplified)
fn get_domain(&self, topic_id: &str, graph: &TopicGraph) -> String {
graph
.topics
.get(topic_id)
.map(|n| {
n.name
.split_whitespace()
.next()
.unwrap_or("Unknown")
.to_string()
})
.unwrap_or_else(|| "Unknown".to_string())
}
/// Calculate confidence score
fn calculate_confidence(
&self,
growth: f64,
coherence_delta: f64,
citation_momentum: f64,
) -> f64 {
let growth_score = (growth.min(5.0) / 5.0).max(0.0);
let coherence_score = (coherence_delta.abs().min(1.0)).max(0.0);
let citation_score = (citation_momentum / 10.0).min(1.0).max(0.0);
(growth_score * 0.4 + coherence_score * 0.4 + citation_score * 0.2).min(1.0)
}
/// Get detected frontiers
pub fn frontiers(&self) -> &[EmergingFrontier] {
&self.frontiers
}
/// Get detected bridges
pub fn bridges(&self) -> &[CrossDomainBridge] {
&self.bridges
}
/// Get highest confidence frontiers
pub fn top_frontiers(&self, n: usize) -> Vec<&EmergingFrontier> {
self.frontiers.iter().take(n).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_frontier_radar_creation() {
let radar = FrontierRadar::new(0.1, 0.2);
assert!(radar.frontiers().is_empty());
assert!(radar.bridges().is_empty());
}
#[test]
fn test_confidence_calculation() {
let radar = FrontierRadar::new(0.1, 0.2);
// High confidence
let high = radar.calculate_confidence(2.0, 0.5, 5.0);
assert!(high > 0.5);
// Low confidence
let low = radar.calculate_confidence(0.05, 0.01, 0.1);
assert!(low < 0.3);
}
}