519 lines
17 KiB
Rust
519 lines
17 KiB
Rust
//! Research frontier detection using coherence signals
|
|
|
|
use std::collections::HashMap;
|
|
|
|
use chrono::{DateTime, Utc};
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use crate::{TopicEdge, TopicGraph, TopicNode, Work};
|
|
|
|
/// An emerging research frontier
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct EmergingFrontier {
|
|
/// Frontier identifier
|
|
pub id: String,
|
|
|
|
/// Primary topic name
|
|
pub name: String,
|
|
|
|
/// Related topic names
|
|
pub related_topics: Vec<String>,
|
|
|
|
/// Growth rate (works per year)
|
|
pub growth_rate: f64,
|
|
|
|
/// Coherence delta (change in min-cut boundary)
|
|
pub coherence_delta: f64,
|
|
|
|
/// Citation momentum (trend in citation rates)
|
|
pub citation_momentum: f64,
|
|
|
|
/// Detected boundary nodes (topics at the frontier edge)
|
|
pub boundary_topics: Vec<String>,
|
|
|
|
/// First detected
|
|
pub detected_at: DateTime<Utc>,
|
|
|
|
/// Confidence score (0-1)
|
|
pub confidence: f64,
|
|
|
|
/// Evidence supporting this frontier
|
|
pub evidence: Vec<FrontierEvidence>,
|
|
}
|
|
|
|
/// Evidence for a frontier detection
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct FrontierEvidence {
|
|
/// Evidence type
|
|
pub evidence_type: String,
|
|
|
|
/// Value
|
|
pub value: f64,
|
|
|
|
/// Explanation
|
|
pub explanation: String,
|
|
}
|
|
|
|
/// A cross-domain bridge connecting two research areas
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct CrossDomainBridge {
|
|
/// Bridge identifier
|
|
pub id: String,
|
|
|
|
/// Source domain/topic
|
|
pub source_domain: String,
|
|
|
|
/// Target domain/topic
|
|
pub target_domain: String,
|
|
|
|
/// Bridge topics (connector nodes)
|
|
pub bridge_topics: Vec<String>,
|
|
|
|
/// Citation flow (source → target)
|
|
pub citation_flow: f64,
|
|
|
|
/// Reverse flow (target → source)
|
|
pub reverse_flow: f64,
|
|
|
|
/// Bridge strength (combined normalized flow)
|
|
pub strength: f64,
|
|
|
|
/// Is this a new connection?
|
|
pub is_emerging: bool,
|
|
|
|
/// First observed
|
|
pub first_observed: DateTime<Utc>,
|
|
|
|
/// Key papers establishing the bridge
|
|
pub key_works: Vec<String>,
|
|
}
|
|
|
|
/// Research frontier radar for detecting emerging fields
|
|
pub struct FrontierRadar {
|
|
/// Topic graph snapshots over time
|
|
snapshots: Vec<(DateTime<Utc>, TopicGraph)>,
|
|
|
|
/// Minimum growth rate to consider
|
|
min_growth_rate: f64,
|
|
|
|
/// Minimum coherence shift to detect
|
|
min_coherence_shift: f64,
|
|
|
|
/// Detected frontiers
|
|
frontiers: Vec<EmergingFrontier>,
|
|
|
|
/// Detected bridges
|
|
bridges: Vec<CrossDomainBridge>,
|
|
}
|
|
|
|
impl FrontierRadar {
|
|
/// Create a new frontier radar
|
|
pub fn new(min_growth_rate: f64, min_coherence_shift: f64) -> Self {
|
|
Self {
|
|
snapshots: Vec::new(),
|
|
min_growth_rate,
|
|
min_coherence_shift,
|
|
frontiers: Vec::new(),
|
|
bridges: Vec::new(),
|
|
}
|
|
}
|
|
|
|
/// Add a topic graph snapshot
|
|
pub fn add_snapshot(&mut self, timestamp: DateTime<Utc>, graph: TopicGraph) {
|
|
self.snapshots.push((timestamp, graph));
|
|
self.snapshots.sort_by_key(|(ts, _)| *ts);
|
|
}
|
|
|
|
/// Build snapshots from works partitioned by time
|
|
pub fn build_from_works(&mut self, works: &[Work], window_days: i64) {
|
|
if works.is_empty() {
|
|
return;
|
|
}
|
|
|
|
// Find time range
|
|
let mut min_date = Utc::now();
|
|
let mut max_date = DateTime::<Utc>::MIN_UTC;
|
|
|
|
for work in works {
|
|
if let Some(date) = work.publication_date {
|
|
if date < min_date {
|
|
min_date = date;
|
|
}
|
|
if date > max_date {
|
|
max_date = date;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Partition works into time windows
|
|
let window_duration = chrono::Duration::days(window_days);
|
|
let mut current_start = min_date;
|
|
|
|
while current_start < max_date {
|
|
let current_end = current_start + window_duration;
|
|
|
|
let window_works: Vec<_> = works
|
|
.iter()
|
|
.filter(|w| {
|
|
w.publication_date
|
|
.map(|d| d >= current_start && d < current_end)
|
|
.unwrap_or(false)
|
|
})
|
|
.cloned()
|
|
.collect();
|
|
|
|
if !window_works.is_empty() {
|
|
let graph = TopicGraph::from_works(&window_works);
|
|
self.add_snapshot(current_start, graph);
|
|
}
|
|
|
|
current_start = current_end;
|
|
}
|
|
}
|
|
|
|
/// Detect emerging frontiers from snapshots
|
|
pub fn detect_frontiers(&mut self) -> Vec<EmergingFrontier> {
|
|
if self.snapshots.len() < 2 {
|
|
return vec![];
|
|
}
|
|
|
|
let mut frontiers = Vec::new();
|
|
let mut frontier_counter = 0;
|
|
|
|
// Compare consecutive snapshots
|
|
for i in 1..self.snapshots.len() {
|
|
let (prev_ts, prev_graph) = &self.snapshots[i - 1];
|
|
let (curr_ts, curr_graph) = &self.snapshots[i];
|
|
|
|
// Find topics with significant growth
|
|
for (topic_id, curr_node) in &curr_graph.topics {
|
|
let prev_node = prev_graph.topics.get(topic_id);
|
|
|
|
let growth = if let Some(prev) = prev_node {
|
|
if prev.work_count > 0 {
|
|
(curr_node.work_count as f64 - prev.work_count as f64)
|
|
/ prev.work_count as f64
|
|
} else {
|
|
f64::INFINITY
|
|
}
|
|
} else {
|
|
// New topic
|
|
f64::INFINITY
|
|
};
|
|
|
|
if growth > self.min_growth_rate {
|
|
// Calculate coherence shift
|
|
let coherence_delta = self.compute_topic_coherence_delta(
|
|
topic_id,
|
|
prev_graph,
|
|
curr_graph,
|
|
);
|
|
|
|
if coherence_delta.abs() > self.min_coherence_shift {
|
|
// Calculate citation momentum
|
|
let citation_momentum = curr_node.avg_citations
|
|
- prev_node.map(|n| n.avg_citations).unwrap_or(0.0);
|
|
|
|
// Find boundary topics
|
|
let boundary_topics = self.find_boundary_topics(topic_id, curr_graph);
|
|
|
|
// Build evidence
|
|
let mut evidence = vec![
|
|
FrontierEvidence {
|
|
evidence_type: "growth_rate".to_string(),
|
|
value: growth,
|
|
explanation: format!(
|
|
"{:.0}% increase in works",
|
|
growth * 100.0
|
|
),
|
|
},
|
|
FrontierEvidence {
|
|
evidence_type: "coherence_delta".to_string(),
|
|
value: coherence_delta,
|
|
explanation: format!(
|
|
"Coherence {} by {:.2}",
|
|
if coherence_delta > 0.0 {
|
|
"increased"
|
|
} else {
|
|
"decreased"
|
|
},
|
|
coherence_delta.abs()
|
|
),
|
|
},
|
|
];
|
|
|
|
if citation_momentum > 0.0 {
|
|
evidence.push(FrontierEvidence {
|
|
evidence_type: "citation_momentum".to_string(),
|
|
value: citation_momentum,
|
|
explanation: format!(
|
|
"+{:.1} avg citations",
|
|
citation_momentum
|
|
),
|
|
});
|
|
}
|
|
|
|
// Calculate confidence based on evidence strength
|
|
let confidence = self.calculate_confidence(growth, coherence_delta, citation_momentum);
|
|
|
|
if confidence >= 0.3 {
|
|
frontiers.push(EmergingFrontier {
|
|
id: format!("frontier_{}", frontier_counter),
|
|
name: curr_node.name.clone(),
|
|
related_topics: self.find_related_topics(topic_id, curr_graph),
|
|
growth_rate: curr_node.growth_rate,
|
|
coherence_delta,
|
|
citation_momentum,
|
|
boundary_topics,
|
|
detected_at: *curr_ts,
|
|
confidence,
|
|
evidence,
|
|
});
|
|
frontier_counter += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort by confidence
|
|
frontiers.sort_by(|a, b| {
|
|
b.confidence
|
|
.partial_cmp(&a.confidence)
|
|
.unwrap_or(std::cmp::Ordering::Equal)
|
|
});
|
|
|
|
self.frontiers = frontiers.clone();
|
|
frontiers
|
|
}
|
|
|
|
/// Detect cross-domain bridges
|
|
pub fn detect_bridges(&mut self) -> Vec<CrossDomainBridge> {
|
|
if self.snapshots.is_empty() {
|
|
return vec![];
|
|
}
|
|
|
|
let mut bridges = Vec::new();
|
|
let mut bridge_counter = 0;
|
|
|
|
let (curr_ts, curr_graph) = self.snapshots.last().unwrap();
|
|
|
|
// Build domain → topics mapping (simplified: use top-level grouping)
|
|
let mut domain_topics: HashMap<String, Vec<String>> = HashMap::new();
|
|
for (topic_id, node) in &curr_graph.topics {
|
|
// Use first word as domain (simplified)
|
|
let domain = node
|
|
.name
|
|
.split_whitespace()
|
|
.next()
|
|
.unwrap_or("Unknown")
|
|
.to_string();
|
|
domain_topics
|
|
.entry(domain.clone())
|
|
.or_default()
|
|
.push(topic_id.clone());
|
|
}
|
|
|
|
// Find cross-domain edges
|
|
let mut domain_flows: HashMap<(String, String), Vec<&TopicEdge>> = HashMap::new();
|
|
|
|
for edge in &curr_graph.edges {
|
|
let src_domain = self.get_domain(&edge.source, curr_graph);
|
|
let tgt_domain = self.get_domain(&edge.target, curr_graph);
|
|
|
|
if src_domain != tgt_domain {
|
|
domain_flows
|
|
.entry((src_domain.clone(), tgt_domain.clone()))
|
|
.or_default()
|
|
.push(edge);
|
|
}
|
|
}
|
|
|
|
// Create bridge records
|
|
for ((src_domain, tgt_domain), edges) in domain_flows {
|
|
let total_flow: f64 = edges.iter().map(|e| e.weight).sum();
|
|
let citation_count: usize = edges.iter().map(|e| e.citation_count).sum();
|
|
|
|
if citation_count >= 5 {
|
|
// Minimum threshold
|
|
let bridge_topics: Vec<String> = edges
|
|
.iter()
|
|
.flat_map(|e| vec![e.source.clone(), e.target.clone()])
|
|
.collect::<std::collections::HashSet<_>>()
|
|
.into_iter()
|
|
.collect();
|
|
|
|
// Check if this is emerging (compare with previous snapshot)
|
|
let is_emerging = if self.snapshots.len() >= 2 {
|
|
let (_, prev_graph) = &self.snapshots[self.snapshots.len() - 2];
|
|
let prev_flow: f64 = prev_graph
|
|
.edges
|
|
.iter()
|
|
.filter(|e| {
|
|
self.get_domain(&e.source, prev_graph) == src_domain
|
|
&& self.get_domain(&e.target, prev_graph) == tgt_domain
|
|
})
|
|
.map(|e| e.weight)
|
|
.sum();
|
|
total_flow > prev_flow * 1.5 // 50% growth
|
|
} else {
|
|
true
|
|
};
|
|
|
|
bridges.push(CrossDomainBridge {
|
|
id: format!("bridge_{}", bridge_counter),
|
|
source_domain: src_domain.clone(),
|
|
target_domain: tgt_domain.clone(),
|
|
bridge_topics,
|
|
citation_flow: total_flow,
|
|
reverse_flow: 0.0, // Would need to compute reverse direction
|
|
strength: total_flow / citation_count as f64,
|
|
is_emerging,
|
|
first_observed: *curr_ts,
|
|
key_works: vec![], // Would need work-level data
|
|
});
|
|
bridge_counter += 1;
|
|
}
|
|
}
|
|
|
|
// Sort by strength
|
|
bridges.sort_by(|a, b| {
|
|
b.strength
|
|
.partial_cmp(&a.strength)
|
|
.unwrap_or(std::cmp::Ordering::Equal)
|
|
});
|
|
|
|
self.bridges = bridges.clone();
|
|
bridges
|
|
}
|
|
|
|
/// Compute coherence delta for a topic between snapshots
|
|
fn compute_topic_coherence_delta(
|
|
&self,
|
|
topic_id: &str,
|
|
prev_graph: &TopicGraph,
|
|
curr_graph: &TopicGraph,
|
|
) -> f64 {
|
|
// Compute local coherence as ratio of intra-topic to inter-topic edges
|
|
let prev_coherence = self.compute_local_coherence(topic_id, prev_graph);
|
|
let curr_coherence = self.compute_local_coherence(topic_id, curr_graph);
|
|
|
|
curr_coherence - prev_coherence
|
|
}
|
|
|
|
/// Compute local coherence for a topic
|
|
fn compute_local_coherence(&self, topic_id: &str, graph: &TopicGraph) -> f64 {
|
|
// Find edges involving this topic
|
|
let edges: Vec<_> = graph
|
|
.edges
|
|
.iter()
|
|
.filter(|e| e.source == topic_id || e.target == topic_id)
|
|
.collect();
|
|
|
|
if edges.is_empty() {
|
|
return 0.0;
|
|
}
|
|
|
|
// Coherence = sum of weights
|
|
edges.iter().map(|e| e.weight).sum::<f64>() / edges.len() as f64
|
|
}
|
|
|
|
/// Find topics at the boundary (connected to other clusters)
|
|
fn find_boundary_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
|
|
// Find topics connected to this topic that have high connectivity elsewhere
|
|
graph
|
|
.edges
|
|
.iter()
|
|
.filter(|e| e.source == topic_id)
|
|
.map(|e| e.target.clone())
|
|
.take(5)
|
|
.collect()
|
|
}
|
|
|
|
/// Find related topics
|
|
fn find_related_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
|
|
graph
|
|
.edges
|
|
.iter()
|
|
.filter(|e| e.source == topic_id || e.target == topic_id)
|
|
.flat_map(|e| {
|
|
if e.source == topic_id {
|
|
vec![e.target.clone()]
|
|
} else {
|
|
vec![e.source.clone()]
|
|
}
|
|
})
|
|
.take(10)
|
|
.collect()
|
|
}
|
|
|
|
/// Get domain for a topic (simplified)
|
|
fn get_domain(&self, topic_id: &str, graph: &TopicGraph) -> String {
|
|
graph
|
|
.topics
|
|
.get(topic_id)
|
|
.map(|n| {
|
|
n.name
|
|
.split_whitespace()
|
|
.next()
|
|
.unwrap_or("Unknown")
|
|
.to_string()
|
|
})
|
|
.unwrap_or_else(|| "Unknown".to_string())
|
|
}
|
|
|
|
/// Calculate confidence score
|
|
fn calculate_confidence(
|
|
&self,
|
|
growth: f64,
|
|
coherence_delta: f64,
|
|
citation_momentum: f64,
|
|
) -> f64 {
|
|
let growth_score = (growth.min(5.0) / 5.0).max(0.0);
|
|
let coherence_score = (coherence_delta.abs().min(1.0)).max(0.0);
|
|
let citation_score = (citation_momentum / 10.0).min(1.0).max(0.0);
|
|
|
|
(growth_score * 0.4 + coherence_score * 0.4 + citation_score * 0.2).min(1.0)
|
|
}
|
|
|
|
/// Get detected frontiers
|
|
pub fn frontiers(&self) -> &[EmergingFrontier] {
|
|
&self.frontiers
|
|
}
|
|
|
|
/// Get detected bridges
|
|
pub fn bridges(&self) -> &[CrossDomainBridge] {
|
|
&self.bridges
|
|
}
|
|
|
|
/// Get highest confidence frontiers
|
|
pub fn top_frontiers(&self, n: usize) -> Vec<&EmergingFrontier> {
|
|
self.frontiers.iter().take(n).collect()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_frontier_radar_creation() {
|
|
let radar = FrontierRadar::new(0.1, 0.2);
|
|
assert!(radar.frontiers().is_empty());
|
|
assert!(radar.bridges().is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_confidence_calculation() {
|
|
let radar = FrontierRadar::new(0.1, 0.2);
|
|
|
|
// High confidence
|
|
let high = radar.calculate_confidence(2.0, 0.5, 5.0);
|
|
assert!(high > 0.5);
|
|
|
|
// Low confidence
|
|
let low = radar.calculate_confidence(0.05, 0.01, 0.1);
|
|
assert!(low < 0.3);
|
|
}
|
|
}
|