Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,49 @@
[package]
name = "ruvector-data-openalex"
version.workspace = true
edition.workspace = true
description = "OpenAlex research intelligence integration for RuVector"
license.workspace = true
repository.workspace = true
keywords = ["openalex", "research", "citations", "graph", "discovery"]
categories = ["science", "database"]
[dependencies]
# Core framework
ruvector-data-framework = { path = "../framework" }
# Async runtime
tokio.workspace = true
futures.workspace = true
async-trait.workspace = true
# Serialization
serde.workspace = true
serde_json.workspace = true
# HTTP client
reqwest.workspace = true
# Time handling
chrono.workspace = true
# Logging
tracing.workspace = true
thiserror.workspace = true
# Data processing
rayon.workspace = true
# URL encoding
urlencoding = "2.1"
# Compression for bulk downloads
flate2 = "1.0"
[dev-dependencies]
tokio-test = "0.4"
rand = "0.8"
[[example]]
name = "frontier_radar"
path = "examples/frontier_radar.rs"

View File

@@ -0,0 +1,322 @@
//! OpenAlex Research Frontier Discovery
//!
//! This example detects emerging research frontiers using citation graph analysis
//! and RuVector's dynamic coherence detection.
use chrono::{Duration, Utc};
use ruvector_data_openalex::{
OpenAlexClient, OpenAlexConfig, EntityType,
TopicGraph, TopicNode, TopicEdge,
frontier::{FrontierRadar, FrontierConfig},
};
use std::collections::HashMap;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ OpenAlex Research Frontier Discovery ║");
println!("║ Detecting Emerging Research via Citation Dynamics ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
// Initialize OpenAlex client
let config = OpenAlexConfig {
email: Some("ruvector-discovery@example.com".to_string()),
per_page: 200,
..Default::default()
};
let client = OpenAlexClient::new(config);
// Research areas to scan for emerging frontiers
let research_domains = [
("Quantum Machine Learning", "quantum computing AND machine learning"),
("Foundation Models", "large language model OR foundation model"),
("Embodied AI", "embodied AI OR robotics learning"),
("Mechanistic Interpretability", "interpretability AND neural network"),
("AI Safety", "AI safety OR alignment"),
("Synthetic Biology AI", "synthetic biology AND AI"),
("Climate AI", "climate AND machine learning"),
("Materials Discovery", "materials discovery AND AI"),
];
println!("🔍 Scanning {} research domains for emerging frontiers...\n", research_domains.len());
// Configure frontier detection
let frontier_config = FrontierConfig {
min_growth_rate: 0.15, // 15% citation growth threshold
coherence_sensitivity: 0.7, // High sensitivity to structure changes
time_window_months: 6, // Look at last 6 months
min_boundary_topics: 3, // Minimum topics at frontier
min_papers: 10, // Minimum papers to consider
};
let mut radar = FrontierRadar::new(frontier_config);
let mut all_discoveries = Vec::new();
let cutoff_date = Utc::now() - Duration::days(180);
for (domain_name, query) in &research_domains {
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
println!("📚 Domain: {}", domain_name);
println!();
// Fetch recent works in this domain
match client.search_works(query, Some(cutoff_date)).await {
Ok(works) => {
println!(" Found {} recent papers", works.len());
if works.is_empty() {
println!(" ⚠️ No papers found, skipping domain\n");
continue;
}
// Build topic citation graph
let mut topic_graph = TopicGraph::new();
let mut topic_papers: HashMap<String, Vec<String>> = HashMap::new();
let mut topic_citations: HashMap<String, usize> = HashMap::new();
for work in &works {
if let Some(concepts) = &work.concepts {
// Add topics and track papers per topic
for concept in concepts.iter().filter(|c| c.score > 0.3) {
let topic_id = concept.id.clone();
// Add topic node if not exists
if !topic_graph.nodes.iter().any(|n| n.id == topic_id) {
topic_graph.nodes.push(TopicNode {
id: topic_id.clone(),
name: concept.display_name.clone(),
level: concept.level as usize,
paper_count: 1,
citation_count: work.cited_by_count.unwrap_or(0) as usize,
score: concept.score,
});
} else {
// Update counts
if let Some(node) = topic_graph.nodes.iter_mut().find(|n| n.id == topic_id) {
node.paper_count += 1;
node.citation_count += work.cited_by_count.unwrap_or(0) as usize;
}
}
topic_papers.entry(topic_id.clone()).or_default().push(work.id.clone());
*topic_citations.entry(topic_id.clone()).or_insert(0) += work.cited_by_count.unwrap_or(0) as usize;
}
// Build edges between co-occurring topics
let topic_ids: Vec<String> = concepts.iter()
.filter(|c| c.score > 0.3)
.map(|c| c.id.clone())
.collect();
for i in 0..topic_ids.len() {
for j in (i + 1)..topic_ids.len() {
let source = &topic_ids[i];
let target = &topic_ids[j];
// Check if edge exists
if let Some(edge) = topic_graph.edges.iter_mut()
.find(|e| (e.source == *source && e.target == *target) ||
(e.source == *target && e.target == *source)) {
edge.weight += 1.0;
} else {
topic_graph.edges.push(TopicEdge {
source: source.clone(),
target: target.clone(),
weight: 1.0,
citation_flow: 0,
});
}
}
}
}
}
println!(" Built topic graph: {} nodes, {} edges",
topic_graph.nodes.len(), topic_graph.edges.len());
// Add snapshot to radar
radar.add_snapshot(Utc::now(), topic_graph.clone());
// Analyze for emerging frontiers (need at least 2 snapshots for delta)
// For demo, we analyze the single snapshot structure
let discoveries = analyze_frontier_structure(&topic_graph, domain_name);
if !discoveries.is_empty() {
println!("\n 🌟 Potential Frontiers Detected:\n");
for discovery in &discoveries {
all_discoveries.push(discovery.clone());
println!(" {}", discovery);
}
} else {
println!(" 📊 No clear frontier signals in current snapshot");
}
}
Err(e) => {
println!(" ❌ Error fetching papers: {}", e);
}
}
println!();
}
// Cross-domain bridge analysis
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
println!("🌉 Cross-Domain Bridge Analysis");
println!();
// Look for papers bridging multiple domains
let bridge_query = "(quantum AND machine learning) OR (biology AND AI AND materials)";
match client.search_works(bridge_query, Some(cutoff_date)).await {
Ok(bridge_works) => {
println!(" Found {} potential bridge papers\n", bridge_works.len());
// Analyze bridge patterns
let bridges = analyze_bridge_papers(&bridge_works);
for bridge in &bridges {
println!(" {}", bridge);
all_discoveries.push(bridge.clone());
}
}
Err(e) => {
println!(" ❌ Error: {}", e);
}
}
// Summary
println!("\n╔══════════════════════════════════════════════════════════════╗");
println!("║ Discovery Summary ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!("Total potential frontiers identified: {}", all_discoveries.len());
println!();
// Rank discoveries by potential
println!("📈 Top Emerging Areas (by structural signals):\n");
for (i, discovery) in all_discoveries.iter().take(5).enumerate() {
println!(" {}. {}", i + 1, discovery);
}
Ok(())
}
/// Analyze topic graph structure for frontier signals
fn analyze_frontier_structure(graph: &TopicGraph, domain: &str) -> Vec<String> {
let mut discoveries = Vec::new();
// 1. High-degree but low-level topics (emerging connectors)
let mut topic_degrees: HashMap<&str, usize> = HashMap::new();
for edge in &graph.edges {
*topic_degrees.entry(&edge.source).or_insert(0) += 1;
*topic_degrees.entry(&edge.target).or_insert(0) += 1;
}
for node in &graph.nodes {
let degree = topic_degrees.get(node.id.as_str()).copied().unwrap_or(0);
// Look for topics that connect many others but have relatively few papers
// This indicates an emerging organizing concept
if degree > 3 && node.paper_count < 20 && node.level >= 2 {
discoveries.push(format!(
"🔺 [{}] '{}' - High connectivity ({} connections), only {} papers - potential emerging organizer",
domain, node.name, degree, node.paper_count
));
}
// Look for high citation velocity (citations per paper)
if node.paper_count > 5 {
let citation_velocity = node.citation_count as f64 / node.paper_count as f64;
if citation_velocity > 20.0 {
discoveries.push(format!(
"🔥 [{}] '{}' - High citation velocity ({:.1} citations/paper) - gaining attention",
domain, node.name, citation_velocity
));
}
}
}
// 2. Detect weakly connected clusters (potential specialization frontiers)
let components = find_weak_bridges(graph);
for (topic1, topic2, bridge_strength) in components {
if bridge_strength < 3.0 && bridge_strength > 0.0 {
discoveries.push(format!(
"🌉 [{}] Weak bridge between '{}' and '{}' (strength {:.1}) - potential specialization point",
domain, topic1, topic2, bridge_strength
));
}
}
discoveries
}
/// Find weak bridges between topic clusters
fn find_weak_bridges(graph: &TopicGraph) -> Vec<(String, String, f64)> {
let mut bridges = Vec::new();
// Simple heuristic: edges with low weight connecting high-degree nodes
let mut topic_degrees: HashMap<&str, usize> = HashMap::new();
for edge in &graph.edges {
*topic_degrees.entry(&edge.source).or_insert(0) += 1;
*topic_degrees.entry(&edge.target).or_insert(0) += 1;
}
for edge in &graph.edges {
let source_degree = topic_degrees.get(edge.source.as_str()).copied().unwrap_or(0);
let target_degree = topic_degrees.get(edge.target.as_str()).copied().unwrap_or(0);
// High-degree nodes connected by weak edge = potential bridge
if source_degree > 3 && target_degree > 3 && edge.weight < 3.0 {
let source_name = graph.nodes.iter()
.find(|n| n.id == edge.source)
.map(|n| n.name.clone())
.unwrap_or_else(|| edge.source.clone());
let target_name = graph.nodes.iter()
.find(|n| n.id == edge.target)
.map(|n| n.name.clone())
.unwrap_or_else(|| edge.target.clone());
bridges.push((source_name, target_name, edge.weight));
}
}
bridges
}
/// Analyze papers that bridge multiple research domains
fn analyze_bridge_papers(works: &[ruvector_data_openalex::Work]) -> Vec<String> {
let mut discoveries = Vec::new();
// Group papers by their concept combinations
let mut concept_combos: HashMap<String, Vec<&ruvector_data_openalex::Work>> = HashMap::new();
for work in works {
if let Some(concepts) = &work.concepts {
// Get high-level concepts (level 0-1)
let high_level: Vec<String> = concepts.iter()
.filter(|c| c.level <= 1 && c.score > 0.4)
.map(|c| c.display_name.clone())
.collect();
if high_level.len() >= 2 {
let key = format!("{}{}", high_level[0], high_level[1]);
concept_combos.entry(key).or_default().push(work);
}
}
}
// Report unusual combinations with high citations
for (combo, papers) in &concept_combos {
let total_citations: i32 = papers.iter()
.filter_map(|w| w.cited_by_count)
.sum();
let avg_citations = total_citations as f64 / papers.len() as f64;
if papers.len() >= 3 && avg_citations > 10.0 {
discoveries.push(format!(
"🔗 Bridge area: {} ({} papers, {:.0} avg citations) - cross-domain synthesis",
combo, papers.len(), avg_citations
));
}
}
discoveries
}

View File

@@ -0,0 +1,267 @@
//! OpenAlex API client
use std::time::Duration;
use reqwest::{Client, StatusCode};
use serde::Deserialize;
use crate::{OpenAlexError, Work};
/// OpenAlex API client
pub struct OpenAlexClient {
client: Client,
base_url: String,
email: Option<String>,
}
/// API response wrapper
#[derive(Debug, Deserialize)]
pub struct ApiResponse<T> {
/// Metadata
pub meta: ApiMeta,
/// Results
pub results: Vec<T>,
}
/// API metadata
#[derive(Debug, Deserialize)]
pub struct ApiMeta {
/// Total count
pub count: u64,
/// Current page
pub page: Option<u32>,
/// Results per page
pub per_page: Option<u32>,
/// Next cursor (for cursor-based pagination)
pub next_cursor: Option<String>,
}
impl OpenAlexClient {
/// Create a new OpenAlex client
///
/// Providing an email enables the "polite pool" with higher rate limits.
pub fn new(email: Option<String>) -> Self {
let client = Client::builder()
.timeout(Duration::from_secs(30))
.user_agent("RuVector/0.1.0")
.gzip(true)
.build()
.expect("Failed to build HTTP client");
Self {
client,
base_url: "https://api.openalex.org".to_string(),
email,
}
}
/// Set custom base URL (for testing)
pub fn with_base_url(mut self, url: &str) -> Self {
self.base_url = url.to_string();
self
}
/// Build URL with email parameter
fn build_url(&self, endpoint: &str, params: &str) -> String {
let mut url = format!("{}/{}?{}", self.base_url, endpoint, params);
if let Some(ref email) = self.email {
if !params.is_empty() {
url.push('&');
}
url.push_str(&format!("mailto={}", email));
}
url
}
/// Health check - verify API is accessible
pub async fn health_check(&self) -> Result<bool, OpenAlexError> {
let url = format!("{}/works?per_page=1", self.base_url);
let response = self.client.get(&url).send().await?;
Ok(response.status().is_success())
}
/// Fetch a page of works with pagination
pub async fn fetch_works_page(
&self,
filter: &str,
cursor: Option<String>,
per_page: usize,
) -> Result<(Vec<Work>, Option<String>), OpenAlexError> {
let mut params = format!("per_page={}", per_page);
if !filter.is_empty() {
params.push_str(&format!("&{}", filter));
}
if let Some(c) = cursor {
params.push_str(&format!("&cursor={}", c));
} else {
// Use cursor-based pagination for bulk
params.push_str("&cursor=*");
}
let url = self.build_url("works", &params);
let response = self.client.get(&url).send().await?;
match response.status() {
StatusCode::OK => {
let api_response: ApiResponse<Work> = response.json().await?;
Ok((api_response.results, api_response.meta.next_cursor))
}
StatusCode::TOO_MANY_REQUESTS => {
let retry_after = response
.headers()
.get("retry-after")
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse().ok())
.unwrap_or(60);
Err(OpenAlexError::RateLimited(retry_after))
}
status => Err(OpenAlexError::Api(format!(
"Unexpected status: {}",
status
))),
}
}
/// Fetch a single work by ID
pub async fn get_work(&self, id: &str) -> Result<Work, OpenAlexError> {
// Normalize ID format
let normalized_id = if id.starts_with("https://") {
id.to_string()
} else if id.starts_with("W") {
format!("https://openalex.org/{}", id)
} else {
return Err(OpenAlexError::InvalidId(id.to_string()));
};
let url = self.build_url(&format!("works/{}", normalized_id), "");
let response = self.client.get(&url).send().await?;
match response.status() {
StatusCode::OK => Ok(response.json().await?),
StatusCode::NOT_FOUND => Err(OpenAlexError::InvalidId(id.to_string())),
status => Err(OpenAlexError::Api(format!(
"Unexpected status: {}",
status
))),
}
}
/// Search works by query
pub async fn search_works(
&self,
query: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let params = format!("search={}&per_page={}", urlencoding::encode(query), per_page);
let url = self.build_url("works", &params);
let response = self.client.get(&url).send().await?;
match response.status() {
StatusCode::OK => {
let api_response: ApiResponse<Work> = response.json().await?;
Ok(api_response.results)
}
status => Err(OpenAlexError::Api(format!(
"Unexpected status: {}",
status
))),
}
}
/// Fetch works by topic
pub async fn works_by_topic(
&self,
topic_id: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let filter = format!("filter=primary_topic.id:{}", topic_id);
let (works, _) = self.fetch_works_page(&filter, None, per_page).await?;
Ok(works)
}
/// Fetch works by author
pub async fn works_by_author(
&self,
author_id: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let filter = format!("filter=authorships.author.id:{}", author_id);
let (works, _) = self.fetch_works_page(&filter, None, per_page).await?;
Ok(works)
}
/// Fetch works by institution
pub async fn works_by_institution(
&self,
institution_id: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let filter = format!(
"filter=authorships.institutions.id:{}",
institution_id
);
let (works, _) = self.fetch_works_page(&filter, None, per_page).await?;
Ok(works)
}
/// Fetch works citing a specific work
pub async fn citing_works(
&self,
work_id: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let filter = format!("filter=cites:{}", work_id);
let (works, _) = self.fetch_works_page(&filter, None, per_page).await?;
Ok(works)
}
/// Fetch works cited by a specific work
pub async fn cited_by_work(&self, work_id: &str) -> Result<Vec<Work>, OpenAlexError> {
let work = self.get_work(work_id).await?;
// Fetch referenced works
let mut cited_works = Vec::new();
for ref_id in work.referenced_works.iter().take(100) {
// Limit to avoid too many requests
if let Ok(cited) = self.get_work(ref_id).await {
cited_works.push(cited);
}
}
Ok(cited_works)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_client_creation() {
let client = OpenAlexClient::new(None);
assert_eq!(client.base_url, "https://api.openalex.org");
}
#[test]
fn test_client_with_email() {
let client = OpenAlexClient::new(Some("test@example.com".to_string()));
let url = client.build_url("works", "per_page=10");
assert!(url.contains("mailto=test@example.com"));
}
#[test]
fn test_url_building() {
let client = OpenAlexClient::new(None);
let url = client.build_url("works", "filter=publication_year:2023");
assert!(url.starts_with("https://api.openalex.org/works"));
assert!(url.contains("filter=publication_year:2023"));
}
}

View File

@@ -0,0 +1,518 @@
//! Research frontier detection using coherence signals
use std::collections::HashMap;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use crate::{TopicEdge, TopicGraph, TopicNode, Work};
/// An emerging research frontier
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmergingFrontier {
/// Frontier identifier
pub id: String,
/// Primary topic name
pub name: String,
/// Related topic names
pub related_topics: Vec<String>,
/// Growth rate (works per year)
pub growth_rate: f64,
/// Coherence delta (change in min-cut boundary)
pub coherence_delta: f64,
/// Citation momentum (trend in citation rates)
pub citation_momentum: f64,
/// Detected boundary nodes (topics at the frontier edge)
pub boundary_topics: Vec<String>,
/// First detected
pub detected_at: DateTime<Utc>,
/// Confidence score (0-1)
pub confidence: f64,
/// Evidence supporting this frontier
pub evidence: Vec<FrontierEvidence>,
}
/// Evidence for a frontier detection
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FrontierEvidence {
/// Evidence type
pub evidence_type: String,
/// Value
pub value: f64,
/// Explanation
pub explanation: String,
}
/// A cross-domain bridge connecting two research areas
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrossDomainBridge {
/// Bridge identifier
pub id: String,
/// Source domain/topic
pub source_domain: String,
/// Target domain/topic
pub target_domain: String,
/// Bridge topics (connector nodes)
pub bridge_topics: Vec<String>,
/// Citation flow (source → target)
pub citation_flow: f64,
/// Reverse flow (target → source)
pub reverse_flow: f64,
/// Bridge strength (combined normalized flow)
pub strength: f64,
/// Is this a new connection?
pub is_emerging: bool,
/// First observed
pub first_observed: DateTime<Utc>,
/// Key papers establishing the bridge
pub key_works: Vec<String>,
}
/// Research frontier radar for detecting emerging fields
pub struct FrontierRadar {
/// Topic graph snapshots over time
snapshots: Vec<(DateTime<Utc>, TopicGraph)>,
/// Minimum growth rate to consider
min_growth_rate: f64,
/// Minimum coherence shift to detect
min_coherence_shift: f64,
/// Detected frontiers
frontiers: Vec<EmergingFrontier>,
/// Detected bridges
bridges: Vec<CrossDomainBridge>,
}
impl FrontierRadar {
/// Create a new frontier radar
pub fn new(min_growth_rate: f64, min_coherence_shift: f64) -> Self {
Self {
snapshots: Vec::new(),
min_growth_rate,
min_coherence_shift,
frontiers: Vec::new(),
bridges: Vec::new(),
}
}
/// Add a topic graph snapshot
pub fn add_snapshot(&mut self, timestamp: DateTime<Utc>, graph: TopicGraph) {
self.snapshots.push((timestamp, graph));
self.snapshots.sort_by_key(|(ts, _)| *ts);
}
/// Build snapshots from works partitioned by time
pub fn build_from_works(&mut self, works: &[Work], window_days: i64) {
if works.is_empty() {
return;
}
// Find time range
let mut min_date = Utc::now();
let mut max_date = DateTime::<Utc>::MIN_UTC;
for work in works {
if let Some(date) = work.publication_date {
if date < min_date {
min_date = date;
}
if date > max_date {
max_date = date;
}
}
}
// Partition works into time windows
let window_duration = chrono::Duration::days(window_days);
let mut current_start = min_date;
while current_start < max_date {
let current_end = current_start + window_duration;
let window_works: Vec<_> = works
.iter()
.filter(|w| {
w.publication_date
.map(|d| d >= current_start && d < current_end)
.unwrap_or(false)
})
.cloned()
.collect();
if !window_works.is_empty() {
let graph = TopicGraph::from_works(&window_works);
self.add_snapshot(current_start, graph);
}
current_start = current_end;
}
}
/// Detect emerging frontiers from snapshots
pub fn detect_frontiers(&mut self) -> Vec<EmergingFrontier> {
if self.snapshots.len() < 2 {
return vec![];
}
let mut frontiers = Vec::new();
let mut frontier_counter = 0;
// Compare consecutive snapshots
for i in 1..self.snapshots.len() {
let (prev_ts, prev_graph) = &self.snapshots[i - 1];
let (curr_ts, curr_graph) = &self.snapshots[i];
// Find topics with significant growth
for (topic_id, curr_node) in &curr_graph.topics {
let prev_node = prev_graph.topics.get(topic_id);
let growth = if let Some(prev) = prev_node {
if prev.work_count > 0 {
(curr_node.work_count as f64 - prev.work_count as f64)
/ prev.work_count as f64
} else {
f64::INFINITY
}
} else {
// New topic
f64::INFINITY
};
if growth > self.min_growth_rate {
// Calculate coherence shift
let coherence_delta = self.compute_topic_coherence_delta(
topic_id,
prev_graph,
curr_graph,
);
if coherence_delta.abs() > self.min_coherence_shift {
// Calculate citation momentum
let citation_momentum = curr_node.avg_citations
- prev_node.map(|n| n.avg_citations).unwrap_or(0.0);
// Find boundary topics
let boundary_topics = self.find_boundary_topics(topic_id, curr_graph);
// Build evidence
let mut evidence = vec![
FrontierEvidence {
evidence_type: "growth_rate".to_string(),
value: growth,
explanation: format!(
"{:.0}% increase in works",
growth * 100.0
),
},
FrontierEvidence {
evidence_type: "coherence_delta".to_string(),
value: coherence_delta,
explanation: format!(
"Coherence {} by {:.2}",
if coherence_delta > 0.0 {
"increased"
} else {
"decreased"
},
coherence_delta.abs()
),
},
];
if citation_momentum > 0.0 {
evidence.push(FrontierEvidence {
evidence_type: "citation_momentum".to_string(),
value: citation_momentum,
explanation: format!(
"+{:.1} avg citations",
citation_momentum
),
});
}
// Calculate confidence based on evidence strength
let confidence = self.calculate_confidence(growth, coherence_delta, citation_momentum);
if confidence >= 0.3 {
frontiers.push(EmergingFrontier {
id: format!("frontier_{}", frontier_counter),
name: curr_node.name.clone(),
related_topics: self.find_related_topics(topic_id, curr_graph),
growth_rate: curr_node.growth_rate,
coherence_delta,
citation_momentum,
boundary_topics,
detected_at: *curr_ts,
confidence,
evidence,
});
frontier_counter += 1;
}
}
}
}
}
// Sort by confidence
frontiers.sort_by(|a, b| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
});
self.frontiers = frontiers.clone();
frontiers
}
/// Detect cross-domain bridges
pub fn detect_bridges(&mut self) -> Vec<CrossDomainBridge> {
if self.snapshots.is_empty() {
return vec![];
}
let mut bridges = Vec::new();
let mut bridge_counter = 0;
let (curr_ts, curr_graph) = self.snapshots.last().unwrap();
// Build domain → topics mapping (simplified: use top-level grouping)
let mut domain_topics: HashMap<String, Vec<String>> = HashMap::new();
for (topic_id, node) in &curr_graph.topics {
// Use first word as domain (simplified)
let domain = node
.name
.split_whitespace()
.next()
.unwrap_or("Unknown")
.to_string();
domain_topics
.entry(domain.clone())
.or_default()
.push(topic_id.clone());
}
// Find cross-domain edges
let mut domain_flows: HashMap<(String, String), Vec<&TopicEdge>> = HashMap::new();
for edge in &curr_graph.edges {
let src_domain = self.get_domain(&edge.source, curr_graph);
let tgt_domain = self.get_domain(&edge.target, curr_graph);
if src_domain != tgt_domain {
domain_flows
.entry((src_domain.clone(), tgt_domain.clone()))
.or_default()
.push(edge);
}
}
// Create bridge records
for ((src_domain, tgt_domain), edges) in domain_flows {
let total_flow: f64 = edges.iter().map(|e| e.weight).sum();
let citation_count: usize = edges.iter().map(|e| e.citation_count).sum();
if citation_count >= 5 {
// Minimum threshold
let bridge_topics: Vec<String> = edges
.iter()
.flat_map(|e| vec![e.source.clone(), e.target.clone()])
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
// Check if this is emerging (compare with previous snapshot)
let is_emerging = if self.snapshots.len() >= 2 {
let (_, prev_graph) = &self.snapshots[self.snapshots.len() - 2];
let prev_flow: f64 = prev_graph
.edges
.iter()
.filter(|e| {
self.get_domain(&e.source, prev_graph) == src_domain
&& self.get_domain(&e.target, prev_graph) == tgt_domain
})
.map(|e| e.weight)
.sum();
total_flow > prev_flow * 1.5 // 50% growth
} else {
true
};
bridges.push(CrossDomainBridge {
id: format!("bridge_{}", bridge_counter),
source_domain: src_domain.clone(),
target_domain: tgt_domain.clone(),
bridge_topics,
citation_flow: total_flow,
reverse_flow: 0.0, // Would need to compute reverse direction
strength: total_flow / citation_count as f64,
is_emerging,
first_observed: *curr_ts,
key_works: vec![], // Would need work-level data
});
bridge_counter += 1;
}
}
// Sort by strength
bridges.sort_by(|a, b| {
b.strength
.partial_cmp(&a.strength)
.unwrap_or(std::cmp::Ordering::Equal)
});
self.bridges = bridges.clone();
bridges
}
/// Compute coherence delta for a topic between snapshots
fn compute_topic_coherence_delta(
&self,
topic_id: &str,
prev_graph: &TopicGraph,
curr_graph: &TopicGraph,
) -> f64 {
// Compute local coherence as ratio of intra-topic to inter-topic edges
let prev_coherence = self.compute_local_coherence(topic_id, prev_graph);
let curr_coherence = self.compute_local_coherence(topic_id, curr_graph);
curr_coherence - prev_coherence
}
/// Compute local coherence for a topic
fn compute_local_coherence(&self, topic_id: &str, graph: &TopicGraph) -> f64 {
// Find edges involving this topic
let edges: Vec<_> = graph
.edges
.iter()
.filter(|e| e.source == topic_id || e.target == topic_id)
.collect();
if edges.is_empty() {
return 0.0;
}
// Coherence = sum of weights
edges.iter().map(|e| e.weight).sum::<f64>() / edges.len() as f64
}
/// Find topics at the boundary (connected to other clusters)
fn find_boundary_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
// Find topics connected to this topic that have high connectivity elsewhere
graph
.edges
.iter()
.filter(|e| e.source == topic_id)
.map(|e| e.target.clone())
.take(5)
.collect()
}
/// Find related topics
fn find_related_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
graph
.edges
.iter()
.filter(|e| e.source == topic_id || e.target == topic_id)
.flat_map(|e| {
if e.source == topic_id {
vec![e.target.clone()]
} else {
vec![e.source.clone()]
}
})
.take(10)
.collect()
}
/// Get domain for a topic (simplified)
fn get_domain(&self, topic_id: &str, graph: &TopicGraph) -> String {
graph
.topics
.get(topic_id)
.map(|n| {
n.name
.split_whitespace()
.next()
.unwrap_or("Unknown")
.to_string()
})
.unwrap_or_else(|| "Unknown".to_string())
}
/// Calculate confidence score
fn calculate_confidence(
&self,
growth: f64,
coherence_delta: f64,
citation_momentum: f64,
) -> f64 {
let growth_score = (growth.min(5.0) / 5.0).max(0.0);
let coherence_score = (coherence_delta.abs().min(1.0)).max(0.0);
let citation_score = (citation_momentum / 10.0).min(1.0).max(0.0);
(growth_score * 0.4 + coherence_score * 0.4 + citation_score * 0.2).min(1.0)
}
/// Get detected frontiers
pub fn frontiers(&self) -> &[EmergingFrontier] {
&self.frontiers
}
/// Get detected bridges
pub fn bridges(&self) -> &[CrossDomainBridge] {
&self.bridges
}
/// Get highest confidence frontiers
pub fn top_frontiers(&self, n: usize) -> Vec<&EmergingFrontier> {
self.frontiers.iter().take(n).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_frontier_radar_creation() {
let radar = FrontierRadar::new(0.1, 0.2);
assert!(radar.frontiers().is_empty());
assert!(radar.bridges().is_empty());
}
#[test]
fn test_confidence_calculation() {
let radar = FrontierRadar::new(0.1, 0.2);
// High confidence
let high = radar.calculate_confidence(2.0, 0.5, 5.0);
assert!(high > 0.5);
// Low confidence
let low = radar.calculate_confidence(0.05, 0.01, 0.1);
assert!(low < 0.3);
}
}

View File

@@ -0,0 +1,476 @@
//! # RuVector OpenAlex Integration
//!
//! Integration with OpenAlex, the open catalog of scholarly works, authors,
//! institutions, and topics. Enables novel discovery through:
//!
//! - **Emerging Field Detection**: Find topic splits/merges as cut boundaries shift
//! - **Cross-Domain Bridges**: Identify connector subgraphs between disciplines
//! - **Funding-to-Output Causality**: Map funder → lab → venue → citation chains
//!
//! ## OpenAlex Data Model
//!
//! OpenAlex provides a rich graph structure:
//! - **Works**: 250M+ scholarly publications
//! - **Authors**: 90M+ researchers with affiliations
//! - **Institutions**: 100K+ universities, labs, companies
//! - **Topics**: Hierarchical concept taxonomy
//! - **Funders**: Research funding organizations
//! - **Sources**: Journals, conferences, repositories
//!
//! ## Quick Start
//!
//! ```rust,ignore
//! use ruvector_data_openalex::{OpenAlexClient, FrontierRadar, TopicGraph};
//!
//! // Initialize client
//! let client = OpenAlexClient::new(Some("your-email@example.com"));
//!
//! // Build topic citation graph
//! let graph = TopicGraph::build_from_works(
//! client.works_by_topic("machine learning", 2020..2024).await?
//! )?;
//!
//! // Detect emerging research frontiers
//! let radar = FrontierRadar::new(graph);
//! let frontiers = radar.detect_emerging_fields(0.3).await?;
//!
//! for frontier in frontiers {
//! println!("Emerging: {} (coherence shift: {:.2})",
//! frontier.name, frontier.coherence_delta);
//! }
//! ```
#![warn(missing_docs)]
#![warn(clippy::all)]
pub mod client;
pub mod frontier;
pub mod schema;
use std::collections::HashMap;
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use thiserror::Error;
pub use client::OpenAlexClient;
pub use frontier::{CrossDomainBridge, EmergingFrontier, FrontierRadar};
pub use schema::{
Author, AuthorPosition, Authorship, Concept, Funder, Institution, Source, Topic, Work,
};
use ruvector_data_framework::{DataRecord, DataSource, FrameworkError, Relationship, Result};
/// OpenAlex-specific error types
#[derive(Error, Debug)]
pub enum OpenAlexError {
/// API request failed
#[error("API error: {0}")]
Api(String),
/// Rate limit exceeded
#[error("Rate limit exceeded, retry after {0}s")]
RateLimited(u64),
/// Invalid entity ID
#[error("Invalid OpenAlex ID: {0}")]
InvalidId(String),
/// Parsing failed
#[error("Parse error: {0}")]
Parse(String),
/// Network error
#[error("Network error: {0}")]
Network(#[from] reqwest::Error),
}
impl From<OpenAlexError> for FrameworkError {
fn from(e: OpenAlexError) -> Self {
FrameworkError::Ingestion(e.to_string())
}
}
/// Configuration for OpenAlex data source
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OpenAlexConfig {
/// API base URL
pub base_url: String,
/// Email for polite pool (faster rate limits)
pub email: Option<String>,
/// Maximum results per page
pub per_page: usize,
/// Enable cursor-based pagination for bulk
pub use_cursor: bool,
/// Filter to specific entity types
pub entity_types: Vec<EntityType>,
}
impl Default for OpenAlexConfig {
fn default() -> Self {
Self {
base_url: "https://api.openalex.org".to_string(),
email: None,
per_page: 200,
use_cursor: true,
entity_types: vec![EntityType::Work],
}
}
}
/// OpenAlex entity types
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub enum EntityType {
/// Scholarly works
Work,
/// Authors
Author,
/// Institutions
Institution,
/// Topics/concepts
Topic,
/// Funding sources
Funder,
/// Publication venues
Source,
}
impl EntityType {
/// Get the API endpoint for this entity type
pub fn endpoint(&self) -> &str {
match self {
EntityType::Work => "works",
EntityType::Author => "authors",
EntityType::Institution => "institutions",
EntityType::Topic => "topics",
EntityType::Funder => "funders",
EntityType::Source => "sources",
}
}
}
/// OpenAlex data source for the framework
pub struct OpenAlexSource {
client: OpenAlexClient,
config: OpenAlexConfig,
filters: HashMap<String, String>,
}
impl OpenAlexSource {
/// Create a new OpenAlex data source
pub fn new(config: OpenAlexConfig) -> Self {
let client = OpenAlexClient::new(config.email.clone());
Self {
client,
config,
filters: HashMap::new(),
}
}
/// Add a filter (e.g., "publication_year" => "2023")
pub fn with_filter(mut self, key: &str, value: &str) -> Self {
self.filters.insert(key.to_string(), value.to_string());
self
}
/// Filter to a specific year range
pub fn with_year_range(self, start: i32, end: i32) -> Self {
self.with_filter("publication_year", &format!("{}-{}", start, end))
}
/// Filter to a specific topic
pub fn with_topic(self, topic_id: &str) -> Self {
self.with_filter("primary_topic.id", topic_id)
}
/// Filter to open access works
pub fn open_access_only(self) -> Self {
self.with_filter("open_access.is_oa", "true")
}
}
#[async_trait]
impl DataSource for OpenAlexSource {
fn source_id(&self) -> &str {
"openalex"
}
async fn fetch_batch(
&self,
cursor: Option<String>,
batch_size: usize,
) -> Result<(Vec<DataRecord>, Option<String>)> {
// Build query URL with filters
let mut query_parts: Vec<String> = self
.filters
.iter()
.map(|(k, v)| format!("{}:{}", k, v))
.collect();
let filter_str = if query_parts.is_empty() {
String::new()
} else {
format!("filter={}", query_parts.join(","))
};
// Fetch works from API
let (works, next_cursor) = self
.client
.fetch_works_page(&filter_str, cursor, batch_size.min(self.config.per_page))
.await
.map_err(|e| FrameworkError::Ingestion(e.to_string()))?;
// Convert to DataRecords
let records: Vec<DataRecord> = works.into_iter().map(work_to_record).collect();
Ok((records, next_cursor))
}
async fn total_count(&self) -> Result<Option<u64>> {
// OpenAlex returns count in meta
Ok(None) // Would require separate API call
}
async fn health_check(&self) -> Result<bool> {
self.client.health_check().await.map_err(|e| e.into())
}
}
/// Convert an OpenAlex Work to a DataRecord
fn work_to_record(work: Work) -> DataRecord {
let mut relationships = Vec::new();
// Citations as relationships
for cited_id in &work.referenced_works {
relationships.push(Relationship {
target_id: cited_id.clone(),
rel_type: "cites".to_string(),
weight: 1.0,
properties: HashMap::new(),
});
}
// Author relationships
for authorship in &work.authorships {
relationships.push(Relationship {
target_id: authorship.author.id.clone(),
rel_type: "authored_by".to_string(),
weight: 1.0 / work.authorships.len() as f64,
properties: HashMap::new(),
});
// Institution relationships
for inst in &authorship.institutions {
relationships.push(Relationship {
target_id: inst.id.clone(),
rel_type: "affiliated_with".to_string(),
weight: 0.5,
properties: HashMap::new(),
});
}
}
// Topic relationships
if let Some(ref topic) = work.primary_topic {
relationships.push(Relationship {
target_id: topic.id.clone(),
rel_type: "primary_topic".to_string(),
weight: topic.score,
properties: HashMap::new(),
});
}
DataRecord {
id: work.id.clone(),
source: "openalex".to_string(),
record_type: "work".to_string(),
timestamp: work.publication_date.unwrap_or_else(Utc::now),
data: serde_json::to_value(&work).unwrap_or_default(),
embedding: None, // Would compute from title/abstract
relationships,
}
}
/// Topic-based citation graph for frontier detection
pub struct TopicGraph {
/// Topics as nodes
pub topics: HashMap<String, TopicNode>,
/// Topic-to-topic edges (via citations)
pub edges: Vec<TopicEdge>,
/// Time window
pub time_window: (DateTime<Utc>, DateTime<Utc>),
}
/// A topic node in the graph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopicNode {
/// OpenAlex topic ID
pub id: String,
/// Topic display name
pub name: String,
/// Number of works in this topic
pub work_count: usize,
/// Average citation count
pub avg_citations: f64,
/// Growth rate (works per year)
pub growth_rate: f64,
}
/// An edge between topics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopicEdge {
/// Source topic ID
pub source: String,
/// Target topic ID
pub target: String,
/// Number of citations across boundary
pub citation_count: usize,
/// Normalized weight
pub weight: f64,
}
impl TopicGraph {
/// Build topic graph from works
pub fn from_works(works: &[Work]) -> Self {
let mut topics: HashMap<String, TopicNode> = HashMap::new();
let mut edge_counts: HashMap<(String, String), usize> = HashMap::new();
let mut min_date = Utc::now();
let mut max_date = DateTime::<Utc>::MIN_UTC;
for work in works {
if let Some(date) = work.publication_date {
if date < min_date {
min_date = date;
}
if date > max_date {
max_date = date;
}
}
// Get work's primary topic
let source_topic = match &work.primary_topic {
Some(t) => t.id.clone(),
None => continue,
};
// Update or create topic node
let node = topics.entry(source_topic.clone()).or_insert_with(|| TopicNode {
id: source_topic.clone(),
name: work
.primary_topic
.as_ref()
.map(|t| t.display_name.clone())
.unwrap_or_default(),
work_count: 0,
avg_citations: 0.0,
growth_rate: 0.0,
});
node.work_count += 1;
node.avg_citations = (node.avg_citations * (node.work_count - 1) as f64
+ work.cited_by_count as f64)
/ node.work_count as f64;
// For simplicity, we'd need referenced works' topics
// This is a simplified model
}
// Calculate growth rates
let time_span_years = (max_date - min_date).num_days() as f64 / 365.0;
for node in topics.values_mut() {
node.growth_rate = if time_span_years > 0.0 {
node.work_count as f64 / time_span_years
} else {
0.0
};
}
// Build edges
let edges: Vec<TopicEdge> = edge_counts
.into_iter()
.map(|((src, tgt), count)| {
let src_count = topics.get(&src).map(|n| n.work_count).unwrap_or(1);
let tgt_count = topics.get(&tgt).map(|n| n.work_count).unwrap_or(1);
let weight = count as f64 / (src_count * tgt_count) as f64;
TopicEdge {
source: src,
target: tgt,
citation_count: count,
weight,
}
})
.collect();
Self {
topics,
edges,
time_window: (min_date, max_date),
}
}
/// Get number of topics
pub fn topic_count(&self) -> usize {
self.topics.len()
}
/// Get number of edges
pub fn edge_count(&self) -> usize {
self.edges.len()
}
/// Get topics by growth rate
pub fn fastest_growing(&self, top_k: usize) -> Vec<&TopicNode> {
let mut nodes: Vec<_> = self.topics.values().collect();
nodes.sort_by(|a, b| {
b.growth_rate
.partial_cmp(&a.growth_rate)
.unwrap_or(std::cmp::Ordering::Equal)
});
nodes.into_iter().take(top_k).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_entity_endpoints() {
assert_eq!(EntityType::Work.endpoint(), "works");
assert_eq!(EntityType::Author.endpoint(), "authors");
assert_eq!(EntityType::Topic.endpoint(), "topics");
}
#[test]
fn test_default_config() {
let config = OpenAlexConfig::default();
assert_eq!(config.base_url, "https://api.openalex.org");
assert!(config.use_cursor);
}
#[test]
fn test_source_with_filters() {
let config = OpenAlexConfig::default();
let source = OpenAlexSource::new(config)
.with_year_range(2020, 2024)
.open_access_only();
assert!(source.filters.contains_key("publication_year"));
assert!(source.filters.contains_key("open_access.is_oa"));
}
}

View File

@@ -0,0 +1,627 @@
//! OpenAlex entity schemas
//!
//! Represents the core entity types from OpenAlex:
//! - Works (publications)
//! - Authors
//! - Institutions
//! - Topics/Concepts
//! - Funders
//! - Sources (journals, conferences)
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
/// A scholarly work (paper, book, dataset, etc.)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Work {
/// OpenAlex ID (e.g., "W2741809807")
pub id: String,
/// DOI (if available)
pub doi: Option<String>,
/// Work title
pub title: String,
/// Publication date
pub publication_date: Option<DateTime<Utc>>,
/// Publication year
pub publication_year: Option<i32>,
/// Work type (article, book, dataset, etc.)
#[serde(rename = "type")]
pub work_type: Option<String>,
/// Open access status
pub open_access: Option<OpenAccessStatus>,
/// Citation count
pub cited_by_count: u64,
/// Authors and their affiliations
#[serde(default)]
pub authorships: Vec<Authorship>,
/// Primary topic
pub primary_topic: Option<TopicReference>,
/// All associated topics
#[serde(default)]
pub topics: Vec<TopicReference>,
/// Legacy concepts (deprecated but still in API)
#[serde(default)]
pub concepts: Vec<ConceptReference>,
/// Referenced works (citations)
#[serde(default)]
pub referenced_works: Vec<String>,
/// Related works
#[serde(default)]
pub related_works: Vec<String>,
/// Abstract (inverted index format in API)
pub abstract_inverted_index: Option<serde_json::Value>,
/// Publication venue
pub primary_location: Option<Location>,
/// Grants/funding
#[serde(default)]
pub grants: Vec<Grant>,
/// Bibliographic info
pub biblio: Option<Biblio>,
/// Last update time
pub updated_date: Option<DateTime<Utc>>,
}
/// Open access status
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OpenAccessStatus {
/// Is this work open access?
pub is_oa: bool,
/// OA status type (gold, green, hybrid, bronze)
pub oa_status: Option<String>,
/// OA URL if available
pub oa_url: Option<String>,
}
/// Author and affiliation information for a work
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Authorship {
/// Author position (first, middle, last)
pub author_position: AuthorPosition,
/// Author details
pub author: AuthorReference,
/// Institutions at time of publication
#[serde(default)]
pub institutions: Vec<InstitutionReference>,
/// Countries
#[serde(default)]
pub countries: Vec<String>,
/// Is corresponding author
#[serde(default)]
pub is_corresponding: bool,
/// Raw affiliation string
pub raw_affiliation_string: Option<String>,
}
/// Author position in author list
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum AuthorPosition {
/// First author
First,
/// Middle author
Middle,
/// Last author
Last,
}
/// Reference to an author
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AuthorReference {
/// OpenAlex author ID
pub id: String,
/// Display name
pub display_name: String,
/// ORCID (if available)
pub orcid: Option<String>,
}
/// Reference to an institution
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InstitutionReference {
/// OpenAlex institution ID
pub id: String,
/// Display name
pub display_name: String,
/// Institution type (education, company, etc.)
#[serde(rename = "type")]
pub institution_type: Option<String>,
/// Country code
pub country_code: Option<String>,
/// ROR ID
pub ror: Option<String>,
}
/// Reference to a topic
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopicReference {
/// OpenAlex topic ID
pub id: String,
/// Display name
pub display_name: String,
/// Relevance score (0-1)
#[serde(default)]
pub score: f64,
/// Subfield
pub subfield: Option<FieldReference>,
/// Field
pub field: Option<FieldReference>,
/// Domain
pub domain: Option<FieldReference>,
}
/// Reference to a concept (legacy)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConceptReference {
/// OpenAlex concept ID
pub id: String,
/// Display name
pub display_name: String,
/// Wikidata ID
pub wikidata: Option<String>,
/// Relevance score
#[serde(default)]
pub score: f64,
/// Hierarchy level (0 = root)
#[serde(default)]
pub level: u32,
}
/// Reference to a field/domain
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldReference {
/// OpenAlex ID
pub id: String,
/// Display name
pub display_name: String,
}
/// Publication location
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Location {
/// Is primary location
#[serde(default)]
pub is_primary: bool,
/// Landing page URL
pub landing_page_url: Option<String>,
/// PDF URL
pub pdf_url: Option<String>,
/// Source (journal/conference)
pub source: Option<SourceReference>,
/// License
pub license: Option<String>,
/// Version
pub version: Option<String>,
}
/// Reference to a source (journal, conference, etc.)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SourceReference {
/// OpenAlex source ID
pub id: String,
/// Display name
pub display_name: String,
/// ISSN
pub issn_l: Option<String>,
/// Source type
#[serde(rename = "type")]
pub source_type: Option<String>,
/// Is Open Access journal
#[serde(default)]
pub is_oa: bool,
/// Host organization
pub host_organization: Option<String>,
}
/// Grant/funding information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Grant {
/// Funder
pub funder: Option<FunderReference>,
/// Funder display name
pub funder_display_name: Option<String>,
/// Award ID
pub award_id: Option<String>,
}
/// Reference to a funder
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FunderReference {
/// OpenAlex funder ID
pub id: String,
/// Display name
pub display_name: String,
}
/// Bibliographic details
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Biblio {
/// Volume
pub volume: Option<String>,
/// Issue
pub issue: Option<String>,
/// First page
pub first_page: Option<String>,
/// Last page
pub last_page: Option<String>,
}
/// Full author entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Author {
/// OpenAlex author ID
pub id: String,
/// ORCID
pub orcid: Option<String>,
/// Display name
pub display_name: String,
/// Alternative names
#[serde(default)]
pub display_name_alternatives: Vec<String>,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// H-index
pub summary_stats: Option<AuthorStats>,
/// Most recent institution
pub last_known_institution: Option<InstitutionReference>,
/// All affiliations
#[serde(default)]
pub affiliations: Vec<Affiliation>,
/// Topic areas
#[serde(default)]
pub topics: Vec<TopicReference>,
/// Works API URL
pub works_api_url: Option<String>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Author summary statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AuthorStats {
/// H-index
pub h_index: Option<u32>,
/// i10-index
pub i10_index: Option<u32>,
/// Two-year mean citedness
#[serde(rename = "2yr_mean_citedness")]
pub two_year_mean_citedness: Option<f64>,
}
/// Author affiliation
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Affiliation {
/// Institution
pub institution: InstitutionReference,
/// Years affiliated
#[serde(default)]
pub years: Vec<i32>,
}
/// Full institution entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Institution {
/// OpenAlex institution ID
pub id: String,
/// ROR ID
pub ror: Option<String>,
/// Display name
pub display_name: String,
/// Country code
pub country_code: Option<String>,
/// Institution type
#[serde(rename = "type")]
pub institution_type: Option<String>,
/// Homepage URL
pub homepage_url: Option<String>,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// Geographic info
pub geo: Option<GeoLocation>,
/// Parent institutions
#[serde(default)]
pub lineage: Vec<String>,
/// Associated institutions
#[serde(default)]
pub associated_institutions: Vec<InstitutionReference>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Geographic location
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GeoLocation {
/// City
pub city: Option<String>,
/// Region/state
pub region: Option<String>,
/// Country
pub country: Option<String>,
/// Country code
pub country_code: Option<String>,
/// Latitude
pub latitude: Option<f64>,
/// Longitude
pub longitude: Option<f64>,
}
/// Full topic entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Topic {
/// OpenAlex topic ID
pub id: String,
/// Display name
pub display_name: String,
/// Description
pub description: Option<String>,
/// Keywords
#[serde(default)]
pub keywords: Vec<String>,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// Subfield
pub subfield: Option<FieldReference>,
/// Field
pub field: Option<FieldReference>,
/// Domain
pub domain: Option<FieldReference>,
/// Sibling topics
#[serde(default)]
pub siblings: Vec<TopicReference>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Legacy concept entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Concept {
/// OpenAlex concept ID
pub id: String,
/// Wikidata ID
pub wikidata: Option<String>,
/// Display name
pub display_name: String,
/// Description
pub description: Option<String>,
/// Hierarchy level
pub level: u32,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// Parent concepts
#[serde(default)]
pub ancestors: Vec<ConceptReference>,
/// Child concepts
#[serde(default)]
pub related_concepts: Vec<ConceptReference>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Full source entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Source {
/// OpenAlex source ID
pub id: String,
/// ISSN-L
pub issn_l: Option<String>,
/// All ISSNs
#[serde(default)]
pub issn: Vec<String>,
/// Display name
pub display_name: String,
/// Publisher
pub host_organization: Option<String>,
/// Source type (journal, conference, etc.)
#[serde(rename = "type")]
pub source_type: Option<String>,
/// Is Open Access
#[serde(default)]
pub is_oa: bool,
/// Homepage URL
pub homepage_url: Option<String>,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// Topics
#[serde(default)]
pub topics: Vec<TopicReference>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Full funder entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Funder {
/// OpenAlex funder ID
pub id: String,
/// Display name
pub display_name: String,
/// Alternative names
#[serde(default)]
pub alternate_titles: Vec<String>,
/// Country code
pub country_code: Option<String>,
/// Description
pub description: Option<String>,
/// Homepage URL
pub homepage_url: Option<String>,
/// Grants count
pub grants_count: u64,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// ROR ID
pub ror: Option<String>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_work_deserialization() {
let json = r#"{
"id": "W123",
"title": "Test Paper",
"cited_by_count": 10,
"authorships": [],
"topics": [],
"concepts": [],
"referenced_works": [],
"related_works": [],
"grants": []
}"#;
let work: Work = serde_json::from_str(json).unwrap();
assert_eq!(work.id, "W123");
assert_eq!(work.title, "Test Paper");
assert_eq!(work.cited_by_count, 10);
}
#[test]
fn test_author_position() {
let first = serde_json::from_str::<AuthorPosition>(r#""first""#).unwrap();
assert_eq!(first, AuthorPosition::First);
let last = serde_json::from_str::<AuthorPosition>(r#""last""#).unwrap();
assert_eq!(last, AuthorPosition::Last);
}
}