Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,267 @@
//! OpenAlex API client
use std::time::Duration;
use reqwest::{Client, StatusCode};
use serde::Deserialize;
use crate::{OpenAlexError, Work};
/// OpenAlex API client
pub struct OpenAlexClient {
client: Client,
base_url: String,
email: Option<String>,
}
/// API response wrapper
#[derive(Debug, Deserialize)]
pub struct ApiResponse<T> {
/// Metadata
pub meta: ApiMeta,
/// Results
pub results: Vec<T>,
}
/// API metadata
#[derive(Debug, Deserialize)]
pub struct ApiMeta {
/// Total count
pub count: u64,
/// Current page
pub page: Option<u32>,
/// Results per page
pub per_page: Option<u32>,
/// Next cursor (for cursor-based pagination)
pub next_cursor: Option<String>,
}
impl OpenAlexClient {
/// Create a new OpenAlex client
///
/// Providing an email enables the "polite pool" with higher rate limits.
pub fn new(email: Option<String>) -> Self {
let client = Client::builder()
.timeout(Duration::from_secs(30))
.user_agent("RuVector/0.1.0")
.gzip(true)
.build()
.expect("Failed to build HTTP client");
Self {
client,
base_url: "https://api.openalex.org".to_string(),
email,
}
}
/// Set custom base URL (for testing)
pub fn with_base_url(mut self, url: &str) -> Self {
self.base_url = url.to_string();
self
}
/// Build URL with email parameter
fn build_url(&self, endpoint: &str, params: &str) -> String {
let mut url = format!("{}/{}?{}", self.base_url, endpoint, params);
if let Some(ref email) = self.email {
if !params.is_empty() {
url.push('&');
}
url.push_str(&format!("mailto={}", email));
}
url
}
/// Health check - verify API is accessible
pub async fn health_check(&self) -> Result<bool, OpenAlexError> {
let url = format!("{}/works?per_page=1", self.base_url);
let response = self.client.get(&url).send().await?;
Ok(response.status().is_success())
}
/// Fetch a page of works with pagination
pub async fn fetch_works_page(
&self,
filter: &str,
cursor: Option<String>,
per_page: usize,
) -> Result<(Vec<Work>, Option<String>), OpenAlexError> {
let mut params = format!("per_page={}", per_page);
if !filter.is_empty() {
params.push_str(&format!("&{}", filter));
}
if let Some(c) = cursor {
params.push_str(&format!("&cursor={}", c));
} else {
// Use cursor-based pagination for bulk
params.push_str("&cursor=*");
}
let url = self.build_url("works", &params);
let response = self.client.get(&url).send().await?;
match response.status() {
StatusCode::OK => {
let api_response: ApiResponse<Work> = response.json().await?;
Ok((api_response.results, api_response.meta.next_cursor))
}
StatusCode::TOO_MANY_REQUESTS => {
let retry_after = response
.headers()
.get("retry-after")
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse().ok())
.unwrap_or(60);
Err(OpenAlexError::RateLimited(retry_after))
}
status => Err(OpenAlexError::Api(format!(
"Unexpected status: {}",
status
))),
}
}
/// Fetch a single work by ID
pub async fn get_work(&self, id: &str) -> Result<Work, OpenAlexError> {
// Normalize ID format
let normalized_id = if id.starts_with("https://") {
id.to_string()
} else if id.starts_with("W") {
format!("https://openalex.org/{}", id)
} else {
return Err(OpenAlexError::InvalidId(id.to_string()));
};
let url = self.build_url(&format!("works/{}", normalized_id), "");
let response = self.client.get(&url).send().await?;
match response.status() {
StatusCode::OK => Ok(response.json().await?),
StatusCode::NOT_FOUND => Err(OpenAlexError::InvalidId(id.to_string())),
status => Err(OpenAlexError::Api(format!(
"Unexpected status: {}",
status
))),
}
}
/// Search works by query
pub async fn search_works(
&self,
query: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let params = format!("search={}&per_page={}", urlencoding::encode(query), per_page);
let url = self.build_url("works", &params);
let response = self.client.get(&url).send().await?;
match response.status() {
StatusCode::OK => {
let api_response: ApiResponse<Work> = response.json().await?;
Ok(api_response.results)
}
status => Err(OpenAlexError::Api(format!(
"Unexpected status: {}",
status
))),
}
}
/// Fetch works by topic
pub async fn works_by_topic(
&self,
topic_id: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let filter = format!("filter=primary_topic.id:{}", topic_id);
let (works, _) = self.fetch_works_page(&filter, None, per_page).await?;
Ok(works)
}
/// Fetch works by author
pub async fn works_by_author(
&self,
author_id: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let filter = format!("filter=authorships.author.id:{}", author_id);
let (works, _) = self.fetch_works_page(&filter, None, per_page).await?;
Ok(works)
}
/// Fetch works by institution
pub async fn works_by_institution(
&self,
institution_id: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let filter = format!(
"filter=authorships.institutions.id:{}",
institution_id
);
let (works, _) = self.fetch_works_page(&filter, None, per_page).await?;
Ok(works)
}
/// Fetch works citing a specific work
pub async fn citing_works(
&self,
work_id: &str,
per_page: usize,
) -> Result<Vec<Work>, OpenAlexError> {
let filter = format!("filter=cites:{}", work_id);
let (works, _) = self.fetch_works_page(&filter, None, per_page).await?;
Ok(works)
}
/// Fetch works cited by a specific work
pub async fn cited_by_work(&self, work_id: &str) -> Result<Vec<Work>, OpenAlexError> {
let work = self.get_work(work_id).await?;
// Fetch referenced works
let mut cited_works = Vec::new();
for ref_id in work.referenced_works.iter().take(100) {
// Limit to avoid too many requests
if let Ok(cited) = self.get_work(ref_id).await {
cited_works.push(cited);
}
}
Ok(cited_works)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_client_creation() {
let client = OpenAlexClient::new(None);
assert_eq!(client.base_url, "https://api.openalex.org");
}
#[test]
fn test_client_with_email() {
let client = OpenAlexClient::new(Some("test@example.com".to_string()));
let url = client.build_url("works", "per_page=10");
assert!(url.contains("mailto=test@example.com"));
}
#[test]
fn test_url_building() {
let client = OpenAlexClient::new(None);
let url = client.build_url("works", "filter=publication_year:2023");
assert!(url.starts_with("https://api.openalex.org/works"));
assert!(url.contains("filter=publication_year:2023"));
}
}

View File

@@ -0,0 +1,518 @@
//! Research frontier detection using coherence signals
use std::collections::HashMap;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use crate::{TopicEdge, TopicGraph, TopicNode, Work};
/// An emerging research frontier
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmergingFrontier {
/// Frontier identifier
pub id: String,
/// Primary topic name
pub name: String,
/// Related topic names
pub related_topics: Vec<String>,
/// Growth rate (works per year)
pub growth_rate: f64,
/// Coherence delta (change in min-cut boundary)
pub coherence_delta: f64,
/// Citation momentum (trend in citation rates)
pub citation_momentum: f64,
/// Detected boundary nodes (topics at the frontier edge)
pub boundary_topics: Vec<String>,
/// First detected
pub detected_at: DateTime<Utc>,
/// Confidence score (0-1)
pub confidence: f64,
/// Evidence supporting this frontier
pub evidence: Vec<FrontierEvidence>,
}
/// Evidence for a frontier detection
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FrontierEvidence {
/// Evidence type
pub evidence_type: String,
/// Value
pub value: f64,
/// Explanation
pub explanation: String,
}
/// A cross-domain bridge connecting two research areas
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrossDomainBridge {
/// Bridge identifier
pub id: String,
/// Source domain/topic
pub source_domain: String,
/// Target domain/topic
pub target_domain: String,
/// Bridge topics (connector nodes)
pub bridge_topics: Vec<String>,
/// Citation flow (source → target)
pub citation_flow: f64,
/// Reverse flow (target → source)
pub reverse_flow: f64,
/// Bridge strength (combined normalized flow)
pub strength: f64,
/// Is this a new connection?
pub is_emerging: bool,
/// First observed
pub first_observed: DateTime<Utc>,
/// Key papers establishing the bridge
pub key_works: Vec<String>,
}
/// Research frontier radar for detecting emerging fields
pub struct FrontierRadar {
/// Topic graph snapshots over time
snapshots: Vec<(DateTime<Utc>, TopicGraph)>,
/// Minimum growth rate to consider
min_growth_rate: f64,
/// Minimum coherence shift to detect
min_coherence_shift: f64,
/// Detected frontiers
frontiers: Vec<EmergingFrontier>,
/// Detected bridges
bridges: Vec<CrossDomainBridge>,
}
impl FrontierRadar {
/// Create a new frontier radar
pub fn new(min_growth_rate: f64, min_coherence_shift: f64) -> Self {
Self {
snapshots: Vec::new(),
min_growth_rate,
min_coherence_shift,
frontiers: Vec::new(),
bridges: Vec::new(),
}
}
/// Add a topic graph snapshot
pub fn add_snapshot(&mut self, timestamp: DateTime<Utc>, graph: TopicGraph) {
self.snapshots.push((timestamp, graph));
self.snapshots.sort_by_key(|(ts, _)| *ts);
}
/// Build snapshots from works partitioned by time
pub fn build_from_works(&mut self, works: &[Work], window_days: i64) {
if works.is_empty() {
return;
}
// Find time range
let mut min_date = Utc::now();
let mut max_date = DateTime::<Utc>::MIN_UTC;
for work in works {
if let Some(date) = work.publication_date {
if date < min_date {
min_date = date;
}
if date > max_date {
max_date = date;
}
}
}
// Partition works into time windows
let window_duration = chrono::Duration::days(window_days);
let mut current_start = min_date;
while current_start < max_date {
let current_end = current_start + window_duration;
let window_works: Vec<_> = works
.iter()
.filter(|w| {
w.publication_date
.map(|d| d >= current_start && d < current_end)
.unwrap_or(false)
})
.cloned()
.collect();
if !window_works.is_empty() {
let graph = TopicGraph::from_works(&window_works);
self.add_snapshot(current_start, graph);
}
current_start = current_end;
}
}
/// Detect emerging frontiers from snapshots
pub fn detect_frontiers(&mut self) -> Vec<EmergingFrontier> {
if self.snapshots.len() < 2 {
return vec![];
}
let mut frontiers = Vec::new();
let mut frontier_counter = 0;
// Compare consecutive snapshots
for i in 1..self.snapshots.len() {
let (prev_ts, prev_graph) = &self.snapshots[i - 1];
let (curr_ts, curr_graph) = &self.snapshots[i];
// Find topics with significant growth
for (topic_id, curr_node) in &curr_graph.topics {
let prev_node = prev_graph.topics.get(topic_id);
let growth = if let Some(prev) = prev_node {
if prev.work_count > 0 {
(curr_node.work_count as f64 - prev.work_count as f64)
/ prev.work_count as f64
} else {
f64::INFINITY
}
} else {
// New topic
f64::INFINITY
};
if growth > self.min_growth_rate {
// Calculate coherence shift
let coherence_delta = self.compute_topic_coherence_delta(
topic_id,
prev_graph,
curr_graph,
);
if coherence_delta.abs() > self.min_coherence_shift {
// Calculate citation momentum
let citation_momentum = curr_node.avg_citations
- prev_node.map(|n| n.avg_citations).unwrap_or(0.0);
// Find boundary topics
let boundary_topics = self.find_boundary_topics(topic_id, curr_graph);
// Build evidence
let mut evidence = vec![
FrontierEvidence {
evidence_type: "growth_rate".to_string(),
value: growth,
explanation: format!(
"{:.0}% increase in works",
growth * 100.0
),
},
FrontierEvidence {
evidence_type: "coherence_delta".to_string(),
value: coherence_delta,
explanation: format!(
"Coherence {} by {:.2}",
if coherence_delta > 0.0 {
"increased"
} else {
"decreased"
},
coherence_delta.abs()
),
},
];
if citation_momentum > 0.0 {
evidence.push(FrontierEvidence {
evidence_type: "citation_momentum".to_string(),
value: citation_momentum,
explanation: format!(
"+{:.1} avg citations",
citation_momentum
),
});
}
// Calculate confidence based on evidence strength
let confidence = self.calculate_confidence(growth, coherence_delta, citation_momentum);
if confidence >= 0.3 {
frontiers.push(EmergingFrontier {
id: format!("frontier_{}", frontier_counter),
name: curr_node.name.clone(),
related_topics: self.find_related_topics(topic_id, curr_graph),
growth_rate: curr_node.growth_rate,
coherence_delta,
citation_momentum,
boundary_topics,
detected_at: *curr_ts,
confidence,
evidence,
});
frontier_counter += 1;
}
}
}
}
}
// Sort by confidence
frontiers.sort_by(|a, b| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
});
self.frontiers = frontiers.clone();
frontiers
}
/// Detect cross-domain bridges
pub fn detect_bridges(&mut self) -> Vec<CrossDomainBridge> {
if self.snapshots.is_empty() {
return vec![];
}
let mut bridges = Vec::new();
let mut bridge_counter = 0;
let (curr_ts, curr_graph) = self.snapshots.last().unwrap();
// Build domain → topics mapping (simplified: use top-level grouping)
let mut domain_topics: HashMap<String, Vec<String>> = HashMap::new();
for (topic_id, node) in &curr_graph.topics {
// Use first word as domain (simplified)
let domain = node
.name
.split_whitespace()
.next()
.unwrap_or("Unknown")
.to_string();
domain_topics
.entry(domain.clone())
.or_default()
.push(topic_id.clone());
}
// Find cross-domain edges
let mut domain_flows: HashMap<(String, String), Vec<&TopicEdge>> = HashMap::new();
for edge in &curr_graph.edges {
let src_domain = self.get_domain(&edge.source, curr_graph);
let tgt_domain = self.get_domain(&edge.target, curr_graph);
if src_domain != tgt_domain {
domain_flows
.entry((src_domain.clone(), tgt_domain.clone()))
.or_default()
.push(edge);
}
}
// Create bridge records
for ((src_domain, tgt_domain), edges) in domain_flows {
let total_flow: f64 = edges.iter().map(|e| e.weight).sum();
let citation_count: usize = edges.iter().map(|e| e.citation_count).sum();
if citation_count >= 5 {
// Minimum threshold
let bridge_topics: Vec<String> = edges
.iter()
.flat_map(|e| vec![e.source.clone(), e.target.clone()])
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
// Check if this is emerging (compare with previous snapshot)
let is_emerging = if self.snapshots.len() >= 2 {
let (_, prev_graph) = &self.snapshots[self.snapshots.len() - 2];
let prev_flow: f64 = prev_graph
.edges
.iter()
.filter(|e| {
self.get_domain(&e.source, prev_graph) == src_domain
&& self.get_domain(&e.target, prev_graph) == tgt_domain
})
.map(|e| e.weight)
.sum();
total_flow > prev_flow * 1.5 // 50% growth
} else {
true
};
bridges.push(CrossDomainBridge {
id: format!("bridge_{}", bridge_counter),
source_domain: src_domain.clone(),
target_domain: tgt_domain.clone(),
bridge_topics,
citation_flow: total_flow,
reverse_flow: 0.0, // Would need to compute reverse direction
strength: total_flow / citation_count as f64,
is_emerging,
first_observed: *curr_ts,
key_works: vec![], // Would need work-level data
});
bridge_counter += 1;
}
}
// Sort by strength
bridges.sort_by(|a, b| {
b.strength
.partial_cmp(&a.strength)
.unwrap_or(std::cmp::Ordering::Equal)
});
self.bridges = bridges.clone();
bridges
}
/// Compute coherence delta for a topic between snapshots
fn compute_topic_coherence_delta(
&self,
topic_id: &str,
prev_graph: &TopicGraph,
curr_graph: &TopicGraph,
) -> f64 {
// Compute local coherence as ratio of intra-topic to inter-topic edges
let prev_coherence = self.compute_local_coherence(topic_id, prev_graph);
let curr_coherence = self.compute_local_coherence(topic_id, curr_graph);
curr_coherence - prev_coherence
}
/// Compute local coherence for a topic
fn compute_local_coherence(&self, topic_id: &str, graph: &TopicGraph) -> f64 {
// Find edges involving this topic
let edges: Vec<_> = graph
.edges
.iter()
.filter(|e| e.source == topic_id || e.target == topic_id)
.collect();
if edges.is_empty() {
return 0.0;
}
// Coherence = sum of weights
edges.iter().map(|e| e.weight).sum::<f64>() / edges.len() as f64
}
/// Find topics at the boundary (connected to other clusters)
fn find_boundary_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
// Find topics connected to this topic that have high connectivity elsewhere
graph
.edges
.iter()
.filter(|e| e.source == topic_id)
.map(|e| e.target.clone())
.take(5)
.collect()
}
/// Find related topics
fn find_related_topics(&self, topic_id: &str, graph: &TopicGraph) -> Vec<String> {
graph
.edges
.iter()
.filter(|e| e.source == topic_id || e.target == topic_id)
.flat_map(|e| {
if e.source == topic_id {
vec![e.target.clone()]
} else {
vec![e.source.clone()]
}
})
.take(10)
.collect()
}
/// Get domain for a topic (simplified)
fn get_domain(&self, topic_id: &str, graph: &TopicGraph) -> String {
graph
.topics
.get(topic_id)
.map(|n| {
n.name
.split_whitespace()
.next()
.unwrap_or("Unknown")
.to_string()
})
.unwrap_or_else(|| "Unknown".to_string())
}
/// Calculate confidence score
fn calculate_confidence(
&self,
growth: f64,
coherence_delta: f64,
citation_momentum: f64,
) -> f64 {
let growth_score = (growth.min(5.0) / 5.0).max(0.0);
let coherence_score = (coherence_delta.abs().min(1.0)).max(0.0);
let citation_score = (citation_momentum / 10.0).min(1.0).max(0.0);
(growth_score * 0.4 + coherence_score * 0.4 + citation_score * 0.2).min(1.0)
}
/// Get detected frontiers
pub fn frontiers(&self) -> &[EmergingFrontier] {
&self.frontiers
}
/// Get detected bridges
pub fn bridges(&self) -> &[CrossDomainBridge] {
&self.bridges
}
/// Get highest confidence frontiers
pub fn top_frontiers(&self, n: usize) -> Vec<&EmergingFrontier> {
self.frontiers.iter().take(n).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_frontier_radar_creation() {
let radar = FrontierRadar::new(0.1, 0.2);
assert!(radar.frontiers().is_empty());
assert!(radar.bridges().is_empty());
}
#[test]
fn test_confidence_calculation() {
let radar = FrontierRadar::new(0.1, 0.2);
// High confidence
let high = radar.calculate_confidence(2.0, 0.5, 5.0);
assert!(high > 0.5);
// Low confidence
let low = radar.calculate_confidence(0.05, 0.01, 0.1);
assert!(low < 0.3);
}
}

View File

@@ -0,0 +1,476 @@
//! # RuVector OpenAlex Integration
//!
//! Integration with OpenAlex, the open catalog of scholarly works, authors,
//! institutions, and topics. Enables novel discovery through:
//!
//! - **Emerging Field Detection**: Find topic splits/merges as cut boundaries shift
//! - **Cross-Domain Bridges**: Identify connector subgraphs between disciplines
//! - **Funding-to-Output Causality**: Map funder → lab → venue → citation chains
//!
//! ## OpenAlex Data Model
//!
//! OpenAlex provides a rich graph structure:
//! - **Works**: 250M+ scholarly publications
//! - **Authors**: 90M+ researchers with affiliations
//! - **Institutions**: 100K+ universities, labs, companies
//! - **Topics**: Hierarchical concept taxonomy
//! - **Funders**: Research funding organizations
//! - **Sources**: Journals, conferences, repositories
//!
//! ## Quick Start
//!
//! ```rust,ignore
//! use ruvector_data_openalex::{OpenAlexClient, FrontierRadar, TopicGraph};
//!
//! // Initialize client
//! let client = OpenAlexClient::new(Some("your-email@example.com"));
//!
//! // Build topic citation graph
//! let graph = TopicGraph::build_from_works(
//! client.works_by_topic("machine learning", 2020..2024).await?
//! )?;
//!
//! // Detect emerging research frontiers
//! let radar = FrontierRadar::new(graph);
//! let frontiers = radar.detect_emerging_fields(0.3).await?;
//!
//! for frontier in frontiers {
//! println!("Emerging: {} (coherence shift: {:.2})",
//! frontier.name, frontier.coherence_delta);
//! }
//! ```
#![warn(missing_docs)]
#![warn(clippy::all)]
pub mod client;
pub mod frontier;
pub mod schema;
use std::collections::HashMap;
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use thiserror::Error;
pub use client::OpenAlexClient;
pub use frontier::{CrossDomainBridge, EmergingFrontier, FrontierRadar};
pub use schema::{
Author, AuthorPosition, Authorship, Concept, Funder, Institution, Source, Topic, Work,
};
use ruvector_data_framework::{DataRecord, DataSource, FrameworkError, Relationship, Result};
/// OpenAlex-specific error types
#[derive(Error, Debug)]
pub enum OpenAlexError {
/// API request failed
#[error("API error: {0}")]
Api(String),
/// Rate limit exceeded
#[error("Rate limit exceeded, retry after {0}s")]
RateLimited(u64),
/// Invalid entity ID
#[error("Invalid OpenAlex ID: {0}")]
InvalidId(String),
/// Parsing failed
#[error("Parse error: {0}")]
Parse(String),
/// Network error
#[error("Network error: {0}")]
Network(#[from] reqwest::Error),
}
impl From<OpenAlexError> for FrameworkError {
fn from(e: OpenAlexError) -> Self {
FrameworkError::Ingestion(e.to_string())
}
}
/// Configuration for OpenAlex data source
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OpenAlexConfig {
/// API base URL
pub base_url: String,
/// Email for polite pool (faster rate limits)
pub email: Option<String>,
/// Maximum results per page
pub per_page: usize,
/// Enable cursor-based pagination for bulk
pub use_cursor: bool,
/// Filter to specific entity types
pub entity_types: Vec<EntityType>,
}
impl Default for OpenAlexConfig {
fn default() -> Self {
Self {
base_url: "https://api.openalex.org".to_string(),
email: None,
per_page: 200,
use_cursor: true,
entity_types: vec![EntityType::Work],
}
}
}
/// OpenAlex entity types
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub enum EntityType {
/// Scholarly works
Work,
/// Authors
Author,
/// Institutions
Institution,
/// Topics/concepts
Topic,
/// Funding sources
Funder,
/// Publication venues
Source,
}
impl EntityType {
/// Get the API endpoint for this entity type
pub fn endpoint(&self) -> &str {
match self {
EntityType::Work => "works",
EntityType::Author => "authors",
EntityType::Institution => "institutions",
EntityType::Topic => "topics",
EntityType::Funder => "funders",
EntityType::Source => "sources",
}
}
}
/// OpenAlex data source for the framework
pub struct OpenAlexSource {
client: OpenAlexClient,
config: OpenAlexConfig,
filters: HashMap<String, String>,
}
impl OpenAlexSource {
/// Create a new OpenAlex data source
pub fn new(config: OpenAlexConfig) -> Self {
let client = OpenAlexClient::new(config.email.clone());
Self {
client,
config,
filters: HashMap::new(),
}
}
/// Add a filter (e.g., "publication_year" => "2023")
pub fn with_filter(mut self, key: &str, value: &str) -> Self {
self.filters.insert(key.to_string(), value.to_string());
self
}
/// Filter to a specific year range
pub fn with_year_range(self, start: i32, end: i32) -> Self {
self.with_filter("publication_year", &format!("{}-{}", start, end))
}
/// Filter to a specific topic
pub fn with_topic(self, topic_id: &str) -> Self {
self.with_filter("primary_topic.id", topic_id)
}
/// Filter to open access works
pub fn open_access_only(self) -> Self {
self.with_filter("open_access.is_oa", "true")
}
}
#[async_trait]
impl DataSource for OpenAlexSource {
fn source_id(&self) -> &str {
"openalex"
}
async fn fetch_batch(
&self,
cursor: Option<String>,
batch_size: usize,
) -> Result<(Vec<DataRecord>, Option<String>)> {
// Build query URL with filters
let mut query_parts: Vec<String> = self
.filters
.iter()
.map(|(k, v)| format!("{}:{}", k, v))
.collect();
let filter_str = if query_parts.is_empty() {
String::new()
} else {
format!("filter={}", query_parts.join(","))
};
// Fetch works from API
let (works, next_cursor) = self
.client
.fetch_works_page(&filter_str, cursor, batch_size.min(self.config.per_page))
.await
.map_err(|e| FrameworkError::Ingestion(e.to_string()))?;
// Convert to DataRecords
let records: Vec<DataRecord> = works.into_iter().map(work_to_record).collect();
Ok((records, next_cursor))
}
async fn total_count(&self) -> Result<Option<u64>> {
// OpenAlex returns count in meta
Ok(None) // Would require separate API call
}
async fn health_check(&self) -> Result<bool> {
self.client.health_check().await.map_err(|e| e.into())
}
}
/// Convert an OpenAlex Work to a DataRecord
fn work_to_record(work: Work) -> DataRecord {
let mut relationships = Vec::new();
// Citations as relationships
for cited_id in &work.referenced_works {
relationships.push(Relationship {
target_id: cited_id.clone(),
rel_type: "cites".to_string(),
weight: 1.0,
properties: HashMap::new(),
});
}
// Author relationships
for authorship in &work.authorships {
relationships.push(Relationship {
target_id: authorship.author.id.clone(),
rel_type: "authored_by".to_string(),
weight: 1.0 / work.authorships.len() as f64,
properties: HashMap::new(),
});
// Institution relationships
for inst in &authorship.institutions {
relationships.push(Relationship {
target_id: inst.id.clone(),
rel_type: "affiliated_with".to_string(),
weight: 0.5,
properties: HashMap::new(),
});
}
}
// Topic relationships
if let Some(ref topic) = work.primary_topic {
relationships.push(Relationship {
target_id: topic.id.clone(),
rel_type: "primary_topic".to_string(),
weight: topic.score,
properties: HashMap::new(),
});
}
DataRecord {
id: work.id.clone(),
source: "openalex".to_string(),
record_type: "work".to_string(),
timestamp: work.publication_date.unwrap_or_else(Utc::now),
data: serde_json::to_value(&work).unwrap_or_default(),
embedding: None, // Would compute from title/abstract
relationships,
}
}
/// Topic-based citation graph for frontier detection
pub struct TopicGraph {
/// Topics as nodes
pub topics: HashMap<String, TopicNode>,
/// Topic-to-topic edges (via citations)
pub edges: Vec<TopicEdge>,
/// Time window
pub time_window: (DateTime<Utc>, DateTime<Utc>),
}
/// A topic node in the graph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopicNode {
/// OpenAlex topic ID
pub id: String,
/// Topic display name
pub name: String,
/// Number of works in this topic
pub work_count: usize,
/// Average citation count
pub avg_citations: f64,
/// Growth rate (works per year)
pub growth_rate: f64,
}
/// An edge between topics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopicEdge {
/// Source topic ID
pub source: String,
/// Target topic ID
pub target: String,
/// Number of citations across boundary
pub citation_count: usize,
/// Normalized weight
pub weight: f64,
}
impl TopicGraph {
/// Build topic graph from works
pub fn from_works(works: &[Work]) -> Self {
let mut topics: HashMap<String, TopicNode> = HashMap::new();
let mut edge_counts: HashMap<(String, String), usize> = HashMap::new();
let mut min_date = Utc::now();
let mut max_date = DateTime::<Utc>::MIN_UTC;
for work in works {
if let Some(date) = work.publication_date {
if date < min_date {
min_date = date;
}
if date > max_date {
max_date = date;
}
}
// Get work's primary topic
let source_topic = match &work.primary_topic {
Some(t) => t.id.clone(),
None => continue,
};
// Update or create topic node
let node = topics.entry(source_topic.clone()).or_insert_with(|| TopicNode {
id: source_topic.clone(),
name: work
.primary_topic
.as_ref()
.map(|t| t.display_name.clone())
.unwrap_or_default(),
work_count: 0,
avg_citations: 0.0,
growth_rate: 0.0,
});
node.work_count += 1;
node.avg_citations = (node.avg_citations * (node.work_count - 1) as f64
+ work.cited_by_count as f64)
/ node.work_count as f64;
// For simplicity, we'd need referenced works' topics
// This is a simplified model
}
// Calculate growth rates
let time_span_years = (max_date - min_date).num_days() as f64 / 365.0;
for node in topics.values_mut() {
node.growth_rate = if time_span_years > 0.0 {
node.work_count as f64 / time_span_years
} else {
0.0
};
}
// Build edges
let edges: Vec<TopicEdge> = edge_counts
.into_iter()
.map(|((src, tgt), count)| {
let src_count = topics.get(&src).map(|n| n.work_count).unwrap_or(1);
let tgt_count = topics.get(&tgt).map(|n| n.work_count).unwrap_or(1);
let weight = count as f64 / (src_count * tgt_count) as f64;
TopicEdge {
source: src,
target: tgt,
citation_count: count,
weight,
}
})
.collect();
Self {
topics,
edges,
time_window: (min_date, max_date),
}
}
/// Get number of topics
pub fn topic_count(&self) -> usize {
self.topics.len()
}
/// Get number of edges
pub fn edge_count(&self) -> usize {
self.edges.len()
}
/// Get topics by growth rate
pub fn fastest_growing(&self, top_k: usize) -> Vec<&TopicNode> {
let mut nodes: Vec<_> = self.topics.values().collect();
nodes.sort_by(|a, b| {
b.growth_rate
.partial_cmp(&a.growth_rate)
.unwrap_or(std::cmp::Ordering::Equal)
});
nodes.into_iter().take(top_k).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_entity_endpoints() {
assert_eq!(EntityType::Work.endpoint(), "works");
assert_eq!(EntityType::Author.endpoint(), "authors");
assert_eq!(EntityType::Topic.endpoint(), "topics");
}
#[test]
fn test_default_config() {
let config = OpenAlexConfig::default();
assert_eq!(config.base_url, "https://api.openalex.org");
assert!(config.use_cursor);
}
#[test]
fn test_source_with_filters() {
let config = OpenAlexConfig::default();
let source = OpenAlexSource::new(config)
.with_year_range(2020, 2024)
.open_access_only();
assert!(source.filters.contains_key("publication_year"));
assert!(source.filters.contains_key("open_access.is_oa"));
}
}

View File

@@ -0,0 +1,627 @@
//! OpenAlex entity schemas
//!
//! Represents the core entity types from OpenAlex:
//! - Works (publications)
//! - Authors
//! - Institutions
//! - Topics/Concepts
//! - Funders
//! - Sources (journals, conferences)
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
/// A scholarly work (paper, book, dataset, etc.)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Work {
/// OpenAlex ID (e.g., "W2741809807")
pub id: String,
/// DOI (if available)
pub doi: Option<String>,
/// Work title
pub title: String,
/// Publication date
pub publication_date: Option<DateTime<Utc>>,
/// Publication year
pub publication_year: Option<i32>,
/// Work type (article, book, dataset, etc.)
#[serde(rename = "type")]
pub work_type: Option<String>,
/// Open access status
pub open_access: Option<OpenAccessStatus>,
/// Citation count
pub cited_by_count: u64,
/// Authors and their affiliations
#[serde(default)]
pub authorships: Vec<Authorship>,
/// Primary topic
pub primary_topic: Option<TopicReference>,
/// All associated topics
#[serde(default)]
pub topics: Vec<TopicReference>,
/// Legacy concepts (deprecated but still in API)
#[serde(default)]
pub concepts: Vec<ConceptReference>,
/// Referenced works (citations)
#[serde(default)]
pub referenced_works: Vec<String>,
/// Related works
#[serde(default)]
pub related_works: Vec<String>,
/// Abstract (inverted index format in API)
pub abstract_inverted_index: Option<serde_json::Value>,
/// Publication venue
pub primary_location: Option<Location>,
/// Grants/funding
#[serde(default)]
pub grants: Vec<Grant>,
/// Bibliographic info
pub biblio: Option<Biblio>,
/// Last update time
pub updated_date: Option<DateTime<Utc>>,
}
/// Open access status
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OpenAccessStatus {
/// Is this work open access?
pub is_oa: bool,
/// OA status type (gold, green, hybrid, bronze)
pub oa_status: Option<String>,
/// OA URL if available
pub oa_url: Option<String>,
}
/// Author and affiliation information for a work
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Authorship {
/// Author position (first, middle, last)
pub author_position: AuthorPosition,
/// Author details
pub author: AuthorReference,
/// Institutions at time of publication
#[serde(default)]
pub institutions: Vec<InstitutionReference>,
/// Countries
#[serde(default)]
pub countries: Vec<String>,
/// Is corresponding author
#[serde(default)]
pub is_corresponding: bool,
/// Raw affiliation string
pub raw_affiliation_string: Option<String>,
}
/// Author position in author list
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum AuthorPosition {
/// First author
First,
/// Middle author
Middle,
/// Last author
Last,
}
/// Reference to an author
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AuthorReference {
/// OpenAlex author ID
pub id: String,
/// Display name
pub display_name: String,
/// ORCID (if available)
pub orcid: Option<String>,
}
/// Reference to an institution
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InstitutionReference {
/// OpenAlex institution ID
pub id: String,
/// Display name
pub display_name: String,
/// Institution type (education, company, etc.)
#[serde(rename = "type")]
pub institution_type: Option<String>,
/// Country code
pub country_code: Option<String>,
/// ROR ID
pub ror: Option<String>,
}
/// Reference to a topic
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopicReference {
/// OpenAlex topic ID
pub id: String,
/// Display name
pub display_name: String,
/// Relevance score (0-1)
#[serde(default)]
pub score: f64,
/// Subfield
pub subfield: Option<FieldReference>,
/// Field
pub field: Option<FieldReference>,
/// Domain
pub domain: Option<FieldReference>,
}
/// Reference to a concept (legacy)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConceptReference {
/// OpenAlex concept ID
pub id: String,
/// Display name
pub display_name: String,
/// Wikidata ID
pub wikidata: Option<String>,
/// Relevance score
#[serde(default)]
pub score: f64,
/// Hierarchy level (0 = root)
#[serde(default)]
pub level: u32,
}
/// Reference to a field/domain
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldReference {
/// OpenAlex ID
pub id: String,
/// Display name
pub display_name: String,
}
/// Publication location
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Location {
/// Is primary location
#[serde(default)]
pub is_primary: bool,
/// Landing page URL
pub landing_page_url: Option<String>,
/// PDF URL
pub pdf_url: Option<String>,
/// Source (journal/conference)
pub source: Option<SourceReference>,
/// License
pub license: Option<String>,
/// Version
pub version: Option<String>,
}
/// Reference to a source (journal, conference, etc.)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SourceReference {
/// OpenAlex source ID
pub id: String,
/// Display name
pub display_name: String,
/// ISSN
pub issn_l: Option<String>,
/// Source type
#[serde(rename = "type")]
pub source_type: Option<String>,
/// Is Open Access journal
#[serde(default)]
pub is_oa: bool,
/// Host organization
pub host_organization: Option<String>,
}
/// Grant/funding information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Grant {
/// Funder
pub funder: Option<FunderReference>,
/// Funder display name
pub funder_display_name: Option<String>,
/// Award ID
pub award_id: Option<String>,
}
/// Reference to a funder
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FunderReference {
/// OpenAlex funder ID
pub id: String,
/// Display name
pub display_name: String,
}
/// Bibliographic details
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Biblio {
/// Volume
pub volume: Option<String>,
/// Issue
pub issue: Option<String>,
/// First page
pub first_page: Option<String>,
/// Last page
pub last_page: Option<String>,
}
/// Full author entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Author {
/// OpenAlex author ID
pub id: String,
/// ORCID
pub orcid: Option<String>,
/// Display name
pub display_name: String,
/// Alternative names
#[serde(default)]
pub display_name_alternatives: Vec<String>,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// H-index
pub summary_stats: Option<AuthorStats>,
/// Most recent institution
pub last_known_institution: Option<InstitutionReference>,
/// All affiliations
#[serde(default)]
pub affiliations: Vec<Affiliation>,
/// Topic areas
#[serde(default)]
pub topics: Vec<TopicReference>,
/// Works API URL
pub works_api_url: Option<String>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Author summary statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AuthorStats {
/// H-index
pub h_index: Option<u32>,
/// i10-index
pub i10_index: Option<u32>,
/// Two-year mean citedness
#[serde(rename = "2yr_mean_citedness")]
pub two_year_mean_citedness: Option<f64>,
}
/// Author affiliation
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Affiliation {
/// Institution
pub institution: InstitutionReference,
/// Years affiliated
#[serde(default)]
pub years: Vec<i32>,
}
/// Full institution entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Institution {
/// OpenAlex institution ID
pub id: String,
/// ROR ID
pub ror: Option<String>,
/// Display name
pub display_name: String,
/// Country code
pub country_code: Option<String>,
/// Institution type
#[serde(rename = "type")]
pub institution_type: Option<String>,
/// Homepage URL
pub homepage_url: Option<String>,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// Geographic info
pub geo: Option<GeoLocation>,
/// Parent institutions
#[serde(default)]
pub lineage: Vec<String>,
/// Associated institutions
#[serde(default)]
pub associated_institutions: Vec<InstitutionReference>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Geographic location
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GeoLocation {
/// City
pub city: Option<String>,
/// Region/state
pub region: Option<String>,
/// Country
pub country: Option<String>,
/// Country code
pub country_code: Option<String>,
/// Latitude
pub latitude: Option<f64>,
/// Longitude
pub longitude: Option<f64>,
}
/// Full topic entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Topic {
/// OpenAlex topic ID
pub id: String,
/// Display name
pub display_name: String,
/// Description
pub description: Option<String>,
/// Keywords
#[serde(default)]
pub keywords: Vec<String>,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// Subfield
pub subfield: Option<FieldReference>,
/// Field
pub field: Option<FieldReference>,
/// Domain
pub domain: Option<FieldReference>,
/// Sibling topics
#[serde(default)]
pub siblings: Vec<TopicReference>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Legacy concept entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Concept {
/// OpenAlex concept ID
pub id: String,
/// Wikidata ID
pub wikidata: Option<String>,
/// Display name
pub display_name: String,
/// Description
pub description: Option<String>,
/// Hierarchy level
pub level: u32,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// Parent concepts
#[serde(default)]
pub ancestors: Vec<ConceptReference>,
/// Child concepts
#[serde(default)]
pub related_concepts: Vec<ConceptReference>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Full source entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Source {
/// OpenAlex source ID
pub id: String,
/// ISSN-L
pub issn_l: Option<String>,
/// All ISSNs
#[serde(default)]
pub issn: Vec<String>,
/// Display name
pub display_name: String,
/// Publisher
pub host_organization: Option<String>,
/// Source type (journal, conference, etc.)
#[serde(rename = "type")]
pub source_type: Option<String>,
/// Is Open Access
#[serde(default)]
pub is_oa: bool,
/// Homepage URL
pub homepage_url: Option<String>,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// Topics
#[serde(default)]
pub topics: Vec<TopicReference>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
/// Full funder entity
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Funder {
/// OpenAlex funder ID
pub id: String,
/// Display name
pub display_name: String,
/// Alternative names
#[serde(default)]
pub alternate_titles: Vec<String>,
/// Country code
pub country_code: Option<String>,
/// Description
pub description: Option<String>,
/// Homepage URL
pub homepage_url: Option<String>,
/// Grants count
pub grants_count: u64,
/// Works count
pub works_count: u64,
/// Citation count
pub cited_by_count: u64,
/// ROR ID
pub ror: Option<String>,
/// Updated date
pub updated_date: Option<DateTime<Utc>>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_work_deserialization() {
let json = r#"{
"id": "W123",
"title": "Test Paper",
"cited_by_count": 10,
"authorships": [],
"topics": [],
"concepts": [],
"referenced_works": [],
"related_works": [],
"grants": []
}"#;
let work: Work = serde_json::from_str(json).unwrap();
assert_eq!(work.id, "W123");
assert_eq!(work.title, "Test Paper");
assert_eq!(work.cited_by_count, 10);
}
#[test]
fn test_author_position() {
let first = serde_json::from_str::<AuthorPosition>(r#""first""#).unwrap();
assert_eq!(first, AuthorPosition::First);
let last = serde_json::from_str::<AuthorPosition>(r#""last""#).unwrap();
assert_eq!(last, AuthorPosition::Last);
}
}