Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
406
vendor/ruvector/crates/sona/src/export/dataset.rs
vendored
Normal file
406
vendor/ruvector/crates/sona/src/export/dataset.rs
vendored
Normal file
@@ -0,0 +1,406 @@
|
||||
//! Dataset Export - HuggingFace-compatible dataset formats
|
||||
//!
|
||||
//! Exports SONA's learned patterns and preference pairs as JSONL datasets
|
||||
//! compatible with HuggingFace's datasets library.
|
||||
|
||||
use super::{ExportConfig, ExportError, ExportResult, ExportType};
|
||||
use crate::engine::SonaEngine;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::Path;
|
||||
|
||||
#[cfg(feature = "serde-support")]
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Dataset exporter for patterns and preferences
|
||||
pub struct DatasetExporter<'a> {
|
||||
config: &'a ExportConfig,
|
||||
}
|
||||
|
||||
impl<'a> DatasetExporter<'a> {
|
||||
/// Create new dataset exporter
|
||||
pub fn new(config: &'a ExportConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Export learned patterns as JSONL dataset
|
||||
pub fn export_patterns<P: AsRef<Path>>(
|
||||
&self,
|
||||
engine: &SonaEngine,
|
||||
output_path: P,
|
||||
) -> Result<ExportResult, ExportError> {
|
||||
let output_path = output_path.as_ref();
|
||||
|
||||
// Ensure parent directory exists
|
||||
if let Some(parent) = output_path.parent() {
|
||||
std::fs::create_dir_all(parent).map_err(ExportError::Io)?;
|
||||
}
|
||||
|
||||
let file = std::fs::File::create(output_path).map_err(ExportError::Io)?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
let patterns = engine.get_all_patterns();
|
||||
let mut items_exported = 0;
|
||||
|
||||
for pattern in patterns {
|
||||
// Filter by quality threshold
|
||||
if pattern.avg_quality < self.config.min_quality_threshold {
|
||||
continue;
|
||||
}
|
||||
|
||||
let record = PatternRecord {
|
||||
id: pattern.id.to_string(),
|
||||
embedding: pattern.centroid.clone(),
|
||||
cluster_size: pattern.cluster_size,
|
||||
avg_quality: pattern.avg_quality,
|
||||
pattern_type: pattern.pattern_type.to_string(),
|
||||
access_count: pattern.access_count as u64,
|
||||
metadata: PatternMetadata {
|
||||
source: "sona".to_string(),
|
||||
version: env!("CARGO_PKG_VERSION").to_string(),
|
||||
target_model: self.config.target_architecture.clone(),
|
||||
},
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&record).map_err(ExportError::Serialization)?;
|
||||
writeln!(writer, "{}", json).map_err(ExportError::Io)?;
|
||||
items_exported += 1;
|
||||
}
|
||||
|
||||
writer.flush().map_err(ExportError::Io)?;
|
||||
|
||||
let size_bytes = std::fs::metadata(output_path).map(|m| m.len()).unwrap_or(0);
|
||||
|
||||
Ok(ExportResult {
|
||||
export_type: ExportType::PatternsDataset,
|
||||
items_exported,
|
||||
output_path: output_path.to_string_lossy().to_string(),
|
||||
size_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
/// Export preference pairs for DPO/RLHF training
|
||||
pub fn export_preferences<P: AsRef<Path>>(
|
||||
&self,
|
||||
engine: &SonaEngine,
|
||||
output_path: P,
|
||||
) -> Result<ExportResult, ExportError> {
|
||||
let output_path = output_path.as_ref();
|
||||
|
||||
// Ensure parent directory exists
|
||||
if let Some(parent) = output_path.parent() {
|
||||
std::fs::create_dir_all(parent).map_err(ExportError::Io)?;
|
||||
}
|
||||
|
||||
let file = std::fs::File::create(output_path).map_err(ExportError::Io)?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
let trajectories = engine.get_quality_trajectories();
|
||||
let mut items_exported = 0;
|
||||
|
||||
// Generate preference pairs from trajectories
|
||||
// Sort by quality and pair high-quality with low-quality
|
||||
let mut sorted_trajectories = trajectories.clone();
|
||||
sorted_trajectories.sort_by(|a, b| {
|
||||
b.quality
|
||||
.partial_cmp(&a.quality)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
|
||||
let mid = sorted_trajectories.len() / 2;
|
||||
let (high_quality, low_quality) = sorted_trajectories.split_at(mid);
|
||||
|
||||
for (chosen, rejected) in high_quality.iter().zip(low_quality.iter().rev()) {
|
||||
// Skip if quality difference is too small
|
||||
if (chosen.quality - rejected.quality).abs() < 0.1 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let pair = PreferencePair {
|
||||
prompt: PreferencePrompt {
|
||||
embedding: chosen.query_embedding.clone(),
|
||||
context: chosen.context_ids.clone(),
|
||||
},
|
||||
chosen: PreferenceResponse {
|
||||
route: chosen.route.clone(),
|
||||
quality: chosen.quality,
|
||||
embedding: chosen.response_embedding.clone(),
|
||||
},
|
||||
rejected: PreferenceResponse {
|
||||
route: rejected.route.clone(),
|
||||
quality: rejected.quality,
|
||||
embedding: rejected.response_embedding.clone(),
|
||||
},
|
||||
metadata: PreferenceMetadata {
|
||||
quality_delta: chosen.quality - rejected.quality,
|
||||
source: "sona".to_string(),
|
||||
version: env!("CARGO_PKG_VERSION").to_string(),
|
||||
},
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&pair).map_err(ExportError::Serialization)?;
|
||||
writeln!(writer, "{}", json).map_err(ExportError::Io)?;
|
||||
items_exported += 1;
|
||||
}
|
||||
|
||||
writer.flush().map_err(ExportError::Io)?;
|
||||
|
||||
let size_bytes = std::fs::metadata(output_path).map(|m| m.len()).unwrap_or(0);
|
||||
|
||||
Ok(ExportResult {
|
||||
export_type: ExportType::PreferencePairs,
|
||||
items_exported,
|
||||
output_path: output_path.to_string_lossy().to_string(),
|
||||
size_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
/// Export distillation targets for knowledge distillation
|
||||
pub fn export_distillation_targets<P: AsRef<Path>>(
|
||||
&self,
|
||||
engine: &SonaEngine,
|
||||
output_path: P,
|
||||
) -> Result<ExportResult, ExportError> {
|
||||
let output_path = output_path.as_ref();
|
||||
|
||||
// Ensure parent directory exists
|
||||
if let Some(parent) = output_path.parent() {
|
||||
std::fs::create_dir_all(parent).map_err(ExportError::Io)?;
|
||||
}
|
||||
|
||||
let file = std::fs::File::create(output_path).map_err(ExportError::Io)?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
let routing_decisions = engine.get_routing_decisions();
|
||||
let mut items_exported = 0;
|
||||
|
||||
for decision in routing_decisions {
|
||||
// Filter by quality
|
||||
if decision.quality < self.config.min_quality_threshold {
|
||||
continue;
|
||||
}
|
||||
|
||||
let target = DistillationTarget {
|
||||
input_embedding: decision.query_embedding.clone(),
|
||||
teacher_logits: decision.routing_logits.clone(),
|
||||
selected_route: decision.selected_route.clone(),
|
||||
confidence: decision.confidence,
|
||||
quality: decision.quality,
|
||||
metadata: DistillationMetadata {
|
||||
source: "sona".to_string(),
|
||||
version: env!("CARGO_PKG_VERSION").to_string(),
|
||||
temperature: 1.0,
|
||||
},
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&target).map_err(ExportError::Serialization)?;
|
||||
writeln!(writer, "{}", json).map_err(ExportError::Io)?;
|
||||
items_exported += 1;
|
||||
}
|
||||
|
||||
writer.flush().map_err(ExportError::Io)?;
|
||||
|
||||
let size_bytes = std::fs::metadata(output_path).map(|m| m.len()).unwrap_or(0);
|
||||
|
||||
Ok(ExportResult {
|
||||
export_type: ExportType::DistillationTargets,
|
||||
items_exported,
|
||||
output_path: output_path.to_string_lossy().to_string(),
|
||||
size_bytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Pattern record for JSONL export
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PatternRecord {
|
||||
/// Pattern ID
|
||||
pub id: String,
|
||||
/// Embedding vector
|
||||
pub embedding: Vec<f32>,
|
||||
/// Number of trajectories in cluster
|
||||
pub cluster_size: usize,
|
||||
/// Average quality score
|
||||
pub avg_quality: f32,
|
||||
/// Pattern type (routing, reasoning, etc.)
|
||||
pub pattern_type: String,
|
||||
/// Access count
|
||||
pub access_count: u64,
|
||||
/// Export metadata
|
||||
pub metadata: PatternMetadata,
|
||||
}
|
||||
|
||||
/// Pattern export metadata
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PatternMetadata {
|
||||
/// Source system
|
||||
pub source: String,
|
||||
/// Version
|
||||
pub version: String,
|
||||
/// Target model architecture
|
||||
pub target_model: String,
|
||||
}
|
||||
|
||||
/// Preference pair for DPO/RLHF
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PreferencePair {
|
||||
/// Input prompt
|
||||
pub prompt: PreferencePrompt,
|
||||
/// Chosen (preferred) response
|
||||
pub chosen: PreferenceResponse,
|
||||
/// Rejected response
|
||||
pub rejected: PreferenceResponse,
|
||||
/// Metadata
|
||||
pub metadata: PreferenceMetadata,
|
||||
}
|
||||
|
||||
/// Preference prompt
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PreferencePrompt {
|
||||
/// Query embedding
|
||||
pub embedding: Vec<f32>,
|
||||
/// Context IDs
|
||||
pub context: Vec<String>,
|
||||
}
|
||||
|
||||
/// Preference response
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PreferenceResponse {
|
||||
/// Model route
|
||||
pub route: String,
|
||||
/// Quality score
|
||||
pub quality: f32,
|
||||
/// Response embedding
|
||||
pub embedding: Vec<f32>,
|
||||
}
|
||||
|
||||
/// Preference pair metadata
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PreferenceMetadata {
|
||||
/// Quality difference between chosen and rejected
|
||||
pub quality_delta: f32,
|
||||
/// Source system
|
||||
pub source: String,
|
||||
/// Version
|
||||
pub version: String,
|
||||
}
|
||||
|
||||
/// Distillation target for knowledge distillation
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DistillationTarget {
|
||||
/// Input embedding
|
||||
pub input_embedding: Vec<f32>,
|
||||
/// Teacher model logits
|
||||
pub teacher_logits: Vec<f32>,
|
||||
/// Selected route
|
||||
pub selected_route: String,
|
||||
/// Confidence score
|
||||
pub confidence: f32,
|
||||
/// Quality score
|
||||
pub quality: f32,
|
||||
/// Metadata
|
||||
pub metadata: DistillationMetadata,
|
||||
}
|
||||
|
||||
/// Distillation metadata
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DistillationMetadata {
|
||||
/// Source system
|
||||
pub source: String,
|
||||
/// Version
|
||||
pub version: String,
|
||||
/// Temperature for softmax
|
||||
pub temperature: f32,
|
||||
}
|
||||
|
||||
/// Quality trajectory for preference learning
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct QualityTrajectory {
|
||||
/// Query embedding
|
||||
pub query_embedding: Vec<f32>,
|
||||
/// Response embedding
|
||||
pub response_embedding: Vec<f32>,
|
||||
/// Model route
|
||||
pub route: String,
|
||||
/// Quality score
|
||||
pub quality: f32,
|
||||
/// Context IDs
|
||||
pub context_ids: Vec<String>,
|
||||
}
|
||||
|
||||
/// Routing decision for distillation
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RoutingDecision {
|
||||
/// Query embedding
|
||||
pub query_embedding: Vec<f32>,
|
||||
/// Routing logits
|
||||
pub routing_logits: Vec<f32>,
|
||||
/// Selected route
|
||||
pub selected_route: String,
|
||||
/// Confidence
|
||||
pub confidence: f32,
|
||||
/// Quality
|
||||
pub quality: f32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pattern_record() {
|
||||
let record = PatternRecord {
|
||||
id: "test-pattern".to_string(),
|
||||
embedding: vec![0.1, 0.2, 0.3],
|
||||
cluster_size: 10,
|
||||
avg_quality: 0.85,
|
||||
pattern_type: "routing".to_string(),
|
||||
access_count: 100,
|
||||
metadata: PatternMetadata {
|
||||
source: "sona".to_string(),
|
||||
version: "0.1.0".to_string(),
|
||||
target_model: "phi-4".to_string(),
|
||||
},
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&record).unwrap();
|
||||
assert!(json.contains("test-pattern"));
|
||||
assert!(json.contains("0.85"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_preference_pair() {
|
||||
let pair = PreferencePair {
|
||||
prompt: PreferencePrompt {
|
||||
embedding: vec![0.1, 0.2],
|
||||
context: vec!["ctx1".to_string()],
|
||||
},
|
||||
chosen: PreferenceResponse {
|
||||
route: "gpt-4".to_string(),
|
||||
quality: 0.9,
|
||||
embedding: vec![0.3, 0.4],
|
||||
},
|
||||
rejected: PreferenceResponse {
|
||||
route: "gpt-3.5".to_string(),
|
||||
quality: 0.6,
|
||||
embedding: vec![0.5, 0.6],
|
||||
},
|
||||
metadata: PreferenceMetadata {
|
||||
quality_delta: 0.3,
|
||||
source: "sona".to_string(),
|
||||
version: "0.1.0".to_string(),
|
||||
},
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&pair).unwrap();
|
||||
assert!(json.contains("gpt-4"));
|
||||
assert!(json.contains("0.9"));
|
||||
}
|
||||
}
|
||||
485
vendor/ruvector/crates/sona/src/export/huggingface_hub.rs
vendored
Normal file
485
vendor/ruvector/crates/sona/src/export/huggingface_hub.rs
vendored
Normal file
@@ -0,0 +1,485 @@
|
||||
//! HuggingFace Hub Integration
|
||||
//!
|
||||
//! Direct integration with HuggingFace Hub API for uploading SONA models,
|
||||
//! patterns, and datasets.
|
||||
|
||||
use super::{
|
||||
DatasetExporter, ExportConfig, ExportError, ExportResult, ExportType, SafeTensorsExporter,
|
||||
};
|
||||
use crate::engine::SonaEngine;
|
||||
use std::path::Path;
|
||||
|
||||
#[cfg(feature = "serde-support")]
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// HuggingFace Hub client
|
||||
pub struct HuggingFaceHub {
|
||||
/// API token (optional for public repos)
|
||||
token: Option<String>,
|
||||
/// API base URL
|
||||
api_url: String,
|
||||
}
|
||||
|
||||
impl HuggingFaceHub {
|
||||
/// Create new Hub client
|
||||
pub fn new(token: Option<&str>) -> Self {
|
||||
Self {
|
||||
token: token.map(|t| t.to_string()),
|
||||
api_url: "https://huggingface.co/api".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create Hub client from environment variable
|
||||
pub fn from_env() -> Self {
|
||||
let token = std::env::var("HF_TOKEN")
|
||||
.or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
|
||||
.ok();
|
||||
Self::new(token.as_deref())
|
||||
}
|
||||
|
||||
/// Push all exports to HuggingFace Hub
|
||||
pub fn push_all(
|
||||
&self,
|
||||
engine: &SonaEngine,
|
||||
config: &ExportConfig,
|
||||
repo_id: &str,
|
||||
) -> Result<ExportResult, ExportError> {
|
||||
// Create temporary directory for exports
|
||||
let temp_dir = std::env::temp_dir().join(format!("sona-export-{}", uuid_v4()));
|
||||
std::fs::create_dir_all(&temp_dir).map_err(ExportError::Io)?;
|
||||
|
||||
// Export all components to temp directory
|
||||
let safetensors_exporter = SafeTensorsExporter::new(config);
|
||||
let dataset_exporter = DatasetExporter::new(config);
|
||||
|
||||
let mut total_items = 0;
|
||||
let mut total_size = 0u64;
|
||||
|
||||
// Export LoRA weights
|
||||
if config.include_lora {
|
||||
let result = safetensors_exporter.export_engine(engine, temp_dir.join("lora"))?;
|
||||
total_items += result.items_exported;
|
||||
total_size += result.size_bytes;
|
||||
}
|
||||
|
||||
// Export patterns
|
||||
if config.include_patterns {
|
||||
let result =
|
||||
dataset_exporter.export_patterns(engine, temp_dir.join("patterns.jsonl"))?;
|
||||
total_items += result.items_exported;
|
||||
total_size += result.size_bytes;
|
||||
}
|
||||
|
||||
// Export preferences
|
||||
if config.include_preferences {
|
||||
let result =
|
||||
dataset_exporter.export_preferences(engine, temp_dir.join("preferences.jsonl"))?;
|
||||
total_items += result.items_exported;
|
||||
total_size += result.size_bytes;
|
||||
}
|
||||
|
||||
// Create model card
|
||||
let readme = self.create_model_card(engine, config);
|
||||
let readme_path = temp_dir.join("README.md");
|
||||
std::fs::write(&readme_path, readme).map_err(ExportError::Io)?;
|
||||
|
||||
// Create adapter config
|
||||
let adapter_config = self.create_adapter_config(engine, config);
|
||||
let config_path = temp_dir.join("adapter_config.json");
|
||||
let config_json = serde_json::to_string_pretty(&adapter_config)?;
|
||||
std::fs::write(&config_path, config_json).map_err(ExportError::Io)?;
|
||||
|
||||
// Upload to Hub (using git LFS approach)
|
||||
self.upload_directory(&temp_dir, repo_id)?;
|
||||
|
||||
// Cleanup
|
||||
let _ = std::fs::remove_dir_all(&temp_dir);
|
||||
|
||||
Ok(ExportResult {
|
||||
export_type: ExportType::SafeTensors,
|
||||
items_exported: total_items,
|
||||
output_path: format!("https://huggingface.co/{}", repo_id),
|
||||
size_bytes: total_size,
|
||||
})
|
||||
}
|
||||
|
||||
/// Upload directory to HuggingFace Hub
|
||||
fn upload_directory(&self, local_path: &Path, repo_id: &str) -> Result<(), ExportError> {
|
||||
// Check for git and git-lfs
|
||||
let has_git = std::process::Command::new("git")
|
||||
.arg("--version")
|
||||
.output()
|
||||
.is_ok();
|
||||
|
||||
if !has_git {
|
||||
return Err(ExportError::HubError(
|
||||
"git is required for HuggingFace Hub upload. Install git and git-lfs.".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
// Clone or create repo
|
||||
let repo_url = if let Some(ref token) = self.token {
|
||||
format!("https://{}@huggingface.co/{}", token, repo_id)
|
||||
} else {
|
||||
format!("https://huggingface.co/{}", repo_id)
|
||||
};
|
||||
|
||||
let clone_dir = local_path.parent().unwrap().join("hf-repo");
|
||||
|
||||
// Try to clone existing repo
|
||||
let clone_result = std::process::Command::new("git")
|
||||
.args(["clone", &repo_url, clone_dir.to_str().unwrap()])
|
||||
.output();
|
||||
|
||||
if clone_result.is_err() {
|
||||
// Create new repo via API
|
||||
self.create_repo(repo_id)?;
|
||||
|
||||
// Try cloning again
|
||||
std::process::Command::new("git")
|
||||
.args(["clone", &repo_url, clone_dir.to_str().unwrap()])
|
||||
.output()
|
||||
.map_err(|e| ExportError::HubError(format!("Failed to clone repo: {}", e)))?;
|
||||
}
|
||||
|
||||
// Copy files to cloned repo
|
||||
copy_dir_recursive(local_path, &clone_dir)?;
|
||||
|
||||
// Add, commit, and push
|
||||
std::process::Command::new("git")
|
||||
.args(["-C", clone_dir.to_str().unwrap(), "add", "-A"])
|
||||
.output()
|
||||
.map_err(|e| ExportError::HubError(format!("git add failed: {}", e)))?;
|
||||
|
||||
std::process::Command::new("git")
|
||||
.args([
|
||||
"-C",
|
||||
clone_dir.to_str().unwrap(),
|
||||
"commit",
|
||||
"-m",
|
||||
"Upload SONA adapter",
|
||||
])
|
||||
.output()
|
||||
.map_err(|e| ExportError::HubError(format!("git commit failed: {}", e)))?;
|
||||
|
||||
let push_result = std::process::Command::new("git")
|
||||
.args(["-C", clone_dir.to_str().unwrap(), "push"])
|
||||
.output()
|
||||
.map_err(|e| ExportError::HubError(format!("git push failed: {}", e)))?;
|
||||
|
||||
if !push_result.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&push_result.stderr);
|
||||
return Err(ExportError::HubError(format!(
|
||||
"git push failed: {}",
|
||||
stderr
|
||||
)));
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
let _ = std::fs::remove_dir_all(&clone_dir);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new repository on HuggingFace Hub
|
||||
fn create_repo(&self, repo_id: &str) -> Result<(), ExportError> {
|
||||
let token = self.token.as_ref().ok_or_else(|| {
|
||||
ExportError::HubError("HuggingFace token required to create repos".to_string())
|
||||
})?;
|
||||
|
||||
// Parse repo_id (org/name or just name)
|
||||
let (organization, name) = if let Some(idx) = repo_id.find('/') {
|
||||
(Some(&repo_id[..idx]), &repo_id[idx + 1..])
|
||||
} else {
|
||||
(None, repo_id)
|
||||
};
|
||||
|
||||
let create_request = CreateRepoRequest {
|
||||
name: name.to_string(),
|
||||
organization: organization.map(|s| s.to_string()),
|
||||
private: false,
|
||||
repo_type: "model".to_string(),
|
||||
};
|
||||
|
||||
let url = format!("{}/repos/create", self.api_url);
|
||||
|
||||
// Use simple HTTP client approach (blocking for simplicity)
|
||||
// In production, you'd use reqwest or similar
|
||||
let body = serde_json::to_string(&create_request)?;
|
||||
|
||||
let output = std::process::Command::new("curl")
|
||||
.args([
|
||||
"-X",
|
||||
"POST",
|
||||
"-H",
|
||||
&format!("Authorization: Bearer {}", token),
|
||||
"-H",
|
||||
"Content-Type: application/json",
|
||||
"-d",
|
||||
&body,
|
||||
&url,
|
||||
])
|
||||
.output()
|
||||
.map_err(|e| ExportError::HubError(format!("curl failed: {}", e)))?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
// Repo might already exist, which is fine
|
||||
if !stderr.contains("already exists") {
|
||||
return Err(ExportError::HubError(format!(
|
||||
"Failed to create repo: {}",
|
||||
stderr
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create model card content
|
||||
fn create_model_card(&self, engine: &SonaEngine, config: &ExportConfig) -> String {
|
||||
let stats = engine.stats();
|
||||
format!(
|
||||
r#"---
|
||||
license: mit
|
||||
library_name: peft
|
||||
base_model: {}
|
||||
tags:
|
||||
- sona
|
||||
- lora
|
||||
- adaptive-learning
|
||||
- ruvector
|
||||
---
|
||||
|
||||
# {} SONA Adapter
|
||||
|
||||
This adapter was generated using [SONA (Self-Optimizing Neural Architecture)](https://github.com/ruvnet/ruvector/tree/main/crates/sona) - a runtime-adaptive learning system.
|
||||
|
||||
## Model Details
|
||||
|
||||
- **Base Model**: {}
|
||||
- **PEFT Type**: LoRA (Two-Tier)
|
||||
- **MicroLoRA Rank**: {} (instant adaptation)
|
||||
- **BaseLoRA Rank**: {} (background learning)
|
||||
- **Patterns Learned**: {}
|
||||
- **Trajectories Processed**: {}
|
||||
|
||||
## SONA Features
|
||||
|
||||
### Two-Tier LoRA Architecture
|
||||
- **MicroLoRA**: Rank 1-2 for instant adaptation (<0.5ms latency)
|
||||
- **BaseLoRA**: Rank 4-16 for background learning
|
||||
|
||||
### EWC++ (Elastic Weight Consolidation)
|
||||
Prevents catastrophic forgetting when learning new patterns.
|
||||
|
||||
### ReasoningBank
|
||||
K-means++ clustering for efficient pattern storage and retrieval.
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Throughput | 2211 ops/sec |
|
||||
| Latency | <0.5ms per layer |
|
||||
| Quality Improvement | +55% max |
|
||||
|
||||
## Usage with PEFT
|
||||
|
||||
```python
|
||||
from peft import PeftModel, PeftConfig
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
# Load adapter
|
||||
config = PeftConfig.from_pretrained("your-username/{}")
|
||||
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
|
||||
model = PeftModel.from_pretrained(model, "your-username/{}")
|
||||
|
||||
# Use for inference
|
||||
outputs = model.generate(input_ids)
|
||||
```
|
||||
|
||||
## Training with Included Datasets
|
||||
|
||||
### Patterns Dataset
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
patterns = load_dataset("json", data_files="patterns.jsonl")
|
||||
```
|
||||
|
||||
### Preference Pairs (for DPO/RLHF)
|
||||
```python
|
||||
preferences = load_dataset("json", data_files="preferences.jsonl")
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see [LICENSE](LICENSE) for details.
|
||||
|
||||
---
|
||||
|
||||
Generated with [ruvector-sona](https://crates.io/crates/ruvector-sona) v{}
|
||||
"#,
|
||||
config.target_architecture,
|
||||
config.model_name,
|
||||
config.target_architecture,
|
||||
engine.config().micro_lora_rank,
|
||||
engine.config().base_lora_rank,
|
||||
stats.patterns_stored,
|
||||
stats.trajectories_buffered,
|
||||
config.model_name,
|
||||
config.model_name,
|
||||
env!("CARGO_PKG_VERSION"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create PEFT-compatible adapter config
|
||||
fn create_adapter_config(
|
||||
&self,
|
||||
engine: &SonaEngine,
|
||||
config: &ExportConfig,
|
||||
) -> AdapterConfigJson {
|
||||
let sona_config = engine.config();
|
||||
AdapterConfigJson {
|
||||
peft_type: "LORA".to_string(),
|
||||
auto_mapping: None,
|
||||
base_model_name_or_path: config.target_architecture.clone(),
|
||||
revision: None,
|
||||
task_type: "CAUSAL_LM".to_string(),
|
||||
inference_mode: true,
|
||||
r: sona_config.base_lora_rank,
|
||||
lora_alpha: sona_config.base_lora_rank as f32,
|
||||
lora_dropout: 0.0,
|
||||
fan_in_fan_out: false,
|
||||
bias: "none".to_string(),
|
||||
target_modules: vec![
|
||||
"q_proj".to_string(),
|
||||
"k_proj".to_string(),
|
||||
"v_proj".to_string(),
|
||||
"o_proj".to_string(),
|
||||
],
|
||||
modules_to_save: None,
|
||||
layers_to_transform: None,
|
||||
layers_pattern: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Request to create a new repo
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
struct CreateRepoRequest {
|
||||
name: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
organization: Option<String>,
|
||||
private: bool,
|
||||
#[serde(rename = "type")]
|
||||
repo_type: String,
|
||||
}
|
||||
|
||||
/// PEFT adapter config for JSON export
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AdapterConfigJson {
|
||||
pub peft_type: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub auto_mapping: Option<serde_json::Value>,
|
||||
pub base_model_name_or_path: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub revision: Option<String>,
|
||||
pub task_type: String,
|
||||
pub inference_mode: bool,
|
||||
pub r: usize,
|
||||
pub lora_alpha: f32,
|
||||
pub lora_dropout: f32,
|
||||
pub fan_in_fan_out: bool,
|
||||
pub bias: String,
|
||||
pub target_modules: Vec<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub modules_to_save: Option<Vec<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub layers_to_transform: Option<Vec<usize>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub layers_pattern: Option<String>,
|
||||
}
|
||||
|
||||
/// Simple UUID v4 generator
|
||||
fn uuid_v4() -> String {
|
||||
use rand::Rng;
|
||||
let mut rng = rand::thread_rng();
|
||||
let bytes: [u8; 16] = rng.gen();
|
||||
format!(
|
||||
"{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
|
||||
bytes[0], bytes[1], bytes[2], bytes[3],
|
||||
bytes[4], bytes[5],
|
||||
(bytes[6] & 0x0f) | 0x40, bytes[7],
|
||||
(bytes[8] & 0x3f) | 0x80, bytes[9],
|
||||
bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15]
|
||||
)
|
||||
}
|
||||
|
||||
/// Copy directory recursively
|
||||
fn copy_dir_recursive(src: &Path, dst: &Path) -> Result<(), ExportError> {
|
||||
if !dst.exists() {
|
||||
std::fs::create_dir_all(dst).map_err(ExportError::Io)?;
|
||||
}
|
||||
|
||||
for entry in std::fs::read_dir(src).map_err(ExportError::Io)? {
|
||||
let entry = entry.map_err(ExportError::Io)?;
|
||||
let path = entry.path();
|
||||
let file_name = path.file_name().unwrap();
|
||||
let dest_path = dst.join(file_name);
|
||||
|
||||
if path.is_dir() {
|
||||
copy_dir_recursive(&path, &dest_path)?;
|
||||
} else {
|
||||
std::fs::copy(&path, &dest_path).map_err(ExportError::Io)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_hub_from_env() {
|
||||
// Just ensure it doesn't panic
|
||||
let _hub = HuggingFaceHub::from_env();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_uuid_v4() {
|
||||
let uuid = uuid_v4();
|
||||
assert_eq!(uuid.len(), 36);
|
||||
assert!(uuid.contains('-'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_adapter_config_json() {
|
||||
let config = AdapterConfigJson {
|
||||
peft_type: "LORA".to_string(),
|
||||
auto_mapping: None,
|
||||
base_model_name_or_path: "microsoft/phi-4".to_string(),
|
||||
revision: None,
|
||||
task_type: "CAUSAL_LM".to_string(),
|
||||
inference_mode: true,
|
||||
r: 8,
|
||||
lora_alpha: 8.0,
|
||||
lora_dropout: 0.0,
|
||||
fan_in_fan_out: false,
|
||||
bias: "none".to_string(),
|
||||
target_modules: vec!["q_proj".to_string()],
|
||||
modules_to_save: None,
|
||||
layers_to_transform: None,
|
||||
layers_pattern: None,
|
||||
};
|
||||
|
||||
let json = serde_json::to_string_pretty(&config).unwrap();
|
||||
assert!(json.contains("LORA"));
|
||||
assert!(json.contains("phi-4"));
|
||||
}
|
||||
}
|
||||
392
vendor/ruvector/crates/sona/src/export/mod.rs
vendored
Normal file
392
vendor/ruvector/crates/sona/src/export/mod.rs
vendored
Normal file
@@ -0,0 +1,392 @@
|
||||
//! SONA Export Module - HuggingFace Integration
|
||||
//!
|
||||
//! Export learned patterns, LoRA weights, and trajectories to HuggingFace-compatible formats
|
||||
//! for pretraining, fine-tuning, and knowledge distillation.
|
||||
//!
|
||||
//! # Supported Export Formats
|
||||
//!
|
||||
//! - **SafeTensors**: LoRA adapter weights in PEFT-compatible format
|
||||
//! - **JSONL Dataset**: ReasoningBank patterns as HuggingFace datasets
|
||||
//! - **Preference Pairs**: Quality trajectories for DPO/RLHF training
|
||||
//! - **Distillation Targets**: Routing decisions for knowledge distillation
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```rust,ignore
|
||||
//! use ruvector_sona::export::{HuggingFaceExporter, ExportConfig};
|
||||
//!
|
||||
//! let exporter = HuggingFaceExporter::new(&engine);
|
||||
//!
|
||||
//! // Export LoRA weights
|
||||
//! exporter.export_lora_safetensors("./lora_weights")?;
|
||||
//!
|
||||
//! // Export patterns as dataset
|
||||
//! exporter.export_patterns_jsonl("./patterns.jsonl")?;
|
||||
//!
|
||||
//! // Export preference pairs for RLHF
|
||||
//! exporter.export_preference_pairs("./preferences.jsonl")?;
|
||||
//! ```
|
||||
|
||||
pub mod dataset;
|
||||
pub mod huggingface_hub;
|
||||
pub mod pretrain;
|
||||
pub mod safetensors;
|
||||
|
||||
pub use dataset::DatasetExporter;
|
||||
pub use huggingface_hub::HuggingFaceHub;
|
||||
pub use pretrain::{PretrainConfig, PretrainPipeline};
|
||||
pub use safetensors::SafeTensorsExporter;
|
||||
|
||||
use crate::engine::SonaEngine;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::Path;
|
||||
|
||||
/// Export configuration
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ExportConfig {
|
||||
/// Model name for HuggingFace
|
||||
pub model_name: String,
|
||||
/// Organization/user on HuggingFace
|
||||
pub organization: Option<String>,
|
||||
/// Target model architecture (e.g., "phi-4", "llama-7b", "mistral-7b")
|
||||
pub target_architecture: String,
|
||||
/// Include patterns in export
|
||||
pub include_patterns: bool,
|
||||
/// Include LoRA weights
|
||||
pub include_lora: bool,
|
||||
/// Include preference pairs
|
||||
pub include_preferences: bool,
|
||||
/// Minimum quality threshold for exports
|
||||
pub min_quality_threshold: f32,
|
||||
/// Compress outputs
|
||||
pub compress: bool,
|
||||
}
|
||||
|
||||
impl Default for ExportConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
model_name: "sona-adapter".to_string(),
|
||||
organization: None,
|
||||
target_architecture: "phi-4".to_string(),
|
||||
include_patterns: true,
|
||||
include_lora: true,
|
||||
include_preferences: true,
|
||||
min_quality_threshold: 0.5,
|
||||
compress: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Main HuggingFace exporter
|
||||
pub struct HuggingFaceExporter<'a> {
|
||||
/// Reference to SONA engine
|
||||
engine: &'a SonaEngine,
|
||||
/// Export configuration
|
||||
config: ExportConfig,
|
||||
}
|
||||
|
||||
impl<'a> HuggingFaceExporter<'a> {
|
||||
/// Create new exporter
|
||||
pub fn new(engine: &'a SonaEngine) -> Self {
|
||||
Self {
|
||||
engine,
|
||||
config: ExportConfig::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with custom config
|
||||
pub fn with_config(engine: &'a SonaEngine, config: ExportConfig) -> Self {
|
||||
Self { engine, config }
|
||||
}
|
||||
|
||||
/// Export LoRA weights in SafeTensors format (PEFT-compatible)
|
||||
pub fn export_lora_safetensors<P: AsRef<Path>>(
|
||||
&self,
|
||||
output_dir: P,
|
||||
) -> Result<ExportResult, ExportError> {
|
||||
let exporter = SafeTensorsExporter::new(&self.config);
|
||||
exporter.export_engine(self.engine, output_dir)
|
||||
}
|
||||
|
||||
/// Export patterns as JSONL dataset
|
||||
pub fn export_patterns_jsonl<P: AsRef<Path>>(
|
||||
&self,
|
||||
output_path: P,
|
||||
) -> Result<ExportResult, ExportError> {
|
||||
let exporter = DatasetExporter::new(&self.config);
|
||||
exporter.export_patterns(self.engine, output_path)
|
||||
}
|
||||
|
||||
/// Export preference pairs for DPO/RLHF training
|
||||
pub fn export_preference_pairs<P: AsRef<Path>>(
|
||||
&self,
|
||||
output_path: P,
|
||||
) -> Result<ExportResult, ExportError> {
|
||||
let exporter = DatasetExporter::new(&self.config);
|
||||
exporter.export_preferences(self.engine, output_path)
|
||||
}
|
||||
|
||||
/// Export all to HuggingFace Hub
|
||||
pub fn push_to_hub(
|
||||
&self,
|
||||
repo_id: &str,
|
||||
token: Option<&str>,
|
||||
) -> Result<ExportResult, ExportError> {
|
||||
let hub = HuggingFaceHub::new(token);
|
||||
hub.push_all(self.engine, &self.config, repo_id)
|
||||
}
|
||||
|
||||
/// Export complete package (LoRA + patterns + config)
|
||||
pub fn export_all<P: AsRef<Path>>(
|
||||
&self,
|
||||
output_dir: P,
|
||||
) -> Result<Vec<ExportResult>, ExportError> {
|
||||
let output_dir = output_dir.as_ref();
|
||||
std::fs::create_dir_all(output_dir).map_err(ExportError::Io)?;
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
if self.config.include_lora {
|
||||
results.push(self.export_lora_safetensors(output_dir.join("lora"))?);
|
||||
}
|
||||
|
||||
if self.config.include_patterns {
|
||||
results.push(self.export_patterns_jsonl(output_dir.join("patterns.jsonl"))?);
|
||||
}
|
||||
|
||||
if self.config.include_preferences {
|
||||
results.push(self.export_preference_pairs(output_dir.join("preferences.jsonl"))?);
|
||||
}
|
||||
|
||||
// Export config
|
||||
let config_path = output_dir.join("adapter_config.json");
|
||||
let config_json = serde_json::to_string_pretty(&self.create_adapter_config())?;
|
||||
std::fs::write(&config_path, config_json).map_err(ExportError::Io)?;
|
||||
|
||||
// Export README
|
||||
let readme_path = output_dir.join("README.md");
|
||||
let readme = self.generate_readme();
|
||||
std::fs::write(&readme_path, readme).map_err(ExportError::Io)?;
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Create PEFT-compatible adapter config
|
||||
fn create_adapter_config(&self) -> AdapterConfig {
|
||||
let sona_config = self.engine.config();
|
||||
AdapterConfig {
|
||||
peft_type: "LORA".to_string(),
|
||||
auto_mapping: None,
|
||||
base_model_name_or_path: self.config.target_architecture.clone(),
|
||||
revision: None,
|
||||
task_type: "CAUSAL_LM".to_string(),
|
||||
inference_mode: true,
|
||||
r: sona_config.micro_lora_rank,
|
||||
lora_alpha: sona_config.micro_lora_rank as f32,
|
||||
lora_dropout: 0.0,
|
||||
fan_in_fan_out: false,
|
||||
bias: "none".to_string(),
|
||||
target_modules: vec![
|
||||
"q_proj".to_string(),
|
||||
"k_proj".to_string(),
|
||||
"v_proj".to_string(),
|
||||
"o_proj".to_string(),
|
||||
],
|
||||
modules_to_save: None,
|
||||
layers_to_transform: None,
|
||||
layers_pattern: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate README for HuggingFace model card
|
||||
fn generate_readme(&self) -> String {
|
||||
let stats = self.engine.stats();
|
||||
format!(
|
||||
r#"---
|
||||
license: mit
|
||||
library_name: peft
|
||||
base_model: {}
|
||||
tags:
|
||||
- sona
|
||||
- lora
|
||||
- adaptive-learning
|
||||
- ruvector
|
||||
---
|
||||
|
||||
# {} SONA Adapter
|
||||
|
||||
This adapter was generated using [SONA (Self-Optimizing Neural Architecture)](https://github.com/ruvnet/ruvector/tree/main/crates/sona).
|
||||
|
||||
## Model Details
|
||||
|
||||
- **Base Model**: {}
|
||||
- **PEFT Type**: LoRA
|
||||
- **Rank**: {}
|
||||
- **Patterns Learned**: {}
|
||||
- **Trajectories Processed**: {}
|
||||
|
||||
## Training Details
|
||||
|
||||
SONA uses two-tier LoRA adaptation:
|
||||
- **MicroLoRA**: Rank 1-2 for instant adaptation (<0.5ms)
|
||||
- **BaseLoRA**: Rank 4-16 for background learning
|
||||
|
||||
### Performance Benchmarks
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Throughput | 2211 ops/sec |
|
||||
| Latency | <0.5ms per layer |
|
||||
| Quality Improvement | +55% max |
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from peft import PeftModel, PeftConfig
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
# Load adapter
|
||||
config = PeftConfig.from_pretrained("your-username/{}")
|
||||
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
|
||||
model = PeftModel.from_pretrained(model, "your-username/{}")
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see [LICENSE](LICENSE) for details.
|
||||
|
||||
---
|
||||
|
||||
Generated with [ruvector-sona](https://crates.io/crates/ruvector-sona) v0.1.0
|
||||
"#,
|
||||
self.config.target_architecture,
|
||||
self.config.model_name,
|
||||
self.config.target_architecture,
|
||||
self.engine.config().micro_lora_rank,
|
||||
stats.patterns_stored,
|
||||
stats.trajectories_buffered,
|
||||
self.config.model_name,
|
||||
self.config.model_name,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// PEFT-compatible adapter configuration
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct AdapterConfig {
|
||||
pub peft_type: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub auto_mapping: Option<serde_json::Value>,
|
||||
pub base_model_name_or_path: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub revision: Option<String>,
|
||||
pub task_type: String,
|
||||
pub inference_mode: bool,
|
||||
pub r: usize,
|
||||
pub lora_alpha: f32,
|
||||
pub lora_dropout: f32,
|
||||
pub fan_in_fan_out: bool,
|
||||
pub bias: String,
|
||||
pub target_modules: Vec<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub modules_to_save: Option<Vec<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub layers_to_transform: Option<Vec<usize>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub layers_pattern: Option<String>,
|
||||
}
|
||||
|
||||
/// Export result
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ExportResult {
|
||||
/// Export type
|
||||
pub export_type: ExportType,
|
||||
/// Number of items exported
|
||||
pub items_exported: usize,
|
||||
/// Output path
|
||||
pub output_path: String,
|
||||
/// File size in bytes
|
||||
pub size_bytes: u64,
|
||||
}
|
||||
|
||||
/// Export type enum
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ExportType {
|
||||
SafeTensors,
|
||||
PatternsDataset,
|
||||
PreferencePairs,
|
||||
DistillationTargets,
|
||||
AdapterConfig,
|
||||
}
|
||||
|
||||
/// Export errors
|
||||
#[derive(Debug)]
|
||||
pub enum ExportError {
|
||||
Io(std::io::Error),
|
||||
Serialization(serde_json::Error),
|
||||
InvalidData(String),
|
||||
HubError(String),
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for ExportError {
|
||||
fn from(e: std::io::Error) -> Self {
|
||||
ExportError::Io(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for ExportError {
|
||||
fn from(e: serde_json::Error) -> Self {
|
||||
ExportError::Serialization(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ExportError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ExportError::Io(e) => write!(f, "IO error: {}", e),
|
||||
ExportError::Serialization(e) => write!(f, "Serialization error: {}", e),
|
||||
ExportError::InvalidData(msg) => write!(f, "Invalid data: {}", msg),
|
||||
ExportError::HubError(msg) => write!(f, "HuggingFace Hub error: {}", msg),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for ExportError {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_export_config_default() {
|
||||
let config = ExportConfig::default();
|
||||
assert_eq!(config.model_name, "sona-adapter");
|
||||
assert!(config.include_patterns);
|
||||
assert!(config.include_lora);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_adapter_config_serialization() {
|
||||
let config = AdapterConfig {
|
||||
peft_type: "LORA".to_string(),
|
||||
auto_mapping: None,
|
||||
base_model_name_or_path: "microsoft/phi-4".to_string(),
|
||||
revision: None,
|
||||
task_type: "CAUSAL_LM".to_string(),
|
||||
inference_mode: true,
|
||||
r: 2,
|
||||
lora_alpha: 2.0,
|
||||
lora_dropout: 0.0,
|
||||
fan_in_fan_out: false,
|
||||
bias: "none".to_string(),
|
||||
target_modules: vec!["q_proj".to_string()],
|
||||
modules_to_save: None,
|
||||
layers_to_transform: None,
|
||||
layers_pattern: None,
|
||||
};
|
||||
|
||||
let json = serde_json::to_string_pretty(&config).unwrap();
|
||||
assert!(json.contains("LORA"));
|
||||
assert!(json.contains("phi-4"));
|
||||
}
|
||||
}
|
||||
666
vendor/ruvector/crates/sona/src/export/pretrain.rs
vendored
Normal file
666
vendor/ruvector/crates/sona/src/export/pretrain.rs
vendored
Normal file
@@ -0,0 +1,666 @@
|
||||
//! Pretraining Pipeline - SONA-optimized model pretraining configuration
|
||||
//!
|
||||
//! Generates optimal pretraining configurations based on SONA benchmark results:
|
||||
//! - 2211 ops/sec throughput
|
||||
//! - <0.5ms latency per layer
|
||||
//! - +55% quality improvement
|
||||
//! - 134 tests passing
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
#[cfg(feature = "serde-support")]
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{ExportConfig, ExportError, ExportResult, HuggingFaceExporter};
|
||||
use crate::engine::SonaEngine;
|
||||
|
||||
/// Pretraining configuration based on SONA benchmarks
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PretrainConfig {
|
||||
/// Base model to fine-tune
|
||||
pub base_model: String,
|
||||
|
||||
/// LoRA configuration
|
||||
pub lora: LoraPretrainConfig,
|
||||
|
||||
/// Training hyperparameters
|
||||
pub training: TrainingConfig,
|
||||
|
||||
/// Dataset configuration
|
||||
pub dataset: DatasetConfig,
|
||||
|
||||
/// Hardware configuration
|
||||
pub hardware: HardwareConfig,
|
||||
|
||||
/// SONA-specific optimizations
|
||||
pub sona: SonaOptimizations,
|
||||
}
|
||||
|
||||
/// LoRA pretraining configuration
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LoraPretrainConfig {
|
||||
/// LoRA rank (benchmark optimal: 2)
|
||||
pub rank: usize,
|
||||
/// LoRA alpha (typically equals rank)
|
||||
pub alpha: f32,
|
||||
/// Dropout rate (benchmark: 0.0)
|
||||
pub dropout: f32,
|
||||
/// Target modules
|
||||
pub target_modules: Vec<String>,
|
||||
/// Use RSLoRA scaling
|
||||
pub use_rslora: bool,
|
||||
}
|
||||
|
||||
/// Training hyperparameters
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct TrainingConfig {
|
||||
/// Learning rate (benchmark optimal: 0.002)
|
||||
pub learning_rate: f64,
|
||||
/// Batch size (benchmark optimal: 32)
|
||||
pub batch_size: usize,
|
||||
/// Gradient accumulation steps
|
||||
pub gradient_accumulation_steps: usize,
|
||||
/// Number of epochs
|
||||
pub num_epochs: usize,
|
||||
/// Warmup ratio
|
||||
pub warmup_ratio: f32,
|
||||
/// Weight decay
|
||||
pub weight_decay: f32,
|
||||
/// Max gradient norm
|
||||
pub max_grad_norm: f32,
|
||||
/// LR scheduler type
|
||||
pub lr_scheduler_type: String,
|
||||
/// Save steps
|
||||
pub save_steps: usize,
|
||||
/// Evaluation steps
|
||||
pub eval_steps: usize,
|
||||
/// Logging steps
|
||||
pub logging_steps: usize,
|
||||
}
|
||||
|
||||
/// Dataset configuration
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DatasetConfig {
|
||||
/// Path to patterns dataset
|
||||
pub patterns_path: Option<String>,
|
||||
/// Path to preferences dataset
|
||||
pub preferences_path: Option<String>,
|
||||
/// Path to distillation targets
|
||||
pub distillation_path: Option<String>,
|
||||
/// Maximum sequence length
|
||||
pub max_seq_length: usize,
|
||||
/// Train/validation split ratio
|
||||
pub validation_split: f32,
|
||||
}
|
||||
|
||||
/// Hardware configuration
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct HardwareConfig {
|
||||
/// Use mixed precision (fp16/bf16)
|
||||
pub mixed_precision: String,
|
||||
/// Number of GPUs
|
||||
pub num_gpus: usize,
|
||||
/// Enable gradient checkpointing
|
||||
pub gradient_checkpointing: bool,
|
||||
/// Enable DeepSpeed
|
||||
pub deepspeed: Option<String>,
|
||||
/// Enable FSDP
|
||||
pub fsdp: bool,
|
||||
}
|
||||
|
||||
/// SONA-specific optimizations
|
||||
#[cfg_attr(feature = "serde-support", derive(Serialize, Deserialize))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SonaOptimizations {
|
||||
/// Enable two-tier LoRA (MicroLoRA + BaseLoRA)
|
||||
pub two_tier_lora: bool,
|
||||
/// MicroLoRA rank (1-2)
|
||||
pub micro_lora_rank: usize,
|
||||
/// Enable EWC++ for catastrophic forgetting prevention
|
||||
pub ewc_enabled: bool,
|
||||
/// EWC lambda (benchmark optimal: 1000)
|
||||
pub ewc_lambda: f32,
|
||||
/// Number of pattern clusters (benchmark optimal: 100)
|
||||
pub pattern_clusters: usize,
|
||||
/// Enable SIMD optimizations
|
||||
pub enable_simd: bool,
|
||||
}
|
||||
|
||||
impl Default for PretrainConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
base_model: "microsoft/phi-4".to_string(),
|
||||
lora: LoraPretrainConfig::default(),
|
||||
training: TrainingConfig::default(),
|
||||
dataset: DatasetConfig::default(),
|
||||
hardware: HardwareConfig::default(),
|
||||
sona: SonaOptimizations::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LoraPretrainConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
// Benchmark optimal: rank 2
|
||||
rank: 2,
|
||||
alpha: 2.0,
|
||||
dropout: 0.0,
|
||||
target_modules: vec![
|
||||
"q_proj".to_string(),
|
||||
"k_proj".to_string(),
|
||||
"v_proj".to_string(),
|
||||
"o_proj".to_string(),
|
||||
],
|
||||
use_rslora: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TrainingConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
// Benchmark optimal: 0.002
|
||||
learning_rate: 0.002,
|
||||
// Benchmark optimal: 32
|
||||
batch_size: 32,
|
||||
gradient_accumulation_steps: 4,
|
||||
num_epochs: 3,
|
||||
warmup_ratio: 0.1,
|
||||
weight_decay: 0.01,
|
||||
max_grad_norm: 1.0,
|
||||
lr_scheduler_type: "cosine".to_string(),
|
||||
save_steps: 500,
|
||||
eval_steps: 100,
|
||||
logging_steps: 10,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DatasetConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
patterns_path: None,
|
||||
preferences_path: None,
|
||||
distillation_path: None,
|
||||
max_seq_length: 2048,
|
||||
validation_split: 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HardwareConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
mixed_precision: "bf16".to_string(),
|
||||
num_gpus: 1,
|
||||
gradient_checkpointing: true,
|
||||
deepspeed: None,
|
||||
fsdp: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SonaOptimizations {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
two_tier_lora: true,
|
||||
micro_lora_rank: 1,
|
||||
ewc_enabled: true,
|
||||
// Benchmark optimal: 1000
|
||||
ewc_lambda: 1000.0,
|
||||
// Benchmark optimal: 100
|
||||
pattern_clusters: 100,
|
||||
enable_simd: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pretraining pipeline orchestrator
|
||||
pub struct PretrainPipeline<'a> {
|
||||
/// Reference to SONA engine
|
||||
engine: &'a SonaEngine,
|
||||
/// Pipeline configuration
|
||||
config: PretrainConfig,
|
||||
}
|
||||
|
||||
impl<'a> PretrainPipeline<'a> {
|
||||
/// Create new pretraining pipeline
|
||||
pub fn new(engine: &'a SonaEngine) -> Self {
|
||||
Self {
|
||||
engine,
|
||||
config: PretrainConfig::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with custom configuration
|
||||
pub fn with_config(engine: &'a SonaEngine, config: PretrainConfig) -> Self {
|
||||
Self { engine, config }
|
||||
}
|
||||
|
||||
/// Generate optimal config from SONA engine stats
|
||||
pub fn from_engine_stats(engine: &'a SonaEngine) -> Self {
|
||||
let sona_config = engine.config();
|
||||
|
||||
let config = PretrainConfig {
|
||||
lora: LoraPretrainConfig {
|
||||
rank: sona_config.base_lora_rank,
|
||||
alpha: sona_config.base_lora_rank as f32,
|
||||
..Default::default()
|
||||
},
|
||||
sona: SonaOptimizations {
|
||||
micro_lora_rank: sona_config.micro_lora_rank,
|
||||
ewc_lambda: sona_config.ewc_lambda,
|
||||
pattern_clusters: sona_config.pattern_clusters,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Self { engine, config }
|
||||
}
|
||||
|
||||
/// Export complete pretraining package
|
||||
pub fn export_package<P: AsRef<Path>>(
|
||||
&self,
|
||||
output_dir: P,
|
||||
) -> Result<PretrainPackage, ExportError> {
|
||||
let output_dir = output_dir.as_ref();
|
||||
std::fs::create_dir_all(output_dir).map_err(ExportError::Io)?;
|
||||
|
||||
// Export using HuggingFaceExporter
|
||||
let export_config = ExportConfig {
|
||||
model_name: self.config.base_model.replace('/', "-"),
|
||||
target_architecture: self.config.base_model.clone(),
|
||||
include_patterns: true,
|
||||
include_lora: true,
|
||||
include_preferences: true,
|
||||
min_quality_threshold: 0.5,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let exporter = HuggingFaceExporter::with_config(self.engine, export_config);
|
||||
let export_results = exporter.export_all(output_dir)?;
|
||||
|
||||
// Generate training script
|
||||
let script_path = output_dir.join("train.py");
|
||||
let script = self.generate_training_script();
|
||||
std::fs::write(&script_path, script).map_err(ExportError::Io)?;
|
||||
|
||||
// Generate config files
|
||||
let config_path = output_dir.join("pretrain_config.json");
|
||||
let config_json = serde_json::to_string_pretty(&self.config)?;
|
||||
std::fs::write(&config_path, config_json).map_err(ExportError::Io)?;
|
||||
|
||||
// Generate requirements
|
||||
let requirements_path = output_dir.join("requirements.txt");
|
||||
let requirements = self.generate_requirements();
|
||||
std::fs::write(&requirements_path, requirements).map_err(ExportError::Io)?;
|
||||
|
||||
// Generate accelerate config
|
||||
let accelerate_path = output_dir.join("accelerate_config.yaml");
|
||||
let accelerate_config = self.generate_accelerate_config();
|
||||
std::fs::write(&accelerate_path, accelerate_config).map_err(ExportError::Io)?;
|
||||
|
||||
Ok(PretrainPackage {
|
||||
output_dir: output_dir.to_string_lossy().to_string(),
|
||||
export_results,
|
||||
script_path: script_path.to_string_lossy().to_string(),
|
||||
config_path: config_path.to_string_lossy().to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Generate Python training script
|
||||
fn generate_training_script(&self) -> String {
|
||||
format!(
|
||||
r#"#!/usr/bin/env python3
|
||||
"""
|
||||
SONA-Optimized Pretraining Script
|
||||
|
||||
Based on SONA benchmark results:
|
||||
- Throughput: 2211 ops/sec
|
||||
- Latency: <0.5ms per layer
|
||||
- Quality improvement: +55%
|
||||
|
||||
Configuration optimized for:
|
||||
- LoRA Rank: {}
|
||||
- Learning Rate: {}
|
||||
- Batch Size: {}
|
||||
- EWC Lambda: {}
|
||||
- Pattern Clusters: {}
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
TrainingArguments,
|
||||
Trainer,
|
||||
DataCollatorForLanguageModeling,
|
||||
)
|
||||
from peft import (
|
||||
LoraConfig,
|
||||
get_peft_model,
|
||||
prepare_model_for_kbit_training,
|
||||
TaskType,
|
||||
)
|
||||
|
||||
# Load SONA config
|
||||
with open("pretrain_config.json", "r") as f:
|
||||
CONFIG = json.load(f)
|
||||
|
||||
def main():
|
||||
# Load base model
|
||||
print(f"Loading base model: {{CONFIG['base_model']}}")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
CONFIG["base_model"],
|
||||
torch_dtype=torch.bfloat16 if CONFIG["hardware"]["mixed_precision"] == "bf16" else torch.float16,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(CONFIG["base_model"])
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
# Configure LoRA with SONA-optimal settings
|
||||
lora_config = LoraConfig(
|
||||
r=CONFIG["lora"]["rank"],
|
||||
lora_alpha=CONFIG["lora"]["alpha"],
|
||||
lora_dropout=CONFIG["lora"]["dropout"],
|
||||
target_modules=CONFIG["lora"]["target_modules"],
|
||||
task_type=TaskType.CAUSAL_LM,
|
||||
bias="none",
|
||||
)
|
||||
|
||||
# Prepare model
|
||||
if CONFIG["hardware"]["gradient_checkpointing"]:
|
||||
model.gradient_checkpointing_enable()
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
model.print_trainable_parameters()
|
||||
|
||||
# Load SONA datasets
|
||||
datasets = {{}}
|
||||
|
||||
if CONFIG["dataset"]["patterns_path"] and os.path.exists(CONFIG["dataset"]["patterns_path"]):
|
||||
print("Loading patterns dataset...")
|
||||
datasets["patterns"] = load_dataset("json", data_files=CONFIG["dataset"]["patterns_path"])
|
||||
|
||||
if CONFIG["dataset"]["preferences_path"] and os.path.exists(CONFIG["dataset"]["preferences_path"]):
|
||||
print("Loading preferences dataset...")
|
||||
datasets["preferences"] = load_dataset("json", data_files=CONFIG["dataset"]["preferences_path"])
|
||||
|
||||
# Use patterns dataset for pretraining if available
|
||||
if "patterns" in datasets:
|
||||
train_dataset = datasets["patterns"]["train"]
|
||||
else:
|
||||
# Fall back to sample data
|
||||
print("Warning: No patterns dataset found, using sample data")
|
||||
train_dataset = None
|
||||
|
||||
# Training arguments with SONA-optimal settings
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./sona-output",
|
||||
num_train_epochs=CONFIG["training"]["num_epochs"],
|
||||
per_device_train_batch_size=CONFIG["training"]["batch_size"],
|
||||
gradient_accumulation_steps=CONFIG["training"]["gradient_accumulation_steps"],
|
||||
learning_rate=CONFIG["training"]["learning_rate"],
|
||||
warmup_ratio=CONFIG["training"]["warmup_ratio"],
|
||||
weight_decay=CONFIG["training"]["weight_decay"],
|
||||
max_grad_norm=CONFIG["training"]["max_grad_norm"],
|
||||
lr_scheduler_type=CONFIG["training"]["lr_scheduler_type"],
|
||||
save_steps=CONFIG["training"]["save_steps"],
|
||||
eval_steps=CONFIG["training"]["eval_steps"],
|
||||
logging_steps=CONFIG["training"]["logging_steps"],
|
||||
bf16=CONFIG["hardware"]["mixed_precision"] == "bf16",
|
||||
fp16=CONFIG["hardware"]["mixed_precision"] == "fp16",
|
||||
gradient_checkpointing=CONFIG["hardware"]["gradient_checkpointing"],
|
||||
report_to="tensorboard",
|
||||
save_total_limit=3,
|
||||
push_to_hub=False,
|
||||
)
|
||||
|
||||
# Data collator
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
mlm=False,
|
||||
)
|
||||
|
||||
if train_dataset:
|
||||
# Initialize trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
data_collator=data_collator,
|
||||
)
|
||||
|
||||
# Train
|
||||
print("Starting SONA-optimized training...")
|
||||
trainer.train()
|
||||
|
||||
# Save
|
||||
print("Saving model...")
|
||||
trainer.save_model("./sona-output/final")
|
||||
tokenizer.save_pretrained("./sona-output/final")
|
||||
else:
|
||||
print("No training data available. Please provide patterns.jsonl or preferences.jsonl")
|
||||
|
||||
print("Done!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
"#,
|
||||
self.config.lora.rank,
|
||||
self.config.training.learning_rate,
|
||||
self.config.training.batch_size,
|
||||
self.config.sona.ewc_lambda,
|
||||
self.config.sona.pattern_clusters,
|
||||
)
|
||||
}
|
||||
|
||||
/// Generate requirements.txt
|
||||
fn generate_requirements(&self) -> String {
|
||||
r#"# SONA Pretraining Requirements
|
||||
torch>=2.0.0
|
||||
transformers>=4.35.0
|
||||
datasets>=2.14.0
|
||||
peft>=0.6.0
|
||||
accelerate>=0.24.0
|
||||
bitsandbytes>=0.41.0
|
||||
safetensors>=0.4.0
|
||||
tensorboard>=2.14.0
|
||||
scipy>=1.11.0
|
||||
scikit-learn>=1.3.0
|
||||
tqdm>=4.66.0
|
||||
"#
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Generate accelerate config
|
||||
fn generate_accelerate_config(&self) -> String {
|
||||
format!(
|
||||
r#"compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: {}
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: all
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: {}
|
||||
num_machines: 1
|
||||
num_processes: {}
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
"#,
|
||||
if self.config.hardware.num_gpus > 1 {
|
||||
"MULTI_GPU"
|
||||
} else {
|
||||
"NO"
|
||||
},
|
||||
self.config.hardware.mixed_precision,
|
||||
self.config.hardware.num_gpus,
|
||||
)
|
||||
}
|
||||
|
||||
/// Generate DPO training script for preference learning
|
||||
pub fn generate_dpo_script(&self) -> String {
|
||||
r#"#!/usr/bin/env python3
|
||||
"""
|
||||
SONA DPO (Direct Preference Optimization) Training Script
|
||||
|
||||
Uses preference pairs exported from SONA ReasoningBank for RLHF-style training
|
||||
without requiring a reward model.
|
||||
"""
|
||||
|
||||
import json
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from trl import DPOTrainer, DPOConfig
|
||||
from peft import LoraConfig, get_peft_model
|
||||
|
||||
# Load config
|
||||
with open("pretrain_config.json", "r") as f:
|
||||
CONFIG = json.load(f)
|
||||
|
||||
def main():
|
||||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
CONFIG["base_model"],
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(CONFIG["base_model"])
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
# Configure LoRA
|
||||
lora_config = LoraConfig(
|
||||
r=CONFIG["lora"]["rank"],
|
||||
lora_alpha=CONFIG["lora"]["alpha"],
|
||||
lora_dropout=CONFIG["lora"]["dropout"],
|
||||
target_modules=CONFIG["lora"]["target_modules"],
|
||||
bias="none",
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
|
||||
# Load preference dataset
|
||||
if CONFIG["dataset"]["preferences_path"]:
|
||||
dataset = load_dataset("json", data_files=CONFIG["dataset"]["preferences_path"])
|
||||
else:
|
||||
raise ValueError("Preferences dataset required for DPO training")
|
||||
|
||||
# DPO config
|
||||
dpo_config = DPOConfig(
|
||||
output_dir="./sona-dpo-output",
|
||||
num_train_epochs=CONFIG["training"]["num_epochs"],
|
||||
per_device_train_batch_size=CONFIG["training"]["batch_size"] // 2,
|
||||
gradient_accumulation_steps=CONFIG["training"]["gradient_accumulation_steps"],
|
||||
learning_rate=CONFIG["training"]["learning_rate"] / 10, # Lower LR for DPO
|
||||
warmup_ratio=CONFIG["training"]["warmup_ratio"],
|
||||
bf16=True,
|
||||
logging_steps=CONFIG["training"]["logging_steps"],
|
||||
save_steps=CONFIG["training"]["save_steps"],
|
||||
beta=0.1, # DPO temperature
|
||||
)
|
||||
|
||||
# Initialize DPO trainer
|
||||
trainer = DPOTrainer(
|
||||
model=model,
|
||||
args=dpo_config,
|
||||
train_dataset=dataset["train"],
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
# Train
|
||||
print("Starting SONA DPO training...")
|
||||
trainer.train()
|
||||
|
||||
# Save
|
||||
trainer.save_model("./sona-dpo-output/final")
|
||||
print("Done!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
"#
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Pretraining package result
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PretrainPackage {
|
||||
/// Output directory
|
||||
pub output_dir: String,
|
||||
/// Export results
|
||||
pub export_results: Vec<ExportResult>,
|
||||
/// Path to training script
|
||||
pub script_path: String,
|
||||
/// Path to config file
|
||||
pub config_path: String,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pretrain_config_default() {
|
||||
let config = PretrainConfig::default();
|
||||
|
||||
// Verify benchmark-optimal values
|
||||
assert_eq!(config.lora.rank, 2);
|
||||
assert_eq!(config.training.learning_rate, 0.002);
|
||||
assert_eq!(config.training.batch_size, 32);
|
||||
assert_eq!(config.sona.ewc_lambda, 1000.0);
|
||||
assert_eq!(config.sona.pattern_clusters, 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_serialization() {
|
||||
let config = PretrainConfig::default();
|
||||
let json = serde_json::to_string_pretty(&config).unwrap();
|
||||
|
||||
assert!(json.contains("\"rank\": 2"));
|
||||
assert!(json.contains("\"learning_rate\": 0.002"));
|
||||
assert!(json.contains("\"batch_size\": 32"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lora_config_default() {
|
||||
let config = LoraPretrainConfig::default();
|
||||
|
||||
assert_eq!(config.rank, 2);
|
||||
assert_eq!(config.alpha, 2.0);
|
||||
assert_eq!(config.dropout, 0.0);
|
||||
assert!(config.target_modules.contains(&"q_proj".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sona_optimizations_default() {
|
||||
let config = SonaOptimizations::default();
|
||||
|
||||
assert!(config.two_tier_lora);
|
||||
assert_eq!(config.micro_lora_rank, 1);
|
||||
assert!(config.ewc_enabled);
|
||||
assert_eq!(config.ewc_lambda, 1000.0);
|
||||
assert_eq!(config.pattern_clusters, 100);
|
||||
assert!(config.enable_simd);
|
||||
}
|
||||
}
|
||||
337
vendor/ruvector/crates/sona/src/export/safetensors.rs
vendored
Normal file
337
vendor/ruvector/crates/sona/src/export/safetensors.rs
vendored
Normal file
@@ -0,0 +1,337 @@
|
||||
//! SafeTensors Export - PEFT-compatible LoRA weight serialization
|
||||
//!
|
||||
//! Exports SONA's learned LoRA weights in SafeTensors format for use with
|
||||
//! HuggingFace's PEFT library and transformers ecosystem.
|
||||
|
||||
use super::{ExportConfig, ExportError, ExportResult, ExportType};
|
||||
use crate::engine::SonaEngine;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
#[cfg(feature = "serde-support")]
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// SafeTensors exporter for LoRA weights
|
||||
pub struct SafeTensorsExporter<'a> {
|
||||
_config: &'a ExportConfig,
|
||||
}
|
||||
|
||||
impl<'a> SafeTensorsExporter<'a> {
|
||||
/// Create new SafeTensors exporter
|
||||
pub fn new(config: &'a ExportConfig) -> Self {
|
||||
Self { _config: config }
|
||||
}
|
||||
|
||||
/// Export engine's LoRA weights to SafeTensors format
|
||||
pub fn export_engine<P: AsRef<Path>>(
|
||||
&self,
|
||||
engine: &SonaEngine,
|
||||
output_dir: P,
|
||||
) -> Result<ExportResult, ExportError> {
|
||||
let output_dir = output_dir.as_ref();
|
||||
std::fs::create_dir_all(output_dir).map_err(ExportError::Io)?;
|
||||
|
||||
// Get LoRA state from engine
|
||||
let lora_state = engine.export_lora_state();
|
||||
|
||||
// Build tensor data map
|
||||
let mut tensors: HashMap<String, TensorData> = HashMap::new();
|
||||
|
||||
// Export MicroLoRA weights (rank 1-2)
|
||||
for (i, layer) in lora_state.micro_lora_layers.iter().enumerate() {
|
||||
let a_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.micro_lora_A.weight",
|
||||
i
|
||||
);
|
||||
let b_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.micro_lora_B.weight",
|
||||
i
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
a_key,
|
||||
TensorData {
|
||||
data: layer.lora_a.clone(),
|
||||
shape: vec![layer.rank, layer.input_dim],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
b_key,
|
||||
TensorData {
|
||||
data: layer.lora_b.clone(),
|
||||
shape: vec![layer.output_dim, layer.rank],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Export BaseLoRA weights (rank 4-16)
|
||||
for (i, layer) in lora_state.base_lora_layers.iter().enumerate() {
|
||||
// Q projection
|
||||
let q_a_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.q_proj.lora_A.weight",
|
||||
i
|
||||
);
|
||||
let q_b_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.q_proj.lora_B.weight",
|
||||
i
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
q_a_key,
|
||||
TensorData {
|
||||
data: layer.lora_a.clone(),
|
||||
shape: vec![layer.rank, layer.input_dim],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
q_b_key,
|
||||
TensorData {
|
||||
data: layer.lora_b.clone(),
|
||||
shape: vec![layer.output_dim, layer.rank],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
|
||||
// K projection
|
||||
let k_a_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.k_proj.lora_A.weight",
|
||||
i
|
||||
);
|
||||
let k_b_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.k_proj.lora_B.weight",
|
||||
i
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
k_a_key,
|
||||
TensorData {
|
||||
data: layer.lora_a.clone(),
|
||||
shape: vec![layer.rank, layer.input_dim],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
k_b_key,
|
||||
TensorData {
|
||||
data: layer.lora_b.clone(),
|
||||
shape: vec![layer.output_dim, layer.rank],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
|
||||
// V projection
|
||||
let v_a_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.v_proj.lora_A.weight",
|
||||
i
|
||||
);
|
||||
let v_b_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.v_proj.lora_B.weight",
|
||||
i
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
v_a_key,
|
||||
TensorData {
|
||||
data: layer.lora_a.clone(),
|
||||
shape: vec![layer.rank, layer.input_dim],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
v_b_key,
|
||||
TensorData {
|
||||
data: layer.lora_b.clone(),
|
||||
shape: vec![layer.output_dim, layer.rank],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
|
||||
// O projection
|
||||
let o_a_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.o_proj.lora_A.weight",
|
||||
i
|
||||
);
|
||||
let o_b_key = format!(
|
||||
"base_model.model.layers.{}.self_attn.o_proj.lora_B.weight",
|
||||
i
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
o_a_key,
|
||||
TensorData {
|
||||
data: layer.lora_a.clone(),
|
||||
shape: vec![layer.rank, layer.input_dim],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
|
||||
tensors.insert(
|
||||
o_b_key,
|
||||
TensorData {
|
||||
data: layer.lora_b.clone(),
|
||||
shape: vec![layer.output_dim, layer.rank],
|
||||
dtype: "F32".to_string(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Serialize to SafeTensors format
|
||||
let safetensors_path = output_dir.join("adapter_model.safetensors");
|
||||
let bytes = self.serialize_safetensors(&tensors)?;
|
||||
std::fs::write(&safetensors_path, &bytes).map_err(ExportError::Io)?;
|
||||
|
||||
let size_bytes = bytes.len() as u64;
|
||||
|
||||
Ok(ExportResult {
|
||||
export_type: ExportType::SafeTensors,
|
||||
items_exported: tensors.len(),
|
||||
output_path: safetensors_path.to_string_lossy().to_string(),
|
||||
size_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
/// Serialize tensors to SafeTensors binary format
|
||||
fn serialize_safetensors(
|
||||
&self,
|
||||
tensors: &HashMap<String, TensorData>,
|
||||
) -> Result<Vec<u8>, ExportError> {
|
||||
// SafeTensors format:
|
||||
// 8 bytes: header size (little endian u64)
|
||||
// N bytes: JSON header with tensor metadata
|
||||
// ... tensor data (aligned to 8 bytes)
|
||||
|
||||
let mut header_data: HashMap<String, TensorMetadata> = HashMap::new();
|
||||
let mut tensor_bytes: Vec<u8> = Vec::new();
|
||||
|
||||
// Sort keys for deterministic output
|
||||
let mut keys: Vec<_> = tensors.keys().collect();
|
||||
keys.sort();
|
||||
|
||||
for key in keys {
|
||||
let tensor = &tensors[key];
|
||||
|
||||
// Align to 8 bytes
|
||||
let padding = (8 - (tensor_bytes.len() % 8)) % 8;
|
||||
tensor_bytes.extend(vec![0u8; padding]);
|
||||
|
||||
let start_offset = tensor_bytes.len();
|
||||
|
||||
// Write tensor data
|
||||
for &val in &tensor.data {
|
||||
tensor_bytes.extend_from_slice(&val.to_le_bytes());
|
||||
}
|
||||
|
||||
let end_offset = tensor_bytes.len();
|
||||
|
||||
header_data.insert(
|
||||
key.clone(),
|
||||
TensorMetadata {
|
||||
dtype: tensor.dtype.clone(),
|
||||
shape: tensor.shape.clone(),
|
||||
data_offsets: [start_offset, end_offset],
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Serialize header to JSON
|
||||
let header_json =
|
||||
serde_json::to_string(&header_data).map_err(ExportError::Serialization)?;
|
||||
let header_bytes = header_json.as_bytes();
|
||||
|
||||
// Build final buffer
|
||||
let mut result = Vec::new();
|
||||
|
||||
// Header size (8 bytes, little endian)
|
||||
result.extend_from_slice(&(header_bytes.len() as u64).to_le_bytes());
|
||||
|
||||
// Header JSON
|
||||
result.extend_from_slice(header_bytes);
|
||||
|
||||
// Tensor data
|
||||
result.extend(tensor_bytes);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
/// Tensor data for export
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct TensorData {
|
||||
/// Flattened tensor values
|
||||
pub data: Vec<f32>,
|
||||
/// Tensor shape
|
||||
pub shape: Vec<usize>,
|
||||
/// Data type (F32, F16, BF16, etc.)
|
||||
pub dtype: String,
|
||||
}
|
||||
|
||||
/// Tensor metadata for SafeTensors header
|
||||
#[cfg(feature = "serde-support")]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct TensorMetadata {
|
||||
dtype: String,
|
||||
shape: Vec<usize>,
|
||||
data_offsets: [usize; 2],
|
||||
}
|
||||
|
||||
/// LoRA layer state for export
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LoRALayerState {
|
||||
/// LoRA A matrix (rank x input_dim)
|
||||
pub lora_a: Vec<f32>,
|
||||
/// LoRA B matrix (output_dim x rank)
|
||||
pub lora_b: Vec<f32>,
|
||||
/// LoRA rank
|
||||
pub rank: usize,
|
||||
/// Input dimension
|
||||
pub input_dim: usize,
|
||||
/// Output dimension
|
||||
pub output_dim: usize,
|
||||
}
|
||||
|
||||
/// Complete LoRA state for export
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct LoRAState {
|
||||
/// MicroLoRA layers (instant adaptation)
|
||||
pub micro_lora_layers: Vec<LoRALayerState>,
|
||||
/// BaseLoRA layers (background learning)
|
||||
pub base_lora_layers: Vec<LoRALayerState>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_tensor_data_creation() {
|
||||
let tensor = TensorData {
|
||||
data: vec![1.0, 2.0, 3.0, 4.0],
|
||||
shape: vec![2, 2],
|
||||
dtype: "F32".to_string(),
|
||||
};
|
||||
|
||||
assert_eq!(tensor.data.len(), 4);
|
||||
assert_eq!(tensor.shape, vec![2, 2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lora_layer_state() {
|
||||
let state = LoRALayerState {
|
||||
lora_a: vec![0.1, 0.2, 0.3, 0.4],
|
||||
lora_b: vec![0.5, 0.6, 0.7, 0.8],
|
||||
rank: 2,
|
||||
input_dim: 2,
|
||||
output_dim: 2,
|
||||
};
|
||||
|
||||
assert_eq!(state.rank, 2);
|
||||
assert_eq!(state.lora_a.len(), 4);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user