Files
wifi-densepose/npm/packages/agentic-synth/examples/self-learning/continual-learning.ts
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

726 lines
23 KiB
TypeScript

/**
* Continual Learning Dataset Generation
*
* This example demonstrates:
* - Incremental training data generation
* - Domain adaptation scenarios
* - Catastrophic forgetting prevention data
* - Transfer learning datasets
*/
import { AgenticSynth, createSynth } from '../../src/index.js';
import type { GenerationResult } from '../../src/types.js';
// ============================================================================
// Example 1: Incremental Training Data
// ============================================================================
/**
* Generate incremental training batches for continual learning
*/
export async function generateIncrementalData() {
console.log('\n📈 Example 1: Incremental Training Data\n');
const synth = createSynth({
provider: 'gemini',
apiKey: process.env.GEMINI_API_KEY || 'demo-key',
cacheStrategy: 'memory',
});
// Generate data for multiple training phases
const phases = [];
for (let phase = 1; phase <= 5; phase++) {
console.log(`\nGenerating Phase ${phase} data...`);
const phaseData = await synth.generateStructured({
count: 200,
schema: {
sample_id: 'UUID',
phase: `number (${phase})`,
// Features (gradually evolving)
features: {
core_feature_1: 'number (0-100)',
core_feature_2: 'number (0-100)',
// New features introduced in later phases
phase_specific_feature: `number (0-100) or null (null if phase < ${Math.min(phase + 1, 3)})`,
evolving_feature: `number (${phase * 10}-${(phase + 1) * 10})`,
},
// Label (distribution shifts over phases)
label: 'number (0-4)',
label_distribution_bias: `number (${phase - 1}-${phase}, bias toward class ${phase - 1})`,
// Data characteristics
noise_level: `number (${0.05 * phase}-${0.05 * (phase + 1)}, increasing noise)`,
difficulty: 'easy | medium | hard',
// Metadata
timestamp: 'ISO timestamp',
data_source: `source_${phase}`,
},
constraints: [
`Label distribution should be biased toward class ${phase - 1}`,
'Noise level should increase with phase number',
'Difficulty should vary across phases',
'phase_specific_feature should be null for early phases',
],
});
console.log(` - Generated ${phaseData.data.length} samples`);
console.log(` - Avg noise level: ${calculateAverage(phaseData.data, 'noise_level').toFixed(3)}`);
console.log(` - Label distribution:`, getLabelDistribution(phaseData.data));
phases.push(phaseData);
}
console.log('\n✅ Incremental data generation complete');
console.log(`Total phases: ${phases.length}`);
console.log(`Total samples: ${phases.reduce((sum, p) => sum + p.data.length, 0)}`);
return phases;
}
// ============================================================================
// Example 2: Domain Adaptation Scenarios
// ============================================================================
/**
* Generate source and target domain data for domain adaptation
*/
export async function generateDomainAdaptationData() {
console.log('\n🌍 Example 2: Domain Adaptation Data\n');
const synth = createSynth({
provider: 'gemini',
apiKey: process.env.GEMINI_API_KEY || 'demo-key',
});
// Source domain: Product reviews from electronics
console.log('Generating source domain data (Electronics Reviews)...');
const sourceData = await synth.generateStructured({
count: 300,
schema: {
review_id: 'UUID',
domain: 'string (source_electronics)',
// Text data
review_text: 'product review for electronics (2-4 sentences)',
sentiment: 'positive | negative | neutral',
// Domain-specific features
domain_features: {
mentions_battery: 'boolean',
mentions_screen: 'boolean',
mentions_performance: 'boolean',
technical_terms_count: 'number (0-10)',
},
// Labels
rating: 'number (1-5)',
helpful_votes: 'number (0-100)',
// Feature representation
feature_vector: ['array of 50 numbers (0-1, embedding)'],
timestamp: 'ISO timestamp',
},
constraints: [
'Sentiment should correlate with rating',
'Technical terms should be common (avg 3-5)',
'Electronics-specific vocabulary',
],
});
console.log(` - Source samples: ${sourceData.data.length}`);
console.log(` - Avg rating: ${calculateAverage(sourceData.data, 'rating').toFixed(2)}`);
// Target domain: Product reviews from home goods (different distribution)
console.log('\nGenerating target domain data (Home Goods Reviews)...');
const targetData = await synth.generateStructured({
count: 300,
schema: {
review_id: 'UUID',
domain: 'string (target_home_goods)',
// Text data
review_text: 'product review for home goods/furniture (2-4 sentences)',
sentiment: 'positive | negative | neutral',
// Domain-specific features (different from source)
domain_features: {
mentions_comfort: 'boolean',
mentions_quality: 'boolean',
mentions_design: 'boolean',
technical_terms_count: 'number (0-3, fewer technical terms)',
},
// Labels (same task, different domain)
rating: 'number (1-5)',
helpful_votes: 'number (0-100)',
// Feature representation (different distribution)
feature_vector: ['array of 50 numbers (0-1, embedding with distribution shift)'],
timestamp: 'ISO timestamp',
},
constraints: [
'Sentiment should correlate with rating',
'Fewer technical terms than source domain',
'Home goods-specific vocabulary',
'Feature vectors should have different distribution than source',
],
});
console.log(` - Target samples: ${targetData.data.length}`);
console.log(` - Avg rating: ${calculateAverage(targetData.data, 'rating').toFixed(2)}`);
// Generate small labeled target set for adaptation
console.log('\nGenerating labeled target samples for adaptation...');
const labeledTargetData = await synth.generateStructured({
count: 50, // Small labeled set
schema: {
review_id: 'UUID',
domain: 'string (target_home_goods_labeled)',
review_text: 'product review for home goods/furniture (2-4 sentences)',
sentiment: 'positive | negative | neutral',
domain_features: {
mentions_comfort: 'boolean',
mentions_quality: 'boolean',
mentions_design: 'boolean',
technical_terms_count: 'number (0-3)',
},
rating: 'number (1-5)',
helpful_votes: 'number (0-100)',
feature_vector: ['array of 50 numbers (0-1)'],
// Adaptation metadata
used_for_adaptation: 'boolean (true)',
similarity_to_source: 'number (0-1, measure of domain similarity)',
timestamp: 'ISO timestamp',
},
});
console.log(` - Labeled target samples: ${labeledTargetData.data.length}`);
console.log('\n✅ Domain adaptation data generated');
return {
source: sourceData,
target: targetData,
labeledTarget: labeledTargetData,
};
}
// ============================================================================
// Example 3: Catastrophic Forgetting Prevention Data
// ============================================================================
/**
* Generate replay buffer and interleaved training data
*/
export async function generateAntiCatastrophicData() {
console.log('\n🧠 Example 3: Catastrophic Forgetting Prevention\n');
const synth = createSynth({
provider: 'gemini',
apiKey: process.env.GEMINI_API_KEY || 'demo-key',
});
// Task 1: Image classification (animals)
console.log('Generating Task 1 data (Animal Classification)...');
const task1Data = await synth.generateStructured({
count: 200,
schema: {
sample_id: 'UUID',
task_id: 'number (1)',
task_name: 'string (animal_classification)',
// Image features (simulated)
image_features: ['array of 100 numbers (0-1, CNN features)'],
// Labels
category: 'cat | dog | bird | fish',
subcategory: 'specific breed or species',
// Importance for replay
importance_score: 'number (0-1, for experience replay)',
difficulty: 'number (0-1)',
timestamp: 'ISO timestamp',
},
});
console.log(` - Task 1 samples: ${task1Data.data.length}`);
// Task 2: Image classification (vehicles) - New task
console.log('\nGenerating Task 2 data (Vehicle Classification)...');
const task2Data = await synth.generateStructured({
count: 200,
schema: {
sample_id: 'UUID',
task_id: 'number (2)',
task_name: 'string (vehicle_classification)',
// Image features (different distribution)
image_features: ['array of 100 numbers (0-1, CNN features)'],
// Labels (different classes)
category: 'car | truck | motorcycle | bicycle',
subcategory: 'specific model or type',
importance_score: 'number (0-1)',
difficulty: 'number (0-1)',
timestamp: 'ISO timestamp',
},
});
console.log(` - Task 2 samples: ${task2Data.data.length}`);
// Generate replay buffer (selected samples from Task 1)
console.log('\nGenerating replay buffer...');
const replayBuffer = await synth.generateStructured({
count: 50, // 25% of Task 1
schema: {
sample_id: 'UUID',
task_id: 'number (1)',
task_name: 'string (animal_classification)',
image_features: ['array of 100 numbers (0-1)'],
category: 'cat | dog | bird | fish',
subcategory: 'specific breed or species',
// Replay metadata
importance_score: 'number (0.5-1.0, high importance)',
replay_count: 'number (0-5)',
last_replayed: 'ISO timestamp',
is_replay_sample: 'boolean (true)',
timestamp: 'ISO timestamp',
},
constraints: [
'importance_score should be high (>0.5)',
'Select diverse and difficult samples for replay',
],
});
console.log(` - Replay buffer size: ${replayBuffer.data.length}`);
// Generate interleaved training data
console.log('\nGenerating interleaved training batches...');
const interleavedBatch = await synth.generateStructured({
count: 100,
schema: {
batch_id: 'UUID',
batch_number: 'number (1-20)',
// Mix of Task 2 (new) and Task 1 (replay)
samples: [
{
sample_id: 'UUID',
task_id: 'number (1 or 2)',
is_replay: 'boolean (true for task_id=1)',
features: ['array of 100 numbers'],
label: 'string',
},
],
// Batch composition
task1_ratio: 'number (0.2-0.3, 20-30% replay)',
task2_ratio: 'number (0.7-0.8, 70-80% new task)',
// Forgetting metrics
task1_performance_estimate: 'number (0.7-0.95, should stay high)',
timestamp: 'ISO timestamp',
},
constraints: [
'Each batch should contain 20-30% Task 1 samples (replay)',
'Replay samples should maintain Task 1 performance',
],
});
console.log(` - Interleaved batches: ${interleavedBatch.data.length}`);
console.log('\n✅ Anti-catastrophic forgetting data generated');
return {
task1: task1Data,
task2: task2Data,
replay: replayBuffer,
interleaved: interleavedBatch,
};
}
// ============================================================================
// Example 4: Transfer Learning Datasets
// ============================================================================
/**
* Generate pre-training and fine-tuning datasets
*/
export async function generateTransferLearningData() {
console.log('\n🔄 Example 4: Transfer Learning Datasets\n');
const synth = createSynth({
provider: 'gemini',
apiKey: process.env.GEMINI_API_KEY || 'demo-key',
});
// Pre-training data: Large, general dataset
console.log('Generating pre-training data (General Text)...');
const pretrainingData = await synth.generateStructured({
count: 1000,
schema: {
sample_id: 'UUID',
stage: 'string (pretraining)',
// General text data
text: 'general text passage (3-5 sentences)',
domain: 'news | wikipedia | books | web',
// Self-supervised labels
masked_tokens: ['array of masked token positions'],
next_sentence_label: 'boolean (is next sentence)',
// Features
embedding: ['array of 768 numbers (transformer embedding)'],
token_count: 'number (50-200)',
timestamp: 'ISO timestamp',
},
constraints: [
'Diverse domains and topics',
'General language patterns',
'High-quality, grammatical text',
],
});
console.log(` - Pre-training samples: ${pretrainingData.data.length}`);
// Fine-tuning data: Smaller, task-specific dataset
console.log('\nGenerating fine-tuning data (Sentiment Analysis)...');
const finetuningData = await synth.generateStructured({
count: 200,
schema: {
sample_id: 'UUID',
stage: 'string (finetuning)',
// Task-specific text
text: 'product or movie review (2-4 sentences)',
domain: 'string (reviews)',
// Supervised labels
sentiment: 'positive | negative | neutral',
confidence: 'number (0-1)',
// Features (initialized from pre-trained model)
embedding: ['array of 768 numbers (fine-tuned embedding)'],
token_count: 'number (30-150)',
// Fine-tuning metadata
learning_phase: 'early | middle | late',
layer_to_finetune: 'all | last_2 | last_4 | classifier_only',
timestamp: 'ISO timestamp',
},
constraints: [
'Domain-specific vocabulary',
'Clear sentiment labels',
'Smaller dataset than pre-training',
],
});
console.log(` - Fine-tuning samples: ${finetuningData.data.length}`);
// Generate few-shot learning data
console.log('\nGenerating few-shot learning data...');
const fewShotData = await synth.generateStructured({
count: 50, // Very small
schema: {
sample_id: 'UUID',
stage: 'string (few_shot)',
// Task-specific examples
text: 'specialized domain text (legal, medical, technical)',
domain: 'legal | medical | scientific',
// Labels
category: 'specialized category',
requires_expertise: 'boolean (true)',
// Few-shot metadata
support_set: 'boolean (used as few-shot example)',
shot_number: 'number (1-5, which shot in few-shot set)',
embedding: ['array of 768 numbers'],
timestamp: 'ISO timestamp',
},
constraints: [
'Highly specialized domain',
'Very limited samples (few-shot)',
'Clear, prototypical examples',
],
});
console.log(` - Few-shot samples: ${fewShotData.data.length}`);
console.log('\n✅ Transfer learning data generated');
console.log('Data pipeline: Pre-training → Fine-tuning → Few-shot');
return {
pretraining: pretrainingData,
finetuning: finetuningData,
fewShot: fewShotData,
};
}
// ============================================================================
// Example 5: Curriculum Learning Data
// ============================================================================
/**
* Generate data organized by difficulty for curriculum learning
*/
export async function generateCurriculumData() {
console.log('\n🎓 Example 5: Curriculum Learning Data\n');
const synth = createSynth({
provider: 'gemini',
apiKey: process.env.GEMINI_API_KEY || 'demo-key',
});
const difficulties = ['easy', 'medium', 'hard', 'expert'];
const curriculum = [];
for (const difficulty of difficulties) {
console.log(`\nGenerating ${difficulty} difficulty data...`);
const difficultyData = await synth.generateStructured({
count: 150,
schema: {
sample_id: 'UUID',
difficulty_level: `string (${difficulty})`,
curriculum_stage: `number (${difficulties.indexOf(difficulty) + 1})`,
// Math problem (example task)
problem: {
question: `math word problem (${difficulty} difficulty)`,
steps_required: `number (${difficulties.indexOf(difficulty) + 1}-${difficulties.indexOf(difficulty) + 3})`,
concepts: [`array of ${difficulties.indexOf(difficulty) + 1}-${difficulties.indexOf(difficulty) + 2} math concepts`],
},
// Solution
solution: {
answer: 'correct numerical answer',
explanation: 'step-by-step solution',
intermediate_steps: ['array of solution steps'],
},
// Difficulty metrics
estimated_time_seconds: `number (${(difficulties.indexOf(difficulty) + 1) * 30}-${(difficulties.indexOf(difficulty) + 2) * 30})`,
concept_complexity: `number (${difficulties.indexOf(difficulty) + 1}-${difficulties.indexOf(difficulty) + 2})`,
prerequisite_skills: [`array of required skills (more for harder problems)`],
// Learning metadata
success_rate_expected: `number (${0.9 - difficulties.indexOf(difficulty) * 0.15}-${0.95 - difficulties.indexOf(difficulty) * 0.15})`,
timestamp: 'ISO timestamp',
},
constraints: [
`Problems should be ${difficulty} difficulty`,
'Success rate should decrease with difficulty',
'More concepts required for harder problems',
'Prerequisite skills accumulate',
],
});
console.log(` - ${difficulty} samples: ${difficultyData.data.length}`);
console.log(` - Avg steps: ${calculateAverage(difficultyData.data, (d: any) => d.problem.steps_required)}`);
curriculum.push({
stage: difficulties.indexOf(difficulty) + 1,
difficulty,
data: difficultyData,
});
}
console.log('\n✅ Curriculum learning data generated');
console.log(`Curriculum stages: ${curriculum.length}`);
console.log('Learning progression: easy → medium → hard → expert');
return curriculum;
}
// ============================================================================
// Example 6: Online Learning Stream
// ============================================================================
/**
* Generate streaming data for online learning
*/
export async function generateOnlineLearningStream() {
console.log('\n📡 Example 6: Online Learning Stream\n');
const synth = createSynth({
provider: 'gemini',
apiKey: process.env.GEMINI_API_KEY || 'demo-key',
});
// Generate time-series data stream
const streamData = await synth.generateTimeSeries({
count: 500, // 500 time points
interval: '1m', // One sample per minute
schema: {
timestamp: 'ISO timestamp',
sequence_number: 'number (sequential)',
// Incoming data point
features: ['array of 20 numbers (0-1)'],
label: 'number (0-4)',
// Distribution characteristics
distribution_shift: 'number (0-1, gradual increase over time)',
concept_drift_indicator: 'boolean',
// Model state
current_model_accuracy: 'number (0.7-0.95, may degrade over time)',
should_update_model: 'boolean (true if drift detected)',
// Online learning metadata
learning_rate: 'number (0.0001-0.01)',
update_applied: 'boolean',
samples_since_update: 'number (0-100)',
// Performance tracking
prediction_error: 'number (0-1)',
cumulative_regret: 'number (increasing)',
},
trend: 'stable',
seasonality: false,
constraints: [
'Distribution shift should gradually increase',
'Model accuracy should correlate inversely with drift',
'should_update_model when accuracy drops or drift detected',
'cumulative_regret increases when predictions are wrong',
],
});
const driftPoints = streamData.data.filter((d: any) => d.concept_drift_indicator);
console.log('Online Learning Stream:');
console.log(`- Stream length: ${streamData.data.length} samples`);
console.log(`- Concept drift points: ${driftPoints.length}`);
console.log(`- Avg accuracy: ${calculateAverage(streamData.data, 'current_model_accuracy').toFixed(3)}`);
console.log(`- Model updates: ${streamData.data.filter((d: any) => d.update_applied).length}`);
console.log('\n✅ Online learning stream generated');
return streamData;
}
// ============================================================================
// Utility Functions
// ============================================================================
function calculateAverage(data: any[], field: string | ((d: any) => number)): number {
const values =
typeof field === 'string'
? data.map((d) => d[field]).filter((v) => typeof v === 'number')
: data.map(field).filter((v) => typeof v === 'number');
if (values.length === 0) return 0;
return values.reduce((a, b) => a + b, 0) / values.length;
}
function getLabelDistribution(data: any[]): Record<string, number> {
const dist: Record<string, number> = {};
data.forEach((d) => {
const label = d.label.toString();
dist[label] = (dist[label] || 0) + 1;
});
return dist;
}
// ============================================================================
// Complete Continual Learning Pipeline
// ============================================================================
/**
* Demonstrate complete continual learning pipeline
*/
export async function completeContinualLearningPipeline() {
console.log('\n🚀 Complete Continual Learning Pipeline\n');
console.log('='.repeat(60));
console.log('\nStage 1: Initial Training with Curriculum');
const curriculum = await generateCurriculumData();
console.log('\nStage 2: Domain Adaptation');
const domainData = await generateDomainAdaptationData();
console.log('\nStage 3: Incremental Learning');
const incrementalData = await generateIncrementalData();
console.log('\nStage 4: Catastrophic Forgetting Prevention');
const antiForgetData = await generateAntiCatastrophicData();
console.log('\nStage 5: Online Learning');
const onlineData = await generateOnlineLearningStream();
console.log('\n' + '='.repeat(60));
console.log('✅ Complete continual learning pipeline executed');
console.log('\nPipeline Summary:');
console.log(' 1. Curriculum Learning (easy → hard)');
console.log(' 2. Domain Adaptation (source → target)');
console.log(' 3. Incremental Learning (phase 1 → phase N)');
console.log(' 4. Experience Replay (prevent forgetting)');
console.log(' 5. Online Learning (continuous stream)');
}
// ============================================================================
// Run All Examples
// ============================================================================
export async function runAllContinualLearningExamples() {
console.log('🎯 Continual Learning Dataset Generation\n');
console.log('='.repeat(60));
try {
await generateIncrementalData();
console.log('='.repeat(60));
await generateDomainAdaptationData();
console.log('='.repeat(60));
await generateAntiCatastrophicData();
console.log('='.repeat(60));
await generateTransferLearningData();
console.log('='.repeat(60));
await generateCurriculumData();
console.log('='.repeat(60));
await generateOnlineLearningStream();
console.log('='.repeat(60));
console.log('\n✅ All continual learning examples completed!\n');
} catch (error: any) {
console.error('❌ Error:', error.message);
}
}
// Run if executed directly
if (import.meta.url === `file://${process.argv[1]}`) {
runAllContinualLearningExamples().catch(console.error);
}