git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
386 lines
12 KiB
TypeScript
386 lines
12 KiB
TypeScript
/**
|
|
* @fileoverview Unit tests for the embeddings integration module
|
|
*
|
|
* @author ruv.io Team <info@ruv.io>
|
|
* @license MIT
|
|
*/
|
|
|
|
import { describe, it, mock } from 'node:test';
|
|
import assert from 'node:assert';
|
|
import {
|
|
EmbeddingProvider,
|
|
OpenAIEmbeddings,
|
|
CohereEmbeddings,
|
|
AnthropicEmbeddings,
|
|
HuggingFaceEmbeddings,
|
|
type BatchEmbeddingResult,
|
|
type EmbeddingError,
|
|
} from '../src/embeddings.js';
|
|
|
|
// ============================================================================
|
|
// Mock Implementation for Testing
|
|
// ============================================================================
|
|
|
|
class MockEmbeddingProvider extends EmbeddingProvider {
|
|
private dimension: number;
|
|
private batchSize: number;
|
|
|
|
constructor(dimension = 384, batchSize = 10) {
|
|
super();
|
|
this.dimension = dimension;
|
|
this.batchSize = batchSize;
|
|
}
|
|
|
|
getMaxBatchSize(): number {
|
|
return this.batchSize;
|
|
}
|
|
|
|
getDimension(): number {
|
|
return this.dimension;
|
|
}
|
|
|
|
async embedTexts(texts: string[]): Promise<BatchEmbeddingResult> {
|
|
// Generate mock embeddings
|
|
const embeddings = texts.map((text, index) => ({
|
|
embedding: Array.from({ length: this.dimension }, () => Math.random()),
|
|
index,
|
|
tokens: text.length,
|
|
}));
|
|
|
|
return {
|
|
embeddings,
|
|
totalTokens: texts.reduce((sum, text) => sum + text.length, 0),
|
|
metadata: {
|
|
provider: 'mock',
|
|
model: 'mock-model',
|
|
},
|
|
};
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Tests for Base EmbeddingProvider
|
|
// ============================================================================
|
|
|
|
describe('EmbeddingProvider (Abstract Base)', () => {
|
|
it('should embed single text', async () => {
|
|
const provider = new MockEmbeddingProvider(384);
|
|
const embedding = await provider.embedText('Hello, world!');
|
|
|
|
assert.strictEqual(embedding.length, 384);
|
|
assert.ok(Array.isArray(embedding));
|
|
assert.ok(embedding.every(val => typeof val === 'number'));
|
|
});
|
|
|
|
it('should embed multiple texts', async () => {
|
|
const provider = new MockEmbeddingProvider(384);
|
|
const texts = ['First text', 'Second text', 'Third text'];
|
|
|
|
const result = await provider.embedTexts(texts);
|
|
|
|
assert.strictEqual(result.embeddings.length, 3);
|
|
assert.ok(result.totalTokens > 0);
|
|
assert.strictEqual(result.metadata?.provider, 'mock');
|
|
});
|
|
|
|
it('should handle empty text array', async () => {
|
|
const provider = new MockEmbeddingProvider(384);
|
|
const result = await provider.embedTexts([]);
|
|
|
|
assert.strictEqual(result.embeddings.length, 0);
|
|
});
|
|
|
|
it('should create batches correctly', async () => {
|
|
const provider = new MockEmbeddingProvider(384, 5);
|
|
const texts = Array.from({ length: 12 }, (_, i) => `Text ${i}`);
|
|
|
|
const result = await provider.embedTexts(texts);
|
|
|
|
assert.strictEqual(result.embeddings.length, 12);
|
|
// Verify all indices are present
|
|
const indices = result.embeddings.map(e => e.index).sort((a, b) => a - b);
|
|
assert.deepStrictEqual(indices, Array.from({ length: 12 }, (_, i) => i));
|
|
});
|
|
});
|
|
|
|
// ============================================================================
|
|
// Tests for OpenAI Provider (Mock)
|
|
// ============================================================================
|
|
|
|
describe('OpenAIEmbeddings', () => {
|
|
it('should throw error if OpenAI SDK not installed', () => {
|
|
assert.throws(
|
|
() => {
|
|
new OpenAIEmbeddings({ apiKey: 'test-key' });
|
|
},
|
|
/OpenAI SDK not found/
|
|
);
|
|
});
|
|
|
|
it('should have correct default configuration', () => {
|
|
// This would work if OpenAI SDK is installed
|
|
// For now, we test the error case
|
|
try {
|
|
const openai = new OpenAIEmbeddings({ apiKey: 'test-key' });
|
|
assert.fail('Should have thrown error');
|
|
} catch (error: any) {
|
|
assert.ok(error.message.includes('OpenAI SDK not found'));
|
|
}
|
|
});
|
|
|
|
it('should return correct dimensions', () => {
|
|
// Mock test - would need OpenAI SDK installed
|
|
const expectedDimensions = {
|
|
'text-embedding-3-small': 1536,
|
|
'text-embedding-3-large': 3072,
|
|
'text-embedding-ada-002': 1536,
|
|
};
|
|
|
|
assert.ok(expectedDimensions['text-embedding-3-small'] === 1536);
|
|
});
|
|
|
|
it('should have correct max batch size', () => {
|
|
// OpenAI supports up to 2048 inputs per request
|
|
const expectedBatchSize = 2048;
|
|
assert.strictEqual(expectedBatchSize, 2048);
|
|
});
|
|
});
|
|
|
|
// ============================================================================
|
|
// Tests for Cohere Provider (Mock)
|
|
// ============================================================================
|
|
|
|
describe('CohereEmbeddings', () => {
|
|
it('should throw error if Cohere SDK not installed', () => {
|
|
assert.throws(
|
|
() => {
|
|
new CohereEmbeddings({ apiKey: 'test-key' });
|
|
},
|
|
/Cohere SDK not found/
|
|
);
|
|
});
|
|
|
|
it('should return correct dimensions', () => {
|
|
// Cohere v3 models use 1024 dimensions
|
|
const expectedDimension = 1024;
|
|
assert.strictEqual(expectedDimension, 1024);
|
|
});
|
|
|
|
it('should have correct max batch size', () => {
|
|
// Cohere supports up to 96 texts per request
|
|
const expectedBatchSize = 96;
|
|
assert.strictEqual(expectedBatchSize, 96);
|
|
});
|
|
});
|
|
|
|
// ============================================================================
|
|
// Tests for Anthropic Provider (Mock)
|
|
// ============================================================================
|
|
|
|
describe('AnthropicEmbeddings', () => {
|
|
it('should throw error if Anthropic SDK not installed', () => {
|
|
assert.throws(
|
|
() => {
|
|
new AnthropicEmbeddings({ apiKey: 'test-key' });
|
|
},
|
|
/Anthropic SDK not found/
|
|
);
|
|
});
|
|
|
|
it('should return correct dimensions', () => {
|
|
// Voyage-2 uses 1024 dimensions
|
|
const expectedDimension = 1024;
|
|
assert.strictEqual(expectedDimension, 1024);
|
|
});
|
|
|
|
it('should have correct max batch size', () => {
|
|
const expectedBatchSize = 128;
|
|
assert.strictEqual(expectedBatchSize, 128);
|
|
});
|
|
});
|
|
|
|
// ============================================================================
|
|
// Tests for HuggingFace Provider (Mock)
|
|
// ============================================================================
|
|
|
|
describe('HuggingFaceEmbeddings', () => {
|
|
it('should create with default config', () => {
|
|
const hf = new HuggingFaceEmbeddings();
|
|
assert.strictEqual(hf.getDimension(), 384);
|
|
assert.strictEqual(hf.getMaxBatchSize(), 32);
|
|
});
|
|
|
|
it('should create with custom config', () => {
|
|
const hf = new HuggingFaceEmbeddings({
|
|
batchSize: 64,
|
|
});
|
|
assert.strictEqual(hf.getMaxBatchSize(), 64);
|
|
});
|
|
|
|
it('should handle initialization lazily', async () => {
|
|
const hf = new HuggingFaceEmbeddings();
|
|
// Should not throw on construction
|
|
assert.ok(hf);
|
|
});
|
|
});
|
|
|
|
// ============================================================================
|
|
// Tests for Retry Logic
|
|
// ============================================================================
|
|
|
|
describe('Retry Logic', () => {
|
|
it('should retry on retryable errors', async () => {
|
|
let attempts = 0;
|
|
|
|
class RetryTestProvider extends MockEmbeddingProvider {
|
|
async embedTexts(texts: string[]): Promise<BatchEmbeddingResult> {
|
|
attempts++;
|
|
if (attempts < 3) {
|
|
throw new Error('Rate limit exceeded');
|
|
}
|
|
return super.embedTexts(texts);
|
|
}
|
|
}
|
|
|
|
const provider = new RetryTestProvider();
|
|
const result = await provider.embedTexts(['Test']);
|
|
|
|
assert.strictEqual(attempts, 3);
|
|
assert.strictEqual(result.embeddings.length, 1);
|
|
});
|
|
|
|
it('should not retry on non-retryable errors', async () => {
|
|
let attempts = 0;
|
|
|
|
class NonRetryableProvider extends MockEmbeddingProvider {
|
|
async embedTexts(texts: string[]): Promise<BatchEmbeddingResult> {
|
|
attempts++;
|
|
throw new Error('Invalid API key');
|
|
}
|
|
}
|
|
|
|
const provider = new NonRetryableProvider();
|
|
|
|
try {
|
|
await provider.embedTexts(['Test']);
|
|
assert.fail('Should have thrown error');
|
|
} catch (error) {
|
|
// Should fail on first attempt only
|
|
assert.strictEqual(attempts, 1);
|
|
}
|
|
});
|
|
|
|
it('should respect max retries', async () => {
|
|
let attempts = 0;
|
|
|
|
class MaxRetriesProvider extends MockEmbeddingProvider {
|
|
async embedTexts(texts: string[]): Promise<BatchEmbeddingResult> {
|
|
attempts++;
|
|
throw new Error('Rate limit exceeded');
|
|
}
|
|
}
|
|
|
|
const provider = new MaxRetriesProvider();
|
|
|
|
try {
|
|
await provider.embedTexts(['Test']);
|
|
assert.fail('Should have thrown error');
|
|
} catch (error) {
|
|
// Default maxRetries is 3, so should try 4 times total (initial + 3 retries)
|
|
assert.strictEqual(attempts, 4);
|
|
}
|
|
});
|
|
});
|
|
|
|
// ============================================================================
|
|
// Tests for Error Handling
|
|
// ============================================================================
|
|
|
|
describe('Error Handling', () => {
|
|
it('should identify retryable errors', () => {
|
|
const provider = new MockEmbeddingProvider();
|
|
const retryableErrors = [
|
|
new Error('Rate limit exceeded'),
|
|
new Error('Request timeout'),
|
|
new Error('503 Service Unavailable'),
|
|
new Error('429 Too Many Requests'),
|
|
new Error('Connection refused'),
|
|
];
|
|
|
|
retryableErrors.forEach(error => {
|
|
const isRetryable = (provider as any).isRetryableError(error);
|
|
assert.strictEqual(isRetryable, true, `Should be retryable: ${error.message}`);
|
|
});
|
|
});
|
|
|
|
it('should identify non-retryable errors', () => {
|
|
const provider = new MockEmbeddingProvider();
|
|
const nonRetryableErrors = [
|
|
new Error('Invalid API key'),
|
|
new Error('Authentication failed'),
|
|
new Error('Invalid request'),
|
|
new Error('Resource not found'),
|
|
];
|
|
|
|
nonRetryableErrors.forEach(error => {
|
|
const isRetryable = (provider as any).isRetryableError(error);
|
|
assert.strictEqual(isRetryable, false, `Should not be retryable: ${error.message}`);
|
|
});
|
|
});
|
|
|
|
it('should create embedding error with context', () => {
|
|
const provider = new MockEmbeddingProvider();
|
|
const originalError = new Error('Test error');
|
|
const embeddingError = (provider as any).createEmbeddingError(
|
|
originalError,
|
|
'Test context',
|
|
true
|
|
) as EmbeddingError;
|
|
|
|
assert.strictEqual(embeddingError.message, 'Test context: Test error');
|
|
assert.strictEqual(embeddingError.retryable, true);
|
|
assert.strictEqual(embeddingError.error, originalError);
|
|
});
|
|
});
|
|
|
|
// ============================================================================
|
|
// Tests for Batch Processing
|
|
// ============================================================================
|
|
|
|
describe('Batch Processing', () => {
|
|
it('should split large datasets into batches', async () => {
|
|
const provider = new MockEmbeddingProvider(384, 10);
|
|
const texts = Array.from({ length: 35 }, (_, i) => `Text ${i}`);
|
|
|
|
const result = await provider.embedTexts(texts);
|
|
|
|
assert.strictEqual(result.embeddings.length, 35);
|
|
// Verify all texts were processed
|
|
const processedIndices = result.embeddings.map(e => e.index).sort((a, b) => a - b);
|
|
assert.deepStrictEqual(processedIndices, Array.from({ length: 35 }, (_, i) => i));
|
|
});
|
|
|
|
it('should handle single batch correctly', async () => {
|
|
const provider = new MockEmbeddingProvider(384, 100);
|
|
const texts = Array.from({ length: 50 }, (_, i) => `Text ${i}`);
|
|
|
|
const result = await provider.embedTexts(texts);
|
|
|
|
assert.strictEqual(result.embeddings.length, 50);
|
|
});
|
|
|
|
it('should preserve order across batches', async () => {
|
|
const provider = new MockEmbeddingProvider(384, 5);
|
|
const texts = Array.from({ length: 12 }, (_, i) => `Text ${i}`);
|
|
|
|
const result = await provider.embedTexts(texts);
|
|
|
|
// Check that indices are correct
|
|
result.embeddings.forEach((embedding, i) => {
|
|
assert.strictEqual(embedding.index, i);
|
|
});
|
|
});
|
|
});
|
|
|
|
console.log('✓ All embeddings tests passed!');
|