Performance Tuning Guide
Optimize Trinity applications for maximum performance. This guide covers SIMD optimization, memory management, VSA operation tuning, and benchmarking best practices.
Overview​
Trinity's balanced ternary architecture provides unique performance characteristics:
| Aspect | Performance | Notes |
|---|---|---|
| Memory Density | 1.58 bits/trit | 20x more compact than float32 |
| Compute | Add-only operations | No multiplication needed for binding |
| SIMD Potential | High | Ternary operations vectorize well |
| Cache Efficiency | Excellent | Packed representation reduces cache misses |
SIMD Optimization​
Understanding SIMD in Trinity​
SIMD (Single Instruction, Multiple Data) allows processing multiple trits simultaneously. Zig's @Vector() type is the key to unlocking this performance.
Basic SIMD Operations​
Vectorized Binding​
const std = @import("std");
const vsa = @import("trinity/vsa");
// Scalar version (slow)
fn bindScalar(a: []const i2, b: []const i2, result: []i2) void {
for (a, 0..) |trit_a, i| {
result[i] = trit_a * b[i];
}
}
// SIMD version (8x faster on AVX2)
fn bindSIMD(a: []const i2, b: []const i2, result: []i2) void {
const Vec = @Vector(16, i2); // Process 16 trits at once
const len = a.len / 16;
var i: usize = 0;
while (i < len) : (i += 1) {
const va: Vec = a[i*16..][0..16].*;
const vb: Vec = b[i*16..][0..16].*;
result[i*16..][0..16].* = va * vb;
}
// Handle remainder
const remainder = a.len % 16;
if (remainder > 0) {
const start = len * 16;
for (0..remainder) |j| {
result[start + j] = a[start + j] * b[start + j];
}
}
}
Vectorized Similarity Calculation​
// Cosine similarity with SIMD
fn cosineSimilaritySIMD(a: []const i2, b: []const i2) f64 {
const Vec = @Vector(16, i32); // Use i32 to avoid overflow
const len = a.len / 16;
var dotVec: Vec = @splat(0);
var i: usize = 0;
while (i < len) : (i += 1) {
const va: Vec = @intCast(a[i*16..][0..16].*);
const vb: Vec = @intCast(b[i*16..][0..16].*);
dotVec += va * vb;
}
// Reduce vector to scalar
var dotSum: i32 = 0;
for (0..16) |j| {
dotSum += dotVec[j];
}
// Handle remainder
for (len * 16..a.len) |j| {
dotSum += @as(i32, a[j]) * @as(i32, b[j]);
}
return @as(f64, @floatFromInt(dotSum)) / @as(f64, @floatFromInt(a.len));
}
SIMD Optimization Tips​
| Tip | Benefit | Example |
|---|---|---|
| Align to 16/32 bytes | Prevents cross-cache-line loads | alignas(32) var data: [1024]i2 |
| Use power-of-2 sizes | Enables loop unrolling | Process 16/32/64 trits at once |
| Prefetch memory | Hides latency | @prefetch(ptr[i + 8]) |
| Avoid branches | Keeps vector pipeline full | Use ternary operators instead of if |
| Batch operations | Amortizes overhead | Process 1000+ trits at once |
Compiler Hints​
// Tell Zig to vectorize
fn optimizedBind(a: []const i2, b: []const i2, result: []i2) void {
@setRuntimeSafety(false); // Disable bounds checking
@setOptimizationMode(.Optimized); // Force optimization
const Vec = @Vector(32, i2);
// ... implementation
}
Memory Management​
HybridBigInt Memory Efficiency​
Trinity's HybridBigInt uses a packed representation that's 20x more memory-efficient than float32 arrays.
Memory Comparison​
| Representation | 10,000 Trits | Memory |
|---|---|---|
[]f32 | 10,000 × 32-bit | 40 KB |
[]i8 (ternary) | 10,000 × 8-bit | 10 KB |
HybridBigInt (packed) | 10,000 × 1.58-bit | 2 KB |
Pool Allocation for Frequent Operations​
const VsaPool = struct {
const VEC_SIZE = 10000;
const POOL_SIZE = 100;
allocator: std.mem.Allocator,
pool: [POOL_SIZE]?vsa.HybridBigInt,
fn init(allocator: std.mem.Allocator) VsaPool {
return .{
.allocator = allocator,
.pool = [_]?vsa.HybridBigInt{null} ** POOL_SIZE,
};
}
fn acquire(pool: *VsaPool) !*vsa.HybridBigInt {
// Find free slot
for (&pool.pool, 0..) |*slot, i| {
if (slot.* == null) {
slot.* = try vsa.HybridBigInt.init(pool.allocator, VEC_SIZE);
return &slot.*.?;
}
}
return error.PoolExhausted;
}
fn release(pool: *VsaPool, vec: *vsa.HybridBigInt) void {
vec.deinit(pool.allocator);
for (&pool.pool, 0..) |*slot, i| {
if (slot.*) |v| {
if (v == vec.*) {
slot.* = null;
return;
}
}
}
}
};
Cache-Friendly Data Structures​
Structure of Arrays vs. Array of Structures​
// BAD: Array of Structures (cache misses)
const TrinaryVectorSoA = struct {
data: []vsa.HybridBigInt,
};
// GOOD: Structure of Arrays (cache friendly)
const TrinaryVectorSoA = struct {
// Store trits contiguously
trits: []i2,
length: usize,
fn deinit(self: *const TrinaryVectorSoA, allocator: std.mem.Allocator) void {
allocator.free(self.trits);
}
};
Memory Profiling​
# Build with memory profiling
zig build -Drelease -Dmemory-profile
# Run with memory tracker
./zig-out/bin/tri --profile-memory
# Analyze heap usage
./zig-out/bin/tri --profile-heap > heap.log
zig tools/analyze-profile heap.log
VSA Operation Optimization​
Batch Binding​
// Process multiple bindings in one pass
fn batchBind(vectors: []const vsa.HybridBigInt, keys: []const vsa.HybridBigInt, results: []vsa.HybridBigInt) !void {
// Pre-allocate all results
for (0..vectors.len) |i| {
results[i] = try vsa.HybridBigInt.init(allocator, vectors[i].len);
}
// Batch process (better cache locality)
for (0..vectors.len) |i| {
_ = try vsa.bind(&vectors[i], &keys[i], &results[i]);
}
}
Similarity Search Optimization​
// Use spatial partitioning for faster nearest-neighbor search
const LshTable = struct {
tables: []std.AutoHashMap(u64, []usize),
num_tables: usize,
num_hashes: usize,
fn init(allocator: std.mem.Allocator, num_tables: usize, num_hashes: usize) !LshTable {
var tables = try allocator.alloc(std.AutoHashMap(u64, []usize), num_tables);
for (tables) |*table| {
table.* = std.AutoHashMap(u64, []usize).init(allocator);
}
return .{
.tables = tables,
.num_tables = num_tables,
.num_hashes = num_hashes,
};
}
fn insert(lsh: *LshTable, allocator: std.mem.Allocator, idx: usize, vec: *const vsa.HybridBigInt) !void {
for (0..lsh.num_tables) |t| {
const hash = computeHash(vec, t);
const entry = try lsh.tables[t].getOrPut(hash);
if (!entry.found_existing) {
entry.value_ptr.* = &[_]usize{};
}
// Append index
const new_list = try allocator.alloc(usize, entry.value_ptr.len + 1);
@memcpy(new_list[0..entry.value_ptr.len], entry.value_ptr.*);
new_list[entry.value_ptr.len] = idx;
entry.value_ptr.* = new_list;
}
}
fn findNearest(lsh: *LshTable, query: *const vsa.HybridBigInt) !?usize {
var candidates = std.ArrayList(usize).init(allocator);
defer candidates.deinit();
for (0..lsh.num_tables) |t| {
const hash = computeHash(query, t);
if (lsh.tables[t].get(hash)) |indices| {
try candidates.appendSlice(indices);
}
}
// Filter by actual similarity
var best_idx: ?usize = null;
var best_sim: f64 = 0.0;
for (candidates.items) |idx| {
const sim = try vsa.cosineSimilarity(query, &vectors[idx]);
if (sim > best_sim) {
best_sim = sim;
best_idx = idx;
}
}
return best_idx;
}
};
Permutation Caching​
// Cache frequently used permutations
var perm_cache: std.AutoHashMap(usize, vsa.HybridBigInt) = undefined;
fn getCachedPermutation(vec: *const vsa.HybridBigInt, count: usize) !vsa.HybridBigInt {
const key = @intFromPtr(vec.ptr) ^ count;
if (perm_cache.get(key)) |cached| {
return cached.clone();
}
const result = try vsa.permute(vec, count);
try perm_cache.put(key, result.clone());
return result;
}
Benchmarking Guidelines​
Microbenchmarking Template​
const std = @import("std");
const vsa = @import("trinity/vsa");
fn benchmarkBind(allocator: std.mem.Allocator, iterations: usize) !void {
const timer = try std.time.Timer.start();
// Setup
const vec_a = try vsa.HybridBigInt.random(allocator, 10000);
defer vec_a.deinit(allocator);
const vec_b = try vsa.HybridBigInt.random(allocator, 10000);
defer vec_b.deinit(allocator);
var result = try vsa.HybridBigInt.init(allocator, 10000);
defer result.deinit(allocator);
// Warmup
for (0..100) |_| {
_ = try vsa.bind(&vec_a, &vec_b, &result);
}
// Benchmark
const start = timer.lap();
for (0..iterations) |_| {
_ = try vsa.bind(&vec_a, &vec_b, &result);
}
const end = timer.read();
// Results
const elapsed_ns = end - start;
const avg_ns = @as(f64, @floatFromInt(elapsed_ns)) / @as(f64, @floatFromInt(iterations));
const ops_per_sec = 1_000_000_000.0 / avg_ns;
std.debug.print(
\\bind() Benchmark:
\\ Iterations: {d}
\\ Total time: {d:.2} ms
\\ Avg/op: {d:.3} ns
\\ Ops/sec: {d:.0}
\\
, .{ iterations, @as(f64, @floatFromInt(elapsed_ns)) / 1_000_000.0, avg_ns, ops_per_sec });
}
Performance Regression Testing​
# Create baseline
zig build bench --baseline
# Compare with current
zig build bench --compare
# Output:
# bind(): +2.3% (was 45.2 ns/op, now 46.3 ns/op) [REGRESSION]
# similarity(): -1.8% (was 32.1 ns/op, now 31.5 ns/op) [IMPROVEMENT]
Benchmarking Best Practices​
| Practice | Why | Example |
|---|---|---|
| Warmup iterations | CPU cache and branch prediction | Run 100+ iterations before measuring |
| Statistical significance | Variance in measurements | Use 1000+ iterations, repeat 5+ times |
| Isolate variables | Measure one thing at a time | Don't benchmark bind+similarity together |
| Use realistic data | Synthetic data can mislead | Use actual corpus data |
| Check assembly | Verify compiler optimization | zig objdump -d binary |
Advanced Techniques​
Multi-threading for Batch Operations​
fn parallelBind(vectors: []const vsa.HybridBigInt, keys: []const vsa.HybridBigInt) !void {
const num_threads = try std.Thread.getCpuCount();
const chunk_size = vectors.len / num_threads;
var threads: [16]std.Thread = undefined;
for (0..num_threads) |i| {
const start = i * chunk_size;
const end = if (i == num_threads - 1) vectors.len else (i + 1) * chunk_size;
threads[i] = try std.Thread.spawn(.{}, struct {
fn worker(start: usize, end: usize) !void {
for (start..end) |j| {
_ = try vsa.bind(&vectors[j], &keys[j], &results[j]);
}
}.worker, .{ start, end });
}
for (0..num_threads) |i| {
threads[i].join();
}
}
GPU Offloading (Future)​
// Pseudo-code for GPU acceleration
fn gpuBindBatch(vectors: []vsa.HybridBigInt, keys: []vsa.HybridBigInt) !void {
// 1. Copy data to GPU
const gpu_vectors = try gpu.copyToGpu(vectors);
defer gpu.free(gpu_vectors);
const gpu_keys = try gpu.copyToGpu(keys);
defer gpu.free(gpu_keys);
// 2. Launch kernel
try gpu.launchKernel(bindKernel, .{ gpu_vectors, gpu_keys });
// 3. Copy results back
try gpu.copyFromGpu(results, gpu_results);
}
Performance Checklist​
Use this checklist before deploying to production:
- All hot paths use SIMD operations
- Memory is aligned to cache line boundaries
- Object pools for frequent allocations
- Benchmark suite covers critical paths
- Performance regression tests pass
- Memory usage is stable (no leaks)
- CPU utilization is >70% (not bottlenecked)
- Cache hit rate is >80%
Further Reading​
- VSA API Reference — Core operations and signatures
- Benchmarks — Performance metrics and comparisons
- JIT Performance Guide — Just-in-time compilation
- Memory Efficiency Report — Detailed analysis
Need more performance tips? Check the community forum or open a GitHub issue.