Skip to main content

Performance Tuning Guide

Optimize Trinity applications for maximum performance. This guide covers SIMD optimization, memory management, VSA operation tuning, and benchmarking best practices.

Overview​

Trinity's balanced ternary architecture provides unique performance characteristics:

AspectPerformanceNotes
Memory Density1.58 bits/trit20x more compact than float32
ComputeAdd-only operationsNo multiplication needed for binding
SIMD PotentialHighTernary operations vectorize well
Cache EfficiencyExcellentPacked representation reduces cache misses

SIMD Optimization​

Understanding SIMD in Trinity​

SIMD (Single Instruction, Multiple Data) allows processing multiple trits simultaneously. Zig's @Vector() type is the key to unlocking this performance.

Basic SIMD Operations​

Vectorized Binding​

const std = @import("std");
const vsa = @import("trinity/vsa");

// Scalar version (slow)
fn bindScalar(a: []const i2, b: []const i2, result: []i2) void {
for (a, 0..) |trit_a, i| {
result[i] = trit_a * b[i];
}
}

// SIMD version (8x faster on AVX2)
fn bindSIMD(a: []const i2, b: []const i2, result: []i2) void {
const Vec = @Vector(16, i2); // Process 16 trits at once
const len = a.len / 16;

var i: usize = 0;
while (i < len) : (i += 1) {
const va: Vec = a[i*16..][0..16].*;
const vb: Vec = b[i*16..][0..16].*;
result[i*16..][0..16].* = va * vb;
}

// Handle remainder
const remainder = a.len % 16;
if (remainder > 0) {
const start = len * 16;
for (0..remainder) |j| {
result[start + j] = a[start + j] * b[start + j];
}
}
}

Vectorized Similarity Calculation​

// Cosine similarity with SIMD
fn cosineSimilaritySIMD(a: []const i2, b: []const i2) f64 {
const Vec = @Vector(16, i32); // Use i32 to avoid overflow
const len = a.len / 16;

var dotVec: Vec = @splat(0);
var i: usize = 0;

while (i < len) : (i += 1) {
const va: Vec = @intCast(a[i*16..][0..16].*);
const vb: Vec = @intCast(b[i*16..][0..16].*);
dotVec += va * vb;
}

// Reduce vector to scalar
var dotSum: i32 = 0;
for (0..16) |j| {
dotSum += dotVec[j];
}

// Handle remainder
for (len * 16..a.len) |j| {
dotSum += @as(i32, a[j]) * @as(i32, b[j]);
}

return @as(f64, @floatFromInt(dotSum)) / @as(f64, @floatFromInt(a.len));
}

SIMD Optimization Tips​

TipBenefitExample
Align to 16/32 bytesPrevents cross-cache-line loadsalignas(32) var data: [1024]i2
Use power-of-2 sizesEnables loop unrollingProcess 16/32/64 trits at once
Prefetch memoryHides latency@prefetch(ptr[i + 8])
Avoid branchesKeeps vector pipeline fullUse ternary operators instead of if
Batch operationsAmortizes overheadProcess 1000+ trits at once

Compiler Hints​

// Tell Zig to vectorize
fn optimizedBind(a: []const i2, b: []const i2, result: []i2) void {
@setRuntimeSafety(false); // Disable bounds checking
@setOptimizationMode(.Optimized); // Force optimization

const Vec = @Vector(32, i2);
// ... implementation
}

Memory Management​

HybridBigInt Memory Efficiency​

Trinity's HybridBigInt uses a packed representation that's 20x more memory-efficient than float32 arrays.

Memory Comparison​

Representation10,000 TritsMemory
[]f3210,000 × 32-bit40 KB
[]i8 (ternary)10,000 × 8-bit10 KB
HybridBigInt (packed)10,000 × 1.58-bit2 KB

Pool Allocation for Frequent Operations​

const VsaPool = struct {
const VEC_SIZE = 10000;
const POOL_SIZE = 100;

allocator: std.mem.Allocator,
pool: [POOL_SIZE]?vsa.HybridBigInt,

fn init(allocator: std.mem.Allocator) VsaPool {
return .{
.allocator = allocator,
.pool = [_]?vsa.HybridBigInt{null} ** POOL_SIZE,
};
}

fn acquire(pool: *VsaPool) !*vsa.HybridBigInt {
// Find free slot
for (&pool.pool, 0..) |*slot, i| {
if (slot.* == null) {
slot.* = try vsa.HybridBigInt.init(pool.allocator, VEC_SIZE);
return &slot.*.?;
}
}
return error.PoolExhausted;
}

fn release(pool: *VsaPool, vec: *vsa.HybridBigInt) void {
vec.deinit(pool.allocator);
for (&pool.pool, 0..) |*slot, i| {
if (slot.*) |v| {
if (v == vec.*) {
slot.* = null;
return;
}
}
}
}
};

Cache-Friendly Data Structures​

Structure of Arrays vs. Array of Structures​

// BAD: Array of Structures (cache misses)
const TrinaryVectorSoA = struct {
data: []vsa.HybridBigInt,
};

// GOOD: Structure of Arrays (cache friendly)
const TrinaryVectorSoA = struct {
// Store trits contiguously
trits: []i2,
length: usize,

fn deinit(self: *const TrinaryVectorSoA, allocator: std.mem.Allocator) void {
allocator.free(self.trits);
}
};

Memory Profiling​

# Build with memory profiling
zig build -Drelease -Dmemory-profile

# Run with memory tracker
./zig-out/bin/tri --profile-memory

# Analyze heap usage
./zig-out/bin/tri --profile-heap > heap.log
zig tools/analyze-profile heap.log

VSA Operation Optimization​

Batch Binding​

// Process multiple bindings in one pass
fn batchBind(vectors: []const vsa.HybridBigInt, keys: []const vsa.HybridBigInt, results: []vsa.HybridBigInt) !void {
// Pre-allocate all results
for (0..vectors.len) |i| {
results[i] = try vsa.HybridBigInt.init(allocator, vectors[i].len);
}

// Batch process (better cache locality)
for (0..vectors.len) |i| {
_ = try vsa.bind(&vectors[i], &keys[i], &results[i]);
}
}

Similarity Search Optimization​

// Use spatial partitioning for faster nearest-neighbor search
const LshTable = struct {
tables: []std.AutoHashMap(u64, []usize),
num_tables: usize,
num_hashes: usize,

fn init(allocator: std.mem.Allocator, num_tables: usize, num_hashes: usize) !LshTable {
var tables = try allocator.alloc(std.AutoHashMap(u64, []usize), num_tables);
for (tables) |*table| {
table.* = std.AutoHashMap(u64, []usize).init(allocator);
}
return .{
.tables = tables,
.num_tables = num_tables,
.num_hashes = num_hashes,
};
}

fn insert(lsh: *LshTable, allocator: std.mem.Allocator, idx: usize, vec: *const vsa.HybridBigInt) !void {
for (0..lsh.num_tables) |t| {
const hash = computeHash(vec, t);
const entry = try lsh.tables[t].getOrPut(hash);
if (!entry.found_existing) {
entry.value_ptr.* = &[_]usize{};
}
// Append index
const new_list = try allocator.alloc(usize, entry.value_ptr.len + 1);
@memcpy(new_list[0..entry.value_ptr.len], entry.value_ptr.*);
new_list[entry.value_ptr.len] = idx;
entry.value_ptr.* = new_list;
}
}

fn findNearest(lsh: *LshTable, query: *const vsa.HybridBigInt) !?usize {
var candidates = std.ArrayList(usize).init(allocator);
defer candidates.deinit();

for (0..lsh.num_tables) |t| {
const hash = computeHash(query, t);
if (lsh.tables[t].get(hash)) |indices| {
try candidates.appendSlice(indices);
}
}

// Filter by actual similarity
var best_idx: ?usize = null;
var best_sim: f64 = 0.0;

for (candidates.items) |idx| {
const sim = try vsa.cosineSimilarity(query, &vectors[idx]);
if (sim > best_sim) {
best_sim = sim;
best_idx = idx;
}
}

return best_idx;
}
};

Permutation Caching​

// Cache frequently used permutations
var perm_cache: std.AutoHashMap(usize, vsa.HybridBigInt) = undefined;

fn getCachedPermutation(vec: *const vsa.HybridBigInt, count: usize) !vsa.HybridBigInt {
const key = @intFromPtr(vec.ptr) ^ count;

if (perm_cache.get(key)) |cached| {
return cached.clone();
}

const result = try vsa.permute(vec, count);
try perm_cache.put(key, result.clone());
return result;
}

Benchmarking Guidelines​

Microbenchmarking Template​

const std = @import("std");
const vsa = @import("trinity/vsa");

fn benchmarkBind(allocator: std.mem.Allocator, iterations: usize) !void {
const timer = try std.time.Timer.start();

// Setup
const vec_a = try vsa.HybridBigInt.random(allocator, 10000);
defer vec_a.deinit(allocator);
const vec_b = try vsa.HybridBigInt.random(allocator, 10000);
defer vec_b.deinit(allocator);
var result = try vsa.HybridBigInt.init(allocator, 10000);
defer result.deinit(allocator);

// Warmup
for (0..100) |_| {
_ = try vsa.bind(&vec_a, &vec_b, &result);
}

// Benchmark
const start = timer.lap();
for (0..iterations) |_| {
_ = try vsa.bind(&vec_a, &vec_b, &result);
}
const end = timer.read();

// Results
const elapsed_ns = end - start;
const avg_ns = @as(f64, @floatFromInt(elapsed_ns)) / @as(f64, @floatFromInt(iterations));
const ops_per_sec = 1_000_000_000.0 / avg_ns;

std.debug.print(
\\bind() Benchmark:
\\ Iterations: {d}
\\ Total time: {d:.2} ms
\\ Avg/op: {d:.3} ns
\\ Ops/sec: {d:.0}
\\
, .{ iterations, @as(f64, @floatFromInt(elapsed_ns)) / 1_000_000.0, avg_ns, ops_per_sec });
}

Performance Regression Testing​

# Create baseline
zig build bench --baseline

# Compare with current
zig build bench --compare

# Output:
# bind(): +2.3% (was 45.2 ns/op, now 46.3 ns/op) [REGRESSION]
# similarity(): -1.8% (was 32.1 ns/op, now 31.5 ns/op) [IMPROVEMENT]

Benchmarking Best Practices​

PracticeWhyExample
Warmup iterationsCPU cache and branch predictionRun 100+ iterations before measuring
Statistical significanceVariance in measurementsUse 1000+ iterations, repeat 5+ times
Isolate variablesMeasure one thing at a timeDon't benchmark bind+similarity together
Use realistic dataSynthetic data can misleadUse actual corpus data
Check assemblyVerify compiler optimizationzig objdump -d binary

Advanced Techniques​

Multi-threading for Batch Operations​

fn parallelBind(vectors: []const vsa.HybridBigInt, keys: []const vsa.HybridBigInt) !void {
const num_threads = try std.Thread.getCpuCount();
const chunk_size = vectors.len / num_threads;

var threads: [16]std.Thread = undefined;

for (0..num_threads) |i| {
const start = i * chunk_size;
const end = if (i == num_threads - 1) vectors.len else (i + 1) * chunk_size;

threads[i] = try std.Thread.spawn(.{}, struct {
fn worker(start: usize, end: usize) !void {
for (start..end) |j| {
_ = try vsa.bind(&vectors[j], &keys[j], &results[j]);
}
}.worker, .{ start, end });
}

for (0..num_threads) |i| {
threads[i].join();
}
}

GPU Offloading (Future)​

// Pseudo-code for GPU acceleration
fn gpuBindBatch(vectors: []vsa.HybridBigInt, keys: []vsa.HybridBigInt) !void {
// 1. Copy data to GPU
const gpu_vectors = try gpu.copyToGpu(vectors);
defer gpu.free(gpu_vectors);
const gpu_keys = try gpu.copyToGpu(keys);
defer gpu.free(gpu_keys);

// 2. Launch kernel
try gpu.launchKernel(bindKernel, .{ gpu_vectors, gpu_keys });

// 3. Copy results back
try gpu.copyFromGpu(results, gpu_results);
}

Performance Checklist​

Use this checklist before deploying to production:

  • All hot paths use SIMD operations
  • Memory is aligned to cache line boundaries
  • Object pools for frequent allocations
  • Benchmark suite covers critical paths
  • Performance regression tests pass
  • Memory usage is stable (no leaks)
  • CPU utilization is >70% (not bottlenecked)
  • Cache hit rate is >80%

Further Reading​


Need more performance tips? Check the community forum or open a GitHub issue.