API Reference (Algorithms)¶

This reference documents the core algorithmic implementations that back the theoretical explanations throughout the book. All implementations are thoroughly commented specifically for educational reading.

1. C++ Distance Metrics¶

The backbone of any vector database is mathematical distance calculation. For production scale, raw scalar for loops are too slow, and AVX2/AVX-512 SIMD (Single Instruction, Multiple Data) intrinsically accelerated kernels are required.

src/cpp/distances.cpp

/**
 * SIMD-optimized distance computations for vector search.
 *
 * Demonstrates how production vector databases accelerate
 * distance calculations using AVX2/AVX-512 intrinsics.
 *
 * Compile: g++ -O3 -mavx2 -mfma -o distances distances.cpp
 */

#include <cmath>
#include <cstddef>
#include <vector>
#include <algorithm>
#include <numeric>

/**
 * Naive L2 squared distance — scalar loop.
 * Complexity: O(d)
 */
float l2_distance_naive(const float* x, const float* y, size_t d) {
    float sum = 0.0f;
    for (size_t i = 0; i < d; ++i) {
        float diff = x[i] - y[i];
        sum += diff * diff;
    }
    return sum;
}


#ifdef __AVX2__
#include <immintrin.h>

/**
 * AVX2-optimized L2 squared distance.
 *
 * Processes 8 floats per iteration using 256-bit SIMD registers.
 * ~4-8x faster than scalar on modern CPUs.
 */
float l2_distance_avx2(const float* x, const float* y, size_t d) {
    __m256 sum = _mm256_setzero_ps();
    size_t i = 0;

    // Process 8 floats at a time
    for (; i + 8 <= d; i += 8) {
        __m256 vx = _mm256_loadu_ps(x + i);
        __m256 vy = _mm256_loadu_ps(y + i);
        __m256 diff = _mm256_sub_ps(vx, vy);
        sum = _mm256_fmadd_ps(diff, diff, sum);  // FMA: sum += diff * diff
    }

    // Horizontal sum of 8 floats in sum register
    __m128 hi = _mm256_extractf128_ps(sum, 1);
    __m128 lo = _mm256_castps256_ps128(sum);
    __m128 sum128 = _mm_add_ps(lo, hi);
    sum128 = _mm_hadd_ps(sum128, sum128);
    sum128 = _mm_hadd_ps(sum128, sum128);
    float result = _mm_cvtss_f32(sum128);

    // Handle remaining elements
    for (; i < d; ++i) {
        float diff = x[i] - y[i];
        result += diff * diff;
    }

    return result;
}
#endif


#ifdef __AVX2__
/**
 * AVX2-optimized inner product.
 */
float inner_product_avx2(const float* x, const float* y, size_t d) {
    __m256 sum = _mm256_setzero_ps();
    size_t i = 0;

    for (; i + 8 <= d; i += 8) {
        __m256 vx = _mm256_loadu_ps(x + i);
        __m256 vy = _mm256_loadu_ps(y + i);
        sum = _mm256_fmadd_ps(vx, vy, sum);
    }

    __m128 hi = _mm256_extractf128_ps(sum, 1);
    __m128 lo = _mm256_castps256_ps128(sum);
    __m128 sum128 = _mm_add_ps(lo, hi);
    sum128 = _mm_hadd_ps(sum128, sum128);
    sum128 = _mm_hadd_ps(sum128, sum128);
    float result = _mm_cvtss_f32(sum128);

    for (; i < d; ++i) {
        result += x[i] * y[i];
    }

    return result;
}
#endif


/**
 * Brute-force k-NN search — the baseline that all ANN
 * algorithms are compared against.
 *
 * Returns indices of k nearest vectors to query.
 */
struct SearchResult {
    size_t index;
    float distance;
    bool operator<(const SearchResult& other) const {
        return distance < other.distance;
    }
};

std::vector<SearchResult> brute_force_knn(
    const float* query,
    const float* database,  // row-major: n × d
    size_t n,
    size_t d,
    size_t k
) {
    std::vector<SearchResult> results(n);
    for (size_t i = 0; i < n; ++i) {
        results[i] = {i, l2_distance_naive(query, database + i * d, d)};
    }

    // Partial sort for top-k
    std::partial_sort(
        results.begin(),
        results.begin() + std::min(k, n),
        results.end()
    );

    results.resize(std::min(k, n));
    return results;
}

2. C++ Locality-Sensitive Hashing (LSH)¶

Locality-Sensitive Hashing uses random hyperplane projection to build probabilistic buckets. Similar vectors have a mathematically high probability of landing in the same bucket via identical binary signature patterns.

src/cpp/lsh.hpp

/**
 * Locality-Sensitive Hashing (LSH) for approximate nearest neighbor search.
 *
 * Implements random-hyperplane LSH (cosine similarity) and
 * p-stable distribution LSH (Euclidean distance).
 *
 * Compile: g++ -std=c++17 -O3 -o lsh lsh.cpp
 */

#pragma once

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstddef>
#include <functional>
#include <numeric>
#include <random>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

/**
 * Hash signature — a sequence of bits/ints that represents
 * a vector's position in a hash table.
 */
struct HashSignature {
  std::vector<int> bits;

  bool operator==(const HashSignature &other) const {
    return bits == other.bits;
  }
};

struct HashSignatureHash {
  size_t operator()(const HashSignature &sig) const {
    size_t seed = sig.bits.size();
    for (auto &b : sig.bits) {
      seed ^= std::hash<int>{}(b) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
    }
    return seed;
  }
};

/**
 * Random Hyperplane LSH for cosine similarity.
 *
 * Each hash function: h(x) = sign(r · x)
 * where r is a random Gaussian vector.
 *
 * Collision probability:
 *   Pr[h(x) = h(y)] = 1 - θ(x,y) / π
 *
 * Parameters:
 *   dim         — dimensionality of input vectors
 *   num_tables  — number of hash tables L (more tables → higher recall)
 *   num_hashes  — hash bits per table k (more bits → higher precision)
 */
class RandomHyperplaneLSH {
public:
  RandomHyperplaneLSH(size_t dim, size_t num_tables = 10, size_t num_hashes = 8)
      : dim_(dim), num_tables_(num_tables), num_hashes_(num_hashes) {
    std::mt19937 rng(42);
    std::normal_distribution<float> normal(0.0f, 1.0f);

    // Generate random hyperplanes for each table
    hyperplanes_.resize(num_tables);
    for (size_t t = 0; t < num_tables; ++t) {
      hyperplanes_[t].resize(num_hashes * dim);
      for (auto &val : hyperplanes_[t]) {
        val = normal(rng);
      }
    }

    tables_.resize(num_tables);
  }

  /**
   * Build the index from a set of vectors.
   */
  void build(const std::vector<std::vector<float>> &vectors) {
    vectors_ = vectors;
    for (auto &table : tables_)
      table.clear();

    for (size_t i = 0; i < vectors.size(); ++i) {
      for (size_t t = 0; t < num_tables_; ++t) {
        auto sig = hash(vectors[i], t);
        tables_[t][sig].push_back(i);
      }
    }
  }

  /**
   * Query for k approximate nearest neighbors.
   *
   * 1. Hash query into each table
   * 2. Collect all candidate IDs from matching buckets
   * 3. Re-rank candidates by exact cosine similarity
   */
  std::vector<size_t> query(const std::vector<float> &q, size_t k) const {
    std::unordered_set<size_t> candidates;

    for (size_t t = 0; t < num_tables_; ++t) {
      auto sig = hash(q, t);
      auto it = tables_[t].find(sig);
      if (it != tables_[t].end()) {
        for (size_t idx : it->second) {
          candidates.insert(idx);
        }
      }
    }

    // Re-rank by cosine similarity
    std::vector<std::pair<float, size_t>> scored;
    scored.reserve(candidates.size());
    for (size_t idx : candidates) {
      float sim = cosine_sim(q, vectors_[idx]);
      scored.push_back({-sim, idx}); // negate for ascending sort
    }
    std::sort(scored.begin(), scored.end());

    std::vector<size_t> result;
    for (size_t i = 0; i < std::min(k, scored.size()); ++i) {
      result.push_back(scored[i].second);
    }
    return result;
  }

  size_t num_vectors() const { return vectors_.size(); }

private:
  HashSignature hash(const std::vector<float> &vec, size_t table_idx) const {
    HashSignature sig;
    sig.bits.resize(num_hashes_);
    for (size_t h = 0; h < num_hashes_; ++h) {
      float dot = 0.0f;
      const float *plane = &hyperplanes_[table_idx][h * dim_];
      for (size_t d = 0; d < dim_; ++d) {
        dot += plane[d] * vec[d];
      }
      sig.bits[h] = (dot > 0.0f) ? 1 : 0;
    }
    return sig;
  }

  static float cosine_sim(const std::vector<float> &a,
                          const std::vector<float> &b) {
    float dot = 0, na = 0, nb = 0;
    for (size_t i = 0; i < a.size(); ++i) {
      dot += a[i] * b[i];
      na += a[i] * a[i];
      nb += b[i] * b[i];
    }
    float denom = std::sqrt(na) * std::sqrt(nb);
    return (denom > 1e-10f) ? dot / denom : 0.0f;
  }

  size_t dim_, num_tables_, num_hashes_;
  std::vector<std::vector<float>> hyperplanes_; // [table][hash*dim + d]
  std::vector<
      std::unordered_map<HashSignature, std::vector<size_t>, HashSignatureHash>>
      tables_;
  std::vector<std::vector<float>> vectors_;
};

/**
 * p-stable distribution LSH for Euclidean distance.
 *
 * Hash function: h(x) = floor((a · x + b) / w)
 * where a ~ N(0,I), b ~ Uniform(0,w).
 *
 * Parameters:
 *   bucket_width — width of hash buckets (w)
 */
class EuclideanLSH {
public:
  EuclideanLSH(size_t dim, size_t num_tables = 10, size_t num_hashes = 8,
               float bucket_width = 4.0f)
      : dim_(dim), num_tables_(num_tables), num_hashes_(num_hashes),
        w_(bucket_width) {
    std::mt19937 rng(123);
    std::normal_distribution<float> normal(0.0f, 1.0f);
    std::uniform_real_distribution<float> uniform(0.0f, bucket_width);

    projections_.resize(num_tables);
    offsets_.resize(num_tables);
    for (size_t t = 0; t < num_tables; ++t) {
      projections_[t].resize(num_hashes * dim);
      offsets_[t].resize(num_hashes);
      for (auto &v : projections_[t])
        v = normal(rng);
      for (auto &v : offsets_[t])
        v = uniform(rng);
    }
    tables_.resize(num_tables);
  }

  void build(const std::vector<std::vector<float>> &vectors) {
    vectors_ = vectors;
    for (auto &table : tables_)
      table.clear();

    for (size_t i = 0; i < vectors.size(); ++i) {
      for (size_t t = 0; t < num_tables_; ++t) {
        auto sig = hash(vectors[i], t);
        tables_[t][sig].push_back(i);
      }
    }
  }

  std::vector<size_t> query(const std::vector<float> &q, size_t k) const {
    std::unordered_set<size_t> candidates;
    for (size_t t = 0; t < num_tables_; ++t) {
      auto sig = hash(q, t);
      auto it = tables_[t].find(sig);
      if (it != tables_[t].end()) {
        for (size_t idx : it->second)
          candidates.insert(idx);
      }
    }

    std::vector<std::pair<float, size_t>> scored;
    for (size_t idx : candidates) {
      scored.push_back({l2_squared(q, vectors_[idx]), idx});
    }
    std::sort(scored.begin(), scored.end());

    std::vector<size_t> result;
    for (size_t i = 0; i < std::min(k, scored.size()); ++i) {
      result.push_back(scored[i].second);
    }
    return result;
  }

private:
  HashSignature hash(const std::vector<float> &vec, size_t table_idx) const {
    HashSignature sig;
    sig.bits.resize(num_hashes_);
    for (size_t h = 0; h < num_hashes_; ++h) {
      float proj = offsets_[table_idx][h];
      const float *a = &projections_[table_idx][h * dim_];
      for (size_t d = 0; d < dim_; ++d)
        proj += a[d] * vec[d];
      sig.bits[h] = static_cast<int>(std::floor(proj / w_));
    }
    return sig;
  }

  static float l2_squared(const std::vector<float> &a,
                          const std::vector<float> &b) {
    float sum = 0;
    for (size_t i = 0; i < a.size(); ++i) {
      float d = a[i] - b[i];
      sum += d * d;
    }
    return sum;
  }

  size_t dim_, num_tables_, num_hashes_;
  float w_;
  std::vector<std::vector<float>> projections_;
  std::vector<std::vector<float>> offsets_;
  std::vector<
      std::unordered_map<HashSignature, std::vector<size_t>, HashSignatureHash>>
      tables_;
  std::vector<std::vector<float>> vectors_;
};

3. C++ Product Quantization (PQ)¶

Product Quantization aggressively chunks high-dimensional vectors into $M$ sub-spaces, and trains a k-means dictionary codebook for each chunk. Memory footprint shrinks up to 96x.

src/cpp/pq.hpp

/**
 * Product Quantization (PQ) for vector compression and approximate search.
 *
 * Decomposes d-dimensional vectors into M subspaces and quantizes
 * each independently using k-means clustering.
 *
 * Compile: g++ -std=c++17 -O3 -o pq pq.hpp
 */

#pragma once

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <limits>
#include <numeric>
#include <random>
#include <vector>

/**
 * Product Quantizer.
 *
 * Compresses d-dimensional float vectors into M bytes (one code per subspace).
 *
 * Compression ratio: d × 4 bytes → M bytes (e.g. 128-dim × 4B = 512B → 8B).
 *
 * Workflow:
 *   1. train()  — learn M codebooks of K=256 centroids each via k-means
 *   2. encode() — compress vectors to PQ codes (M uint8 per vector)
 *   3. search() — approximate distance using ADC lookup tables
 *
 * Asymmetric Distance Computation (ADC):
 *   dist(q, x̃)² ≈ Σ_m || q^(m) - c_{code_m}^(m) ||²
 *   Precompute a (M × K) distance table, then M lookups per candidate.
 */
class ProductQuantizer {
public:
  ProductQuantizer(size_t dim, size_t M = 8, size_t K = 256)
      : dim_(dim), M_(M), K_(K), ds_(dim / M) {
    assert(dim % M == 0 && "dim must be divisible by M");
    codebooks_.resize(
        M, std::vector<std::vector<float>>(K, std::vector<float>(ds_)));
  }

  /**
   * Train codebooks with k-means on each subspace.
   */
  void train(const std::vector<std::vector<float>> &data, size_t n_iter = 25) {
    std::mt19937 rng(42);
    size_t n = data.size();

    for (size_t m = 0; m < M_; ++m) {
      // Extract sub-vectors for subspace m
      std::vector<std::vector<float>> sub(n, std::vector<float>(ds_));
      for (size_t i = 0; i < n; ++i) {
        for (size_t d = 0; d < ds_; ++d) {
          sub[i][d] = data[i][m * ds_ + d];
        }
      }

      // Initialize centroids randomly
      std::vector<size_t> indices(n);
      std::iota(indices.begin(), indices.end(), 0);
      std::shuffle(indices.begin(), indices.end(), rng);
      for (size_t k = 0; k < K_; ++k) {
        codebooks_[m][k] = sub[indices[k % n]];
      }

      // k-means iterations
      std::vector<size_t> assignments(n);
      for (size_t iter = 0; iter < n_iter; ++iter) {
        // Assign
        for (size_t i = 0; i < n; ++i) {
          float best_dist = std::numeric_limits<float>::max();
          for (size_t k = 0; k < K_; ++k) {
            float d = l2_sq(sub[i], codebooks_[m][k]);
            if (d < best_dist) {
              best_dist = d;
              assignments[i] = k;
            }
          }
        }

        // Update centroids
        std::vector<std::vector<float>> sums(K_, std::vector<float>(ds_, 0));
        std::vector<size_t> counts(K_, 0);
        for (size_t i = 0; i < n; ++i) {
          size_t k = assignments[i];
          counts[k]++;
          for (size_t d = 0; d < ds_; ++d) {
            sums[k][d] += sub[i][d];
          }
        }
        for (size_t k = 0; k < K_; ++k) {
          if (counts[k] > 0) {
            for (size_t d = 0; d < ds_; ++d) {
              codebooks_[m][k][d] = sums[k][d] / counts[k];
            }
          }
        }
      }
    }
    trained_ = true;
  }

  /**
   * Encode vectors to PQ codes (M uint8 per vector).
   */
  std::vector<std::vector<uint8_t>>
  encode(const std::vector<std::vector<float>> &data) const {
    assert(trained_);
    size_t n = data.size();
    std::vector<std::vector<uint8_t>> codes(n, std::vector<uint8_t>(M_));

    for (size_t i = 0; i < n; ++i) {
      for (size_t m = 0; m < M_; ++m) {
        float best_dist = std::numeric_limits<float>::max();
        uint8_t best_k = 0;
        for (size_t k = 0; k < K_; ++k) {
          float d = 0;
          for (size_t dd = 0; dd < ds_; ++dd) {
            float diff = data[i][m * ds_ + dd] - codebooks_[m][k][dd];
            d += diff * diff;
          }
          if (d < best_dist) {
            best_dist = d;
            best_k = static_cast<uint8_t>(k);
          }
        }
        codes[i][m] = best_k;
      }
    }
    return codes;
  }

  /**
   * Decode PQ codes back to approximate vectors.
   */
  std::vector<float> decode(const std::vector<uint8_t> &code) const {
    assert(trained_);
    std::vector<float> vec(dim_);
    for (size_t m = 0; m < M_; ++m) {
      for (size_t d = 0; d < ds_; ++d) {
        vec[m * ds_ + d] = codebooks_[m][code[m]][d];
      }
    }
    return vec;
  }

  /**
   * Asymmetric Distance Computation (ADC) search.
   *
   * 1. Build distance table: dist_table[m][k] = ||q_m - c_mk||²
   * 2. For each database code, sum M table lookups
   * 3. Return top-k nearest
   *
   * Complexity: O(M·K·ds) to build table + O(n·M) to scan
   */
  struct SearchResult {
    float distance;
    size_t id;
    bool operator<(const SearchResult &o) const {
      return distance < o.distance;
    }
  };

  std::vector<SearchResult>
  search_adc(const std::vector<float> &query,
             const std::vector<std::vector<uint8_t>> &codes, size_t k) const {
    assert(trained_);
    size_t n = codes.size();

    // Precompute distance table: M × K
    std::vector<std::vector<float>> dist_table(M_, std::vector<float>(K_));
    for (size_t m = 0; m < M_; ++m) {
      for (size_t kk = 0; kk < K_; ++kk) {
        float d = 0;
        for (size_t dd = 0; dd < ds_; ++dd) {
          float diff = query[m * ds_ + dd] - codebooks_[m][kk][dd];
          d += diff * diff;
        }
        dist_table[m][kk] = d;
      }
    }

    // Compute approximate distances via table lookups
    std::vector<SearchResult> results(n);
    for (size_t i = 0; i < n; ++i) {
      float d = 0;
      for (size_t m = 0; m < M_; ++m) {
        d += dist_table[m][codes[i][m]];
      }
      results[i] = {std::sqrt(d), i};
    }

    // Partial sort for top-k
    k = std::min(k, n);
    std::partial_sort(results.begin(), results.begin() + k, results.end());
    results.resize(k);
    return results;
  }

private:
  static float l2_sq(const std::vector<float> &a, const std::vector<float> &b) {
    float s = 0;
    for (size_t i = 0; i < a.size(); ++i) {
      float d = a[i] - b[i];
      s += d * d;
    }
    return s;
  }

  size_t dim_, M_, K_, ds_;
  bool trained_ = false;
  // codebooks_[m][k][d] = centroid value
  std::vector<std::vector<std::vector<float>>> codebooks_;
};

4. C++ HNSW Graph¶

Hierarchical Navigable Small World graphs are multi-layer proximity graphs. Search jumps layer by layer (skipping massive distances like an interstate highway) until dropping to Layer 0 to execute dense local navigation.

src/cpp/hnsw.hpp

/**
 * Hierarchical Navigable Small World (HNSW) graph index.
 *
 * A complete C++ implementation of the HNSW algorithm.
 * Reference: Malkov & Yashunin, "Efficient and Robust Approximate
 *            Nearest Neighbor Search Using HNSW Graphs" (2020)
 *
 * Compile: g++ -std=c++17 -O3 -o hnsw hnsw.hpp
 */

#pragma once

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstddef>
#include <limits>
#include <queue>
#include <random>
#include <unordered_set>
#include <vector>

/**
 * HNSW Index for approximate nearest neighbor search.
 *
 * Builds a multi-layer navigable small-world graph.
 * - Layer 0 is the densest (all nodes present)
 * - Higher layers are exponentially sparser
 * - Search descends from top layer greedily, then does
 *   beam search at layer 0.
 *
 * Template parameters are avoided for clarity; uses float vectors.
 *
 * Key parameters:
 *   M               — max edges per node per layer (default 16)
 *   M_max0          — max edges at layer 0 (default 2*M)
 *   ef_construction  — beam width during build
 *   ef_search        — beam width during query
 *   mL              — level generation factor = 1/ln(M)
 */
class HNSWIndex {
public:
  struct SearchResult {
    float distance;
    size_t id;
    bool operator>(const SearchResult &o) const {
      return distance > o.distance;
    }
    bool operator<(const SearchResult &o) const {
      return distance < o.distance;
    }
  };

  HNSWIndex(size_t dim, size_t M = 16, size_t ef_construction = 200,
            size_t ef_search = 50)
      : dim_(dim), M_(M), M_max0_(2 * M), ef_construction_(ef_construction),
        ef_search_(ef_search), mL_(1.0 / std::log(static_cast<double>(M))),
        entry_point_(NONE), max_layer_(0), rng_(42), uniform_(0.0, 1.0) {}

  /**
   * Insert a single vector into the index.
   *
   * Algorithm (simplified):
   * 1. Draw random level l ~ Geometric(mL)
   * 2. From entry point, greedily descend to layer l+1
   * 3. At each layer [l..0], beam-search for ef_construction
   *    neighbors and add bidirectional edges
   */
  size_t insert(const std::vector<float> &vec) {
    size_t id = vectors_.size();
    vectors_.push_back(vec);

    int level = random_level();

    // Grow graph layers if needed
    while (static_cast<int>(graph_.size()) <= level) {
      graph_.emplace_back();
    }

    // First element
    if (entry_point_ == NONE) {
      entry_point_ = id;
      max_layer_ = level;
      for (int l = 0; l <= level; ++l) {
        if (graph_[l].size() <= id)
          graph_[l].resize(id + 1);
      }
      return id;
    }

    // Ensure node has adjacency lists at all its layers
    for (int l = 0; l <= level; ++l) {
      if (graph_[l].size() <= id)
        graph_[l].resize(id + 1);
    }

    size_t current = entry_point_;

    // Phase 1: Greedy descent from max_layer to level+1
    for (int l = max_layer_; l > level; --l) {
      auto nearest = search_layer(vec, current, 1, l);
      if (!nearest.empty())
        current = nearest[0].id;
    }

    // Phase 2: Insert at layers [min(level, max_layer)..0]
    for (int l = std::min(level, max_layer_); l >= 0; --l) {
      auto candidates = search_layer(vec, current, ef_construction_, l);
      size_t M_max = (l == 0) ? M_max0_ : M_;
      auto neighbors = select_neighbors(candidates, M_max);

      // Add bidirectional edges
      for (auto &nb : neighbors) {
        graph_[l][id].push_back(nb.id);
        graph_[l][nb.id].push_back(id);

        // Prune neighbor if over capacity
        if (graph_[l][nb.id].size() > M_max) {
          prune(nb.id, l, M_max);
        }
      }

      if (!candidates.empty())
        current = candidates[0].id;
    }

    if (level > max_layer_) {
      entry_point_ = id;
      max_layer_ = level;
    }

    return id;
  }

  /**
   * Search for k approximate nearest neighbors.
   *
   * 1. Greedy descent from top layer to layer 1
   * 2. Beam search at layer 0 with ef = max(ef_search, k)
   * 3. Return top-k results sorted by distance
   */
  std::vector<SearchResult> search(const std::vector<float> &query,
                                   size_t k) const {
    if (entry_point_ == NONE)
      return {};

    size_t current = entry_point_;

    // Descend from top
    for (int l = max_layer_; l > 0; --l) {
      auto nearest = search_layer(query, current, 1, l);
      if (!nearest.empty())
        current = nearest[0].id;
    }

    size_t ef = std::max(ef_search_, k);
    auto results = search_layer(query, current, ef, 0);

    // Take sqrt for actual Euclidean distances
    if (results.size() > k)
      results.resize(k);
    for (auto &r : results) {
      r.distance = std::sqrt(r.distance);
    }
    return results;
  }

  /**
   * Bulk insert all vectors.
   */
  void build(const std::vector<std::vector<float>> &vectors) {
    for (const auto &v : vectors)
      insert(v);
  }

  size_t size() const { return vectors_.size(); }
  size_t num_layers() const { return graph_.size(); }

  void set_ef_search(size_t ef) { ef_search_ = ef; }

private:
  static constexpr size_t NONE = std::numeric_limits<size_t>::max();

  float distance_sq(const std::vector<float> &a,
                    const std::vector<float> &b) const {
    float sum = 0.0f;
    for (size_t i = 0; i < dim_; ++i) {
      float d = a[i] - b[i];
      sum += d * d;
    }
    return sum;
  }

  int random_level() {
    return static_cast<int>(-std::log(uniform_(rng_)) * mL_);
  }

  /**
   * Beam search in a single layer.
   * Returns up to ef nearest elements, sorted by distance (ascending).
   */
  std::vector<SearchResult> search_layer(const std::vector<float> &query,
                                         size_t entry, size_t ef,
                                         int layer) const {
    if (layer >= static_cast<int>(graph_.size()))
      return {};
    if (entry >= graph_[layer].size())
      return {};

    std::unordered_set<size_t> visited;
    visited.insert(entry);

    float d = distance_sq(query, vectors_[entry]);

    // candidates: min-heap (closest first)
    std::priority_queue<SearchResult, std::vector<SearchResult>,
                        std::greater<SearchResult>>
        candidates;
    // results: max-heap (farthest first, for bounding)
    std::priority_queue<SearchResult> results;

    candidates.push({d, entry});
    results.push({d, entry});

    while (!candidates.empty()) {
      auto [c_dist, c_id] = candidates.top();
      candidates.pop();

      float farthest = results.top().distance;
      if (c_dist > farthest)
        break;

      // Expand neighbors
      if (c_id < graph_[layer].size()) {
        for (size_t nb : graph_[layer][c_id]) {
          if (visited.count(nb))
            continue;
          visited.insert(nb);

          float nb_dist = distance_sq(query, vectors_[nb]);
          farthest = results.top().distance;

          if (nb_dist < farthest || results.size() < ef) {
            candidates.push({nb_dist, nb});
            results.push({nb_dist, nb});
            if (results.size() > ef)
              results.pop();
          }
        }
      }
    }

    // Extract sorted results
    std::vector<SearchResult> out;
    out.reserve(results.size());
    while (!results.empty()) {
      out.push_back(results.top());
      results.pop();
    }
    std::sort(out.begin(), out.end());
    return out;
  }

  std::vector<SearchResult>
  select_neighbors(const std::vector<SearchResult> &candidates,
                   size_t M) const {
    auto sorted = candidates;
    std::sort(sorted.begin(), sorted.end());
    if (sorted.size() > M)
      sorted.resize(M);
    return sorted;
  }

  void prune(size_t node, int layer, size_t M_max) {
    auto &adj = graph_[layer][node];
    std::vector<SearchResult> scored;
    for (size_t nb : adj) {
      scored.push_back({distance_sq(vectors_[node], vectors_[nb]), nb});
    }
    std::sort(scored.begin(), scored.end());
    adj.clear();
    for (size_t i = 0; i < std::min(M_max, scored.size()); ++i) {
      adj.push_back(scored[i].id);
    }
  }

  size_t dim_;
  size_t M_, M_max0_;
  size_t ef_construction_, ef_search_;
  double mL_;
  size_t entry_point_;
  int max_layer_;

  std::mt19937 rng_;
  std::uniform_real_distribution<double> uniform_;

  std::vector<std::vector<float>> vectors_;
  // graph_[layer][node_id] = list of neighbor IDs
  std::vector<std::vector<std::vector<size_t>>> graph_;
};

5. C++ Inverted File Index (IVF)¶

IVF uses k-means to slice the entire valid embedding space into distinct "Voronoi Cells". During an incoming query, the database routes the query to the nprobe nearest cell centroids, entirely skipping computation in faraway regions.

src/cpp/ivf.hpp

/**
 * Inverted File Index (IVF) for approximate nearest neighbor search.
 *
 * Partitions vector space into Voronoi cells using k-means,
 * then searches only the nprobe nearest cells at query time.
 *
 * Compile: g++ -std=c++17 -O3 -o ivf ivf.hpp
 */

#pragma once

#include <algorithm>
#include <cassert>
#include <cmath>
#include <limits>
#include <numeric>
#include <random>
#include <vector>

/**
 * Inverted File Index.
 *
 * Search complexity: O(nprobe × n/nlist × d) vs O(n × d) brute-force.
 *
 * Parameters:
 *   nlist  — number of Voronoi cells (centroids)
 *   nprobe — number of cells to search (trade-off: recall vs speed)
 */
class IVFIndex {
public:
  struct SearchResult {
    float distance;
    size_t id;
    bool operator<(const SearchResult &o) const {
      return distance < o.distance;
    }
  };

  IVFIndex(size_t dim, size_t nlist = 100, size_t nprobe = 10)
      : dim_(dim), nlist_(nlist), nprobe_(nprobe) {
    inverted_lists_.resize(nlist);
  }

  /**
   * Train centroids using k-means.
   */
  void train(const std::vector<std::vector<float>> &data, size_t n_iter = 20) {
    size_t n = data.size();
    centroids_.resize(nlist_, std::vector<float>(dim_));

    std::mt19937 rng(42);
    std::vector<size_t> indices(n);
    std::iota(indices.begin(), indices.end(), 0);
    std::shuffle(indices.begin(), indices.end(), rng);

    for (size_t c = 0; c < nlist_; ++c) {
      centroids_[c] = data[indices[c % n]];
    }

    std::vector<size_t> assignments(n);
    for (size_t iter = 0; iter < n_iter; ++iter) {
      // Assign to nearest centroid
      for (size_t i = 0; i < n; ++i) {
        float best = std::numeric_limits<float>::max();
        for (size_t c = 0; c < nlist_; ++c) {
          float d = l2_sq(data[i], centroids_[c]);
          if (d < best) {
            best = d;
            assignments[i] = c;
          }
        }
      }
      // Update centroids
      std::vector<std::vector<float>> sums(nlist_, std::vector<float>(dim_, 0));
      std::vector<size_t> counts(nlist_, 0);
      for (size_t i = 0; i < n; ++i) {
        counts[assignments[i]]++;
        for (size_t d = 0; d < dim_; ++d)
          sums[assignments[i]][d] += data[i][d];
      }
      for (size_t c = 0; c < nlist_; ++c) {
        if (counts[c] > 0) {
          for (size_t d = 0; d < dim_; ++d)
            centroids_[c][d] = sums[c][d] / counts[c];
        }
      }
    }
    trained_ = true;
  }

  /**
   * Add vectors to the inverted lists.
   */
  void add(const std::vector<std::vector<float>> &data) {
    assert(trained_);
    vectors_ = data;
    for (auto &list : inverted_lists_)
      list.clear();

    for (size_t i = 0; i < data.size(); ++i) {
      float best = std::numeric_limits<float>::max();
      size_t best_c = 0;
      for (size_t c = 0; c < nlist_; ++c) {
        float d = l2_sq(data[i], centroids_[c]);
        if (d < best) {
          best = d;
          best_c = c;
        }
      }
      inverted_lists_[best_c].push_back(i);
    }
  }

  /**
   * Search for k approximate nearest neighbors.
   *
   * 1. Find nprobe nearest centroids
   * 2. Scan vectors in those cells
   * 3. Return top-k
   */
  std::vector<SearchResult> search(const std::vector<float> &query,
                                   size_t k) const {
    assert(trained_);

    // Find nearest centroids
    std::vector<std::pair<float, size_t>> centroid_dists(nlist_);
    for (size_t c = 0; c < nlist_; ++c) {
      centroid_dists[c] = {l2_sq(query, centroids_[c]), c};
    }
    std::partial_sort(centroid_dists.begin(),
                      centroid_dists.begin() + std::min(nprobe_, nlist_),
                      centroid_dists.end());

    // Collect and score candidates
    std::vector<SearchResult> candidates;
    for (size_t p = 0; p < std::min(nprobe_, nlist_); ++p) {
      size_t cell = centroid_dists[p].second;
      for (size_t idx : inverted_lists_[cell]) {
        float d = std::sqrt(l2_sq(query, vectors_[idx]));
        candidates.push_back({d, idx});
      }
    }

    k = std::min(k, candidates.size());
    if (k > 0) {
      std::partial_sort(candidates.begin(), candidates.begin() + k,
                        candidates.end());
      candidates.resize(k);
    }
    return candidates;
  }

  void set_nprobe(size_t nprobe) { nprobe_ = nprobe; }
  size_t size() const { return vectors_.size(); }

private:
  static float l2_sq(const std::vector<float> &a, const std::vector<float> &b) {
    float s = 0;
    for (size_t i = 0; i < a.size(); ++i) {
      float d = a[i] - b[i];
      s += d * d;
    }
    return s;
  }

  size_t dim_, nlist_, nprobe_;
  bool trained_ = false;
  std::vector<std::vector<float>> centroids_;
  std::vector<std::vector<size_t>> inverted_lists_;
  std::vector<std::vector<float>> vectors_;
};