p0_C++_Primer

Author: lorixyu

Created: 2026-01-13

Updated: 2026-01-17

count_min_sketch.h

//===----------------------------------------------------------------------===//
//
//                         BusTub
//
// count_min_sketch.h
//
// Identification: src/include/primer/count_min_sketch.h
//
// Copyright (c) 2015-2025, Carnegie Mellon University Database Group
//
//===----------------------------------------------------------------------===//

#pragma once

#include <cstdint>

#include <functional>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <utility>
#include <vector>

#include "common/util/hash_util.h"

namespace bustub {

template <typename KeyType>
class CountMinSketch {
 public:
  /** @brief Constructs a count-min sketch with specified dimensions
   * @param width Number of buckets
   * @param depth Number of hash functions
   */
  explicit CountMinSketch(uint32_t width, uint32_t depth);

  CountMinSketch() = delete;                                            // Default constructor deleted
  CountMinSketch(const CountMinSketch &) = delete;                      // Copy constructor deleted
  auto operator=(const CountMinSketch &) -> CountMinSketch & = delete;  // Copy assignment deleted

  CountMinSketch(CountMinSketch &&other) noexcept;                      // Move constructor
  auto operator=(CountMinSketch &&other) noexcept -> CountMinSketch &;  // Move assignment

  /**
   * @brief Inserts an item into the count-min sketch
   *
   * @param item The item to increment the count for
   * @note Updates the min-heap at the same time
   */
  void Insert(const KeyType &item);

  /**
   * @brief Gets the estimated count of an item
   *
   * @param item The item to look up
   * @return The estimated count
   */
  auto Count(const KeyType &item) const -> uint32_t;

  /**
   * @brief Resets the sketch to initial empty state
   *
   * @note Clears the sketch matrix, item set, and top-k min-heap
   */
  void Clear();

  /**
   * @brief Merges the current CountMinSketch with another, updating the current sketch
   * with combined data from both sketches.
   *
   * @param other The other CountMinSketch to merge with.
   * @throws std::invalid_argument if the sketches' dimensions are incompatible.
   */
  void Merge(const CountMinSketch<KeyType> &other);

  /**
   * @brief Gets the top k items based on estimated counts from a list of candidates.
   *
   * @param k Number of top items to return (will be capped at initial k)
   * @param candidates List of candidate items to consider for top k
   * @return Vector of (item, count) pairs in descending count order
   */
  auto TopK(uint16_t k, const std::vector<KeyType> &candidates) -> std::vector<std::pair<KeyType, uint32_t>>;

 private:
  /** Dimensions of the count-min sketch matrix */
  uint32_t width_;  // Number of buckets for each hash function
  uint32_t depth_;  // Number of independent hash functions
  /** Pre-computed hash functions for each row */
  std::vector<std::function<size_t(const KeyType &)>> hash_functions_;

  // lorixyu
  std::vector<std::vector<uint32_t>> table_;
  mutable std::vector<std::unique_ptr<std::shared_mutex>> row_locks_;

  /** @spring2026 PLEASE DO NOT MODIFY THE FOLLOWING */
  constexpr static size_t SEED_BASE = 15445;

  /**
   * @brief Seeded hash function generator
   *
   * @param seed Used for creating independent hash functions
   * @return A function that maps items to column indices
   */
  inline auto HashFunction(size_t seed) -> std::function<size_t(const KeyType &)> {
    return [seed, this](const KeyType &item) -> size_t {
      auto h1 = std::hash<KeyType>{}(item);
      auto h2 = bustub::HashUtil::CombineHashes(seed, SEED_BASE);
      return bustub::HashUtil::CombineHashes(h1, h2) % width_;
    };
  }

  /** @todo (student) can add their data structures that support count-min sketch operations */
};

}  // namespace bustub

好的，我帮你把今天围绕 CountMinSketch 项目 的学习内容、问题、解决方案和收获做一个系统总结。内容会分模块、问题、解决方案、学到的知识点和代码规范建议。

1️⃣ 主要问题回顾

1. 插入（Insert）函数的并发安全

问题：最初尝试给整个函数加锁或用 std::vector<std::shared_lock> 来一次性锁住多行。
风险：可能出现死锁，尤其是两个线程交叉锁不同的行：
- 线程 A 锁行 0 → 等待行 1
- 线程 B 锁行 1 → 等待行 0
  → 典型死锁。
解决方案：
- 每次只锁当前行 std::unique_lock<std::shared_mutex> lock(*row_locks_[i]);
- 操作完立即释放锁，下一行再单独加锁
- 这样即便多线程并发，每行只锁一次，不会交叉等待 → 无死锁。
收获：
- 理解了死锁产生的条件：多线程、多锁、交叉等待、锁不释放
- 学会了分粒度锁设计：尽量缩小锁的作用域，提高并发性能。

2. Merge 函数的锁

问题：合并两个 CountMinSketch，需要锁住两份数据的所有行。
解决方案：
- 固定顺序锁定两份 sketch 的行，保证全局顺序一致：
  1
  2
  3
  const CountMinSketch* first = this;
  const CountMinSketch* second = &other;
  if (first > second) std::swap(first, second);
- 使用 std::vector<std::unique_lock<std::shared_mutex>> locks; 来按顺序锁行，避免死锁。
收获：
- 学会了 多对象多行锁的死锁预防策略：按照地址或固定顺序锁，保证线程之间一致。
- 学会了在并发操作中锁的顺序和粒度的重要性。

3. TopK 和 Count 函数的锁

问题：
- TopK 要统计多行的频率，需要对行加读锁。
- 如果 TopK 用写锁，插入会被阻塞，性能下降。
解决方案：
- 使用 std::shared_lock<std::shared_mutex> 对每行加读锁
- 插入时用写锁，TopK/Count 时用读锁 → 读写分离。
收获：
- 学会了 读写锁（shared_mutex） 的使用：
  - std::unique_lock → 写锁
  - std::shared_lock → 读锁
- 能够实现高并发读、独占写的设计。

4. std::vector 和 unique_ptr 使用问题

问题：
- row_locks_ 用 std::vector<std::shared_mutex> 会无法拷贝/移动
- emplace_back 对 {} 的推导问题
解决方案：
- 使用 std::vector<std::unique_ptr<std::shared_mutex>> row_locks_;
- 对每行锁做 make_unique 初始化
- 使用时 *row_locks_[i] 获取原 mutex
收获：
- 理解了 指针容器 vs 对象容器 在移动语义和多态上的差别
- 理解了 emplace_back 和 push_back 在模板推导上的细微差异

5. Clear 函数

问题：
- 清零表格需要锁吗？
解决方案：
- 如果 Clear 会在多线程使用中调用：
  - 需要给每行加写锁
  - 或者全局写锁
- 否则单线程调用无需锁
收获：
- 理解了 锁的作用域，锁不只是保护“写操作”，还保护“数据一致性”

6. 并发性能问题

问题：
- ContentRatioTest 测试多线程插入速度
- 你的版本多线程比单线程还慢（0.91x）
原因：
- Insert 中每次循环都加/释放写锁，导致锁频繁申请和释放，开销大
- 锁粒度还可以优化，比如 批量锁或者减少重复计算
收获：
- 高并发结构不仅要安全，还要考虑锁开销和热点行竞争
- 设计思路：
  - 粒度锁：每行独立
  - 避免重复加锁
  - 读写分离

2️⃣ 今天学到的新知识点

死锁的产生条件
- 多线程 + 多锁 + 锁顺序不一致 + 未释放 → 死锁
- 解决：单锁粒度、固定顺序、及时释放
读写锁 shared_mutex
- std::unique_lock<std::shared_mutex> → 写锁
- std::shared_lock<std::shared_mutex> → 读锁
- 可提高读多写少场景性能
智能指针容器
- std::vector<std::unique_ptr<T>> 支持移动，避免对象拷贝/拷贝构造问题
- 对象容器 vs 指针容器在 C++ 中的区别
Lock 的粒度设计
- 单行锁 > 全局锁，避免死锁，提高并发
- Lock 的作用域要尽量小，减少锁持有时间
TopK 实现
- 先统计 Count → 排序 → 截取前 k
- 注意锁住行，保证统计一致
并发性能调优
- 了解了锁申请/释放开销
- 多线程不一定快，需要减少竞争和锁开销

3️⃣ 代码规范/建议

锁作用域

{
    std::unique_lock<std::shared_mutex> lock(*row_locks_[i]);
    // 只操作这一行
} // 离开作用域立即释放锁

容器使用
- std::vector<std::unique_ptr<std::shared_mutex>> 推荐用于不可拷贝对象
- std::vector<std::pair<KeyType,uint32_t>> 使用 emplace_back 或 push_back
循环内不要重复加锁
- 如果只锁一行就操作一行，避免在循环中多余锁
读写锁分离
- 插入 → 写锁
- 查询 → 读锁
- 避免读阻塞写，写阻塞读
Merge 等操作顺序固定
- 对多个对象操作时，按地址/固定顺序加锁，防止死锁