C++里如何把vector<bool>或vector<unsigned char>对象的数据保存到文件,以及从文件装载回内存

我用vector做一个bloom filter,需要把生成的过滤器持久化到文件,再从文件恢复到内存。但在这个环节遇到了问题。
我写了以下把vetor bit_table_ 内容写入文件,以及从文件重载回内存的两个函数,但装载后的内容与实际内容是不一样的,以致不能通过bloom filter的检测。
#2021-12-06晚间更新:
通过网上查资料,自己琢磨测试,找到了一种方法能勉强解决上述问题。但有一个地方的写法有点蠢:其中反序列化装载函数的实现,先从文件把内容读入一个临时vector,再swap到类成员变量的写法实在是很LOW,特别是对于巨大的,非常耗内存的大型bloom filter,这种写法要占用双倍的内存,非常不可取。但我这种C++新手,一时半会又找不到更好且可行的写法,还请各路大神继续帮助,给出更好的实现方法。

img

     详见下文回复内容。

#----------------------------------


//把过滤器内存持久化到文件
       inline bool dump_tofile(string filename)
       {
           ofstream fout(filename.c_str(), ios::binary);
           if(fout.is_open()==false)
           {
               std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
               return false;
           }
           std::vector<bool>::size_type n = bit_table_.size();
           fout.write((const char*)&n, sizeof(std::vector<bool>::size_type));
           for(std::vector<bool>::size_type i = 0; i < n;)
           {
               unsigned char aggr = 0;
               for(unsigned char mask = 1; mask > 0 && i < n; ++i, mask <<= 1)
                   if(bit_table_.at(i))
                       aggr |= mask;
               fout.write((const char*)&aggr, sizeof(unsigned char));
           }
           fout.close();
           std::cout << "已把"<< n/(1024*1024) <<" MB 的过滤器内容保存到文件:"<< filename << std::endl;
           return true;
       }
       //从文件中加载过滤器
       inline bool load_fromfile(string filename)
       {
           ifstream fin(filename.c_str(), ios::binary);
           if(fin.is_open()==false)
           {
               std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
               return false;
           }
           std::vector<bool>::size_type n;
           fin.read((char*)&n, sizeof(std::vector<bool>::size_type));
           bit_table_.resize(n);
           for(std::vector<bool>::size_type i = 0; i < n;)
           {
               unsigned char aggr;
               fin.read((char*)&aggr, sizeof(unsigned char));
               for(unsigned char mask = 1; mask > 0 && i < n; ++i, mask <<= 1)
                   bit_table_.at(i) = aggr & mask;
           }
           fin.close();
           std::cout << "已把"<< n/(1024*1024) <<" MB 的过滤器从文件:"<< filename << "装载到内存。" << std::endl;
       }

求大神指导

我记得网上有说不要往vector里面放bool,会遇到坑的

** 通过网上查资料,自己琢磨测试,找到了一种方法能勉强解决上述问题。但有一个地方的写法有点蠢,要在内存里复制。先把我这个能跑通的代码贴出来,恳请各路大神继续帮助优化!
以下是我要实现的,把内存里的bloom filter持久化到文件,以及从文件反序列化的两个函数。其中反序列化装载的函数,先从文件把内容读入一个临时vector,再swap到类成员变量的写法实在是很LOW,特别是对于巨大的,非常耗内存的大型过滤器,这种写法也要占用双倍的内存,非常不可取。但我这种C++新手,一时半会又找不到更好且可行的写法,还请各路大神帮助给出更好的实现方法:**

        //把布隆过滤器的内容从内存持久化到文件
       inline bool dump_tofile(string filename)
       {
           ofstream fout(filename.c_str(), ios::binary);
           if(fout.is_open()==false)
           {
               std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
               return false;
           }

           save_bf_struct s_bf_info;
           s_bf_info.salt_count = salt_count_;
           s_bf_info.max_capability = max_capability;
           s_bf_info.capability = projected_element_count_;
           s_bf_info.count = inserted_element_count_;
           s_bf_info.random_seed = random_seed_;
           s_bf_info.error_rate = error_rate;

           const int file_check_code = 0xabcd1234;                                //file_check_code 写入文件的开头用于简单判断是否正确格式
           fout.write((char*)&file_check_code, sizeof(int));                    //写入文件头标识符
           unsigned long long  length = bit_table_.size();                      //向量表的大小
           fout.write((char*)&length, sizeof(unsigned long long));              //vector<unsigned char>的元素个数
           unsigned long long  totalSize = length * sizeof(unsigned char);      //正文内容的大小(单位是个字节)
           fout.write((char*)&totalSize, sizeof(unsigned long long));           //写入文件内容的长度
           fout.write((char*)&s_bf_info, sizeof(save_bf_struct));               //写入过滤器的结构信息

           //cout << "要写入的文件大小为" << totalSize << "字节,有" << length << "个元素"<< endl;
           cout << "要保存的滤器设计容量为" << s_bf_info.capability << ",hash次数为" << s_bf_info.salt_count << ",错误率为"<< s_bf_info.error_rate;
           cout << ",已写入" << s_bf_info.count << "个元素" << endl;
           std::copy(bit_table_.cbegin(), bit_table_.cend(),std::ostream_iterator<unsigned char>(fout));
           fout.close();
           std::cout << "已把"<< totalSize <<" 字节的过滤器内容保存到文件:"<< filename << std::endl;
           return true;
       }
       //从文件中加载布隆过滤器的内容到内存的向量
       inline bool load_fromfile(string filename)
       {
           const int file_check_code = 0xabcd1234;                                        //file_check_code 放在文件的开头用于简单判断是否正确读取数据
           //std::ios_base::
           ifstream ifile(filename.c_str(), ios::binary);
           if(ifile.is_open()==false)
           {
               std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
               return false;
           }
           int tmp_check_code;
           unsigned long long length, totalSize;
           ifile.read((char*)&tmp_check_code, sizeof(int));
           if (tmp_check_code!=file_check_code){
               cout<<"Unknow format at the begin of file...";
               ifile.close();
               return false;
           }
           save_bf_struct s_bf_info;
           ifile.read((char*)&length, sizeof(unsigned long long));
           ifile.read((char*)&totalSize, sizeof(unsigned long long));
           //读入过滤器的结构信息
           ifile.read((char*)&s_bf_info, sizeof(save_bf_struct));

           cout << "要载入的文件有"<< length << "个元素," << totalSize << "个字节" << endl;
           cout << "要载入的过滤器设计容量为" << s_bf_info.capability << "个,hash次数为" << s_bf_info.salt_count << ",错误率为"<< s_bf_info.error_rate;
           cout << "要,过滤器中已写入" << s_bf_info.count << "个元素" << endl;
           bit_table_.clear();
           bit_table_.resize(1);
           //以下这个写法能正确地从文件读入数据,但读入后再复制的做法实在太愚蠢了,怎么一次读入到类成员 bit_table_呢?
           vector<unsigned char> file_data((istreambuf_iterator<char>(ifile)),istreambuf_iterator<char>());
           bit_table_.swap(file_data);
           file_data.clear();
           ifile.close();

           this->salt_count_ = s_bf_info.salt_count  ;
           this-> max_capability = s_bf_info.max_capability;
           this-> projected_element_count_ = s_bf_info.capability ;
           this -> inserted_element_count_ = s_bf_info.count ;
           this -> random_seed_ = s_bf_info.random_seed  ;
           this -> error_rate  = s_bf_info.error_rate  ;
           std::cout << "已把"<<  bit_table_.size() << "个元素的过滤器从"<<totalSize<<" 字节的文件:"<< filename << "装载到内存。" << std::endl;
           return true;
       }

以下是bloom filter完整的实现代码 bloom_filter.hpp,这个版本可以支持100GB内存(约800亿条记录),甚至更大的容量,并且可以把过滤器内容从内存持久化到文件保存,需要时再从磁盘文件反序列化到内存中。

#ifndef INCLUDE_BLOOM_FILTER_HPP
#define INCLUDE_BLOOM_FILTER_HPP
#include <iostream>
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdlib>
#include <iterator>
#include <limits>
#include <string>
#include <fstream>
#include <istream>
#include <sstream>
#include <cstring>
#include <vector>


static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)

static const unsigned char bit_mask[bits_per_char] = {
                                                       0x01,  //00000001
                                                       0x02,  //00000010
                                                       0x04,  //00000100
                                                       0x08,  //00001000
                                                       0x10,  //00010000
                                                       0x20,  //00100000
                                                       0x40,  //01000000
                                                       0x80   //10000000
                                                     };

//设定过滤器的参数
class bloom_parameters
{
    public:
       bloom_parameters()
       : minimum_size(1),
         maximum_size(std::numeric_limits<unsigned long long>::max()),
         minimum_number_of_hashes(1),
         maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
         projected_element_count(10000),
         error_rate(1.0 / projected_element_count),
         random_seed(0xA5A5A5A55A5A5A5AULL)
       {}

       virtual ~bloom_parameters()
       {}

       inline bool operator!()
       {
          return (minimum_size > maximum_size)      ||
                 (minimum_number_of_hashes > maximum_number_of_hashes) ||
                 (minimum_number_of_hashes < 1)     ||
                 (0 == maximum_number_of_hashes)    ||
                 (0 == projected_element_count)     ||
                 (error_rate < 0.0) ||
                 (std::numeric_limits<double>::infinity() == std::abs(error_rate)) ||
                 (0 == random_seed)                 ||
                 (0xFFFFFFFFFFFFFFFFULL == random_seed);
       }

       // Allowable min/max size of the bloom filter in bits
       unsigned long long minimum_size;
       unsigned long long maximum_size;

       // Allowable min/max number of hash functions
       unsigned int minimum_number_of_hashes;
       unsigned int maximum_number_of_hashes;

       //错误率,一般要求小于0.1%
       double error_rate;

       unsigned long long random_seed;

       struct optimal_parameters_t
       {
          optimal_parameters_t()
          : number_of_hashes(0),
            table_size(0)
          {}

          unsigned int number_of_hashes;
          unsigned long long table_size;
       };

       optimal_parameters_t optimal_parameters;

       virtual bool compute_optimal_parameters()
       {
          if (!(*this))
             return false;

          double min_m  = std::numeric_limits<double>::infinity();
          double min_k  = 0.0;
          double k      = 1.0;

          while (k < 1000.0)
          {
             const double numerator   = (- k * projected_element_count);
             const double denominator = std::log(1.0 - std::pow(error_rate, 1.0 / k));

             const double curr_m = numerator / denominator;

             if (curr_m < min_m)
             {
                min_m = curr_m;
                min_k = k;
             }

             k += 1.0;
          }

          optimal_parameters_t& optp = optimal_parameters;

          optp.number_of_hashes = static_cast<unsigned int>(min_k);

          optp.table_size = static_cast<unsigned long long>(min_m);

          optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);

          if (optp.number_of_hashes < minimum_number_of_hashes)
             optp.number_of_hashes = minimum_number_of_hashes;
          else if (optp.number_of_hashes > maximum_number_of_hashes)
             optp.number_of_hashes = maximum_number_of_hashes;

          if (optp.table_size < minimum_size)
             optp.table_size = minimum_size;
          else if (optp.table_size > maximum_size)
             optp.table_size = maximum_size;

          return true;
       }

};
using namespace std;
class bloom_filter
{
    protected:

       typedef unsigned int bloom_type;
       typedef unsigned char cell_type;
       typedef std::vector<unsigned char> table_type;

    public:

       bloom_filter()
       : salt_count_(0),
         max_capability(0),
         projected_element_count_(0),
         inserted_element_count_ (0),
         random_seed_(0),
         error_rate(0.0)
       {}

       bloom_filter(const bloom_parameters& p)
       : projected_element_count_(p.projected_element_count),
         inserted_element_count_(0),
         random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
         error_rate(p.error_rate)
       {
          //哈希次数
          salt_count_ = p.optimal_parameters.number_of_hashes;
          //元素个数上限
          max_capability = p.optimal_parameters.table_size;

          generate_unique_salt();

          bit_table_.resize(max_capability / bits_per_char, static_cast<unsigned char>(0x00));
       }

       bloom_filter(const bloom_filter& filter)
       {
          this->operator=(filter);
       }

       inline bool operator == (const bloom_filter& f) const
       {
          if (this != &f)
          {
             return
                (salt_count_                         == f.salt_count_                        ) &&
                (max_capability                      == f.max_capability                     ) &&
                (bit_table_.size()                   == f.bit_table_.size()                  ) &&
                (projected_element_count_            == f.projected_element_count_           ) &&
                (inserted_element_count_             == f.inserted_element_count_            ) &&
                (random_seed_                        == f.random_seed_                       ) &&
                (error_rate                          == f.error_rate                         ) &&
                (salt_                               == f.salt_                              ) &&
                (bit_table_                          == f.bit_table_                         ) ;
          }
          else
             return true;
       }

       inline bool operator != (const bloom_filter& f) const
       {
          return !operator==(f);
       }

       inline bloom_filter& operator = (const bloom_filter& f)
       {
          if (this != &f)
          {
             salt_count_ = f.salt_count_;
             max_capability = f.max_capability;
             bit_table_  = f.bit_table_;
             salt_       = f.salt_;
             projected_element_count_ = f.projected_element_count_;
             inserted_element_count_  = f.inserted_element_count_;

             random_seed_ = f.random_seed_;

             error_rate = f.error_rate;
          }

          return *this;
       }

       virtual ~bloom_filter()
       {}

       inline bool operator!() const
       {
          return (0 == max_capability);
       }

       inline void clear()
       {
          std::fill(bit_table_.begin(), bit_table_.end(), static_cast<unsigned char>(0x00));
          inserted_element_count_ = 0;
       }
       struct save_bf_struct
       {
           unsigned long long     salt_count;
           unsigned long long     max_capability;
           unsigned long long     capability;
           unsigned long long     count;
           unsigned long long     random_seed;
           double                 error_rate;
       };
       //把过滤器内容从内存持久化到文件
       inline bool dump_tofile(string filename)
       {
           ofstream fout(filename.c_str(), ios::binary);
           if(fout.is_open()==false)
           {
               std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
               return false;
           }

           save_bf_struct s_bf_info;
           s_bf_info.salt_count = salt_count_;
           s_bf_info.max_capability = max_capability;
           s_bf_info.capability = projected_element_count_;
           s_bf_info.count = inserted_element_count_;
           s_bf_info.random_seed = random_seed_;
           s_bf_info.error_rate = error_rate;

           const int file_check_code = 0xabcd1234;                                //file_check_code 写入文件头以用于简单判断是否正确格式
           fout.write((char*)&file_check_code, sizeof(int));                    //写入文件头标识符
           unsigned long long  length = bit_table_.size();                      //向量表的大小
           fout.write((char*)&length, sizeof(unsigned long long));              //vector<unsigned char>的元素个数
           unsigned long long  totalSize = length * sizeof(unsigned char);      //正文内容的大小(单位是个字节)
           fout.write((char*)&totalSize, sizeof(unsigned long long));           //写入文件内容的长度
           fout.write((char*)&s_bf_info, sizeof(save_bf_struct));               //写入过滤器的结构信息

           //cout << "要写入的文件大小为" << totalSize << "字节,有" << length << "个元素"<< endl;
           cout << "要保存的滤器设计容量为" << s_bf_info.capability << ",hash次数为" << s_bf_info.salt_count << ",错误率为"<< s_bf_info.error_rate;
           cout << ",已写入" << s_bf_info.count << "个元素" << endl;
           std::copy(bit_table_.cbegin(), bit_table_.cend(),std::ostream_iterator<unsigned char>(fout));
           fout.close();
           std::cout << "已把"<< totalSize <<" 字节的过滤器内容保存到文件:"<< filename << std::endl;
           return true;
       }
       //从文件中加载过滤器
       inline bool load_fromfile(string filename)
       {
           const int file_check_code = 0xabcd1234;                                        //file_check_code 放在文件的开头用于简单判断是否正确读取数据
           //std::ios_base::
           ifstream ifile(filename.c_str(), ios::binary);
           if(ifile.is_open()==false)
           {
               std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
               return false;
           }
           int tmp_check_code;
           unsigned long long length, totalSize;
           ifile.read((char*)&tmp_check_code, sizeof(int));
           if (tmp_check_code!=file_check_code){
               cout<<"Unknow format at the begin of file...";
               ifile.close();
               return false;
           }
           save_bf_struct s_bf_info;
           ifile.read((char*)&length, sizeof(unsigned long long));
           ifile.read((char*)&totalSize, sizeof(unsigned long long));
           //读入过滤器的结构信息
           ifile.read((char*)&s_bf_info, sizeof(save_bf_struct));

           cout << "要载入的文件有"<< length << "个元素," << totalSize << "个字节" << endl;
           cout << "要载入的过滤器设计容量为" << s_bf_info.capability << "个,hash次数为" << s_bf_info.salt_count << ",错误率为"<< s_bf_info.error_rate;
           cout << "要,过滤器中已写入" << s_bf_info.count << "个元素" << endl;
           bit_table_.clear();
           bit_table_.resize(1);
           //以下这个写法能正确地从文件读入数据,但读入后再复制的做法实在太愚蠢了,怎么一次读入到类成员 bit_table_呢?
           vector<unsigned char> file_data((istreambuf_iterator<char>(ifile)),istreambuf_iterator<char>());
           bit_table_.swap(file_data);
           file_data.clear();
           ifile.close();

           this->salt_count_ = s_bf_info.salt_count  ;
           this-> max_capability = s_bf_info.max_capability;
           this-> projected_element_count_ = s_bf_info.capability ;
           this -> inserted_element_count_ = s_bf_info.count ;
           this -> random_seed_ = s_bf_info.random_seed  ;
           this -> error_rate  = s_bf_info.error_rate  ;
           std::cout << "已把"<<  bit_table_.size() << "个元素的过滤器从"<<totalSize<<" 字节的文件:"<< filename << "装载到内存。" << std::endl;
           return true;
       }
       //在过滤器插入元素
       inline void insert(const unsigned char* key_begin, const std::size_t& length)
       {
          std::size_t bit_index = 0;
          std::size_t bit       = 0;

          for (std::size_t i = 0; i < salt_.size(); ++i)
          {
             compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit);

             bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
          }

          ++inserted_element_count_;
       }

       template <typename T>
       inline void insert(const T& t)
       {
          // Note: T must be a C++ POD type.
          insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
       }
       //把一个字符型的值插入过滤器
       inline void insert(const std::string& key)
       {
          insert(reinterpret_cast<const unsigned char*>(key.data()),key.size());
       }

       inline void insert(const char* data, const std::size_t& length)
       {
          insert(reinterpret_cast<const unsigned char*>(data),length);
       }

       template <typename InputIterator>
       inline void insert(const InputIterator begin, const InputIterator end)
       {
          InputIterator itr = begin;

          while (end != itr)
          {
             insert(*(itr++));
          }
       }

       inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
       {
          std::size_t bit_index = 0;
          std::size_t bit       = 0;

          for (std::size_t i = 0; i < salt_.size(); ++i)
          {
             compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit);

             if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
             {
                return false;
             }
          }

          return true;
       }

       template <typename T>
       inline bool contains(const T& t) const
       {
          return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
       }

       inline bool contains(const std::string& key) const
       {
          return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
       }

       inline bool contains(const char* data, const std::size_t& length) const
       {
          return contains(reinterpret_cast<const unsigned char*>(data),length);
       }

       template <typename InputIterator>
       inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
       {
          InputIterator itr = begin;

          while (end != itr)
          {
             if (!contains(*itr))
             {
                return itr;
             }

             ++itr;
          }

          return end;
       }

       template <typename InputIterator>
       inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
       {
          InputIterator itr = begin;

          while (end != itr)
          {
             if (contains(*itr))
             {
                return itr;
             }

             ++itr;
          }

          return end;
       }

       inline virtual unsigned long long size() const
       {
          return max_capability;
       }

       inline unsigned long long element_count() const
       {
          return inserted_element_count_;
       }

       inline double effective_fpp() const
       {
          /*
            Note:
            The effective false positive probability is calculated using the
            designated table size and hash function count in conjunction with
            the current number of inserted elements - not the user defined
            predicated/expected number of inserted elements.
          */
          return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
       }

       inline bloom_filter& operator &= (const bloom_filter& f)
       {
          /* intersection */
          if (
               (salt_count_  == f.salt_count_ ) &&
               (max_capability  == f.max_capability ) &&
               (random_seed_ == f.random_seed_)
             )
          {
             for (std::size_t i = 0; i < bit_table_.size(); ++i)
             {
                bit_table_[i] &= f.bit_table_[i];
             }
          }

          return *this;
       }

       inline bloom_filter& operator |= (const bloom_filter& f)
       {
          /* union */
          if (
               (salt_count_  == f.salt_count_ ) &&
               (max_capability  == f.max_capability ) &&
               (random_seed_ == f.random_seed_)
             )
          {
             for (std::size_t i = 0; i < bit_table_.size(); ++i)
             {
                bit_table_[i] |= f.bit_table_[i];
             }
          }

          return *this;
       }

       inline bloom_filter& operator ^= (const bloom_filter& f)
       {
          /* difference */
          if (
               (salt_count_  == f.salt_count_ ) &&
               (max_capability  == f.max_capability ) &&
               (random_seed_ == f.random_seed_)
             )
          {
             for (std::size_t i = 0; i < bit_table_.size(); ++i)
             {
                bit_table_[i] ^= f.bit_table_[i];
             }
          }

          return *this;
       }

       inline const cell_type* table() const
       {
          return bit_table_.data();
       }

       inline std::size_t hash_count()
       {
          return salt_.size();
       }

    protected:

       inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
       {
          bit_index = hash % max_capability;
          bit       = bit_index % bits_per_char;
       }

       void generate_unique_salt()
       {
          const unsigned int predef_salt_count = 128;

          static const bloom_type predef_salt[predef_salt_count] =
                                     {
                                        0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
                                        0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
                                        0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
                                        0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
                                        0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
                                        0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
                                        0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
                                        0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
                                        0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
                                        0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
                                        0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
                                        0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
                                        0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
                                        0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
                                        0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
                                        0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
                                        0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
                                        0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
                                        0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
                                        0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
                                        0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
                                        0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
                                        0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
                                        0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
                                        0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
                                        0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
                                        0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
                                        0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
                                        0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
                                        0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
                                        0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
                                        0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
                                     };

          if (salt_count_ <= predef_salt_count)
          {
             std::copy(predef_salt,
                       predef_salt + salt_count_,
                       std::back_inserter(salt_));

             for (std::size_t i = 0; i < salt_.size(); ++i)
             {
                /*
                   Note:
                   This is done to integrate the user defined random seed,
                   so as to allow for the generation of unique bloom filter
                   instances.
                */
                salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
             }
          }
          else
          {
             std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_));

             srand(static_cast<unsigned int>(random_seed_));

             while (salt_.size() < salt_count_)
             {
                bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());

                if (0 == current_salt)
                   continue;

                if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
                {
                   salt_.push_back(current_salt);
                }
             }
          }
       }

       inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
       {
          const unsigned char* itr = begin;
          unsigned int loop        = 0;

          while (remaining_length >= 8)
          {
             const unsigned int& i1 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
             const unsigned int& i2 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);

             hash ^= (hash <<  7) ^  i1 * (hash >> 3) ^
                  (~((hash << 11) + (i2 ^ (hash >> 5))));

             remaining_length -= 8;
          }

          if (remaining_length)
          {
             if (remaining_length >= 4)
             {
                const unsigned int& i = *(reinterpret_cast<const unsigned int*>(itr));

                if (loop & 0x01)
                   hash ^=    (hash <<  7) ^  i * (hash >> 3);
                else
                   hash ^= (~((hash << 11) + (i ^ (hash >> 5))));

                ++loop;

                remaining_length -= 4;

                itr += sizeof(unsigned int);
             }

             if (remaining_length >= 2)
             {
                const unsigned short& i = *(reinterpret_cast<const unsigned short*>(itr));

                if (loop & 0x01)
                   hash ^=    (hash <<  7) ^  i * (hash >> 3);
                else
                   hash ^= (~((hash << 11) + (i ^ (hash >> 5))));

                ++loop;

                remaining_length -= 2;

                itr += sizeof(unsigned short);
             }

             if (remaining_length)
             {
                hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop;
             }
          }

          return hash;
       }
       std::vector<bloom_type>    salt_;
       std::vector<unsigned char> bit_table_;
       unsigned long long     salt_count_;
       unsigned long long     max_capability;
       unsigned long long     projected_element_count_;
       unsigned long long     inserted_element_count_;
       unsigned long long     random_seed_;
       double                 error_rate;
};

inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
{
   bloom_filter result = a;
   result &= b;
   return result;
}

inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
{
   bloom_filter result = a;
   result |= b;
   return result;
}

inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
{
   bloom_filter result = a;
   result ^= b;
   return result;
}

class compressible_bloom_filter : public bloom_filter
{
public:

   compressible_bloom_filter(const bloom_parameters& p)
   : bloom_filter(p)
   {
      size_list.push_back(max_capability);
   }

   inline unsigned long long size() const
   {
      return size_list.back();
   }

   inline bool compress(const double& percentage)
   {
      if (
           (percentage <    0.0) ||
           (percentage >= 100.0)
         )
      {
         return false;
      }

      unsigned long long original_table_size = size_list.back();
      unsigned long long new_table_size = static_cast<unsigned long long>((size_list.back() * (1.0 - (percentage / 100.0))));

      new_table_size -= new_table_size % bits_per_char;

      if (
           (bits_per_char  >       new_table_size) ||
           (new_table_size >= original_table_size)
         )
      {
         return false;
      }

      error_rate = effective_fpp();

      const unsigned long long new_tbl_raw_size = new_table_size / bits_per_char;

      table_type tmp(new_tbl_raw_size);

      std::copy(bit_table_.begin(), bit_table_.begin() + new_tbl_raw_size, tmp.begin());

      typedef table_type::iterator itr_t;

      itr_t itr     = bit_table_.begin() + (new_table_size      / bits_per_char);
      itr_t end     = bit_table_.begin() + (original_table_size / bits_per_char);
      itr_t itr_tmp = tmp.begin();

      while (end != itr)
      {
         *(itr_tmp++) |= (*itr++);
      }

      std::swap(bit_table_, tmp);

      size_list.push_back(new_table_size);

      return true;
   }

private:

   inline void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
   {
      bit_index = hash;

      for (std::size_t i = 0; i < size_list.size(); ++i)
      {
         bit_index %= size_list[i];
      }

      bit = bit_index % bits_per_char;
   }

   std::vector<unsigned long long> size_list;
};

#endif

以下我调试用的主执行程序代码以及运行结果:

#include <iostream>
#include <bitset>
#include <string>
#include "bloom_filter.hpp"

int main()
{
    string bloom_file = "./tmp_bf.bloom";
    bloom_parameters parameters;
   // How many elements roughly do we expect to insert?
   //要确保系统的内存容量足够
   parameters.projected_element_count = 10000;
   // Maximum tolerable false positive probability? (0,1)
   parameters.error_rate = 0.0001; // 1 in 10000
   // Simple randomizer (optional)
   parameters.random_seed = 0xA5A5A5A5;
   if (!parameters)
   {
      std::cout << "Error - Invalid set of bloom filter parameters!" << std::endl;
      return 1;
   }
   //计算合理的过滤器参数
   parameters.compute_optimal_parameters();

   //Instantiate Bloom Filter
   bloom_filter filter(parameters);

   std::string str_list[] = { "AbC", "iJk", "XYZ" };
   unsigned have_insert = 0;
   // Insert into Bloom Filter
   {
      // Insert some strings
      for (std::size_t i = 0; i < (sizeof(str_list) / sizeof(std::string)); ++i)
      {
         filter.insert(str_list[i]);
          have_insert += 1;
      }

      // Insert some numbers
      for (std::size_t i = 0; i < 10; ++i)
      {
         filter.insert(i);
         have_insert += 1;
      }
   }
   cout << "系统已经把"<<have_insert << "个元素写入过滤器"<< endl;
   filter.dump_tofile(bloom_file);
   filter.load_fromfile(bloom_file);
   for (std::size_t i = 16; i < 21; ++i)
   {
        filter.insert(i);
        have_insert += 1;
   }
   // Query Bloom Filter
   {
      // Query the existence of strings
      for (std::size_t i = 0; i < (sizeof(str_list) / sizeof(std::string)); ++i)
      {
         if (filter.contains(str_list[i]))
         {
            std::cout << "BF 已包含: " << str_list[i] << std::endl;
         }
      }

      // Query the existence of numbers
      for (std::size_t i = 0; i < 25; ++i)
      {
         if (filter.contains(i))
         {
            std::cout << "Bloom filter 已包含: " << i << std::endl;
         }
         else
         {
             std::cout << "Bloom filter 未包含: " << i << std::endl;
         }
      }
      std::string invalid_str_list[] = { "AbCX", "iJkX", "XYZX" };
      // Query the existence of invalid strings
      for (std::size_t i = 0; i < (sizeof(invalid_str_list) / sizeof(std::string)); ++i)
      {
         if (filter.contains(invalid_str_list[i])== false)
         {
            std::cout << "Bloom filter falsely contains: " << invalid_str_list[i] << std::endl;
         }
      }
      // Query the existence of invalid numbers
      for (int i = -1; i > -10; --i)
      {
         if (filter.contains(i) == false)
         {
            std::cout << "Bloom filter  falsely contains: " << i << std::endl;
         }
      }
   }
   filter.dump_tofile(bloom_file);
   return 0;
}

运行的结果:

系统已经把8个元素写入过滤器
要保存的滤器设计容量为10000,hash次数为13,错误率为0.0001,已写入8个元素
已把23967 字节的过滤器内容保存到文件:./tmp_bf.bloom
要载入的文件有23967个元素,23967个字节
要载入的过滤器设计容量为10000个,hash次数为13,错误率为0.0001要,过滤器中已写入8个元素
已把23967个元素的过滤器从23967 字节的文件:./tmp_bf.bloom装载到内存。
BF 已包含: AbC
BF 已包含: iJk
BF 已包含: XYZ
BF 已包含: 0
BF 已包含: 1
BF 已包含: 2
BF 已包含: 3
BF 已包含: 4
BF 未包含: 5
BF 未包含: 6
BF 未包含: 7
BF 已包含: 8
BF 已包含: 9
BF 已包含: 10
BF 已包含: 11
BF 已包含: 12
BF 已包含: 13
BF 已包含: 14
BF 未包含: 15
BF 未包含: 16
BF 未包含: 17
BF 未包含: 18
BF 未包含: 19
Bloom filter falsely contains: AbCX
Bloom filter falsely contains: iJkX
Bloom filter falsely contains: XYZX
Bloom filter  falsely contains: -1
Bloom filter  falsely contains: -2
Bloom filter  falsely contains: -3
Bloom filter  falsely contains: -4
要保存的滤器设计容量为10000,hash次数为13,错误率为0.0001,已写入15个元素
已把23967 字节的过滤器内容保存到文件:./tmp_bf.bloom

进程已结束,退出代码为 0

你可以看看Boost Dynamic Bitset (https://www.boost.org/doc/libs/1_77_0/libs/dynamic_bitset/dynamic_bitset.html)