我用vector做一个bloom filter,需要把生成的过滤器持久化到文件,再从文件恢复到内存。但在这个环节遇到了问题。
我写了以下把vetor bit_table_ 内容写入文件,以及从文件重载回内存的两个函数,但装载后的内容与实际内容是不一样的,以致不能通过bloom filter的检测。
#2021-12-06晚间更新:
通过网上查资料,自己琢磨测试,找到了一种方法能勉强解决上述问题。但有一个地方的写法有点蠢:其中反序列化装载函数的实现,先从文件把内容读入一个临时vector,再swap到类成员变量的写法实在是很LOW,特别是对于巨大的,非常耗内存的大型bloom filter,这种写法要占用双倍的内存,非常不可取。但我这种C++新手,一时半会又找不到更好且可行的写法,还请各路大神继续帮助,给出更好的实现方法。
详见下文回复内容。
#----------------------------------
//把过滤器内存持久化到文件
inline bool dump_tofile(string filename)
{
ofstream fout(filename.c_str(), ios::binary);
if(fout.is_open()==false)
{
std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
return false;
}
std::vector<bool>::size_type n = bit_table_.size();
fout.write((const char*)&n, sizeof(std::vector<bool>::size_type));
for(std::vector<bool>::size_type i = 0; i < n;)
{
unsigned char aggr = 0;
for(unsigned char mask = 1; mask > 0 && i < n; ++i, mask <<= 1)
if(bit_table_.at(i))
aggr |= mask;
fout.write((const char*)&aggr, sizeof(unsigned char));
}
fout.close();
std::cout << "已把"<< n/(1024*1024) <<" MB 的过滤器内容保存到文件:"<< filename << std::endl;
return true;
}
//从文件中加载过滤器
inline bool load_fromfile(string filename)
{
ifstream fin(filename.c_str(), ios::binary);
if(fin.is_open()==false)
{
std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
return false;
}
std::vector<bool>::size_type n;
fin.read((char*)&n, sizeof(std::vector<bool>::size_type));
bit_table_.resize(n);
for(std::vector<bool>::size_type i = 0; i < n;)
{
unsigned char aggr;
fin.read((char*)&aggr, sizeof(unsigned char));
for(unsigned char mask = 1; mask > 0 && i < n; ++i, mask <<= 1)
bit_table_.at(i) = aggr & mask;
}
fin.close();
std::cout << "已把"<< n/(1024*1024) <<" MB 的过滤器从文件:"<< filename << "装载到内存。" << std::endl;
}
求大神指导
我记得网上有说不要往vector里面放bool,会遇到坑的
** 通过网上查资料,自己琢磨测试,找到了一种方法能勉强解决上述问题。但有一个地方的写法有点蠢,要在内存里复制。先把我这个能跑通的代码贴出来,恳请各路大神继续帮助优化!
以下是我要实现的,把内存里的bloom filter持久化到文件,以及从文件反序列化的两个函数。其中反序列化装载的函数,先从文件把内容读入一个临时vector,再swap到类成员变量的写法实在是很LOW,特别是对于巨大的,非常耗内存的大型过滤器,这种写法也要占用双倍的内存,非常不可取。但我这种C++新手,一时半会又找不到更好且可行的写法,还请各路大神帮助给出更好的实现方法:**
//把布隆过滤器的内容从内存持久化到文件
inline bool dump_tofile(string filename)
{
ofstream fout(filename.c_str(), ios::binary);
if(fout.is_open()==false)
{
std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
return false;
}
save_bf_struct s_bf_info;
s_bf_info.salt_count = salt_count_;
s_bf_info.max_capability = max_capability;
s_bf_info.capability = projected_element_count_;
s_bf_info.count = inserted_element_count_;
s_bf_info.random_seed = random_seed_;
s_bf_info.error_rate = error_rate;
const int file_check_code = 0xabcd1234; //file_check_code 写入文件的开头用于简单判断是否正确格式
fout.write((char*)&file_check_code, sizeof(int)); //写入文件头标识符
unsigned long long length = bit_table_.size(); //向量表的大小
fout.write((char*)&length, sizeof(unsigned long long)); //vector<unsigned char>的元素个数
unsigned long long totalSize = length * sizeof(unsigned char); //正文内容的大小(单位是个字节)
fout.write((char*)&totalSize, sizeof(unsigned long long)); //写入文件内容的长度
fout.write((char*)&s_bf_info, sizeof(save_bf_struct)); //写入过滤器的结构信息
//cout << "要写入的文件大小为" << totalSize << "字节,有" << length << "个元素"<< endl;
cout << "要保存的滤器设计容量为" << s_bf_info.capability << ",hash次数为" << s_bf_info.salt_count << ",错误率为"<< s_bf_info.error_rate;
cout << ",已写入" << s_bf_info.count << "个元素" << endl;
std::copy(bit_table_.cbegin(), bit_table_.cend(),std::ostream_iterator<unsigned char>(fout));
fout.close();
std::cout << "已把"<< totalSize <<" 字节的过滤器内容保存到文件:"<< filename << std::endl;
return true;
}
//从文件中加载布隆过滤器的内容到内存的向量
inline bool load_fromfile(string filename)
{
const int file_check_code = 0xabcd1234; //file_check_code 放在文件的开头用于简单判断是否正确读取数据
//std::ios_base::
ifstream ifile(filename.c_str(), ios::binary);
if(ifile.is_open()==false)
{
std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
return false;
}
int tmp_check_code;
unsigned long long length, totalSize;
ifile.read((char*)&tmp_check_code, sizeof(int));
if (tmp_check_code!=file_check_code){
cout<<"Unknow format at the begin of file...";
ifile.close();
return false;
}
save_bf_struct s_bf_info;
ifile.read((char*)&length, sizeof(unsigned long long));
ifile.read((char*)&totalSize, sizeof(unsigned long long));
//读入过滤器的结构信息
ifile.read((char*)&s_bf_info, sizeof(save_bf_struct));
cout << "要载入的文件有"<< length << "个元素," << totalSize << "个字节" << endl;
cout << "要载入的过滤器设计容量为" << s_bf_info.capability << "个,hash次数为" << s_bf_info.salt_count << ",错误率为"<< s_bf_info.error_rate;
cout << "要,过滤器中已写入" << s_bf_info.count << "个元素" << endl;
bit_table_.clear();
bit_table_.resize(1);
//以下这个写法能正确地从文件读入数据,但读入后再复制的做法实在太愚蠢了,怎么一次读入到类成员 bit_table_呢?
vector<unsigned char> file_data((istreambuf_iterator<char>(ifile)),istreambuf_iterator<char>());
bit_table_.swap(file_data);
file_data.clear();
ifile.close();
this->salt_count_ = s_bf_info.salt_count ;
this-> max_capability = s_bf_info.max_capability;
this-> projected_element_count_ = s_bf_info.capability ;
this -> inserted_element_count_ = s_bf_info.count ;
this -> random_seed_ = s_bf_info.random_seed ;
this -> error_rate = s_bf_info.error_rate ;
std::cout << "已把"<< bit_table_.size() << "个元素的过滤器从"<<totalSize<<" 字节的文件:"<< filename << "装载到内存。" << std::endl;
return true;
}
以下是bloom filter完整的实现代码 bloom_filter.hpp,这个版本可以支持100GB内存(约800亿条记录),甚至更大的容量,并且可以把过滤器内容从内存持久化到文件保存,需要时再从磁盘文件反序列化到内存中。
#ifndef INCLUDE_BLOOM_FILTER_HPP
#define INCLUDE_BLOOM_FILTER_HPP
#include <iostream>
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdlib>
#include <iterator>
#include <limits>
#include <string>
#include <fstream>
#include <istream>
#include <sstream>
#include <cstring>
#include <vector>
static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
static const unsigned char bit_mask[bits_per_char] = {
0x01, //00000001
0x02, //00000010
0x04, //00000100
0x08, //00001000
0x10, //00010000
0x20, //00100000
0x40, //01000000
0x80 //10000000
};
//设定过滤器的参数
class bloom_parameters
{
public:
bloom_parameters()
: minimum_size(1),
maximum_size(std::numeric_limits<unsigned long long>::max()),
minimum_number_of_hashes(1),
maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
projected_element_count(10000),
error_rate(1.0 / projected_element_count),
random_seed(0xA5A5A5A55A5A5A5AULL)
{}
virtual ~bloom_parameters()
{}
inline bool operator!()
{
return (minimum_size > maximum_size) ||
(minimum_number_of_hashes > maximum_number_of_hashes) ||
(minimum_number_of_hashes < 1) ||
(0 == maximum_number_of_hashes) ||
(0 == projected_element_count) ||
(error_rate < 0.0) ||
(std::numeric_limits<double>::infinity() == std::abs(error_rate)) ||
(0 == random_seed) ||
(0xFFFFFFFFFFFFFFFFULL == random_seed);
}
// Allowable min/max size of the bloom filter in bits
unsigned long long minimum_size;
unsigned long long maximum_size;
// Allowable min/max number of hash functions
unsigned int minimum_number_of_hashes;
unsigned int maximum_number_of_hashes;
//错误率,一般要求小于0.1%
double error_rate;
unsigned long long random_seed;
struct optimal_parameters_t
{
optimal_parameters_t()
: number_of_hashes(0),
table_size(0)
{}
unsigned int number_of_hashes;
unsigned long long table_size;
};
optimal_parameters_t optimal_parameters;
virtual bool compute_optimal_parameters()
{
if (!(*this))
return false;
double min_m = std::numeric_limits<double>::infinity();
double min_k = 0.0;
double k = 1.0;
while (k < 1000.0)
{
const double numerator = (- k * projected_element_count);
const double denominator = std::log(1.0 - std::pow(error_rate, 1.0 / k));
const double curr_m = numerator / denominator;
if (curr_m < min_m)
{
min_m = curr_m;
min_k = k;
}
k += 1.0;
}
optimal_parameters_t& optp = optimal_parameters;
optp.number_of_hashes = static_cast<unsigned int>(min_k);
optp.table_size = static_cast<unsigned long long>(min_m);
optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
if (optp.number_of_hashes < minimum_number_of_hashes)
optp.number_of_hashes = minimum_number_of_hashes;
else if (optp.number_of_hashes > maximum_number_of_hashes)
optp.number_of_hashes = maximum_number_of_hashes;
if (optp.table_size < minimum_size)
optp.table_size = minimum_size;
else if (optp.table_size > maximum_size)
optp.table_size = maximum_size;
return true;
}
};
using namespace std;
class bloom_filter
{
protected:
typedef unsigned int bloom_type;
typedef unsigned char cell_type;
typedef std::vector<unsigned char> table_type;
public:
bloom_filter()
: salt_count_(0),
max_capability(0),
projected_element_count_(0),
inserted_element_count_ (0),
random_seed_(0),
error_rate(0.0)
{}
bloom_filter(const bloom_parameters& p)
: projected_element_count_(p.projected_element_count),
inserted_element_count_(0),
random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
error_rate(p.error_rate)
{
//哈希次数
salt_count_ = p.optimal_parameters.number_of_hashes;
//元素个数上限
max_capability = p.optimal_parameters.table_size;
generate_unique_salt();
bit_table_.resize(max_capability / bits_per_char, static_cast<unsigned char>(0x00));
}
bloom_filter(const bloom_filter& filter)
{
this->operator=(filter);
}
inline bool operator == (const bloom_filter& f) const
{
if (this != &f)
{
return
(salt_count_ == f.salt_count_ ) &&
(max_capability == f.max_capability ) &&
(bit_table_.size() == f.bit_table_.size() ) &&
(projected_element_count_ == f.projected_element_count_ ) &&
(inserted_element_count_ == f.inserted_element_count_ ) &&
(random_seed_ == f.random_seed_ ) &&
(error_rate == f.error_rate ) &&
(salt_ == f.salt_ ) &&
(bit_table_ == f.bit_table_ ) ;
}
else
return true;
}
inline bool operator != (const bloom_filter& f) const
{
return !operator==(f);
}
inline bloom_filter& operator = (const bloom_filter& f)
{
if (this != &f)
{
salt_count_ = f.salt_count_;
max_capability = f.max_capability;
bit_table_ = f.bit_table_;
salt_ = f.salt_;
projected_element_count_ = f.projected_element_count_;
inserted_element_count_ = f.inserted_element_count_;
random_seed_ = f.random_seed_;
error_rate = f.error_rate;
}
return *this;
}
virtual ~bloom_filter()
{}
inline bool operator!() const
{
return (0 == max_capability);
}
inline void clear()
{
std::fill(bit_table_.begin(), bit_table_.end(), static_cast<unsigned char>(0x00));
inserted_element_count_ = 0;
}
struct save_bf_struct
{
unsigned long long salt_count;
unsigned long long max_capability;
unsigned long long capability;
unsigned long long count;
unsigned long long random_seed;
double error_rate;
};
//把过滤器内容从内存持久化到文件
inline bool dump_tofile(string filename)
{
ofstream fout(filename.c_str(), ios::binary);
if(fout.is_open()==false)
{
std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
return false;
}
save_bf_struct s_bf_info;
s_bf_info.salt_count = salt_count_;
s_bf_info.max_capability = max_capability;
s_bf_info.capability = projected_element_count_;
s_bf_info.count = inserted_element_count_;
s_bf_info.random_seed = random_seed_;
s_bf_info.error_rate = error_rate;
const int file_check_code = 0xabcd1234; //file_check_code 写入文件头以用于简单判断是否正确格式
fout.write((char*)&file_check_code, sizeof(int)); //写入文件头标识符
unsigned long long length = bit_table_.size(); //向量表的大小
fout.write((char*)&length, sizeof(unsigned long long)); //vector<unsigned char>的元素个数
unsigned long long totalSize = length * sizeof(unsigned char); //正文内容的大小(单位是个字节)
fout.write((char*)&totalSize, sizeof(unsigned long long)); //写入文件内容的长度
fout.write((char*)&s_bf_info, sizeof(save_bf_struct)); //写入过滤器的结构信息
//cout << "要写入的文件大小为" << totalSize << "字节,有" << length << "个元素"<< endl;
cout << "要保存的滤器设计容量为" << s_bf_info.capability << ",hash次数为" << s_bf_info.salt_count << ",错误率为"<< s_bf_info.error_rate;
cout << ",已写入" << s_bf_info.count << "个元素" << endl;
std::copy(bit_table_.cbegin(), bit_table_.cend(),std::ostream_iterator<unsigned char>(fout));
fout.close();
std::cout << "已把"<< totalSize <<" 字节的过滤器内容保存到文件:"<< filename << std::endl;
return true;
}
//从文件中加载过滤器
inline bool load_fromfile(string filename)
{
const int file_check_code = 0xabcd1234; //file_check_code 放在文件的开头用于简单判断是否正确读取数据
//std::ios_base::
ifstream ifile(filename.c_str(), ios::binary);
if(ifile.is_open()==false)
{
std::cout<<"Open file:" << filename <<" fail!"<<std::endl;
return false;
}
int tmp_check_code;
unsigned long long length, totalSize;
ifile.read((char*)&tmp_check_code, sizeof(int));
if (tmp_check_code!=file_check_code){
cout<<"Unknow format at the begin of file...";
ifile.close();
return false;
}
save_bf_struct s_bf_info;
ifile.read((char*)&length, sizeof(unsigned long long));
ifile.read((char*)&totalSize, sizeof(unsigned long long));
//读入过滤器的结构信息
ifile.read((char*)&s_bf_info, sizeof(save_bf_struct));
cout << "要载入的文件有"<< length << "个元素," << totalSize << "个字节" << endl;
cout << "要载入的过滤器设计容量为" << s_bf_info.capability << "个,hash次数为" << s_bf_info.salt_count << ",错误率为"<< s_bf_info.error_rate;
cout << "要,过滤器中已写入" << s_bf_info.count << "个元素" << endl;
bit_table_.clear();
bit_table_.resize(1);
//以下这个写法能正确地从文件读入数据,但读入后再复制的做法实在太愚蠢了,怎么一次读入到类成员 bit_table_呢?
vector<unsigned char> file_data((istreambuf_iterator<char>(ifile)),istreambuf_iterator<char>());
bit_table_.swap(file_data);
file_data.clear();
ifile.close();
this->salt_count_ = s_bf_info.salt_count ;
this-> max_capability = s_bf_info.max_capability;
this-> projected_element_count_ = s_bf_info.capability ;
this -> inserted_element_count_ = s_bf_info.count ;
this -> random_seed_ = s_bf_info.random_seed ;
this -> error_rate = s_bf_info.error_rate ;
std::cout << "已把"<< bit_table_.size() << "个元素的过滤器从"<<totalSize<<" 字节的文件:"<< filename << "装载到内存。" << std::endl;
return true;
}
//在过滤器插入元素
inline void insert(const unsigned char* key_begin, const std::size_t& length)
{
std::size_t bit_index = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < salt_.size(); ++i)
{
compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit);
bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
}
++inserted_element_count_;
}
template <typename T>
inline void insert(const T& t)
{
// Note: T must be a C++ POD type.
insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
}
//把一个字符型的值插入过滤器
inline void insert(const std::string& key)
{
insert(reinterpret_cast<const unsigned char*>(key.data()),key.size());
}
inline void insert(const char* data, const std::size_t& length)
{
insert(reinterpret_cast<const unsigned char*>(data),length);
}
template <typename InputIterator>
inline void insert(const InputIterator begin, const InputIterator end)
{
InputIterator itr = begin;
while (end != itr)
{
insert(*(itr++));
}
}
inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
{
std::size_t bit_index = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < salt_.size(); ++i)
{
compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit);
if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
{
return false;
}
}
return true;
}
template <typename T>
inline bool contains(const T& t) const
{
return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
}
inline bool contains(const std::string& key) const
{
return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
}
inline bool contains(const char* data, const std::size_t& length) const
{
return contains(reinterpret_cast<const unsigned char*>(data),length);
}
template <typename InputIterator>
inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
{
InputIterator itr = begin;
while (end != itr)
{
if (!contains(*itr))
{
return itr;
}
++itr;
}
return end;
}
template <typename InputIterator>
inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
{
InputIterator itr = begin;
while (end != itr)
{
if (contains(*itr))
{
return itr;
}
++itr;
}
return end;
}
inline virtual unsigned long long size() const
{
return max_capability;
}
inline unsigned long long element_count() const
{
return inserted_element_count_;
}
inline double effective_fpp() const
{
/*
Note:
The effective false positive probability is calculated using the
designated table size and hash function count in conjunction with
the current number of inserted elements - not the user defined
predicated/expected number of inserted elements.
*/
return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
}
inline bloom_filter& operator &= (const bloom_filter& f)
{
/* intersection */
if (
(salt_count_ == f.salt_count_ ) &&
(max_capability == f.max_capability ) &&
(random_seed_ == f.random_seed_)
)
{
for (std::size_t i = 0; i < bit_table_.size(); ++i)
{
bit_table_[i] &= f.bit_table_[i];
}
}
return *this;
}
inline bloom_filter& operator |= (const bloom_filter& f)
{
/* union */
if (
(salt_count_ == f.salt_count_ ) &&
(max_capability == f.max_capability ) &&
(random_seed_ == f.random_seed_)
)
{
for (std::size_t i = 0; i < bit_table_.size(); ++i)
{
bit_table_[i] |= f.bit_table_[i];
}
}
return *this;
}
inline bloom_filter& operator ^= (const bloom_filter& f)
{
/* difference */
if (
(salt_count_ == f.salt_count_ ) &&
(max_capability == f.max_capability ) &&
(random_seed_ == f.random_seed_)
)
{
for (std::size_t i = 0; i < bit_table_.size(); ++i)
{
bit_table_[i] ^= f.bit_table_[i];
}
}
return *this;
}
inline const cell_type* table() const
{
return bit_table_.data();
}
inline std::size_t hash_count()
{
return salt_.size();
}
protected:
inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
{
bit_index = hash % max_capability;
bit = bit_index % bits_per_char;
}
void generate_unique_salt()
{
const unsigned int predef_salt_count = 128;
static const bloom_type predef_salt[predef_salt_count] =
{
0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
};
if (salt_count_ <= predef_salt_count)
{
std::copy(predef_salt,
predef_salt + salt_count_,
std::back_inserter(salt_));
for (std::size_t i = 0; i < salt_.size(); ++i)
{
/*
Note:
This is done to integrate the user defined random seed,
so as to allow for the generation of unique bloom filter
instances.
*/
salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
}
}
else
{
std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_));
srand(static_cast<unsigned int>(random_seed_));
while (salt_.size() < salt_count_)
{
bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
if (0 == current_salt)
continue;
if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
{
salt_.push_back(current_salt);
}
}
}
}
inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
{
const unsigned char* itr = begin;
unsigned int loop = 0;
while (remaining_length >= 8)
{
const unsigned int& i1 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
const unsigned int& i2 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
hash ^= (hash << 7) ^ i1 * (hash >> 3) ^
(~((hash << 11) + (i2 ^ (hash >> 5))));
remaining_length -= 8;
}
if (remaining_length)
{
if (remaining_length >= 4)
{
const unsigned int& i = *(reinterpret_cast<const unsigned int*>(itr));
if (loop & 0x01)
hash ^= (hash << 7) ^ i * (hash >> 3);
else
hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
++loop;
remaining_length -= 4;
itr += sizeof(unsigned int);
}
if (remaining_length >= 2)
{
const unsigned short& i = *(reinterpret_cast<const unsigned short*>(itr));
if (loop & 0x01)
hash ^= (hash << 7) ^ i * (hash >> 3);
else
hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
++loop;
remaining_length -= 2;
itr += sizeof(unsigned short);
}
if (remaining_length)
{
hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop;
}
}
return hash;
}
std::vector<bloom_type> salt_;
std::vector<unsigned char> bit_table_;
unsigned long long salt_count_;
unsigned long long max_capability;
unsigned long long projected_element_count_;
unsigned long long inserted_element_count_;
unsigned long long random_seed_;
double error_rate;
};
inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
{
bloom_filter result = a;
result &= b;
return result;
}
inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
{
bloom_filter result = a;
result |= b;
return result;
}
inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
{
bloom_filter result = a;
result ^= b;
return result;
}
class compressible_bloom_filter : public bloom_filter
{
public:
compressible_bloom_filter(const bloom_parameters& p)
: bloom_filter(p)
{
size_list.push_back(max_capability);
}
inline unsigned long long size() const
{
return size_list.back();
}
inline bool compress(const double& percentage)
{
if (
(percentage < 0.0) ||
(percentage >= 100.0)
)
{
return false;
}
unsigned long long original_table_size = size_list.back();
unsigned long long new_table_size = static_cast<unsigned long long>((size_list.back() * (1.0 - (percentage / 100.0))));
new_table_size -= new_table_size % bits_per_char;
if (
(bits_per_char > new_table_size) ||
(new_table_size >= original_table_size)
)
{
return false;
}
error_rate = effective_fpp();
const unsigned long long new_tbl_raw_size = new_table_size / bits_per_char;
table_type tmp(new_tbl_raw_size);
std::copy(bit_table_.begin(), bit_table_.begin() + new_tbl_raw_size, tmp.begin());
typedef table_type::iterator itr_t;
itr_t itr = bit_table_.begin() + (new_table_size / bits_per_char);
itr_t end = bit_table_.begin() + (original_table_size / bits_per_char);
itr_t itr_tmp = tmp.begin();
while (end != itr)
{
*(itr_tmp++) |= (*itr++);
}
std::swap(bit_table_, tmp);
size_list.push_back(new_table_size);
return true;
}
private:
inline void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
{
bit_index = hash;
for (std::size_t i = 0; i < size_list.size(); ++i)
{
bit_index %= size_list[i];
}
bit = bit_index % bits_per_char;
}
std::vector<unsigned long long> size_list;
};
#endif
以下我调试用的主执行程序代码以及运行结果:
#include <iostream>
#include <bitset>
#include <string>
#include "bloom_filter.hpp"
int main()
{
string bloom_file = "./tmp_bf.bloom";
bloom_parameters parameters;
// How many elements roughly do we expect to insert?
//要确保系统的内存容量足够
parameters.projected_element_count = 10000;
// Maximum tolerable false positive probability? (0,1)
parameters.error_rate = 0.0001; // 1 in 10000
// Simple randomizer (optional)
parameters.random_seed = 0xA5A5A5A5;
if (!parameters)
{
std::cout << "Error - Invalid set of bloom filter parameters!" << std::endl;
return 1;
}
//计算合理的过滤器参数
parameters.compute_optimal_parameters();
//Instantiate Bloom Filter
bloom_filter filter(parameters);
std::string str_list[] = { "AbC", "iJk", "XYZ" };
unsigned have_insert = 0;
// Insert into Bloom Filter
{
// Insert some strings
for (std::size_t i = 0; i < (sizeof(str_list) / sizeof(std::string)); ++i)
{
filter.insert(str_list[i]);
have_insert += 1;
}
// Insert some numbers
for (std::size_t i = 0; i < 10; ++i)
{
filter.insert(i);
have_insert += 1;
}
}
cout << "系统已经把"<<have_insert << "个元素写入过滤器"<< endl;
filter.dump_tofile(bloom_file);
filter.load_fromfile(bloom_file);
for (std::size_t i = 16; i < 21; ++i)
{
filter.insert(i);
have_insert += 1;
}
// Query Bloom Filter
{
// Query the existence of strings
for (std::size_t i = 0; i < (sizeof(str_list) / sizeof(std::string)); ++i)
{
if (filter.contains(str_list[i]))
{
std::cout << "BF 已包含: " << str_list[i] << std::endl;
}
}
// Query the existence of numbers
for (std::size_t i = 0; i < 25; ++i)
{
if (filter.contains(i))
{
std::cout << "Bloom filter 已包含: " << i << std::endl;
}
else
{
std::cout << "Bloom filter 未包含: " << i << std::endl;
}
}
std::string invalid_str_list[] = { "AbCX", "iJkX", "XYZX" };
// Query the existence of invalid strings
for (std::size_t i = 0; i < (sizeof(invalid_str_list) / sizeof(std::string)); ++i)
{
if (filter.contains(invalid_str_list[i])== false)
{
std::cout << "Bloom filter falsely contains: " << invalid_str_list[i] << std::endl;
}
}
// Query the existence of invalid numbers
for (int i = -1; i > -10; --i)
{
if (filter.contains(i) == false)
{
std::cout << "Bloom filter falsely contains: " << i << std::endl;
}
}
}
filter.dump_tofile(bloom_file);
return 0;
}
运行的结果:
系统已经把8个元素写入过滤器
要保存的滤器设计容量为10000,hash次数为13,错误率为0.0001,已写入8个元素
已把23967 字节的过滤器内容保存到文件:./tmp_bf.bloom
要载入的文件有23967个元素,23967个字节
要载入的过滤器设计容量为10000个,hash次数为13,错误率为0.0001要,过滤器中已写入8个元素
已把23967个元素的过滤器从23967 字节的文件:./tmp_bf.bloom装载到内存。
BF 已包含: AbC
BF 已包含: iJk
BF 已包含: XYZ
BF 已包含: 0
BF 已包含: 1
BF 已包含: 2
BF 已包含: 3
BF 已包含: 4
BF 未包含: 5
BF 未包含: 6
BF 未包含: 7
BF 已包含: 8
BF 已包含: 9
BF 已包含: 10
BF 已包含: 11
BF 已包含: 12
BF 已包含: 13
BF 已包含: 14
BF 未包含: 15
BF 未包含: 16
BF 未包含: 17
BF 未包含: 18
BF 未包含: 19
Bloom filter falsely contains: AbCX
Bloom filter falsely contains: iJkX
Bloom filter falsely contains: XYZX
Bloom filter falsely contains: -1
Bloom filter falsely contains: -2
Bloom filter falsely contains: -3
Bloom filter falsely contains: -4
要保存的滤器设计容量为10000,hash次数为13,错误率为0.0001,已写入15个元素
已把23967 字节的过滤器内容保存到文件:./tmp_bf.bloom
进程已结束,退出代码为 0
你可以看看Boost Dynamic Bitset (https://www.boost.org/doc/libs/1_77_0/libs/dynamic_bitset/dynamic_bitset.html)