【C】 布隆过滤器BloomFilter

在实现BloomFilter,首先实现一个位图;

BitMap

在位图中,每个元素为“0”或“1”,表示其对应的元素不存在或者存在。


typedef struct BitMap 
{ 
	size_t* _bits; 
	size_t _range; 
}BitMap; 

void BitMapInit(BitMap* bm, size_t range) ;
void BitMapSet(BitMap* bm, size_t x) ;
void BitMapReset(BitMap* bm, size_t x) ;
int BitMapTest(BitMap* bm, size_t x) ;

void BitMapInit(BitMap* bm, size_t range) {
	assert(bm);
	bm->_bits=(size_t*)malloc(sizeof(size_t)*((range>>5)+1));
	bm->_range=(range>>5)+1;
	memset(bm->_bits,0,sizeof(size_t)*((range>>5)+1));
}

void BitMapSet(BitMap* bm, size_t x) {
	size_t index,num;
	index=x>>5;
	num=x%32;
	bm->_bits[index]|=(1<<num);
}

void BitMapReset(BitMap* bm, size_t x) {
	size_t index,num;
	index=x>>5;
	num=x%32;
	bm->_bits[index]&=(~(1<<num));
}
void BitMapDestroy(BitMap* bm){
 	free(bm->_bits); 
}

// x存在返回0,不存在返回-1 
int BitMapTest(BitMap* bm, size_t x){
	size_t index,num;
	index=x>>5;
	num=x%32;
	if(bm->_bits[index]&(1<<num))
		return 0;
	return -1;
}

写好位图后,可以快速的在海量数据中查找一个数据是否在其中

笔试题: 
给40亿个不重复的无符号整数,没排过序。给一个无符号整数,如何快速判断一个数是否在这40亿个数中。 【腾讯】

#include"BitMap.h"

int main(){
	BitMap bm;
	BitMapInit(&bm,-1);//-1 无符号整型中为最大值
	BitMapSet(&bm, 2);
	BitMapSet(&bm, 33);
	BitMapSet(&bm, 120);
	BitMapSet(&bm, 9);

	printf("1?   %d 
",BitMapTest(&bm,1));
	printf("2?   %d 
",BitMapTest(&bm,2));
	printf("33?  %d 
",BitMapTest(&bm,33));
	printf("120? %d 
",BitMapTest(&bm,120));
	printf("9?   %d 
",BitMapTest(&bm,9));

	BitMapReset(&bm, 33) ;
	BitMapReset(&bm, 120) ;
	printf("33?  %d 
",BitMapTest(&bm,33));
	printf("120? %d 
",BitMapTest(&bm,120));
	system("pause");
	return 0;
}


BloomFilter

如果我们要存入的不是一个整数,而是字符串,那么引起冲突的可能性将大大增加,为了解决这个问题,就需要使用的布隆过滤器,但布隆过滤器并不是完全准确的,他只能只能保证不在是准确的,在则有可能是误判;

为降低存在的误判可能,采取多个HashFunc,这样,只要查找多次找到都为1,那么可以近似认为他就是存在;

反之,只要有一个为0,那么他必定不存在;

#include"BitMap.h"

typedef const char* KeyValue;
typedef size_t(*HASH_FUNC)(const char* str);

typedef struct{
	BitMap _bm;
	HASH_FUNC hashfunc1;
	HASH_FUNC hashfunc2;
	HASH_FUNC hashfunc3;
}BloomFilter;

void BloomFilterInit(BloomFilter* bf,size_t range);
void BloomFilterSet(BloomFilter* bf, KeyValue x) ;
void BloomFilterReset(BloomFilter* bf, KeyValue x) ;
void BloomFilterTest() ;
void BloomFilterDestory(BloomFilter* bf); 

static size_t BKDRHash(KeyValue str)
{
	size_t seed = 131;  // 31 131 1313 13131 131313
	size_t hash = 0;
	while (*str )
	{
		hash = hash * seed + (*str++);
	}
	return (hash & 0x7FFFFFFF);
}

size_t SDBMHash(KeyValue str)  
{  
	size_t ch;
	size_t hash = 0;  
	while (ch = (size_t)*str++)  
	{  
		hash = 65599 * hash + ch;         
		//hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;  
	}  
	return hash;  
}

size_t RSHash(KeyValue str)  
{  
	size_t ch;
	size_t hash = 0;  
	size_t magic = 63689;     
	while (ch = (size_t)*str++)  
	{  
		hash = hash * magic + ch;  
		magic *= 378551;  
	}
	return hash;  
}

void BloomFilterInit(BloomFilter* bf,size_t range){
	assert(bf);
	BitMapInit(&bf->_bm,range);
	bf->hashfunc1=BKDRHash;
	bf->hashfunc2=SDBMHash;
	bf->hashfunc3=RSHash;
}

void BloomFilterSet(BloomFilter* bf, KeyValue x){
	size_t hash1,hash2,hash3;
	hash1=bf->hashfunc1(x)%bf->_bm._range;
	hash2=bf->hashfunc2(x)%bf->_bm._range;
	hash3=bf->hashfunc3(x)%bf->_bm._range;
	BitMapSet(&bf->_bm,hash1);
	BitMapSet(&bf->_bm,hash2);
	BitMapSet(&bf->_bm,hash3);
}

void BloomFilterReset(BitMap* bm, KeyValue x) ;

int BloomFilterFind(BloomFilter* bf,KeyValue x){

	size_t hash1,hash2,hash3;
	hash1=bf->hashfunc1(x)%bf->_bm._range;
	hash2=bf->hashfunc2(x)%bf->_bm._range;
	hash3=bf->hashfunc3(x)%bf->_bm._range;

	if(BitMapTest(&bf->_bm,hash1)==-1)
		return -1;
	if(BitMapTest(&bf->_bm,hash2)==-1)
		return -1;
	if(BitMapTest(&bf->_bm,hash3)==-1)
		return -1;

	return 0;
}

void BloomFilterDestory(BloomFilter* bf){
	assert(bf);
	BitMapDestroy(&bf->_bm);
}


void BloomFilterTest(){
	BloomFilter bf;
	BloomFilterInit(&bf,10000);
	BloomFilterSet(&bf,"sort");
	BloomFilterSet(&bf,"srot");
	BloomFilterSet(&bf,"return");
	BloomFilterSet(&bf,"find");
	BloomFilterSet(&bf,"sort123456789");

	printf("sort?  %d
",BloomFilterFind(&bf,"sort"));
	printf("srot?  %d
",BloomFilterFind(&bf,"srot"));
	printf("return?  %d
",BloomFilterFind(&bf,"return"));
	printf("sort123456789?  %d
",BloomFilterFind(&bf,"sort123456789"));
	printf("find?  %d
",BloomFilterFind(&bf,"find"));
	printf("find1?  %d
",BloomFilterFind(&bf,"find1"));
	printf("asd?  %d
",BloomFilterFind(&bf,"asd"));
	BloomFilterDestory(&bf);
}


原文地址:https://www.cnblogs.com/yongtaochang/p/13615364.html