第五章_散列

散列表ADT，只支持二叉树查找所允许的一部分操作。比如插入，删除，查找等。那些需要元素间排序信息的操作将不会得到支持。比如FindMin，FindMax和线性时间按排序顺序打印整个表的操作时不支持的。

散列函数在理想状态是能将任何两个不同的关键字映射到不同的单元，但是这是不可能，因为关键字是无穷的，但是散列表是有大小的。所以我们要找的散列函数应该能够在单元之间均匀的分配关键字。而且需要能够解决冲突。

通常保证散列表的大小是素数。

解决冲突的方法：分离链接法和开放定址法

分离链接法

将散列到同一个值的所有元素保存在一个链表中

#ifndef HASH_SEP_H
#define HASH_SEP_H

typedef int ElementType;
struct ListNode;
typedef struct ListNode *Position;
typedef Position List;

struct HashTbl;
typedef struct HashTbl *HashTable;

HashTable InitializeTable(int TableSize);
Position Find(ElementType key, HashTable H);
void Insert(ElementType key, HashTable H);
void PrintHashSep(HashTable H);

#endif

#include "HashSep.h"
#include <iostream>
using namespace std;

struct ListNode {
	ElementType Element;
	Position next;
};

struct HashTbl {
	int TableSize;
	List *TheLists;
};

int Hash(int n, int TableSize)
{
	return n %TableSize;
}

bool IsPrime(int n)
{
	if (2==n || 3==n)
		return true;
	if (1==n || (n&1)==0)
		return false;
	for (int i=3; i*i<=n; i+=2) {
		if (n%i==0)
			return false;
	}
	return true;
}

int NextPrime(int n)
{
	if (2==n)
		return n;
	if ((n&1)==0)
		++n;
	while (1) {
		if (IsPrime(n))
			return n;
		else
			n += 2;
	}
}

HashTable InitializeTable(int TableSize)
{
	HashTable H;
	H = (HashTable)malloc(sizeof(struct HashTbl));
	if (H==NULL) {
		cout<<"out of space"<<endl;
		return NULL;
	}

	H->TableSize = NextPrime(TableSize);
	H->TheLists = (List*)malloc(sizeof(List)*(H->TableSize));
	if (H->TheLists==NULL) {
		cout<<"out of space"<<endl;
		return NULL;
	}

	for (int i=0; i<H->TableSize; ++i) {
		H->TheLists[i] = (List)malloc(sizeof(struct ListNode));
		if (H->TheLists[i]==NULL) {
			cout<<"out of space"<<endl;
			return NULL;
		}
		(H->TheLists[i])->next = NULL;
	}
	return H;
}

Position Find(ElementType key, HashTable H)
{
	Position P;
	List L;
	L = H->TheLists[Hash(key, H->TableSize)];
	P = L->next;
	while (P!=NULL && P->Element!=key)
		P = P->next;
	return P;
}

void Insert(ElementType key, HashTable H)
{
	Position pos = Find(key, H);
	Position newCell;
	List L;
	if (pos==NULL) {
		newCell = (Position)malloc(sizeof(struct ListNode));
		if (newCell==NULL) {
			cout<<"out of space"<<endl;
			return;
		} else {
			L = H->TheLists[Hash(key, H->TableSize)];
			newCell->next = L->next;
			newCell->Element = key;
			L->next = newCell;
		}
	}
}
			  
void PrintHashSep(HashTable H)
{
	List *L = H->TheLists;
	Position P;
	for (int i=0; i<H->TableSize; ++i) {
		P = L[i]->next;
		while (P!=NULL) {
			cout<<P->Element<<" ";
			P = P->next;
		}
	}
}
int main(int argc, char **argv)
{
	HashTable H = InitializeTable(6);
	for (int i=0; i<H->TableSize; ++i)
		Insert(i, H);
	Insert(13, H);
	Insert(16, H);
	Insert(9, H);

    PrintHashSep(H);
	cout<<endl;

	if (Find(9, H)) {
		cout<<Find(9, H)->Element<<endl;
	}
	
	system("pause");
	return 0;
}

装填因子

定义装填因子λ为散列表中元素个数与散列表大小的比值。

开放定址法

不需要用链表解决冲突。在开放定址散列法中，当发现冲突就尝试选择另外的单元，直到找出空的单元。更一般的，单元h0(x), h1(x),h2(x)…相继被试选，其中hi(x) = (Hash(x)+F[i]) mod TableSize。因为所有的数据都要放入表内，所以开放定址法比分离链接法所需要的散列表大。一般对于开放定址散列算法，装填因子λ应该低于0.5。

线性探测法

函数F是i的线性函数，典型的情况是F(i) = i。即逐个探测每个单元以查找一个空单元。容易产生一次聚集效应。

如果一个表可以有多于一半被填满的话，那么线性探测就不是个好办法。

平方探测法

可以消除线性探测中的一次聚集问题。F(i) = i^2。

对于平方探测，如果表的大小是素数，当表至少还有一半是空的时候，总能插入一个新元素。一旦表被填满超过一半，当表的大小不是素数时甚至在表被填满之前，就不能保证一次找到一个空单元了。

哪怕表有比一半多一个的位置被填满，插入都有可能失败。表的大小是素数也很重要，因为若不是素数，备选单元的个数可能会锐减。

在开放定址散列表中，标准的删除操作不能进行，因为响应的单元可能已经引起过冲突，元素绕过它存到了别处。开放定址散列表需要懒惰删除。

散列到同一位置上的哪些元素将探测相同的备选单元，这叫做二次聚集。

#ifndef _HASH_QUAD_H
#define _HASH_QUAD_H

typedef int ElementType;
typedef unsigned int Index;
typedef Index Position;
typedef struct HashEntry Cell;
typedef struct HashTbl *HashTable;

HashTable InitializeTable(int TableSize);
Position Find(ElementType Key, HashTable H);
void Insert(ElementType Key, HashTable H);
#endif

#include "HashQuad.h"
#include <iostream>
using namespace std;
enum KindOfEntry {
	Legitimate,
	Empty,
	Deleted
};

struct HashEntry {
	ElementType Element;
	enum KindOfEntry Info;
};

struct HashTbl {
	int TableSize;
	Cell *TheCells;
};

bool IsPrime(int n)
{
	if (2==n || 3==n)
		return true;
	if (1==n || (n&1)==0)
		return false;
	for (int i=3; i*i<=n; i+=2) {
		if (n%i==0)
			return false;
	}
	return true;
}

int NextPrime(int n)
{
	if (2==n)
		return n;
	if ((n&1)==0)
		++n;
	while (1) {
		if (IsPrime(n))
			return n;
		else
			n += 2;
	}
}

int Hash(int key, int TableSize) {
	return key%TableSize ;
}

HashTable InitializeTable(int TableSize)
{
	HashTable H = (HashTable)malloc(sizeof(struct HashTbl));
	if (H==NULL) {
		cout<<"out of space"<<endl;
		return NULL;
	}
	H->TableSize = NextPrime(TableSize);

	H->TheCells = (Cell*)malloc(sizeof(Cell)*H->TableSize);
	if (H->TheCells==NULL) {
		cout<<"out of space"<<endl;
		return NULL;
	}

	for (int i=0; i<H->TableSize; ++i)
		H->TheCells[i].Info = Empty;

	return H;
}

Position Find(ElementType Key, HashTable H)
{
	Position CurrentPos;
	int CollisionNum = 0;
	CurrentPos = Hash(Key, H->TableSize);

	while (H->TheCells[CurrentPos].Info!=Empty &&
		H->TheCells[CurrentPos].Element!=Key) {
			CurrentPos += 2 * ++CollisionNum - 1;
			if (CurrentPos>=H->TableSize)
				CurrentPos -= H->TableSize;
	}
	return CurrentPos;
}

void Insert(ElementType Key, HashTable H)
{
	Position Pos;
	Pos = Find(Key, H);
	if (H->TheCells[Pos].Info!=Legitimate) {
		H->TheCells[Pos].Info = Legitimate;
		H->TheCells[Pos].Element = Key;
	}
}


int main(int argc, char **argv)
{
	system("pause");
	return 0;
}

双散列

对于双散列，一种流行的选择是F(i) = I * hash2(X)。我们将第二个散列函数应用到X并在距离hash2(X)，2hash2(X)处探测。注意，函数一定不要算得0值。

再散列

如果表填的太满，操作的运行时间将开始消耗过长，且Insert操作可能失败。一种解决方法是建立另外一个大约两倍大的表（而且使用一个相关的新散列函数），扫描整个原始散列表，计算每个（未删除的）元素的新散列值并将其插入到新表中。