EPANET中的哈希文件—

EPANET中的哈希文件——hash.c

/*-----------------------------------------------------------------------------
**   hash.c
**
**   Implementation of a simple Hash Table for string storage & retrieval
**
**   Written by L. Rossman
**   Last Updated on 6/19/03
**
**   The hash table data structure (HTable) is defined in "hash.h".
**   Interface Functions:
**      HTcreate() - creates a hash table
**      HTinsert() - inserts a string & its index value into a hash table
**      HTfind()   - retrieves the index value of a string from a table
**      HTfree()   - frees a hash table
**
*********************************************************************
**   NOTE: This is a modified version of the original HASH.C module.
*********************************************************************
*/

/*-----------------------------------------------------------------------------
**   关于哈希表这一数据结构的介绍，可以参考博文：
**   http://www.cnblogs.com/KingOfFreedom/archive/2012/12/11/2812505.html
**
**   这里采用的哈希函数是Fletcher's checksum to compute 2-byte hash of string
**   这里的哈希冲突解决方法是采用上述博文中的第3种方法“链地址法”
**   将所有关键字为同义词的记录存储在同一线性链表中。该线性链表的定义在hash.h中的HTentry
*/

#include <malloc.h>
#include <string.h>
#include "hash.h"

/*
**--------------------------------------------------------------
** 输入："ID标识"作为哈希函数的参数
** 输出：哈希后的值
** 作用：使用了Fletcher's checksum算法的哈希函数来处理32位长的字符串以获得散列值。
**--------------------------------------------------------------
*/
/* Use Fletcher's checksum to compute 2-byte hash of string */
unsigned int hash(char *str)
{
    unsigned int sum1= 0, check1;
    unsigned long sum2= 0L;
while( '' != *str )
    {
        sum1 += (*str);
        str++;
        if ( 255 <= sum1 ) sum1 -= 255;
        sum2 += sum1;
    }
    check1= sum2;
    check1 %= 255;
    check1= 255 - (sum1+check1) % 255;
    sum1= 255 - (sum1+check1) % 255;
    return( ( ( check1 << 8 ) | sum1 ) % HTMAXSIZE);
}

/*
**--------------------------------------------------------------
** 输入：无
** 输出：成功则返回哈希表头指针
** 作用：创建一个长度为HTMAXSIZE的哈希表，并初始化
**--------------------------------------------------------------
*/
HTtable *HTcreate()
{
        int i;
        HTtable *ht = (HTtable *) calloc(HTMAXSIZE, sizeof(HTtable));
  if (ht != NULL) for (i=0; i<HTMAXSIZE; i++) ht[i] = NULL;/* Comment by CCR: Here Can Be Better,the Reason is:calloc在动态分配完内存后，自动初始化该内存空间为零，而malloc不初始化，里边数据是随机的垃圾数据。所以这句可以注释掉 */
        return(ht);
}

/*
**--------------------------------------------------------------
** 输入：哈希表ht、"ID标识"key、Node中的索引值
** 输出：成功插入返回1，否则返回0
** 作用：将一个字符串以及索引值插入到哈希表中
**--------------------------------------------------------------
*/
int     HTinsert(HTtable *ht, char *key, int data)
{
        unsigned int i = hash(key);
        struct HTentry *entry;
        if ( i >= HTMAXSIZE )
   return(0);
        entry = (struct HTentry *) malloc(sizeof(struct HTentry));
        if (entry == NULL) return(0);//判断内存是否分配成功
        entry->key = key;
        entry->data = data;
  //将同一hash值的链表挂到当前对象entry后面，再将当前对象entry置于队首
        entry->next = ht[i];
        ht[i] = entry;
        return(1);
}

/*
**--------------------------------------------------------------
** 输入：哈希表、"ID标识"
** 输出：给出指定"ID标识"在Node中的索引值，若没找到返回0
** 作用：返回指定"ID标识"在Node中的索引值
**--------------------------------------------------------------
*/
int     HTfind(HTtable *ht, char *key)
{
        unsigned int i = hash(key);
        struct HTentry *entry;
        if ( i >= HTMAXSIZE )
   return(NOTFOUND);
        entry = ht[i];
        while (entry != NULL)
        {
   //哈希冲突处理：链地址法
            if (strcmp(entry->key,key) == 0 ) return(entry->data);
            entry = entry->next;
        }
        return(NOTFOUND);
}

/*
**--------------------------------------------------------------
** 输入：哈希表、"ID标识"
** 输出：寻找指定"ID标识"是否存在于哈希表中，若没找到返回NULL，找到则返回指向"ID标识"的指针
** 作用：判断指定"ID标识"是否存在于哈希表中
**--------------------------------------------------------------
*/
char    *HTfindKey(HTtable *ht, char *key)
{
        unsigned int i = hash(key);
        struct HTentry *entry;
        if ( i >= HTMAXSIZE )
   return(NULL);
        entry = ht[i];
        while (entry != NULL)
        {
            if ( strcmp(entry->key,key) == 0 ) return(entry->key);
            entry = entry->next;
        }
        return(NULL);
}

/*
**--------------------------------------------------------------
** 输入：哈希表
** 输出：
** 作用：回收哈希表的内存
**--------------------------------------------------------------
*/
void    HTfree(HTtable *ht)
{
        struct HTentry *entry,
                       *nextentry;
        int i;
        for (i=0; i<HTMAXSIZE; i++)
        {
            entry = ht[i];
            while (entry != NULL)
            {
                nextentry = entry->next;
                free(entry);
                entry = nextentry;
            }
        }
        free(ht);
}

--------------------------------------------------------

哈希表这一数据结构是用内存空间来提高时间效率的算法，理想情况下（不存在冲突）的哈希算法的时间复杂度是常数O（1）。但是实际情况是即便开辟了足够多的一连串的内存空间，如果哈希函数选取不当，还是会发生冲突。EPANET中的哈希函数的选取是使用了Fletcher's checksum算法的哈希函数来处理32位长的字符串以获得散列值，这个哈希算法的优劣本人还无法去评断。但是注意，EPANET中的冲突处理是采用链地址法，而EPANET中默认提供的哈希地址个数是HTMAXSIZE个，在hash.h中是这样定义的#define HTMAXSIZE 1999。如果我们的模型有5W个左右的节点与管段，那么这2000个地址空间，平均每个地址空间会挂有一个长度为25的线性链表。而哈希函数算法不一定这么优秀，可能某个地址空间挂了一个长度为上百甚至上千的线性链表，那么查询效率就低下了。所以，如果运行EPANET的机子有足够多的内存，比如8G以上，那么就可以试着修改hash.h中的#define HTMAXSIZE 1999。将整个1999改的大些，那么运行效率也就可以提高了。