Katama hash 算法的C#实现

Katama hash 是经常在分布式解决方案中见到的算法，网上已经有很多文章介绍这个算法或者其他的hash一致性算法

前一阵子正好在做一个分布式系统的时候需要实现该算法，在网上找了找，发现用C#实现的都不是很好。。

有一个搜索出来结果最前面最多的实现，性能没有优化过，代码可读性也不是很好。。

然后各个C#的memcached library中的实现又耦合的太紧了，所以自己搞了下面的这段代码（参考了这位朋友的实现 http://www.cnblogs.com/daizhj/archive/2010/08/24/1807324.html）还有Beit的实现

using System;
using System.Collections.Generic;
using System.Text;
using System.Security.Cryptography;

namespace Clover
{
    public sealed class KetamaHash
    {
        private int[] Values = null;
        private string[] Nodes = null;

        public KetamaHash(IEnumerable<string> nodes, int copyNodes = 10000)
        {
            Refresh(nodes, copyNodes);
        }

        /// <summary>
        /// 该方法不是线程安全的，不过这个方法应该是很少调用的，只有在服务器列表变更的时候才需要调用该方法
        /// </summary>
        /// <param name="nodes"></param>
        /// <param name="copyNodes"></param>
        public void Refresh(IEnumerable<string> nodes, int copyNodes = 10000)
        {
            if (nodes == null)
            {
                throw new ArgumentNullException("nodes");
            }
            if (copyNodes <= 0)
            {
                throw new ArgumentOutOfRangeException("virualNodes");
            }

            SortedList<int, string> dict = new SortedList<int, string>();
            HashSet<string> sortedNodes = new HashSet<string>();
            foreach (var item in nodes)
            {
                if (item != null)
                    sortedNodes.Add(item);
            }

            if ((sortedNodes.Count * copyNodes) > 320 * 1000)
            {
                throw new ArgumentOutOfRangeException("There is too many copyNodes or real nodes! nodes.Count multiply copyNodes must be not greater than 320*1000 ");
            }

            foreach (var node in sortedNodes)
            {
                for (int i = 0; i < copyNodes / 4; i++)
                {
                    byte[] digest = Hash(node + "_" + i);
                    for (int h = 0; h < 4; h++)
                    {
                        int m = BitConverter.ToInt32(digest, 0 * 4);
                        dict[m] = node;
                    }
                }
            }

            var newValues = new int[dict.Keys.Count];
            var newNodes = new string[dict.Keys.Count];
            dict.Keys.CopyTo(newValues, 0);
            dict.Values.CopyTo(newNodes, 0);

            Values = newValues; // thread not safty
            Nodes = newNodes; // thread not safty

        }

        public string GetNodeByKey(string key)
        {
            int value = BitConverter.ToInt32(Hash(key), 0); //first 4 byte to int32
            int result = Array.BinarySearch<int>(Values, value);
            if (result < 0)
                result = ~result;
            if (result >= Nodes.Length)
                return Nodes[Nodes.Length - 1];
            else
                return Nodes[result];
        }

        #region Private Supported Method
        private byte[] Hash(byte[] source)
        {
            HashAlgorithm helper = new MD5CryptoServiceProvider();
            return helper.ComputeHash(source);
        }
        private byte[] Hash(string s)
        {
            return Hash(Encoding.UTF8.GetBytes(s));
        }
        #endregion
    }
}

测试代码如下：

  static void Main(string[] args)
        {
            List<string> nodes = new List<string>();
            for (int i = 0; i < 16; i++)
            {
                nodes.Add(Guid.NewGuid().ToString());//用来做测试代码。。。。的随机值
            }
            KetamaHash target = new KetamaHash(nodes, 10000);

            Dictionary<string, int> dict = new Dictionary<string, int>();
            Stopwatch sw = new Stopwatch();
            sw.Start();
            for (int i = 0; i < 1000 * 1000; i++)//运行一百万次
            {
                var result = target.GetNodeByKey(Guid.NewGuid().ToString());//用来做测试代码。。。。的随机值
                if (result == null)
                {
                    throw new Exception("没取到数据");
                }
                if (dict.ContainsKey(result))
                {
                    dict[result]++;
                }
                else
                {
                    dict[result] = 1;
                }
            }
            sw.Stop();

            long maxNumber = dict.Values.Max();
            long minNumber = dict.Values.Min();

            double temp = (maxNumber - minNumber) / Convert.ToDouble(maxNumber);

            Console.WriteLine(temp);
            Console.WriteLine(sw.ElapsedMilliseconds);

            if (temp >= 0.1)
            {
                Console.WriteLine("数据分布不均匀，尝试增加虚拟节点会更均匀点");
            }
            if (sw.ElapsedMilliseconds >= 12 * 1000)
            {
                Console.WriteLine("跑的太慢....当然 也有可能是你的机器太烂。。。。哈哈～");
            }


        }

虚拟节点越多，数据分配越均匀，不过性能也相对差一点，这边推荐使用10000，分配会比较均匀，速度也不慢

经过性能测试 95% 以上的性能消耗在MD5算法中，

如果换掉MD5的hash算法性能会好点。。。。