利用偏移量快速定位数据内容

本项目需要把数据存档位二进制文件,载入时只载入文件索引,通过索引,快速定位到数据内容,从而实现最小存储,最快速查找。下面代码是初步实现,通过扩展,还实现搜索引擎关键字匹配度,权重,分词效果,这是后话,先把最基础的通过偏移量快速查找分享下。

/// <summary>
    /// 索引文件结构
    /// </summary>
    struct Token
    {
        /// <summary>
        /// 关键字
        /// </summary>
        public string ID;

        /// <summary>
        /// 移偏量
        /// </summary>
        public int Offset;

        /// <summary>
        /// 长度
        /// </summary>
        public int Length;

    }

  

/// <summary>
    /// 搜索
    /// </summary>
    class Search
    {
        private static StringBuilder _mainContent = new StringBuilder();

        //生成索引文件和数据文件
        public void BuildFile()
        {
            //生成索引文件
            if (File.Exists("index.txt"))
                File.Delete("index.txt");
            using (FileStream aFile = new FileStream("index.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite))
            {
                Random rd = new Random();
                int rdv = 0;

                byte[] bytes = null;
                byte[] byCont = null;
                int offset = 0;
                int len = 0;


                using (BinaryWriter bw = new BinaryWriter(aFile, Encoding.UTF8))
                {
                    for (int i = 0; i < 15; i++)
                    {
                        rdv = rd.Next(10, 305000);
                        string indexerid = string.Empty;
                        indexerid = i.ToString() + DateTime.Today.ToString("yyyyMMdd");

                        _mainContent.Append(indexerid + "|test programe" + rdv.ToString());
                        string result = indexerid + "|test programe" + rdv.ToString();

                        bytes = System.Text.Encoding.UTF8.GetBytes(_mainContent.ToString()); //所有内容
                        byCont = System.Text.Encoding.UTF8.GetBytes(result);  //本次内容

                        //计算偏移量和内容长度
                        if (i == 0)
                        {
                            offset = 0;
                            len = byCont.Length;
                        }
                        else
                        {
                            offset = bytes.Length - byCont.Length;
                            len = byCont.Length;
                        }
                        bw.Write(indexerid + "," + (offset) + "," + (len));
                        bw.Flush();
                    }

                }
            }

            //生成数据文件
            if (File.Exists("data.txt"))
                File.Delete("data.txt");

            using (FileStream dFile = new FileStream("data.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite))
            {
                Random rd = new Random();

                byte[] bytes = null;


                using (BinaryWriter bw = new BinaryWriter(dFile, Encoding.UTF8))
                {

                    bytes = System.Text.Encoding.Default.GetBytes(_mainContent.ToString());
                    bw.Write(_mainContent.ToString());
                    bw.Flush();
                }
            }
        }
     //加载索引表到内存
        public Dictionary<string, Token> GetTokenDic()
        {
            Dictionary<string, Token> dic = new Dictionary<string, Token>();

            using (FileStream aFile = new FileStream("index.txt", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
            {
                using (BinaryReader bw = new BinaryReader(aFile, Encoding.UTF8))
                {
                    for (int i = 0; i < 15; i++)
                    {
                        string result = bw.ReadString();
                        if (result.IndexOf(',') != -1)
                        {
                            string[] arr = result.Split(',');
                            Token token = new Token();
                            token.ID = arr[0];
                            token.Length = Convert.ToInt32(arr[2]);
                            token.Offset = Convert.ToInt32(arr[1]);

                            dic.Add(arr[0], token);
                        }

                    }

                }
            }
            return dic;

        }

        //根据关键字,通过偏移量快速查找内容
        public void ReadFile(string key)
        {
            Dictionary<string, Token> dic = GetTokenDic();


            //char[] charData = null;
            FileStream file = new FileStream("data.txt", FileMode.Open);
            int dOffset = 0;
            int dLen = 0;
            Token t = new Token();

            if (dic.TryGetValue(key, out t))
            {
                dOffset = t.Offset;
                dLen = t.Length;
            }
            byte[] byData = new byte[dLen];
            using (BinaryReader bw = new BinaryReader(file, Encoding.UTF8))
            {
                file.Seek(dOffset + 2, SeekOrigin.Begin);
                
                file.Read(byData, 0, dLen); 
                string d = Encoding.UTF8.GetString(byData);
                Console.WriteLine(d);
                file.Close();
            }

        }
    }

  

static void Main(string[] args)
        {
            Search s = new Search();
            //s.BuildFile();
            s.ReadFile("1420130825");
            Console.Read();
        }

  

原文地址:https://www.cnblogs.com/kevinke/p/3281627.html