java实现的Trie树数据结构

近期在学习的时候，常常看到使用Trie树数据结构来解决这个问题。比方“ 有一个1G大小的一个文件。里面每一行是一个词。词的大小不超过16字节，内存大小限制是1M。

返回频数最高的100个词。

” 该怎样解决？有一种方案就是使用Trie树加排序实现。

什么是Trie 树呢？也就是常说的字典树，网上对此讲得也非常多，简单补充一下个人理解：它实际上相当于把单词的公共部分给拎出来。这样一层一层往上拎直到得到每一个节点都是不可分的最小单元！

比方网上一个样例

一组单词，inn, int, at, age, adv, ant, 我们能够得到以下的Trie：

这里的节点上存的是一个单词，实际上。每一个节点走过的路径就是该节点代表的单词！其他不多扯了~~~

Trie树有什么优点呢

它是一种非常快的单词查询结构。当然，对于单词去重统计也是非常好的选择！比方搜索引擎的关键词联想功能非常好的一种选择就是使用Trie树了！比方你输入了in。通过上面的图我们应该提示inn和int 。这样能够轻松实现。另外，对于单词出现的频率统计。以及查找公共前缀等问题，都能够非常好的解决！

本文不是讲理论。仅仅是给出用java自己实现的Trie树数据结构，当中实现了插入、查找、遍历、单词联想（找公共前缀）等基本功能, 其他功能大家能够自己加入~~~~

下面是Trie树类：

package com.algorithms;

import java.util.HashMap;
import java.util.Map;


public class Trie_Tree{
	 
	
	/**
	 * 内部节点类
	 * @author "zhshl"
	 * @date	2014-10-14
	 *
	 */
	private class Node{
		private int dumpli_num;////该字串的反复数目，  该属性统计反复次数的时候实用,取值为0、1、2、3、4、5……
		private int prefix_num;///以该字串为前缀的字串数。 应该包含该字串本身。。！
。！
		private Node childs[];////此处用数组实现，当然也能够map或list实现以节省空间
		private boolean isLeaf;///是否为单词节点
		public Node(){
			dumpli_num=0;
			prefix_num=0;
			isLeaf=false;
			childs=new Node[26];
		}
	}	
	
	
	private Node root;///树根  
	public Trie_Tree(){
		///初始化trie 树
		root=new Node();
	}
	
	
	
	/**
	 * 插入字串。用循环取代迭代实现
	 * @param words
	 */
	public void insert(String words){
		insert(this.root, words);
	}
	/**
	 * 插入字串，用循环取代迭代实现
	 * @param root
	 * @param words
	 */
	private void insert(Node root,String words){
		words=words.toLowerCase();////转化为小写
		char[] chrs=words.toCharArray();
		
		for(int i=0,length=chrs.length; i<length; i++){
			///用相对于a字母的值作为下标索引，也隐式地记录了该字母的值
			int index=chrs[i]-'a';
			if(root.childs[index]!=null){
				////已经存在了，该子节点prefix_num++
				root.childs[index].prefix_num++;
			}else{
				///假设不存在
				root.childs[index]=new Node();
				root.childs[index].prefix_num++;				
			}	
			
			///假设到了字串结尾，则做标记
			if(i==length-1){
				root.childs[index].isLeaf=true;
				root.childs[index].dumpli_num++;
			}
			///root指向子节点，继续处理
			root=root.childs[index];
		}
		
	}
	
	
	
	
	/**
	 * 遍历Trie树，查找全部的words以及出现次数
	 * @return HashMap<String, Integer> map
	 */
	public HashMap<String,Integer> getAllWords(){
//		HashMap<String, Integer> map=new HashMap<String, Integer>();
			
		return preTraversal(this.root, "");
	}
	
	/**
	 * 前序遍历。。。
	 * @param root		子树根节点
	 * @param prefixs	查询到该节点前所遍历过的前缀
	 * @return
	 */
	private  HashMap<String,Integer> preTraversal(Node root,String prefixs){
		HashMap<String, Integer> map=new HashMap<String, Integer>();
		
		if(root!=null){
			
			if(root.isLeaf==true){
			////当前即为一个单词
				map.put(prefixs, root.dumpli_num);
			}
			
			for(int i=0,length=root.childs.length; i<length;i++){
				if(root.childs[i]!=null){
					char ch=(char) (i+'a');
					////递归调用前序遍历
					String tempStr=prefixs+ch;
					map.putAll(preTraversal(root.childs[i], tempStr));
				}
			}
		}		
		
		return map;
	}
	
	
	
	
	/**
	 * 推断某字串是否在字典树中
	 * @param word
	 * @return true if exists ,otherwise  false 
	 */
	public boolean isExist(String word){
		return search(this.root, word);
	}
	/**
	 * 查询某字串是否在字典树中
	 * @param word
	 * @return true if exists ,otherwise  false 
	 */
	private boolean search(Node root,String word){
		char[] chs=word.toLowerCase().toCharArray();
		for(int i=0,length=chs.length; i<length;i++){
			int index=chs[i]-'a';
			if(root.childs[index]==null){
				///假设不存在，则查找失败
				return false;
			}			
			root=root.childs[index];			
		}
		
		return true;
	}
	
	/**
	 * 得到以某字串为前缀的字串集。包含字串本身。 相似单词输入法的联想功能
	 * @param prefix 字串前缀
	 * @return 字串集以及出现次数，假设不存在则返回null
	 */
	public HashMap<String, Integer> getWordsForPrefix(String prefix){
		return getWordsForPrefix(this.root, prefix);
	}
	/**
	 * 得到以某字串为前缀的字串集。包含字串本身。
	 * @param root
	 * @param prefix
	 * @return 字串集以及出现次数
	 */
	private HashMap<String, Integer> getWordsForPrefix(Node root,String prefix){
		HashMap<String, Integer> map=new HashMap<String, Integer>();
		char[] chrs=prefix.toLowerCase().toCharArray();
		////
		for(int i=0, length=chrs.length; i<length; i++){
			
			int index=chrs[i]-'a';
			if(root.childs[index]==null){
				return null;
			}
			
			root=root.childs[index];
		
		}
		///结果包含该前缀本身
		///此处利用之前的前序搜索方法进行搜索
		return preTraversal(root, prefix);
	}
	   
}

下面是測试类：

package com.algorithm.test;

import java.util.HashMap;

import com.algorithms.Trie_Tree;

public class Trie_Test {

	 public static void main(String args[])  //Just used for test
	    {
	    Trie_Tree trie = new Trie_Tree();
	    trie.insert("I");
	    trie.insert("Love");
	    trie.insert("China");
	    trie.insert("China");
	    trie.insert("China");
	    trie.insert("China");
	    trie.insert("China");
	    trie.insert("xiaoliang");
	    trie.insert("xiaoliang");
	    trie.insert("man");
	    trie.insert("handsome");
	    trie.insert("love");
	    trie.insert("chinaha");
	    trie.insert("her");
	    trie.insert("know");
	  
	    HashMap<String,Integer> map=trie.getAllWords();
	    
	    for(String key:map.keySet()){
	    	System.out.println(key+" 出现: "+ map.get(key)+"次");
	    }
	    
	    
	    map=trie.getWordsForPrefix("chin");
	    
	    System.out.println("

包括chin（包括本身）前缀的单词及出现次数：");
	    for(String key:map.keySet()){
	    	System.out.println(key+" 出现: "+ map.get(key)+"次");
	    }
	    
	    if(trie.isExist("xiaoming")==false){
	    	System.out.println("

字典树中不存在：xiaoming ");
	    }
	    
	    
	    }
}

执行结果：

love 出现: 2次
chinaha 出现: 1次
her 出现: 1次
handsome 出现: 1次
know 出现: 1次
man 出现: 1次
xiaoliang 出现: 2次
i 出现: 1次
china 出现: 5次

包括chin（包括本身）前缀的单词及出现次数：
chinaha 出现: 1次
china 出现: 5次

字典树中不存在：xiaoming

总结：在实现的时候。主要是想好怎样设计每一个节点的结构，这里针对单词总共26个，使用了一个字符数组来记录。事实上全然能够用list或其它的容器来实现。这样也就能够容纳更复杂的对象了！另外一个方面就是。一个节点的prefix_num属性实际上是指到该节点经过的路径（也就是字串）的反复数。而不是到该节点的反复数（由于一个节点的child域并非指某个单词，这样prefix_num对该节点本身没意义）。最后，遍历使用了前序遍历的递归实现。相信对学过一点数据结构的不难。。。