java 实现字符串词频统计

package com.gpdi.action;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class WordsStatistics {
	
	class Obj {
		int count ;
		Obj(int count){
			this.count = count;
		}
	}
	
	public List<WordCount> statistics(String word) {
		List<WordCount> rs = new ArrayList<WordCount>();
		Map <String,Obj> map = new HashMap<String,Obj>();
		
		if(word == null ) {
			return null;
		}
		word = word.toLowerCase();
		word = word.replaceAll("'s", "");
		word = word.replaceAll(",", "");
		word = word.replaceAll("-", "");
		word = word.replaceAll("\\.", "");
		word = word.replaceAll("'", "");
		word = word.replaceAll(":", "");
		word = word.replaceAll("!", "");
		word = word.replaceAll("\n", "");
		
		String [] wordArray = word.split(" ");
		for(String simpleWord : wordArray) {
			simpleWord = simpleWord.trim(); 
			if (simpleWord != null && !simpleWord.equalsIgnoreCase("")) {
				Obj cnt = map.get(simpleWord);
				if ( cnt!= null ) {
					cnt.count++;
				}else {
					map.put(simpleWord, new Obj(1));
				}
			}
		}
		
		for(String key : map.keySet()) {
			WordCount wd = new WordCount(key,map.get(key).count);
			rs.add(wd);
		}
		
		Collections.sort(rs, new java.util.Comparator<WordCount>(){
			@Override
			public int compare(WordCount o1, WordCount o2) {
				int result = 0 ;
				if (o1.getCount() > o2.getCount() ) {
					result = -1;
				}else if (o1.getCount() < o2.getCount()) {
					result = 1;
				}else {
					int strRs = o1.getWord().compareToIgnoreCase(o2.getWord());
					if ( strRs > 0 ) {
						result = 1;
					}else {
						result = -1 ;
					}
				}
				return result;
			}
			
		});
		return rs;
	}
	
	
	public static void main(String args[]) {
		String word = "Pinterest is might be aa ab aa ab marketer's dream  - ths site is largely used to curate products " ;
		WordsStatistics s = new WordsStatistics();
		List<WordCount> rs = s.statistics(word);
		for(WordCount word1 : rs) {
			System.out.println(word1.getWord()+"*"+word1.getCount());
		}
	}
	
}
原文地址:https://www.cnblogs.com/treemanfm/p/2989924.html