单词 统计续(补)

短语统计

我们在处理文本的时候只需将提取出来的文本数据进行特殊的分割处理,比如只需将英文的“,”,“.”,“?”,“!"以及回车符设为分隔符。并将一些无用单词作为间断比如

"a",  "it", "the", "and", "this"等。

package analyse_word;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.Set;
public class recognize_sentence {
 
 public static boolean useless(String str) throws FileNotFoundException {
  File file = new File("D:\useless.txt");// 读取文件
  String words[] = new String [100000];
  int out_words[] = new int [100000];
  if (!file.exists()) {// 如果文件打不开或不存在则提示错误
   System.out.println("文件不存在");
   return false;
  }
  Scanner x = new Scanner(file);
  HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
  while (x.hasNextLine()) {
   String line = x.nextLine();
   String[] lineWords = line.split("[ ]");
   Set<String> wordSet = hashMap.keySet();
   for (int i = 0; i < lineWords.length; i++) {
    if (wordSet.contains(lineWords[i])) {
     Integer number = hashMap.get(lineWords[i]);
     number++;
     hashMap.put(lineWords[i], number);
    } else {
     hashMap.put(lineWords[i], 1);
    }
   }
  }
  Iterator<String> iterator = hashMap.keySet().iterator();
  int max = 0,i=0;
  while (iterator.hasNext()) {
   String word = iterator.next();
   if(!"".equals(word)&&word!=null&&!"a".equals(word)&&!"the".equals(word)&&!"  ".equals(word)) {
    if(str.indexOf(" "+word+" ")==0) {
     return true;
    }
    words[i]=word;
    out_words[i]=hashMap.get(word);
    i++;
   }
  }
  return true;
 }
 public static void recognize() throws FileNotFoundException {
  File file = new File("D:\Englis_letters.txt");// 读取文件
  if (!file.exists()) {// 如果文件打不开或不存在则提示错误
   System.out.println("文件不存在");
   return;
  }
  Scanner x = new Scanner(file);
  HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
  while (x.hasNextLine()) {
   String line = x.nextLine();
   String[] lineWords = line.split("[\t+;.,“”‘’?! +]");
   Set<String> wordSet = hashMap.keySet();
   for (int i = 0; i < lineWords.length; i++) {
    if (wordSet.contains(lineWords[i])) {
     Integer number = hashMap.get(lineWords[i]);
     number++;
     hashMap.put(lineWords[i], number);
    } else {
     hashMap.put(lineWords[i], 1);
    }
   }
  }
  Iterator<String> iterator = hashMap.keySet().iterator();
  while (iterator.hasNext()) {
   String word = iterator.next();
   if(useless(word)) {
    System.out.println(word);
   }
  }
 }
 public static void main(String[] args) throws FileNotFoundException {
  recognize();
 }
}
原文地址:https://www.cnblogs.com/goubb/p/11031048.html