[NLP自然语言处理]计算熵和KL距离，java实现汉字和英文单词的识别，UTF8变长字符读取

算法任务：

1. 给定一个文件，统计这个文件中所有字符的相对频率（相对频率就是这些字符出现的概率——该字符出现次数除以字符总个数，并计算该文件的熵）。

2. 给定另外一个文件，按上述同样的方法计算字符分布的概率，然后计算两个文件中的字符分布的KL距离。

（熵和KL距离都是NLP自然语言处理中术语，仅仅是涉及到一两个公式而已，不影响您对代码的理解，so just try！）

说明：

1. 给定的文件可以是两个中文文件或两个英文文件，也可以是两个中英文混合文件。对于中文，计算字符，对于英文，计算词。

2.有效字符不包括空格换行符标点符号。

3.将中文字符、英文单词、其他非有效字符及其出现次数，分别写入三个文件中。

4.代码用java完成。

文章的重点：

1.如何判断一个字符是汉字，而不是ASCII、标点、日文、阿拉伯文……

2.了解汉字是如何编码的。“UTF8”绝逼是要花你一整个下午时间去弄明白的东西。

3.正则表达式。对于计算机科班出身的人应该不陌生，在此我就不造次了。

代码如下：

  1 import java.io.BufferedReader;
  2 import java.io.FileInputStream;
  3 import java.io.FileReader;
  4 import java.io.FileWriter;
  5 import java.util.HashMap;
  6 import java.util.Iterator;
  7 import java.util.Map.Entry;
  8 import java.util.regex.Matcher;
  9 import java.util.regex.Pattern;
 10 
 11 public class NLPFileUnit {
 12     public HashMap<String, Integer> WordOccurrenceNumber;//The Occurrence Number of the single Chinese character
 13     //or Single English word in the file
 14     public HashMap<String, Float> WordProbability;//The probability of single Chinese character or English word
 15     public HashMap<String, Integer> Punctuations;//The punctuation that screened out from the file 
 16     public float entropy;//熵，本文主要计算单个汉字，或者单个英文单词的熵值
 17     private String filePath;
 18 
 19     //构造函数
 20     public NLPFileUnit(String filePath) throws Exception {
 21         this.filePath = filePath;
 22         WordOccurrenceNumber = createHash(createReader(filePath));
 23         Punctuations = filterPunctuation(WordOccurrenceNumber);
 24         WordProbability = calProbability(WordOccurrenceNumber);
 25         this.entropy = calEntropy(this.WordProbability);
 26 
 27         System.out.println("all punctuations were saved at " + filePath.replace(".", "_punctuation.") + "!");
 28         this.saveFile(Punctuations, filePath.replace(".", "_punctuation."));
 29         System.out.println("all words(En & Ch) were saved at " + filePath.replace(".", "_AllWords.") + "!");
 30         this.saveFile(this.WordOccurrenceNumber, filePath.replace(".", "_AllWords."));
 31     }
 32 
 33     /**
 34      * get the English words form the file to HashMap
 35      * @param hash
 36      * @param path
 37      * @throws Exception
 38      */
 39     public void getEnWords(HashMap<String, Integer> hash, String path) throws Exception {
 40         FileReader fr = new FileReader(path);
 41         BufferedReader br = new BufferedReader(fr);
 42         
 43         //read all lines into content
 44         String content = "";
 45         String line = null;
 46         while((line = br.readLine())!=null){
 47             content+=line;
 48         }
 49         br.close();
 50         
 51         //extract words by regex正则表达式
 52         Pattern enWordsPattern = Pattern.compile("([A-Za-z]+)");
 53         Matcher matcher = enWordsPattern.matcher(content);
 54         while (matcher.find()) {
 55             String word = matcher.group();
 56             if(hash.containsKey(word))
 57                 hash.put(word, 1 + hash.get(word));
 58             else{
 59                 hash.put(word, 1);
 60             }
 61         }
 62     }
 63 
 64     private boolean isPunctuation(String tmp) {
 65         //Punctuation should not be EN words/ Chinese
 66         final String cnregex = "\p{InCJK Unified Ideographs}";
 67         final String enregex = "[A-Za-z]+"; 
 68         return !(tmp.matches(cnregex) || tmp.matches(enregex)) ;
 69     }
 70 
 71     /**
 72      * judge whether the file is encoded by UTF-8 (UCS Transformation Format)format.
 73      * @param fs
 74      * @return
 75      * @throws Exception
 76      */
 77     private boolean isUTF8(FileInputStream fs) throws Exception {
 78         if (fs.read() == 0xEF && fs.read() == 0xBB && fs.read() == 0xBF)//所有utf8编码的文件前三个字节为0xEFBBBF
 79             return true;
 80         return false;
 81     }
 82 
 83     /**
 84      * utf8格式编码的字符，其第一个byte的二进制编码可以判断该字符的长度（汉字一般占三个字节）ASCII占一byte
 85      * @param b
 86      * @return
 87      */
 88     private int getlength(byte b) {
 89         int v = b & 0xff;//byte to 十六进制数
 90         if (v > 0xF0) {
 91             return 4;
 92         }
 93         // 110xxxxx
 94         else if (v > 0xE0) {
 95             return 3;
 96         } else if (v > 0xC0) {
 97             return 2;//该字符长度占2byte
 98         }
 99         return 1;
100     }
101 
102     /**
103      * 通过读取头一个byte来判断该字符占用字节数，并读取该字符，如1110xxxx，表示这个字符占三个byte
104      * @param fs
105      * @return
106      * @throws Exception
107      */
108     private String readUnit(FileInputStream fs) throws Exception {
109         byte b = (byte) fs.read();
110         if (b == -1)
111             return null;
112         int len = getlength(b);
113         byte[] units = new byte[len];
114         units[0] = b;
115         for (int i = 1; i < len; i++) {
116             units[i] = (byte) fs.read();
117         }
118         String ret = new String(units, "UTF-8");
119         return ret;
120     }
121 
122     /**
123      * 把单词，标点，汉字等全都读入hashmap
124      * @param inputStream
125      * @return
126      * @throws Exception
127      */
128     private HashMap<String, Integer> createHash(FileInputStream inputStream)
129             throws Exception {
130         HashMap<String, Integer> hash = new HashMap<String, Integer>();
131         String key = null;
132         while ((key = readUnit(inputStream)) != null) {
133             if (hash.containsKey(key)) {
134                 hash.put(key, 1 + (int) hash.get(key));
135             } else {
136                 hash.put(key, 1);
137             }
138         }
139         inputStream.close();
140         getEnWords(hash, this.filePath);
141         return hash;
142     }
143 
144     /**
145      * FileInputStream读取文件，若文件不是UTF8编码，返回null
146      * @param path
147      * @return
148      * @throws Exception
149      */
150     private FileInputStream createReader(String path) throws Exception {
151         FileInputStream br = new FileInputStream(path);
152         if (!isUTF8(br))
153             return null;
154         return br;
155     }
156 
157     /**
158      * save punctuation filtered form (HashMap)hash into (HashMap)puncs,
159      * @param hash;remove punctuation form (HashMap)hash at the same time
160      * @return
161      */
162     private HashMap<String, Integer> filterPunctuation(
163             HashMap<String, Integer> hash) {
164         HashMap<String, Integer> puncs = new HashMap<String, Integer>();
165         Iterator<?> iterator = hash.entrySet().iterator();
166 
167         while (iterator.hasNext()) {
168             Entry<?, ?> entry = (Entry<?, ?>) iterator.next();
169             String key = entry.getKey().toString();
170             if (isPunctuation(key)) {
171                 puncs.put(key, hash.get(key));
172                 iterator.remove();
173             }
174         }
175         return puncs;
176     }
177 
178     /**
179      * calculate the probability of the word in hash
180      * @param hash
181      * @return
182      */
183     private HashMap<String, Float> calProbability(HashMap<String, Integer> hash) {
184         float count = countWords(hash);
185         HashMap<String, Float> prob = new HashMap<String, Float>();
186         Iterator<?> iterator = hash.entrySet().iterator();
187         while (iterator.hasNext()) {
188             Entry<?, ?> entry = (Entry<?, ?>) iterator.next();
189             String key = entry.getKey().toString();
190             prob.put(key, hash.get(key) / count);
191         }
192         return prob;
193     }
194 
195     /**
196      * save the content in the hash into file.txt
197      * @param hash
198      * @param path
199      * @throws Exception
200      */
201     private void saveFile(HashMap<String, Integer> hash, String path)
202             throws Exception {
203         FileWriter fw = new FileWriter(path);
204         fw.write(hash.toString());
205         fw.close();
206     }
207 
208     /**
209      * calculate the total words in hash
210      * @param hash
211      * @return
212      */
213     private int countWords(HashMap<String, Integer> hash) {
214         int count = 0;
215         for (Entry<String, Integer> entry : hash.entrySet()) {
216             count += entry.getValue();
217         }
218         return count;
219     }
220 
221     /**
222      * calculate the entropy（熵） of the characters
223      * @param hash
224      * @return
225      */
226     private float calEntropy(HashMap<String, Float> hash) {
227         float entropy = 0;
228         Iterator<Entry<String, Float>> iterator = hash.entrySet().iterator();
229         while (iterator.hasNext()) {
230             Entry<String, Float> entry = (Entry<String, Float>) iterator.next();
231             Float prob = entry.getValue();//get the probability of the characters
232             entropy += 0 - (prob * Math.log(prob));//calculate the entropy of the characters
233         }
234         return entropy;
235     }
236 }
237 
238 
239 
240 
241 
242 
243 
244 import java.io.BufferedReader;
245 import java.io.FileNotFoundException;
246 import java.io.IOException;
247 import java.io.InputStreamReader;
248 import java.util.HashMap;
249 import java.util.Iterator;
250 import java.util.Map.Entry;
251 
252 public class NLPWork {
253  
254     /**
255      * calculate the KL distance form file u1 to file u2
256      * @param u1
257      * @param u2
258      * @return
259      */
260     public static float calKL(NLPFileUnit u1, NLPFileUnit u2) {
261         HashMap<String, Float> hash1 = u1.WordProbability;
262         HashMap<String, Float> hash2 = u2.WordProbability;
263         float KLdistance = 0;
264         Iterator<Entry<String, Float>> iterator = hash1.entrySet().iterator();
265         while (iterator.hasNext()) {
266             Entry<String, Float> entry = iterator.next();
267             String key = entry.getKey().toString();
268 
269             if (hash2.containsKey(key)) {
270                 Float value1 = entry.getValue();
271                 Float value2 = hash2.get(key);
272                 KLdistance += value1 * Math.log(value1 / value2);
273             }
274         }
275         return KLdistance;
276     }
277 
278     public static void main(String[] args) throws IOException, Exception {
279         //all punctuation will be saved under working directory
280         System.out.println("Now only UTF8 encoded file is supported!!!");
281         System.out.println("PLS input file 1 path:");
282         BufferedReader cin = new BufferedReader(
283                 new InputStreamReader(System.in));
284         String file1 = cin.readLine();
285         System.out.println("PLS input file 2 path:");
286         String file2 = cin.readLine();
287         NLPFileUnit u1 = null;
288         NLPFileUnit u2 = null;
289         try{
290             u1 = new NLPFileUnit(file1);//NLP:Nature Language Processing
291             u2 = new NLPFileUnit(file2);
292         }
293         catch(FileNotFoundException e){
294             System.out.println("File Not Found!!");
295             e.printStackTrace();
296             return;
297         }
298         float KLdistance = calKL(u1, u2);
299         System.out.println("KLdistance is :" + KLdistance);
300         System.out.println("File 1 Entropy: " + u1.entropy);
301         System.out.println("File 2 Entropy: " + u2.entropy);
302     }
303 }

计算结果：