rf-idf的java实现

还存在的问题是，对于其中分词借助的库还存在问题

参考此篇链接

http://www.cnblogs.com/ywl925/archive/2013/08/26/3275878.html

具体代码部分：

具体代码在老电脑linux系统中

下面这个类：主要是，1列出某个目录下的所有文件名。2，读取某个特定文件

package com.bobo.paper.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

public class FileUtil {
 
    public static ArrayList<String> FileList = new ArrayList<String>(); // the list of file
/**
 * 列出某個目錄及其子目錄下所有的文件列表
 * @param filepath 目錄路徑
 * @return 該路徑及其子路經下的所有文件名列表
 * @throws FileNotFoundException
 * @throws IOException
 */
    public static List<String> readDirs(String filepath) throws FileNotFoundException, IOException
    {
        try
        {
            File file = new File(filepath);
            if(!file.isDirectory())
            {
                System.out.println("输入的不是目錄名称；");
                System.out.println("filepath:" + file.getAbsolutePath());
            }
            else
            {
                String[] flist = file.list();
                for(int i = 0; i < flist.length; i++)
                {
                    File newfile = new File(filepath + "/" + flist[i]);
                    if(!newfile.isDirectory())
                    {
                        FileList.add(newfile.getAbsolutePath());
                    }
                    else if(newfile.isDirectory()) //if file is a directory, call ReadDirs
                    {
                        readDirs(filepath + "/" + flist[i]);
                    }                    
                }
            }
        }catch(FileNotFoundException e)
        {
            System.out.println(e.getMessage());
        }
        return FileList;
    }
    /**
     * 讀取文件內容，以字符串的方式返回
     * @param file 需要讀取的文件名
     * @return 返回讀取的文件內容構成的字符串，行之間用
進行分割
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static String readFile(String file) throws FileNotFoundException, IOException
    {
        StringBuffer strSb = new StringBuffer(); //String is constant， StringBuffer can be changed.
        InputStreamReader inStrR = new InputStreamReader(new FileInputStream(file), "gbk"); //byte streams to character streams
        BufferedReader br = new BufferedReader(inStrR); 
        String line = br.readLine();
        while(line != null){
            strSb.append(line).append("
");
            line = br.readLine();    
        }
        
        return strSb.toString();
    }
    
    

}

FileUtil

下面这个类主要用于分词

package com.bobo.paper.util;

import java.io.IOException;
import java.util.ArrayList;

import org.wltea.analyzer.lucene.IKAnalyzer;

public class CutWordsUtil {


    /**
     * 进行分词操作
     * @param file
     * @return
     * @throws IOException
     */
    public static ArrayList<String> cutWords(String file) throws IOException{
        
        ArrayList<String> words = new ArrayList<String>();
        String text = FileUtil.readFile(file);
        IKAnalyzer analyzer = new IKAnalyzer();
        // 这里貌似缺少一个分词jar包进一步依赖的包？
        
      // analyzer.split(text);
        //这个分词的工具，回头要以下即可
        return null;
            
        
         
    }

     
    

}

CutWords

下面这个类主要实现tf-idf算法

package com.bobo.paper.athology;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import com.bobo.paper.util.CutWordsUtil;
import com.bobo.paper.util.FileUtil;

public class TfIdfAthology {
 
     /**
      * 统计各个词语列表中各个词语出现的次数
      * @param cutwords 分词之后的词语列表
      * @return 返回一个hashmap，key为词，value为词出现的次数
      */
    public static HashMap<String, Integer> normalTF(ArrayList<String> cutwords){
        HashMap<String, Integer> resTF = new HashMap<String, Integer>();
        
        for(String word : cutwords){
            if(resTF.get(word) == null){
                resTF.put(word, 1);
                System.out.println(word);
            }
            else{
                resTF.put(word, resTF.get(word) + 1);
                System.out.println(word.toString());
            }
        }
        return resTF;
    }
    /**
     * 统计词频，即tf值
     * @param cutwords 分词之后的词语列表
     * @return
     */
    public static HashMap<String, Float> tf(ArrayList<String> cutwords){
        HashMap<String, Float> resTF = new HashMap<String, Float>();
        
        int wordLen = cutwords.size();
        HashMap<String, Integer> intTF = normalTF(cutwords); 
        
        Iterator iter = intTF.entrySet().iterator(); //iterator for that get from TF
        while(iter.hasNext()){
            Map.Entry entry = (Map.Entry)iter.next();
            resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen);
            System.out.println(entry.getKey().toString() + " = "+  Float.parseFloat(entry.getValue().toString()) / wordLen);
        }
        return resTF;
    } 
   /**
    * 将以个目录下所有的文件进行分词，返回一个HashMap<String, HashMap<String, Integer>> ，前面一个key是文件名，后面一个key是词，其值为该词出现的次数
    * @param dirc
    * @return
    * @throws IOException
    */
    public static HashMap<String, HashMap<String, Integer>> normalTFAllFiles(String dirc) throws IOException{
        HashMap<String, HashMap<String, Integer>> allNormalTF = new HashMap<String, HashMap<String,Integer>>();
        List<String> filelist = FileUtil.readDirs(dirc);
        for(String file : filelist){
            HashMap<String, Integer> dict = new HashMap<String, Integer>();
            ArrayList<String> cutwords = CutWordsUtil.cutWords(file); //get cut word for one file
            
            dict =  normalTF(cutwords);
            allNormalTF.put(file, dict);
        }    
        return allNormalTF;
    }
    /**
     * 計算一個目錄下所有文件中詞語的詞頻
     * @param dirc 目錄名稱
     * @return 返回一個HashMap<String,HashMap<String, Float>>，第一個key是文件名，第二個key是詞，value是該詞語在該文件中的頻率
     * @throws IOException
     */
    public static HashMap<String,HashMap<String, Float>> tfAllFiles(String dirc) throws IOException{
        HashMap<String, HashMap<String, Float>> allTF = new HashMap<String, HashMap<String, Float>>();
        List<String> filelist = FileUtil.readDirs(dirc);
        
        for(String file : filelist){
            HashMap<String, Float> dict = new HashMap<String, Float>();
            ArrayList<String> cutwords = CutWordsUtil.cutWords(file); //get cut words for one file
            
            dict = tf(cutwords);
            allTF.put(file, dict);
        }
        return allTF;
    }
    /**
     * 計算词语的idf值 log（|D|/{包含该词语的文档个数+1})
     * @param all_tf 爲HashMap<String,HashMap<String, Float>>，第一個key爲文件名，第二個key爲詞語，float代表該詞語在本文件中的詞頻
     * @return
     */
    public static HashMap<String, Float> idf(HashMap<String,HashMap<String, Float>> all_tf){
        HashMap<String, Float> resIdf = new HashMap<String, Float>();
        //dict的key值为词，其value为出现该词的文档个数
        HashMap<String, Integer> dict = new HashMap<String, Integer>();
        int docNum = FileUtil.FileList.size();
        //循环所有的文件
        for(int i = 0; i < docNum; i++){
            //all_tf中記錄的是
            HashMap<String, Float> temp = all_tf.get(FileUtil.FileList.get(i));
            Iterator iter = temp.entrySet().iterator();
            
            while(iter.hasNext()){
                //循环一个文件中的所有词语的词频
                Map.Entry entry = (Map.Entry)iter.next();
                String word = entry.getKey().toString();
                //IDF的公式，idfi=log（|D|/|{j:ti屬於dj}|），其中|D|爲語料庫中的文件總數目，|{j:ti屬於dj}|指的是包含詞語ti的文件數目，如果该词语不在语料库中，就会导致被除数为零，因此一般情况下使用1 + |{j : t_{i} in d_{j}}|
                if(dict.get(word) == null){
                    dict.put(word, 1);
                }else {
                    dict.put(word, dict.get(word) + 1);
                }
            }
        }
        System.out.println("IDF for every word is:");
        Iterator iter_dict = dict.entrySet().iterator();
        while(iter_dict.hasNext()){
            Map.Entry entry = (Map.Entry)iter_dict.next();
            float value = (float)Math.log(docNum / Float.parseFloat(entry.getValue().toString()));
            resIdf.put(entry.getKey().toString(), value);
            System.out.println(entry.getKey().toString() + " = " + value);
        }
        return resIdf;
    }
    /**
     * 计算某个词语的idf值
     * @param all_tf  记录所有词语tf值的map，第一个key为文件名，第二个key为词语
     * @param idfs  记录所有词语idf值的map，key为词语
     */
    public static void tf_idf(HashMap<String,HashMap<String, Float>> all_tf,HashMap<String, Float> idfs){
        HashMap<String, HashMap<String, Float>> resTfIdf = new HashMap<String, HashMap<String, Float>>();
        int docNum = FileUtil.FileList.size();
        for(int i = 0; i < docNum; i++){
            String filepath = FileUtil.FileList.get(i);
            HashMap<String, Float> tfidf = new HashMap<String, Float>();
            HashMap<String, Float> temp = all_tf.get(filepath);
            Iterator iter = temp.entrySet().iterator();
            while(iter.hasNext()){
                Map.Entry entry = (Map.Entry)iter.next();
                String word = entry.getKey().toString();
                Float value = (float)Float.parseFloat(entry.getValue().toString()) * idfs.get(word); 
                tfidf.put(word, value);
            }
            resTfIdf.put(filepath, tfidf);
        }
        System.out.println("TF-IDF for Every file is :");
        DisTfIdf(resTfIdf);
    }
    //這個主要用來顯示最終計算得到的tf-idf值
    public static void DisTfIdf(HashMap<String, HashMap<String, Float>> tfidf){
        Iterator iter1 = tfidf.entrySet().iterator();
        while(iter1.hasNext()){
            Map.Entry entrys = (Map.Entry)iter1.next();
            System.out.println("FileName: " + entrys.getKey().toString());
            System.out.print("{");
            HashMap<String, Float> temp = (HashMap<String, Float>) entrys.getValue();
            Iterator iter2 = temp.entrySet().iterator();
            while(iter2.hasNext()){
                Map.Entry entry = (Map.Entry)iter2.next(); 
                System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
            }
            System.out.println("}");
        }
        
    }
}

tfIdfAthology

最终的调用方法为：

package com.bobo.paper;

import java.io.IOException;
import java.util.HashMap;

import com.bobo.paper.athology.TfIdfAthology;

public class Welcome {

    /**
     * @param args
     */
    public static void main(String[] args) {
         
 
           String file = "D:/testfiles";

            HashMap<String, HashMap<String, Float>> all_tf;
            try {
                all_tf = TfIdfAthology.tfAllFiles(file);
                System.out.println();
                HashMap<String, Float> idfs = TfIdfAthology.idf(all_tf);
                System.out.println();
                TfIdfAthology.tf_idf(all_tf, idfs);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            
    }

}

Test