TF-IDF

参考源：
http://www.ruanyifeng.com/blog/2013/03/tf-idf.html 写的很明了

package com.data.text.tfidf;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

public class TF_IDF {

    private  double NUM_DOCS;

    private  Map<String, Integer> idf_map;
    
    public TF_IDF(String fileName){
        idf_map = new HashMap<String, Integer>();
        File file = new File(fileName);
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            String tempString = null;

            //第一行为Num_docs
            tempString = reader.readLine();
            NUM_DOCS = (double)Integer.parseInt(tempString);
            
            // 一次读入一行，直到读入null为文件结束
            while ((tempString = reader.readLine()) != null) {
                String[] arr = tempString.split(" : ");
                String key = arr[0];
                Integer value = Integer.parseInt(arr[1]);
                idf_map.put(key, value);
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
    }
    
    

    public List<Feature> cacu(Map<String, Integer> tf_map) {

        // 统计总词数
        Integer word_num_sum = 0;
        for (Entry<String, Integer> entry : tf_map.entrySet()) {
            word_num_sum += entry.getValue();
        }
        
        //计算tf-idf
        List<Feature> list_fea = new ArrayList<Feature>();
        for (Entry<String, Integer> entry : tf_map.entrySet()) {
            String word = entry.getKey();
            Integer num = entry.getValue();
            double tf = (double) num / word_num_sum;
            double idf = Math.log(NUM_DOCS / idf_map.get(word) + 1);//+1平滑 逆文档频率
            double weight = tf * idf;
            list_fea.add(new Feature(word, num, weight));
        }
        
        //根据权重排序
        Collections.sort(list_fea);

        return list_fea;
    }

    public static void main(String[] args) {
        // TODO Auto-generated method stub

    }

}


package com.data.text.tfidf;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

public class StopWord {
    
    public static Set<String> GetStopWords(){
        String fileName = "stopwords.txt";
        return readwords(fileName);
    }
    
    /**
     * 读取停用词表
     * @param fileName
     * @return
     */
    private static Set<String> readwords(String fileName){
        Set<String> set = new HashSet<String>();
        File file = new File(fileName);
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            String tempString = null;
            
            // 一次读入一行，直到读入null为文件结束
            while ((tempString = reader.readLine()) != null) {
                set.add(tempString.trim());                
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
        return set;
    }
}


package com.data.text.tfidf;

/**
 * 特征词
 * @author root
 *
 */
public class Feature implements Comparable<Feature> {
    private String word;
    private Integer num;
    private double weight;

    public Feature(String word, Integer num, double weight) {
        this.word = word;
        this.num = num;
        this.weight = weight;
    }

    public String getWord() {
        return word;
    }

    public Integer getNum() {
        return num;
    }

    public double getWeight() {
        return weight;
    }

    @Override
    public int compareTo(Feature o) {
        if(this.getWeight() == o.getWeight()){
            return 0;
        }else if(this.getWeight() > o.getWeight()){
            return -1;
        }else{
            return 1;
        }
    }
    
    public String toString(){
        return this.word + " freq: " + num + " weight: " + weight;
    }
}

__author__ = 'dell'

import math
import re
from operator import itemgetter


class TfIdf:
    def __init__(self, corpus_filename = None, stopword_filename = None, DEFAULT_IDF = 1.5):
        self.num_docs = 0
        self.term_num_docs = {}
        self.stopwords = []
        self.idf_default = DEFAULT_IDF

        if corpus_filename:
            corpus_file = open(corpus_filename, 'r')
            #load num of documents
            line = corpus_file.readline()
            self.num_docs = int(line)
            #read term:frequency from each subsequent line in the file
            for line in corpus_file:
                tokens = line.split(':')
                term = tokens[0].strip()
                frequency = int(tokens[1].strip())
                self.term_num_docs[term] = frequency

        if stopword_filename:
            stopword_file = open(stopword_filename)
            self.stopwords = [line.strip() for line in stopword_file]

    def get_tokens(self, str):
        return re.findall(r"<a.*?/a>|<[^>]*>|[w'@#]+", str.lower())

    def add_input_document(self, input):
        self.num_docs += 1
        words = set(self.get_tokens(input))
        for word in words:
            if word in self.term_num_docs:
                self.term_num_docs[word] += 1
            else:
                self.term_num_docs[word] = 1

    def get_num_docs(self):
        return self.num_docs

    def get_idf(self, term):
        if term in self.stopwords:
            return 0
        if term not in self.term_num_docs:
            return self.idf_default
        return math.log(float(1 + self.get_num_docs()) / (1 + self.term_num_docs[term]))

    def get_doc_keywords(self, curr_doc):
        tfidf = {}
        tokens = self.get_tokens(curr_doc)
        tokens_set = set(tokens)
        for word in tokens_set:
            tf = float(tokens.count(word) / len(tokens))
            idf = self.get_idf(word)
            tfidf[word] = tf * idf
        return sorted(tfidf.items(), key=itemgetter(1), reverse=True)