weka特征选择(IG、chi-square)

一、说明

  IG是information gain 的缩写,中文名称是信息增益,是选择特征的一个很有效的方法(特别是在使用svm分类时)。这里不做详细介绍,有兴趣的可以googling一下。

  chi-square 是一个常用特征筛选方法,在种子词扩展那篇文章中,有详细说明,这里不再赘述。

二、weka中的使用方法

  1、特征筛选代码

package com.lvxinjian.alg.models.feature;

import java.nio.charset.Charset;
import java.util.ArrayList;

import weka.attributeSelection.ASEvaluation;
import weka.attributeSelection.AttributeEvaluator;
import weka.attributeSelection.Ranker;
import weka.core.Instances;

import com.iminer.tool.common.util.FileTool;
/**
 * @Description : 使用Weka的特征筛选方法(目前支持IG、Chi-square)
 *
 */
public class FeatureSelectorByWeka {
    
    /**
     * @function 使用weka内置的算法筛选特征
     * @param eval 特征筛选方法的对象实例
     * @param data arff格式的数据
     * @param maxNumberOfAttribute 支持的最大的特征个数
     * @param outputPath lex输出文件
     * @throws Exception
     */
    public void EvalueAndRank(ASEvaluation eval , Instances data ,int maxNumberOfAttribute , String outputPath) throws Exception
    {
        Ranker rank = new Ranker();        
        eval.buildEvaluator(data);
        rank.search(eval, data);
        
         // 按照特定搜索算法对属性进行筛选 在这里使用的Ranker算法仅仅是属性按照InfoGain/Chi-square的大小进行排序            
        int[] attrIndex = rank.search(eval, data);
        
         // 打印结果信息 在这里我们了属性的排序结果                 
        ArrayList<String> attributeWords = new ArrayList<String>();
        for (int i = 0; i < attrIndex.length; i++) {
            //如果权重等于0,则跳出循环
            if (((AttributeEvaluator) eval).evaluateAttribute(attrIndex[i]) == 0)
                break;
            if (i >= maxNumberOfAttribute)
                break;
            attributeWords.add(i + "	"
                    + data.attribute(attrIndex[i]).name() + "	" + "1");
        }
        FileTool.SaveListToFile(attributeWords, outputPath, false,
                Charset.forName("utf8"));
    }

}
package com.lvxinjian.alg.models.feature;

import java.io.IOException;

import weka.attributeSelection.ASEvaluation;
import weka.attributeSelection.ChiSquaredAttributeEval;
import weka.attributeSelection.InfoGainAttributeEval;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;

import com.iminer.alg.models.generatefile.ParameterUtils;

/**
 * @Description : IG、Chi-square特征筛选
 *
 */
public class WekaFeatureSelector extends FeatureSelector{        

    /**
     * 最大的特征个数
     */
    private int maxFeatureNum = 10000;
    /**
     * 特征文件保存路径
     */
    private String outputPath = null;
    /**
     * @Fields rule 对于特征过滤的规则
     */
    private String classname = "CLASS";
    /**
     * 特征筛选方法,默认为IG
     */
    private String selectMethod = "IG";
    
    private boolean Initialization(String options){        
        try {
            String [] paramArrayOfString = options.split(" ");
            
            //初始化特征最大个数 
            String maxFeatureNum = ParameterUtils.getOption("maxFeatureNum", paramArrayOfString);
            if(maxFeatureNum.length() != 0)
                this.maxFeatureNum = Integer.parseInt(maxFeatureNum);
            //初始化类别
            String classname = ParameterUtils.getOption("class", paramArrayOfString);
            if(classname.length() != 0)
                this.classname = classname;
            else{
                System.out.println("use default class name("CLASS")");
            }
            //初始化特征保存路径
            String outputPath = ParameterUtils.getOption("outputPath", paramArrayOfString);
            if(outputPath.length() != 0)
                this.outputPath = outputPath;
            else{
                System.out.println("please initialze output path.");
                return false;
            }
            String selectMethod = ParameterUtils.getOption("selectMethod", paramArrayOfString);
            if(selectMethod.length() != 0)
                this.selectMethod = selectMethod;
            else{
                System.out.println("use default select method(IG)");
            }
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }            
        return true;
    }
    @Override
    public boolean selectFeature(Object obj ,String options) throws IOException {        
        try {
            if(!Initialization(options))
                return false;        
            Instances data = (Instances)obj;
            data.setClass(data.attribute(this.classname));
            ASEvaluation selector = null;
            if(this.selectMethod.equals("IG"))
                selector = new InfoGainAttributeEval();
            else if(this.selectMethod.equals("CHI"))
                selector = new ChiSquaredAttributeEval();
            FeatureSelectorByWeka attributeSelector = new FeatureSelectorByWeka();    
            attributeSelector.EvalueAndRank(selector, data ,this.maxFeatureNum ,this.outputPath);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        
        return true;            
    }
    
    public static void main(String [] args) throws Exception
    {
        String root = "C:\Users\Administrator\Desktop\12_05\模型训练\1219\";
        WekaFeatureSelector selector = new WekaFeatureSelector();
        Instances data = DataSource.read(root + "train.Bigram.arff");
        String options = "-maxFeatureNum 10000 -outputPath lex.txt";
        selector.selectFeature(data, options);
    }
}

参考:

weka数据挖掘拾遗(二)---- 特征选择(IG、chi-square)

Weka学习四(属性选择)

原文地址:https://www.cnblogs.com/549294286/p/3644098.html