文本小票的一种无监督聚类方法

基于ostu的无监督文本聚类 对于区分不同店铺的小票效果良好 同店铺小票不同类别区分效果一般,但是对于离群点定位(小样本类别很精准),借鉴了TF/IDF的思想,还有词处理时的去停词,词频因素的考虑优化,分类的阈值计算为每次基于相似度集合的前后背景最大分割点,不断二分类(其实也可以随机选取几个小票作为聚类的种子,类似kmeans那种,但是这样聚类最后会有部分偏差,只是局部最优解,而且当时觉得还是迭代二分干净明了吧)
ostu为图像二值化处理时的一种算法(类间最大方差),图像专业,前年第一次尝试做聚类时的一个想法和实现,前后设计调优用了三天,不足地方还请见谅。
转载请标明原址,谢谢



public class ClusterHelper {
    private static Logger log = Logger
            .getLogger(ClusterHelper.class);

    private String txt = "";
    private String filekey = "";
    private int classify = 0;

    public int getClassify() {
        return classify;
    }

    public void setClassify(int classify) {
        this.classify = classify;
    }

    public void setClusterHelper(String txt, String filekey, int classify) {
        this.txt = txt;
        this.filekey = filekey;
        this.classify = classify;
    }

    public String getTxt() {
        return txt;
    }

    public void setTxt(String txt) {
        this.txt = txt;
    }

    public String getFilekey() {
        return filekey;
    }

    public void setFilekey(String filekey) {
        this.filekey = filekey;
    }

    /**
     * 得到存有所有cell的bitmap:bf
     * 
     * @param alltxt
     * @return
     */
    public static SimpleBloomFilterHelper allBloomFilter(
            ArrayList<String> alltxt) {
        Set<String> txtcell = new TreeSet<String>();
        String txt = "";
        SimpleBloomFilterHelper bf = new SimpleBloomFilterHelper();
        for (int i = 0; i < alltxt.size(); i++) {
            txt = alltxt.get(i);
            txtcell.clear();
            txtcell = txtToCell(txt);
            for (String s : txtcell) {
                if (!bf.contains(s))
                    bf.add(s);
            }
        }
        return bf;

    }

    public static ArrayList<ClusterHelper>cluster(Set<String> filekeys) {
        ArrayList<ClusterHelper>allcluster=new ArrayList<ClusterHelper>();
        for(String i:filekeys){
            ClusterHelper cluster=new ClusterHelper();
            cluster.setFilekey(i);
            cluster.setClassify(0);
            cluster.setTxt( TemplateCompare.filekeyToString(i));
        }
        String txt = "";
        String ctxt = "";
        int k = 1;
        int[][] similar = new int[allcluster.size()][allcluster.size()];
        for (int i = 0; i < allcluster.size(); i++) {
            if (allcluster.get(i).classify == 0) {
                txt = allcluster.get(i).getTxt();
                SimpleBloomFilterHelper bf = initBf(txt);
                for (int j = 0; j < allcluster.size(); j++) {
                    if (j == i) {
                        similar[i][j] = 1000000;
                    } else if (i != 0 && allcluster.get(j).classify != 0) {
                        similar[i][j] = 1000000;
                    } else {
                        ctxt = allcluster.get(j).getTxt();
                        similar[i][j] = similar(similar[i][j], ctxt, bf);
                    }
                }

                log.info("similar");
                for (int t : similar[i]) {
                    log.info(t);
                }

                ArrayList<Integer> s = new ArrayList<Integer>();
                for (int q = 0; q < similar[i].length; q++) {
                    if (similar[i][q] < 1000000) {
                        s.add(similar[i][q]);
                    }
                }
                s.add(0);
                if (s.size() == 1) {
                    return allcluster;
                }
                int[] ss = listtoint(s);
                int[] sort = ss.clone();
                Arrays.sort(sort);
                double T = ostu(sort, ss);
                log.info("T:" + T);
                allcluster = classifyByT(T, similar[i], allcluster, k);
                allcluster.get(i).setClassify(k);
                k++;
                int n = 0;
                log.info(i + "-classify:");
                for (ClusterHelper h : allcluster) {
                    log.info(n + ":" + h.classify);
                    n++;
                }
            }
        }
        int n = 0;

        for (int i = 0; i < allcluster.size(); i++) {
            if (allcluster.get(i).classify == 0) {
                allcluster = clusterString(allcluster);
            } else {
                n++;
            }
            if (n == allcluster.size()) {
                return allcluster;
            }
        }
        return allcluster;

    }

    /**
     * 基于ostu的无监督文本聚类 对于区分不同店铺的小票效果良好 同店铺小票不同类别区分效果一般,
     * 但是对于离群点定位(小样本类别很精准)
     * 
     * @param alltxt
     */
    public static ArrayList<ClusterHelper> clusterString(
            ArrayList<ClusterHelper> allcluster) {
        String txt = "";
        String ctxt = "";
        int k = 1;
        int[][] similar = new int[allcluster.size()][allcluster.size()];
        for (int i = 0; i < allcluster.size(); i++) {
            if (allcluster.get(i).classify == 0) {
                txt = allcluster.get(i).getTxt();
                SimpleBloomFilterHelper bf = initBf(txt);
                for (int j = 0; j < allcluster.size(); j++) {
                    if (j == i) {
                        similar[i][j] = 1000000;
                    } else if (i != 0 && allcluster.get(j).classify != 0) {
                        similar[i][j] = 1000000;
                    } else {
                        ctxt = allcluster.get(j).getTxt();
                        similar[i][j] = similar(similar[i][j], ctxt, bf);
                    }
                }

                log.info("similar");
                for (int t : similar[i]) {
                    log.info(t);
                }

                ArrayList<Integer> s = new ArrayList<Integer>();
                for (int q = 0; q < similar[i].length; q++) {
                    if (similar[i][q] < 1000000) {
                        s.add(similar[i][q]);
                    }
                }
                s.add(0);
                if (s.size() == 1) {
                    return allcluster;
                }
                int[] ss = listtoint(s);
                int[] sort = ss.clone();
                Arrays.sort(sort);
                double T = ostu(sort, ss);
                log.info("T:" + T);
                allcluster = classifyByT(T, similar[i], allcluster, k);
                allcluster.get(i).setClassify(k);
                k++;
                int n = 0;
                log.info(i + "-classify:");
                for (ClusterHelper h : allcluster) {
                    log.info(n + ":" + h.classify);
                    n++;
                }
            }
        }
        int n = 0;

        for (int i = 0; i < allcluster.size(); i++) {
            if (allcluster.get(i).classify == 0) {
                allcluster = clusterString(allcluster);
            } else {
                n++;
            }
            if (n == allcluster.size()) {
                return allcluster;
            }
        }
        return allcluster;

    }

    public static ArrayList<ClusterHelper> classifyByT(double T, int[] similar,
            ArrayList<ClusterHelper> allcluster, int k) {
        for (int i = 0; i < similar.length; i++) {
            if (similar[i] >= T && similar[i] < 1000000) {
                allcluster.get(i).classify = k;
            }
        }

        return allcluster;
    }

    /**
     * 初始化bf
     * 
     * @param txt
     * @param bf
     */
    public static SimpleBloomFilterHelper initBf(String txt) {
        SimpleBloomFilterHelper bf = new SimpleBloomFilterHelper();
        Set<String> txtcell = txtToCell(txt);
        for (String s : txtcell) {
            bf.add(s);
        }
        return bf;
    }

    /**
     * 计算相似度similar
     * 
     * @param similar
     * @param ctxt
     * @param bf
     * @return
     */
    public static int similar(int similar, String ctxt,
            SimpleBloomFilterHelper bf) {
        Set<String> cxtcell = txtToCell(ctxt);
        for (String c : cxtcell) {
            if (bf.contains(c)) {
                similar++;
            }
        }
        return similar;
    }

    /**
     * 大津法阈值分割
     * 
     * @param sort
     * @param ss
     * @return
     */
    public static int ostu(int[] sort, int[] ss) {
        // 设当前景与背景的分割阈值为t时,前景点占图像比例为w0,均值为u0,背景点占图像比例为w1,均值为u1。
        // 则整个图像的均值为u = w0*u0+w1*u1。建立目标函数g(t)=w0*(u0-u)^2+w1*(u1-u)^2,
        int T = 0;
        double gt = 0;
        double g = 0;
        int mc1 = 0;
        int mc2 = 0;
        int m = mean(ss);
        ArrayList<Integer> c1 = new ArrayList<Integer>();
        ArrayList<Integer> c2 = new ArrayList<Integer>();
        for (int i = sort[0]; i < sort[sort.length - 1]; i++) {
            for (int j = 0; j < ss.length; j++) {
                if (ss[j] <= i) {
                    c1.add(ss[j]);
                } else {
                    c2.add(ss[j]);
                }
            }
            int[] sc1 = listtoint(c1);
            int[] sc2 = listtoint(c2);
            mc1 = mean(sc1);
            mc2 = mean(sc2);
            g = (sc1.length * (Math.pow(mc1 - m, 2)) + sc2.length
                    * (Math.pow(mc2 - m, 2)))
                    / ss.length;
            if (g > gt) {
                T = i;
                gt = g;
            }
        }
        return T;
    }

    public static int[] listtoint(ArrayList<Integer> c) {
        int[] s = new int[c.size()];
        for (int i = 0; i < c.size(); i++) {
            s[i] = (int) c.get(i);
        }
        return s;
    }

    public static int mean(int[] s) {
        int mean = 0;
        int sum = 0;
        for (int i = 0; i < s.length; i++) {
            sum = sum + s[i];
        }
        mean = sum / s.length;
        return mean;
    }

    /**
     * 将txt分解为最小单元集合 以“ ”和":"为分割标记 且不含有“0123456789.-*”
     * 
     * @param txt
     * @return
     */
    public static Set<String> txtToCell(String txt) {
        Set<String> txtcell = new TreeSet<String>();
        ArrayList<Integer> num = new ArrayList<Integer>();
        int k = 0;
        int begin = 0;
        String cell = "";
        String cc = "";
        txt = txt.replaceAll("
", " ");
        txt = txt.replaceAll("1", " ");
        txt = txt.replaceAll("2", " ");
        txt = txt.replaceAll("3", " ");
        txt = txt.replaceAll("4", " ");
        txt = txt.replaceAll("5", " ");
        txt = txt.replaceAll("6", " ");
        txt = txt.replaceAll("7", " ");
        txt = txt.replaceAll("8", " ");
        txt = txt.replaceAll("9", " ");
        txt = txt.replaceAll("0", " ");
        txt = txt.replaceAll("\.", " ");
        // txt = txt.replaceAll("F", " ");
        // txt = txt.replaceAll("=", " ");
        // txt = txt.replaceAll("--", " ");
        txt = txt.replaceAll(":", " ");
        if (!txt.substring(0, 1).equals(" ")) {
            num.add(0);
        }

        for (int j = 0; j < txt.length(); j++) {
            cc = txt.substring(j, j + 1);
            if (cc.equals(" ")) {
                if (j != begin + 1) {
                    begin = j;
                    num.add(begin);
                    k = 0;
                } else {
                    k++;
                    begin = begin + k;
                }
            }
        }
        for (int i = 0; i < num.size() - 1; i++) {
            cell = txt.substring(num.get(i), num.get(i + 1));
            cell = cell.replaceAll(" +", "");
            if (!cell.equals("")) {
                txtcell.add(cell);
            }
        }

        return txtcell;
    }

    /**
     * 判断cell中是否含有“0123456789.-*”
     * 
     * @param cell
     * @return
     */
    public static boolean numberjudge(String cell) {
        if (cell.indexOf("0") == -1 && cell.indexOf("1") == -1
                && cell.indexOf("2") == -1 && cell.indexOf("3") == -1
                && cell.indexOf("4") == -1 && cell.indexOf("5") == -1
                && cell.indexOf("6") == -1 && cell.indexOf("7") == -1
                && cell.indexOf("8") == -1 && cell.indexOf("9") == -1
                && cell.indexOf(".") == -1 && cell.indexOf("-") == -1
                && cell.indexOf("*") == -1) {
            return true;
        } else
            return false;
    }

    public static String txtread(String fileName) throws IOException {
        String result = "";
        BufferedReader br = new BufferedReader(new FileReader(fileName));
        String line = "";
        StringBuffer buffer = new StringBuffer();
        while ((line = br.readLine()) != null) {
            buffer.append(line + "
");
        }
        result = buffer.toString();
        br.close();

        return result;
    }

    public static void testtxtToCell() {
        try {
            String txt = txtread("/Users/zhangdebin/Documents/txt/0.txt");
            log.info(txt);
            Set<String> allcell = txtToCell(txt);
            log.info("allcell:");
            for (String s : allcell) {
                log.info(s);
            }
        } catch (IOException e) {

            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws IOException {
        String filepath = "/Users/Documents/3000testtxt";
        String filepath2 = "/Users/Documents/imgtxt";
        String filepath3 = "/Users/Documents/testtail";
        File file = new File(filepath2);
        int len = file.list().length - 1;
        ArrayList<String> alltxt = new ArrayList<String>();
        for (int i = 0; i < len; i++) {
            String readfile = file + "/" + i + ".txt";
            try {
                alltxt.add(txtread(readfile));
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        ArrayList<ClusterHelper> cluster = new ArrayList<ClusterHelper>();
        for (int i = 0; i < alltxt.size(); i++) {
            ClusterHelper c = new ClusterHelper();
            c.setTxt(alltxt.get(i));
            c.setClassify(0);
            c.setFilekey("123");
            cluster.add(c);
        }
        long start = System.currentTimeMillis();
        cluster = clusterString(cluster);
        log.info("cluster.size:" + cluster.size());
        long end = System.currentTimeMillis();
        log.info("time:" + (end - start));
        Set<Integer> cc = new TreeSet<Integer>();
        int a[] = new int[100];
        for (int i = 0; i < cluster.size(); i++) {
            cc.add(cluster.get(i).classify);
            a[cluster.get(i).classify]++;
        }
        log.info("cc:");
        for (int i : cc) {
            log.info(i + "-" + a[i]);
        }

    }

}
原文地址:https://www.cnblogs.com/zhangdebin/p/5567902.html