最佳阈值划分问题

分类问题有时是个回归问题。这就需要找到阈值来将样本准确地划分到类别。

例如一个文本情感分类问题：情感有0（消极）、1（中性）、2（积极）三种类别。回归器返回的情感的分值分别为0.2,0.3,0.4,0.45,0.66,1.2,1.3,1.4，它们对应的类别分别为0，0，1，2，1，1，2，2，需要找到两个阈值x，y，小于x的表示0类别，x和y之间的表示1类别，大于y的表示2类别。

如果寻找最佳答案，复杂度为O(样本数^类别数)。

如果使用贪心法，问题复杂度可以降为O（样本数）的复杂度。
把每次寻找阈值当做一个二分类问题。这种方法能够牺牲准确性换来时间效率的提升。

import java.util.Arrays;
import java.util.Comparator;
import java.util.Random;

public class Main {
Random r = new Random();

class Sample {
    int score;//样本的得分
    int type;//样本的类别

    Sample(int score, int type) {
        this.score = score;
        this.type = type;
    }

    @Override
    public String toString() {
        return "(" + score + "," + type + ")";
    }
}

Sample[] generateProblem() {
    int n = r.nextInt(4) + 2;
    Sample[] a = new Sample[n];
    for (int i = 0; i < n; i++) {
        a[i] = new Sample(r.nextInt(50), r.nextInt(3));
    }
    Arrays.sort(a, Comparator.comparingInt(x -> x.score));
    return a;
}

int bruteforceScore(Sample[] a) {
    int bestI = 0, bestJ = 0;
    int bestScore = 0;
    for (int i = 0; i <= a.length; i++) {//第一个阈值
        for (int j = i; j <= a.length; j++) {//第二个阈值
            int score = getScore(a, i, j);
            if (score > bestScore) {
                bestScore = score;
                bestI = i;
                bestJ = j;
            }
        }
    }
    System.out.println("ans i: " + bestI + " ans j:" + bestJ);
    return bestScore;
}

int getScore(Sample[] a, int i, int j) {
    int rightCount = 0;
    for (int k = 0; k < a.length; k++) {
        if (k < i && a[k].type == 0) {
            rightCount++;
        } else if (k >= i && k < j && a[k].type == 1) {
            rightCount++;
        } else if (k >= j && a[k].type == 2) {
            rightCount++;
        }
    }
    return rightCount;
}

int mine(Sample[] a) {
    int bestI = 0;
    long bestRightCOunt = 0;
    long rightCount = Arrays.stream(a).filter(x -> x.type != 0).count();
    for (int i = 0; i < a.length; i++) {
        if (rightCount >= bestRightCOunt) {
            bestRightCOunt = rightCount;
            bestI = i;
        }
        if (a[i].type == 0) rightCount++;
        else rightCount--;
    }
    if (rightCount >= bestRightCOunt) {
        bestI = a.length;
    }
    bestRightCOunt = 0;
    final int goodI = bestI;
    if (goodI == a.length) {
        //全0的情况
        return getScore(a, a.length, a.length);
    }
    rightCount = Arrays.stream(a).filter(x -> x.score >= a[goodI].score && x.type == 2).count();
    int bestJ = 0;
    for (int i = bestI; i < a.length; i++) {
        if (rightCount > bestRightCOunt) {
            bestRightCOunt = rightCount;
            bestJ = i;
        }
        if (a[i].type == 1) rightCount++;
        else if (a[i].type == 2) rightCount--;
    }
    if (rightCount > bestRightCOunt) {
        bestJ = a.length;
    }
    System.out.println();
    System.out.println("bestI " + bestI + " bestJ " + bestJ);
    return getScore(a, bestI, bestJ);
}

Main() {
    while (true) {
        Sample a[] = generateProblem();
        for (Sample i : a) {
            System.out.print(i);
        }
        System.out.println();
        int x = bruteforceScore(a);
        int y = mine(a);
        System.out.println(x + " " + y);
        if (x != y) break;
    }
}

public static void main(String[] args) {
    new Main();
}
}