Java,用户刷屏检测相似字符串检测

背景

近期有几个业务方提出一需求,期望判断一个用户在短期内是否存在刷屏现象,出现后能对其做出限制,并上报。

刷屏定义:取出用户近期20条评论,如果有50%的评论是"相似"的,则认为该用户是在刷屏

相似定义:两条评论的字符串最小编辑距离 / 长串的长度 < 0.2,即两串的80%是相同的,则认为两串相似。

关于最小编辑距离

@Slf4j
public class SimpleBrushDetectionFilter implements ReviewFilter {

    // Todo 参数可实时调
    private int USER_RECENT_REVIEW_LIST_SIZE = 20;
    private int SIMILARITY_THRESHOLD = 80;
    private double BRUSH_THRESHOLD = 0.5;// 该值不允许低于0.5,否则会出现用户循环被ban
    private int BAN_SECOND = 3600 * 24;//一天
    private int LIST_EXPIRE_SECOND = 3600 * 24 * 3;//三天

    @Override
    public ReviewFilterModel filter(ReviewFilterModel reviewFilterModel) {
        if (reviewFilterModel.isEnd()) {
            return reviewFilterModel;
        }

        long userId = reviewFilterModel.getReviewInfo().getUserId();
        if (userId <= 0) {
            log.info("错误的userId {}", userId);
            return reviewFilterModel;
        }

        BrowserRedisService banRedisInstance = BrowserRedisService
                .getRedisService(RedisPrefix.REVIEW_SIMPLE_BRUSH_DETECTION_BAN);
        String str = banRedisInstance.get("" + userId);

        if (StrUtil.isNotBlank(str)
                // BAN_SECOND的expire set非原子性。出错时需要额外判断一下
                && (System.currentTimeMillis() - Long.parseLong(str)) < BAN_SECOND * 1000) {
            banReview(reviewFilterModel, userId);
            return reviewFilterModel;
        }

        if (StrUtil.isNotBlank(str) && (System.currentTimeMillis() - Long.parseLong(str)) > BAN_SECOND * 1000) {
            banRedisInstance.del("" + userId);
        }

        return simpleBrushDetect(reviewFilterModel);
    }

    private void banReview(ReviewFilterModel reviewFilterModel, long userId) {
        log.info("user {} 疑似刷屏,限制发表评论", userId);
        reviewFilterModel.setEnd(true);
        reviewFilterModel.setPass(false);
        reviewFilterModel.setReason("该用户疑似近期出现恶意刷屏,限制发表评论");
    }

    private ReviewFilterModel simpleBrushDetect(ReviewFilterModel reviewFilterModel) {

        BrowserRedisService listRedisInstance = BrowserRedisService
                .getRedisService(RedisPrefix.REVIEW_SIMPLE_BRUSH_DETECTION_LIST);
        long userId = reviewFilterModel.getReviewInfo().getUserId();
        List<String> userRecentReview = listRedisInstance
                .lrange("" + userId, 0, USER_RECENT_REVIEW_LIST_SIZE);
        if (null == userRecentReview) {
            // 将当前评论塞入队列中
            listRedisInstance.rpush("" + userId, reviewFilterModel.getReviewInfo().getDocuments());
            return reviewFilterModel;
        }

        userRecentReview.add(reviewFilterModel.getReviewInfo().getDocuments());

        // 正确的暴力做法是,将20个串依次互相两两对比,但是这样复杂度太高了
        // 这里采用一个取巧的方法,将20个串按字典序排序,然后依次左右对比,效果应该也可以接受
        Collections.sort(userRecentReview);
        int cnt = 0;
        for (int i = 0; i < userRecentReview.size() - 1; i++) {
            int similarity = towStringSimilarity(userRecentReview.get(i),
                    userRecentReview.get(i + 1));
            if (similarity > SIMILARITY_THRESHOLD) {
                cnt++;
            }
        }

        if (cnt > BRUSH_THRESHOLD * USER_RECENT_REVIEW_LIST_SIZE) {
            log.info("user {} 疑似刷屏,禁止发言{}秒", userId, BAN_SECOND);
            BrowserRedisService banRedisInstance = BrowserRedisService
                    .getRedisService(RedisPrefix.REVIEW_SIMPLE_BRUSH_DETECTION_BAN);
            banRedisInstance.set("" + userId, "" + System.currentTimeMillis());
            banRedisInstance.expire("" + userId, BAN_SECOND);

            // 为了避免用户禁言到期后再次触发逻辑,list中删除2/3的评论
            listRedisInstance.ltrim("" + userId, -1, -USER_RECENT_REVIEW_LIST_SIZE / 3);

            banReview(reviewFilterModel, userId);
        }

        // 将当前评论塞入队列中
        listRedisInstance.rpush("" + userId, reviewFilterModel.getReviewInfo().getDocuments());
        listRedisInstance.ltrim("" + userId, -1, -USER_RECENT_REVIEW_LIST_SIZE);

        // 刷新整条list的过期时间
        listRedisInstance.expire("" + userId, LIST_EXPIRE_SECOND);

        return reviewFilterModel;

    }

    /**
     * 返回两个字符串的相似度。 当某个串长度小于5的时候,认为其不构成可比性
     *
     * @return int [0,100]
     */
    private static int towStringSimilarity(String word1, String word2) {
        if (word1.length() < 5 || word2.length() < 5) {
            return 0;
        }
        int distance = towStringMinDistance(word1, word2);

        return 100
                - distance / (word1.length() > word2.length() ? word1.length() : word2.length()) * 100;
    }

    /**
     * 返回两条字符串的最短编辑距离,
     *
     * 即将word2转变成word1的最小操作次数。
     *
     * 采用二维动态规划实现,时间复杂度O(N^2)
     */
    private static int towStringMinDistance(String word1, String word2) {
        int m = word1.length();
        int n = word2.length();
        if (m == 0) {
            return n;
        }
        if (n == 0) {
            return m;
        }
        int[][] f = new int[m + 1][n + 1];
        for (int i = 0; i <= m; i++) {
            f[i][0] = i;
        }
        for (int j = 0; j <= n; j++) {
            f[0][j] = j;
        }

        for (int i = 1; i <= m; i++) {
            for (int j = 1; j <= n; j++) {
                if (word1.charAt(i - 1) == word2.charAt(j - 1)) {
                    f[i][j] = f[i - 1][j - 1];
                } else {
                    f[i][j] = min(f[i - 1][j - 1], f[i - 1][j], f[i][j - 1]) + 1;
                }
            }
        }

        return f[m][n];
    }

    private static int min(int a, int b, int c) {
        return (a > b ? (b > c ? c : b) : (a > c ? c : a));
    }

}
原文地址:https://www.cnblogs.com/acbingo/p/9645746.html