数据挖掘聚类算法--DBSCAN

数据集如下所示:

1,1,1
2,1.5,1
3,0.5,1
3,5,-1
7,0.75,-1
7,4,2
8,5,2
8,5.5,2

数据集有三个属性,分别是二维坐标中的x和y,第三个属性是所属的类,-1代表为孤立点,坐标系如下图所示:

源代码如下:

package neugle.dbscan;

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

public class DBScan {
    private List<Point> pointList = new ArrayList<DBScan.Point>();// 读入的样本数据

    private List<List<Point>> clusterList = new ArrayList<List<Point>>();// 最终分类结果

    private List<Point> noiseList = new ArrayList<DBScan.Point>();// 噪声数据集合

    private List<Point> npointList = new ArrayList<DBScan.Point>();// 候选数据集合

    private List<Integer> unvisitedList = new ArrayList<Integer>();// unvisited集合

    private double eps;// 邻域半径
    private int minPts;// 密度

    class Point {
        public double x;
        public double y;
        public String point_type;
        public boolean isVisited = false;
    }

    public DBScan(double eps, int minPts) {
        this.eps = eps;
        this.minPts = minPts;
    }

    // 读取数据
    public List<Point> ReadFile(String filePath) {
        FileReader fr = null;
        BufferedReader br = null;
        try {
            fr = new FileReader(filePath);
            br = new BufferedReader(fr);
            String line = null;
            while ((line = br.readLine()) != null) {
                Point point = new Point();
                String[] agrs = line.split(",");
                point.x = Double.parseDouble(agrs[0]);
                point.y = Double.parseDouble(agrs[1]);
                point.point_type = agrs[2];
                this.pointList.add(point);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                br.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return pointList;
    }

    // DBScan主方法
    public void DBScanFun(String filePath) {
        this.ReadFile(filePath);
        // this.Norm();
        while (this.IsOver()) {
            Point p = this.pointList.get(this.RandomNum());// 随机找到unvisited节点
            p.isVisited = true;// 标记p为visited
            List<Point> neighborList = this.GetNeighbors(p);// 找到满足最小密度的邻居节点
            if (neighborList.size() < this.minPts) {// 添加噪声数据
                this.noiseList.add(p);// 将p放入噪声集合
            } else {
                List<Point> clist = new ArrayList<DBScan.Point>();// 新建一个簇C
                clist.add(p);// 将p放到簇C中
                this.npointList = neighborList;// 令N为p的邻域对象的集合
                for (int i = 0; i < this.npointList.size(); i++) {
                    if (this.npointList.get(i).isVisited == false) {// 查找p'中unvisited的节点
                        this.npointList.get(i).isVisited = true;// 标记p'为visited
                        List<Point> neighborLists = this
                                .GetNeighbors(this.npointList.get(i));// 计算p'满足邻域的节点集合
                        if (neighborLists.size() >= this.minPts) {
                            for (int j = 0; j < neighborLists.size(); j++) {
                                this.npointList.add(neighborLists.get(j));// 将p'的邻域节点加入到N
                            }
                        }
                        clist.add(this.npointList.get(i));// 将p'添加到簇C
                    }
                }
                this.clusterList.add(clist);
            }
        }
    }

    // 在未访问的集合中随机选取
    private int RandomNum() {
        int num = this.unvisitedList.size();
        Random rand = new Random();
        int randNum = rand.nextInt(num);
        return this.unvisitedList.get(randNum);
    }

    // 获得邻域集合
    private List<Point> GetNeighbors(Point p) {
        List<Point> list = new ArrayList<DBScan.Point>();
        for (int i = 0; i < this.pointList.size(); i++) {
            double value = this.DistanceCalculate(this.pointList.get(i), p);
            if (value != 0 && value <= this.eps) {
                list.add(this.pointList.get(i));
            }
        }
        return list;
    }

    // 欧几里得距离公式
    private double DistanceCalculate(Point iris1, Point iris2) {
        double sum = Math.sqrt(Math.pow((iris1.x - iris2.x), 2)
                + Math.pow((iris1.y - iris2.y), 2));
        return sum;
    }

    // 判断数据是否都被访问完
    private boolean IsOver() {
        this.unvisitedList = new ArrayList<Integer>();
        for (int i = 0; i < this.pointList.size(); i++) {
            if (this.pointList.get(i).isVisited == false) {
                unvisitedList.add(i);
            }
        }
        if (this.unvisitedList.size() > 0) {
            return true;
        }
        return false;
    }

    public void Print() {
        System.out.println("聚为" + this.clusterList.size() + "类");
        for (int i = 0; i < this.clusterList.size(); i++) {
            List<Point> c = this.clusterList.get(i);
            System.out.println("------------");
            for (int j = 0; j < c.size(); j++) {
                System.out.println(c.get(j).x + " " + c.get(j).y + " "
                        + c.get(j).point_type);
            }
            System.out.println(c.size());
            System.out.println("------------");
        }

        System.out.println("噪声点有" + this.noiseList.size() + "个");
        System.out.println("------------");
        for (int i = 0; i < this.noiseList.size(); i++) {
            System.out.println(this.noiseList.get(i).x + " "
                    + this.noiseList.get(i).y + " "
                    + this.noiseList.get(i).point_type);
        }
        System.out.println("------------");
    }

    public static void main(String[] args) {
        DBScan c = new DBScan(2.5, 2);
        c.DBScanFun("D:\data\DBScan\test.data");
        c.Print();
    }
}

实验结果如下所示:

聚为2类
------------
8.0 5.5 2
7.0 4.0 2
8.0 5.0 2
3
------------
------------
3.0 0.5 1
1.0 1.0 1
2.0 1.5 1
3
------------
噪声点有2个
------------
3.0 5.0 -1
7.0 0.75 -1
------------

  

原文地址:https://www.cnblogs.com/niuxiaoha/p/4661935.html