K-Means

//转自:https://www.cnblogs.com/LCcnblogs/p/6000934.html
// ConsoleApplication1.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。 // #pragma warning(disable:4996) #include <iostream> #include<stdio.h> #include<stdlib.h> #include<string.h> #include<math.h> #define MAX_ROUNDS 100 //最大允许的聚类次数 #define _CRT_S //“点”的结构体 typedef struct Point { double x_value; //用于存放点在X轴上的值 double y_value; //用于存放点在Y轴上的值 int cluster_id; //用于存放该点所属的cluster id }Point; Point* data; //cluster center的结构体 typedef struct ClusterCenter { double x_value; double y_value; int cluster_id; }ClusterCenter; ClusterCenter* cluster_center; //计算cluster center的结构体 typedef struct CenterCalc { double x_value; double y_value; }CenterCalc; CenterCalc *center_calc; int is_continue; //kmeans 运算是否继续 int* cluster_center_init_index; //记录每个cluster center最初用的是哪个“点” double* distance_from_center; //记录一个“点”到所有cluster center的距离 int* data_size_per_cluster; //每个cluster点的个数 int data_size_total; //设定点的个数 char filename[200]; //要读取的点的数据的文件名 int cluster_count; //设定的cluster的个数 void memoryAlloc(); void memoryFree(); void readDataFromFile(); void initialCluster(); void calcDistance2OneCenter(int pointID, int centerID); void calcDistance2AllCenters(int pointID); void partition4OnePoint(int pointID); void partition4AllPointOneCluster(); void calcClusterCenter(); void kmeans(); void compareNewOldClusterCenter(CenterCalc* center_calc); int main(int argc, char* argv[]) { if (argc != 4) { printf("This application needs 3 parameters to run:" " the 1st is the size of data set," " the 2nd is the file name that contains data" " the 3rd indicates the cluster_count" " "); exit(1); } data_size_total = atoi(argv[1]); strcat_s(filename, argv[2]); cluster_count = atoi(argv[3]); //1, memory alloc memoryAlloc(); //2, read point data from file readDataFromFile(); //3, initial cluster initialCluster(); //4, run k-means kmeans(); //5, memory free & end memoryFree(); return 0; } void memoryAlloc() { data = (Point*)malloc(sizeof(struct Point) * (data_size_total)); if (!data) { printf("malloc error:data!"); exit(1); } cluster_center_init_index = (int*)malloc(sizeof(int) * (cluster_count)); if (!cluster_center_init_index) { printf("malloc error:cluster_center! "); exit(1); } distance_from_center = (double*)malloc(sizeof(double) * (cluster_count)); if (!distance_from_center) { printf("malloc error: distance_from_center! "); exit(1); } cluster_center = (ClusterCenter*)malloc(sizeof(struct ClusterCenter) * (cluster_count)); if (!cluster_center) { printf("malloc cluster center new error! "); exit(1); } center_calc = (CenterCalc*)malloc(sizeof(CenterCalc) * cluster_count); if (!center_calc) { printf("malloc error: center_calc! "); exit(1); } data_size_per_cluster = (int*)malloc(sizeof(int) * (cluster_count)); if (!data_size_per_cluster) { printf("malloc error: data_size_per_cluster "); exit(1); } } void memoryFree() { free(data); data = NULL; free(cluster_center_init_index); cluster_center_init_index = NULL; free(distance_from_center); distance_from_center = NULL; free(cluster_center); cluster_center = NULL; free(center_calc); center_calc = NULL; free(data_size_per_cluster); data_size_per_cluster = NULL; } //从文件中读入每个点的x和y值 void readDataFromFile() { int i; FILE* fread; if (NULL == (fread = fopen(filename, "r"))) { printf("open file(%s) error! ", filename); exit(1); } for (i = 0; i < data_size_total; i++) { if (2 != fscanf(fread, "%lf %lf ", &data[i].x_value, &data[i].y_value)) { printf("fscanf error: %d ", i); } data[i].cluster_id = -1; //初始时每个点所属的cluster id均置为-1 printf("After reading, point index:%d, X:%d, Y:%d, cluster_id:%d ", i, data[i].x_value, data[i].y_value, data[i].cluster_id); } } //根据传入的cluster_count来随机的选择一个点作为 一个cluster的center void initialCluster() { int i, j; int random; //产生初始化的cluster_count个聚类 for (i = 0; i < cluster_count; i++) { cluster_center_init_index[i] = -1; } //随机选择一个点作为每个cluster的center(不重复) for (i = 0; i < cluster_count; i++) { Reselect: random = rand() % (data_size_total - 1); for (j = 0; j < i; j++) { if (random == cluster_center_init_index[j]) goto Reselect; } cluster_center_init_index[i] = random; printf("cluster_id: %d, located in point index:%d ", i, random); } //将随机选择的点作为center,同时这个点的cluster id也就确定了 for (i = 0; i < cluster_count; i++) { cluster_center[i].x_value = data[cluster_center_init_index[i]].x_value; cluster_center[i].y_value = data[cluster_center_init_index[i]].y_value; cluster_center[i].cluster_id = i; data[cluster_center_init_index[i]].cluster_id = i; printf("cluster_id:%d, index:%d, x_value:%f, y_value:%f ", cluster_center[i].cluster_id, cluster_center_init_index[i], cluster_center[i].x_value, cluster_center[i].y_value); } } //计算一个点到一个cluster center的distance void calcDistance2OneCenter(int point_id, int center_id) { distance_from_center[center_id] = sqrt((data[point_id].x_value - cluster_center[center_id].x_value)*(double)(data[point_id].x_value - cluster_center[center_id].x_value) + (double)(data[point_id].y_value - cluster_center[center_id].y_value) * (data[point_id].y_value - cluster_center[center_id].y_value)); } //计算一个点到每个cluster center的distance void calcDistance2AllCenters(int point_id) { int i; for (i = 0; i < cluster_count; i++) { calcDistance2OneCenter(point_id, i); } } //确定一个点属于哪一个cluster center(取距离最小的) void partition4OnePoint(int point_id) { int i; int min_index = 0; double min_value = distance_from_center[0]; for (i = 0; i < cluster_count; i++) { if (distance_from_center[i] < min_value) { min_value = distance_from_center[i]; min_index = i; } } data[point_id].cluster_id = cluster_center[min_index].cluster_id; } //在一轮的聚类中得到所有的point所属于的cluster center void partition4AllPointOneCluster() { int i; for (i = 0; i < data_size_total; i++) { if (data[i].cluster_id != -1) //这个点就是center,不需要计算 continue; else { calcDistance2AllCenters(i); //计算第i个点到所有center的distance partition4OnePoint(i); //根据distance对第i个点进行partition } } } //重新计算新的cluster center void calcClusterCenter() { int i; memset(center_calc, 0, sizeof(CenterCalc) * cluster_count); memset(data_size_per_cluster, 0, sizeof(int) * cluster_count); //分别对每个cluster内的每个点的X和Y求和,并计每个cluster内点的个数 for (i = 0; i < data_size_total; i++) { center_calc[data[i].cluster_id].x_value += data[i].x_value; center_calc[data[i].cluster_id].y_value += data[i].y_value; data_size_per_cluster[data[i].cluster_id]++; } //计算每个cluster内点的X和Y的均值作为center for (i = 0; i < cluster_count; i++) { if (data_size_per_cluster[i] != 0) { center_calc[i].x_value = center_calc[i].x_value / (double)(data_size_per_cluster[i]); center_calc[i].y_value = center_calc[i].y_value / (double)(data_size_per_cluster[i]); printf(" cluster %d point cnt:%d ", i, data_size_per_cluster[i]); printf(" cluster %d center: X:%f, Y:%f ", i, center_calc[i].x_value, center_calc[i].y_value); } else printf(" cluster %d count is zero ", i); } //比较新的和旧的cluster center值的差别。如果是相等的,则停止K-means算法。 compareNewOldClusterCenter(center_calc); //将新的cluster center的值放入cluster_center结构体中 for (i = 0; i < cluster_count; i++) { cluster_center[i].x_value = center_calc[i].x_value; cluster_center[i].y_value = center_calc[i].y_value; cluster_center[i].cluster_id = i; } //在重新计算了新的cluster center之后,要重新来为每一个Point进行聚类,所以data中用于表示聚类ID的cluster_id要都重新置为-1。 for (i = 0; i < data_size_total; i++) { data[i].cluster_id = -1; } } //比较新旧的cluster center的值,完全一样表示聚类完成 void compareNewOldClusterCenter(CenterCalc* center_calc) { int i; is_continue = 0; //等于0表示不要继续,1表示要继续 for (i = 0; i < cluster_count; i++) { if (center_calc[i].x_value != cluster_center[i].x_value || center_calc[i].y_value != cluster_center[i].y_value) { is_continue = 1; break; } } } //K-means算法 void kmeans() { int rounds; for (rounds = 0; rounds < MAX_ROUNDS; rounds++) { printf(" Rounds : %d ", rounds + 1); partition4AllPointOneCluster(); calcClusterCenter(); if (0 == is_continue) { printf(" after %d rounds, the classification is ok and can stop. ", rounds + 1); break; } } }

  

// ConsoleApplication1.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。//#pragma warning(disable:4996)#include <iostream>#include<stdio.h>#include<stdlib.h>#include<string.h>#include<math.h>
#define MAX_ROUNDS 100    //最大允许的聚类次数#define _CRT_S//“点”的结构体  typedef struct Point {double x_value;           //用于存放点在X轴上的值double y_value;           //用于存放点在Y轴上的值int cluster_id;        //用于存放该点所属的cluster id}Point;Point* data;
//cluster center的结构体typedef struct ClusterCenter {double x_value;double y_value;int cluster_id;}ClusterCenter;ClusterCenter* cluster_center;
//计算cluster center的结构体typedef struct CenterCalc {double x_value;double y_value;}CenterCalc;CenterCalc *center_calc;
int is_continue;                               //kmeans 运算是否继续int* cluster_center_init_index;        //记录每个cluster center最初用的是哪个“点”double* distance_from_center;      //记录一个“点”到所有cluster center的距离int* data_size_per_cluster;            //每个cluster点的个数int data_size_total;                        //设定点的个数char filename[200];                       //要读取的点的数据的文件名int cluster_count;                          //设定的cluster的个数
void memoryAlloc();void memoryFree();void readDataFromFile();void initialCluster();void calcDistance2OneCenter(int pointID, int centerID);void calcDistance2AllCenters(int pointID);void partition4OnePoint(int pointID);void partition4AllPointOneCluster();void calcClusterCenter();void kmeans();void compareNewOldClusterCenter(CenterCalc* center_calc);
int main(int argc, char* argv[]){if (argc != 4){printf("This application needs 3 parameters to run:"" the 1st is the size of data set,"" the 2nd is the file name that contains data"" the 3rd indicates the cluster_count"" ");exit(1);}
data_size_total = atoi(argv[1]);strcat_s(filename, argv[2]);cluster_count = atoi(argv[3]);//1, memory allocmemoryAlloc();//2, read point data from filereadDataFromFile();//3, initial clusterinitialCluster();//4, run k-meanskmeans();//5, memory free & endmemoryFree();
return 0;}
void memoryAlloc(){data = (Point*)malloc(sizeof(struct Point) * (data_size_total));if (!data){printf("malloc error:data!");exit(1);}cluster_center_init_index = (int*)malloc(sizeof(int) * (cluster_count));if (!cluster_center_init_index){printf("malloc error:cluster_center! ");exit(1);}distance_from_center = (double*)malloc(sizeof(double) * (cluster_count));if (!distance_from_center){printf("malloc error: distance_from_center! ");exit(1);}cluster_center = (ClusterCenter*)malloc(sizeof(struct ClusterCenter) * (cluster_count));if (!cluster_center){printf("malloc cluster center new error! ");exit(1);}
center_calc = (CenterCalc*)malloc(sizeof(CenterCalc) * cluster_count);if (!center_calc){printf("malloc error: center_calc! ");exit(1);}
data_size_per_cluster = (int*)malloc(sizeof(int) * (cluster_count));if (!data_size_per_cluster){printf("malloc error: data_size_per_cluster ");exit(1);}
}
void memoryFree(){free(data);data = NULL;free(cluster_center_init_index);cluster_center_init_index = NULL;free(distance_from_center);distance_from_center = NULL;free(cluster_center);cluster_center = NULL;free(center_calc);center_calc = NULL;free(data_size_per_cluster);data_size_per_cluster = NULL;}
//从文件中读入每个点的x和y值void readDataFromFile(){int i;FILE* fread;
if (NULL == (fread = fopen(filename, "r"))){printf("open file(%s) error! ", filename);exit(1);}
for (i = 0; i < data_size_total; i++){if (2 != fscanf(fread, "%lf %lf ", &data[i].x_value, &data[i].y_value)){printf("fscanf error: %d ", i);}data[i].cluster_id = -1;    //初始时每个点所属的cluster id均置为-1
printf("After reading, point index:%d, X:%d, Y:%d, cluster_id:%d ", i, data[i].x_value, data[i].y_value, data[i].cluster_id);}}

//根据传入的cluster_count来随机的选择一个点作为 一个cluster的center  void initialCluster(){int i, j;int random;
//产生初始化的cluster_count个聚类  for (i = 0; i < cluster_count; i++){cluster_center_init_index[i] = -1;}//随机选择一个点作为每个cluster的center(不重复)for (i = 0; i < cluster_count; i++){Reselect:random = rand() % (data_size_total - 1);for (j = 0; j < i; j++) {if (random == cluster_center_init_index[j])goto Reselect;}
cluster_center_init_index[i] = random;printf("cluster_id: %d, located in point index:%d ", i, random);}//将随机选择的点作为center,同时这个点的cluster id也就确定了for (i = 0; i < cluster_count; i++){cluster_center[i].x_value = data[cluster_center_init_index[i]].x_value;cluster_center[i].y_value = data[cluster_center_init_index[i]].y_value;cluster_center[i].cluster_id = i;data[cluster_center_init_index[i]].cluster_id = i;
printf("cluster_id:%d, index:%d, x_value:%f, y_value:%f ", cluster_center[i].cluster_id, cluster_center_init_index[i], cluster_center[i].x_value, cluster_center[i].y_value);}}

//计算一个点到一个cluster center的distancevoid calcDistance2OneCenter(int point_id, int center_id){distance_from_center[center_id] = sqrt((data[point_id].x_value - cluster_center[center_id].x_value)*(double)(data[point_id].x_value - cluster_center[center_id].x_value) + (double)(data[point_id].y_value - cluster_center[center_id].y_value) *              (data[point_id].y_value - cluster_center[center_id].y_value));}
//计算一个点到每个cluster center的distancevoid calcDistance2AllCenters(int point_id){int i;for (i = 0; i < cluster_count; i++){calcDistance2OneCenter(point_id, i);}}
//确定一个点属于哪一个cluster center(取距离最小的)void partition4OnePoint(int point_id){int i;int min_index = 0;double min_value = distance_from_center[0];for (i = 0; i < cluster_count; i++){if (distance_from_center[i] < min_value){min_value = distance_from_center[i];min_index = i;}}
data[point_id].cluster_id = cluster_center[min_index].cluster_id;}
//在一轮的聚类中得到所有的point所属于的cluster centervoid partition4AllPointOneCluster(){int i;for (i = 0; i < data_size_total; i++){if (data[i].cluster_id != -1)  //这个点就是center,不需要计算continue;else{calcDistance2AllCenters(i);  //计算第i个点到所有center的distancepartition4OnePoint(i);          //根据distance对第i个点进行partition}}}
//重新计算新的cluster centervoid calcClusterCenter(){int i;
memset(center_calc, 0, sizeof(CenterCalc) * cluster_count);memset(data_size_per_cluster, 0, sizeof(int) * cluster_count);//分别对每个cluster内的每个点的X和Y求和,并计每个cluster内点的个数for (i = 0; i < data_size_total; i++){center_calc[data[i].cluster_id].x_value += data[i].x_value;center_calc[data[i].cluster_id].y_value += data[i].y_value;data_size_per_cluster[data[i].cluster_id]++;}//计算每个cluster内点的X和Y的均值作为centerfor (i = 0; i < cluster_count; i++){if (data_size_per_cluster[i] != 0) {center_calc[i].x_value = center_calc[i].x_value / (double)(data_size_per_cluster[i]);center_calc[i].y_value = center_calc[i].y_value / (double)(data_size_per_cluster[i]);
printf(" cluster %d point cnt:%d ", i, data_size_per_cluster[i]);printf(" cluster %d center: X:%f, Y:%f ", i, center_calc[i].x_value, center_calc[i].y_value);}elseprintf(" cluster %d count is zero ", i);}
//比较新的和旧的cluster center值的差别。如果是相等的,则停止K-means算法。compareNewOldClusterCenter(center_calc);
//将新的cluster center的值放入cluster_center结构体中for (i = 0; i < cluster_count; i++){cluster_center[i].x_value = center_calc[i].x_value;cluster_center[i].y_value = center_calc[i].y_value;cluster_center[i].cluster_id = i;}
//在重新计算了新的cluster center之后,要重新来为每一个Point进行聚类,所以data中用于表示聚类ID的cluster_id要都重新置为-1。for (i = 0; i < data_size_total; i++){data[i].cluster_id = -1;}}
//比较新旧的cluster center的值,完全一样表示聚类完成void compareNewOldClusterCenter(CenterCalc* center_calc){int i;is_continue = 0;       //等于0表示不要继续,1表示要继续for (i = 0; i < cluster_count; i++){if (center_calc[i].x_value != cluster_center[i].x_value || center_calc[i].y_value != cluster_center[i].y_value){is_continue = 1;break;}}}
//K-means算法void kmeans(){int rounds;for (rounds = 0; rounds < MAX_ROUNDS; rounds++){printf(" Rounds : %d              ", rounds + 1);partition4AllPointOneCluster();calcClusterCenter();if (0 == is_continue){printf(" after %d rounds, the classification is ok and can stop. ", rounds + 1);break;}}}

原文地址:https://www.cnblogs.com/xuhongfei0021/p/13812668.html