gotea 2012-12-27
K中心点算法(K-medoids)
前面介绍了k-means算法,并列举了该算法的缺点。而K中心点算法(K-medoids)正好能解决k-means算法中的“噪声”敏感这个问题。
如何解决的呢?
首先,我们得介绍下k-means算法为什么会对“噪声”敏感。还记得K-means寻找质点的过程吗?对某类簇中所有的样本点维度求平均值,即获得该类簇质点的维度。当聚类的样本点中有“噪声”(离群点)时,在计算类簇质点的过程中会受到噪声异常维度的干扰,造成所得质点和实际质点位置偏差过大,从而使类簇发生“畸变”。
Eg: 类簇C1中已经包含点A(1,1)、B(2,2)、C(1,2)、 D(2,1),假设N(100,100)为异常点,当它纳入类簇C1时,计算质点Centroid((1+2+1+2+100)/5,(1+2+2+1+100)/5)=centroid(21,21),此时可能造成了类簇C1质点的偏移,在下一轮迭代重新划分样本点的时候,将大量不属于类簇C1的样本点纳入,因此得到不准确的聚类结果。
为了解决该问题,K中心点算法(K-medoids)提出了新的质点选取方式,而不是简单像k-means算法采用均值计算法。在K中心点算法中,每次迭代后的质点都是从聚类的样本点中选取,而选取的标准就是当该样本点成为新的质点后能提高类簇的聚类质量,使得类簇更紧凑。该算法使用绝对误差标准来定义一个类簇的紧凑程度。
如果某样本点成为质点后,绝对误差能小于原质点所造成的绝对误差,那么K中心点算法认为该样本点是可以取代原质点的,在一次迭代重计算类簇质点的时候,我们选择绝对误差最小的那个样本点成为新的质点。
Eg:样本点A–>E1=10
样本点B –>E2=11
样本点C –>E3=12
原质点O–>E4=13,那我们选举A作为类簇的新质点。
与K-means算法一样,K-medoids也是采用欧几里得距离来衡量某个样本点到底是属于哪个类簇。终止条件是,当所有的类簇的质点都不在发生变化时,即认为聚类结束。
该算法除了改善K-means的“噪声”敏感以后,其他缺点和K-means一致,并且由于采用新的质点计算规则,也使得算法的时间复杂度上升:O(k(n-k)2)
Java实现代码如下:
package com.kmedoids;
importjava.util.ArrayList;
publicclassCluster{
privateStringclusterName;//类簇名
privateMedoidmedoid;//类簇的质点
privateArrayList<DataPoint>dataPoints;//类簇中各样本点
publicCluster(StringclusterName){
this.clusterName=clusterName;
this.medoid=null;//willbesetbycallingsetCentroid()
dataPoints=newArrayList<DataPoint>();
}
publicvoidsetMedoid(Medoidc){
medoid=c;
}
publicMedoidgetMedoid(){
returnmedoid;
}
publicvoidaddDataPoint(DataPointdp){//calledfromCAInstance
dp.setCluster(this);//标注该类簇属于某点,计算欧式距离
this.dataPoints.add(dp);
}
publicvoidremoveDataPoint(DataPointdp){
this.dataPoints.remove(dp);
}
publicintgetNumDataPoints(){
returnthis.dataPoints.size();
}
publicDataPointgetDataPoint(intpos){
return(DataPoint)this.dataPoints.get(pos);
}
publicStringgetName(){
returnthis.clusterName;
}
publicArrayList<DataPoint>getDataPoints(){
returnthis.dataPoints;
}
}------------------------------------
package com.kmedoids;
importjava.util.ArrayList;
publicclassDataPoint{
privatedoubledimension[];//样本点的维度
privateStringpointName;//样本点名字
privateClustercluster;//类簇
privatedoubleeuDt;//样本点到质点的距离
publicDataPoint(doubledimension[],StringpointName){
this.dimension=dimension;
this.pointName=pointName;
this.cluster=null;
}
publicvoidsetCluster(Clustercluster){
this.cluster=cluster;
}
publicdoublecalEuclideanDistanceSum(){
doublesum=0.0;
Clustercluster=this.getCluster();
ArrayList<DataPoint>dataPoints=cluster.getDataPoints();
for(inti=0;i<dataPoints.size();i++){
double[]dims=dataPoints.get(i).getDimensioin();
for(intj=0;j<dims.length;j++){
doubletemp=Math.pow((dims[j]-this.dimension[j]),2);
sum=sum+temp;
}
}
returnMath.sqrt(sum);
}
publicdoubletestEuclideanDistance(Medoidc){
doublesum=0.0;
double[]cDim=c.getDimensioin();
for(inti=0;i<dimension.length;i++){
doubletemp=Math.pow((dimension[i]-cDim[i]),2);
sum=sum+temp;
}
returnMath.sqrt(sum);
}
publicdouble[]getDimensioin(){
returnthis.dimension;
}
publicClustergetCluster(){
returnthis.cluster;
}
publicdoublegetCurrentEuDt(){
returnthis.euDt;
}
publicStringgetPointName(){
returnthis.pointName;
}
}
-------------------------------package com.kmedoids;
importjava.util.ArrayList;
publicclassMedoid{
privatedoubledimension[];//质点的维度
privateClustercluster;//所属类簇
privatedoubleetdDisSum;//Medoid到本类簇中所有的欧式距离之和
publicMedoid(doubledimension[]){
this.dimension=dimension;
}
publicvoidsetCluster(Clusterc){
this.cluster=c;
}
publicdouble[]getDimensioin(){
returnthis.dimension;
}
publicClustergetCluster(){
returnthis.cluster;
}
publicvoidcalcMedoid(){//取代价最小的点
calcEtdDisSum();
doubleminEucDisSum=this.etdDisSum;
ArrayList<DataPoint>dps=this.cluster.getDataPoints();
for(inti=0;i<dps.size();i++){
doubletempeucDisSum=dps.get(i).calEuclideanDistanceSum();
if(tempeucDisSum<minEucDisSum){
dimension=dps.get(i).getDimensioin();
minEucDisSum=tempeucDisSum;
}
}
}
//计算该Medoid到同类簇所有样本点的欧斯距离和
privatevoidcalcEtdDisSum(){
doublesum=0.0;
Clustercluster=this.getCluster();
ArrayList<DataPoint>dataPoints=cluster.getDataPoints();
for(inti=0;i<dataPoints.size();i++){
double[]dims=dataPoints.get(i).getDimensioin();
for(intj=0;j<dims.length;j++){
doubletemp=Math.abs(dims[j]-this.dimension[j]);
sum=sum+temp;
}
}
etdDisSum=sum;
}
}--------------------------
package com.kmedoids;
importjava.util.ArrayList;
publicclassClusterAnalysis{
privateCluster[]clusters;//所有类簇
privateintmiter;//迭代次数
privateArrayList<DataPoint>dataPoints=newArrayList<DataPoint>();//所有样本点
privateintdimNum;//维度
publicClusterAnalysis(intk,intiter,ArrayList<DataPoint>dataPoints,intdimNum){
clusters=newCluster[k];//类簇种类数
for(inti=0;i<k;i++){
clusters[i]=newCluster("Cluster:"+i);
}
this.miter=iter;
this.dataPoints=dataPoints;
this.dimNum=dimNum;
}
publicintgetIterations(){
returnmiter;
}
publicArrayList<DataPoint>[]getClusterOutput(){
ArrayList<DataPoint>v[]=newArrayList[clusters.length];
for(inti=0;i<clusters.length;i++){
v[i]=clusters[i].getDataPoints();
}
returnv;
}
publicvoidstartAnalysis(double[][]medoids){
setInitialMedoids(medoids);
double[][]newMedoids=medoids;
double[][]oldMedoids=newdouble[medoids.length][this.dimNum];
while(!isEqual(oldMedoids,newMedoids)){
for(intm=0;m<clusters.length;m++){//每次迭代开始情况各类簇的点
clusters[m].getDataPoints().clear();
}
for(intj=0;j<dataPoints.size();j++){
intclusterIndex=0;
doubleminDistance=Double.MAX_VALUE;
for(intk=0;k<clusters.length;k++){//判断样本点属于哪个类簇
doubleeucDistance=dataPoints.get(j).testEuclideanDistance(clusters[k].getMedoid());
if(eucDistance<minDistance){
minDistance=eucDistance;
clusterIndex=k;
}
}
//将该样本点添加到该类簇
clusters[clusterIndex].addDataPoint(dataPoints.get(j));
}
for(intm=0;m<clusters.length;m++){
clusters[m].getMedoid().calcMedoid();//重新计算各类簇的质点
}
for(inti=0;i<medoids.length;i++){
for(intj=0;j<this.dimNum;j++){
oldMedoids[i][j]=newMedoids[i][j];
}
}
for(intn=0;n<clusters.length;n++){
newMedoids[n]=clusters[n].getMedoid().getDimensioin();
}
this.miter++;
}
}
privatevoidsetInitialMedoids(double[][]medoids){
for(intn=0;n<clusters.length;n++){
Medoidmedoid=newMedoid(medoids[n]);
clusters[n].setMedoid(medoid);
medoid.setCluster(clusters[n]);
}
}
privatebooleanisEqual(double[][]oldMedoids,double[][]newMedoids){
booleanflag=false;
for(inti=0;i<oldMedoids.length;i++){
for(intj=0;j<oldMedoids[i].length;j++){
if(oldMedoids[i][j]!=newMedoids[i][j]){
returnflag;
}
}
}
flag=true;
returnflag;
}
}
--------------------------------------------package com.kmedoids;
importjava.util.ArrayList;
importjava.util.Iterator;
publicclassTestMain{
publicstaticvoidmain(Stringargs[]){
ArrayList<DataPoint>dataPoints=newArrayList<DataPoint>();
double[]a={2,3};
double[]b={2,4};
double[]c={1,4};
double[]d={1,3};
double[]e={2,2};
double[]f={3,2};
double[]g={8,7};
double[]h={8,6};
double[]i={7,7};
double[]j={7,6};
double[]k={8,5};
double[]l={100,2};//孤立点
double[]m={8,20};
double[]n={8,19};
double[]o={7,18};
double[]p={7,17};
double[]q={7,20};
dataPoints.add(newDataPoint(a,"a"));
dataPoints.add(newDataPoint(b,"b"));
dataPoints.add(newDataPoint(c,"c"));
dataPoints.add(newDataPoint(d,"d"));
dataPoints.add(newDataPoint(e,"e"));
dataPoints.add(newDataPoint(f,"f"));
dataPoints.add(newDataPoint(g,"g"));
dataPoints.add(newDataPoint(h,"h"));
dataPoints.add(newDataPoint(i,"i"));
dataPoints.add(newDataPoint(j,"j"));
dataPoints.add(newDataPoint(k,"k"));
dataPoints.add(newDataPoint(l,"l"));
dataPoints.add(newDataPoint(m,"m"));
dataPoints.add(newDataPoint(n,"n"));
dataPoints.add(newDataPoint(o,"o"));
dataPoints.add(newDataPoint(p,"p"));
dataPoints.add(new DataPoint(q,"q"));ClusterAnalysis ca=new ClusterAnalysis(3,0,dataPoints,2);
double[][]cen={{8,7},{8,6},{7,7}};
ca.startAnalysis(cen);ArrayList<DataPoint>[] v =ca.getClusterOutput();
for(intii=0;ii<v.length;ii++){
ArrayListtempV=v[ii];
System.out.println("-----------Cluster"+ii+"---------");
Iteratoriter=tempV.iterator();
while(iter.hasNext()){
DataPointdpTemp=(DataPoint)iter.next();
System.out.println(dpTemp.getPointName());
}
}
}
}