原创:数据挖掘十大算法之4——Apriori算法+编码

  1. 基本Apriori算法:

主要思想是:连接步+Apriori性质

  • 统计单项在的支持度,再利用连接步生成2项;
  • 对2项采取Apriori性质剪枝,得到可能的2项;
  • 再将2项的再原始中统计得出其支持度,并减去达不到支持度的项。
  • 按照上面步骤重复,直到不能产生新的更多项。

因为从网上找的程序都不太好调试,特贴出自己编写的程序,当然这个针对的是固定的一个对象,通过修改能达到同样的目的,欢迎指出错误!

主程序:

#include <iostream>
#include <string>
#include <math.h>
#include "Apri.h"
int main(){
    Apri aa(2);
    First_Iteam First_set[9];
    First_set[0].name = "T100";
    First_set[0].Iteams[0] = "I1";
    First_set[0].Iteams[1] = "I2";
    First_set[0].Iteams[2] = "I5";
    
    First_set[1].name = "T200";
    First_set[1].Iteams[0] = "I2";
    First_set[1].Iteams[1] = "I4";
    
    First_set[2].name = "T300";
    First_set[2].Iteams[0] = "I2";
    First_set[2].Iteams[1] = "I3";
    
    First_set[3].name = "T400";
    First_set[3].Iteams[0] = "I1";
    First_set[3].Iteams[1] = "I2";
    First_set[3].Iteams[2] = "I4";
    
    First_set[4].name = "T500";
    First_set[4].Iteams[0] = "I1";
    First_set[4].Iteams[1] = "I3";
    
    First_set[5].name = "T600";
    First_set[5].Iteams[0] = "I2";
    First_set[5].Iteams[1] = "I3";
    
    First_set[6].name = "T700";
    First_set[6].Iteams[0] = "I1";
    First_set[6].Iteams[1] = "I3";
    
    First_set[7].name = "T800";
    First_set[7].Iteams[0] = "I1";
    First_set[7].Iteams[1] = "I2";
    First_set[7].Iteams[2] = "I3";
    First_set[7].Iteams[3] = "I5";
    
    First_set[8].name = "T900";
    First_set[8].Iteams[0] = "I1";
    First_set[8].Iteams[1] = "I2";
    First_set[8].Iteams[2] = "I3";
    
    aa.setFirstSet(First_set);
    aa.Find_frequent_1_itemsets();
    
    
    
    aa.print();
    aa.apriori_gen();
    
    cin.get();
    return 0;   
}

 Apriori类

.h文件:

#include <iostream>
#include <string>
using namespace std;

struct First_Iteam{
       string name;
       string Iteams[5];
       };
       
struct Ci{
       string Items[5];
       int confident;
       };
        
class Apri{
private:
        First_Iteam First_set[9];//存放事物,相当于从数据库中读取的信息,固定为9 
        Ci Ck_Items[100];//计算的K-1的统计信息 
        Ci Cl_Items[100];//新计算的第k个统计信息 
        int Items_k;//k-1的统计信息的数目 
        int Items_l;//k个统计信息的数目 
        int Gen;//现在的代数,开始值为1 
        int min_sup;//最小支持度 
        
public:
       Apri(int);
       void print();
       void setFirstSet(First_Iteam*);
       void Find_frequent_1_itemsets();//查找第一代的数据及其信息
       void Find_ck(string item);//查找是否在K-1中存在此对象
       void apriori_gen();//连接步,将K-1中的数据连接成K的数据 
       bool Apriori_check(Ci);//在K-1中检查一个k-1的字符串是否达到了min_sup,或者是否存在这样的K-1 
       bool Ci_Equal(Ci,Ci);//判断两个串是否相等 
       int ContainCi(Ci);//判断在原集合中对Ci的支持度 
        
};

 .cpp文件:

#include "Apri.h"
Apri::Apri(int a){
    min_sup = a; 
}
void Apri::print(){
     cout << "原始项目集:" << endl; 
     for(int i = 0; i < 9; i++)
     {
         cout << First_set[i].name << ": ";
         for( int j = 0; j < 5; j++)
             if(First_set[i].Iteams[j] != "")
                 cout << First_set[i].Iteams[j] << " ";
         cout << endl;
     }
}
     
     
void Apri::setFirstSet(First_Iteam a[]){
     for(int i = 0; i < 9; i++)
             First_set[i] = a[i];
}
void Apri::Find_frequent_1_itemsets(){
     Gen = 1;
     Items_k = 0;
     for(int i = 0; i < 9; i++)
     {
         int j = 0;
         while(First_set[i].Iteams[j] != ""){
             Find_ck(First_set[i].Iteams[j]);
             j++;
         }
     }
}
//查找是否在第一次中存在这样的项,并统计信息,
//在统计信息时要做到判断准确,不能失误。 
void Apri::Find_ck(string item){
     int i;
     bool isbreak = false;
     for(i = 0; i < Items_k; i++)
         if(item == Ck_Items[i].Items[0])
         {
             isbreak =!isbreak;
             break;
         }
     if(isbreak)
         Ck_Items[i].confident++;
     else
     {
         Ck_Items[i].Items[0] = item;
         Ck_Items[i].confident = 1;
         Items_k++;
     }
}
void Apri::apriori_gen(){
    while(Items_k != 0)
    {
         cout << "-----------------" << "L" << Gen << "------------------" << endl;
         for(int i = 0; i < Items_k; i++)
         {
             for(int j = 0; j < 5; j++)
                 if(Ck_Items[i].Items[j] != "")
                     cout << Ck_Items[i].Items[j] <<" ";
             cout << Ck_Items[i].confident<< endl; 
         }           
               
    
         Items_l = 0;
         for(int i = 0; i < Items_k-1; i++)
         {
             for(int j = i+1; j < Items_k; j++)
             {
                 bool isLink = true;
                 //判断是否能够连接,前面k-2个都相同,但是K-1是不同的 
                 for(int m = 0; m < Gen-1; m++)
                 {
                     if(Ck_Items[i].Items[m] != Ck_Items[j].Items[m])
                     {
                         isLink = false;   
                         break;
                     }                  
                 }
                 if(isLink)
                 {
                     if(Ck_Items[i].Items[Gen-1] != Ck_Items[j].Items[Gen-1])
                     //连接条件成立,进行连接; 
                     {
                         Ci newItem;//生成新连接后得到的串 
                         for(int n = 0; n < Gen; n++)
                             newItem.Items[n] = Ck_Items[i].Items[n];
                         newItem.Items[Gen] = Ck_Items[j].Items[Gen-1]; 
                         //对新生成的串进行数据Apriori剪枝,判断是否合法
                         Ci AllItem;//生成k的所有的k-1子串,并进行检查 
                         bool ThroughApriori = true;//是否通过Apriori检查 
                         for(int g = 0; g <= Gen; g++)
                         {
                              int start = 0;
                              for(int h=0; h <= Gen; h++)
                              {
                                   if(h != g)
                                   {
                                        AllItem.Items[start] = newItem.Items[h]; 
                                        start++;
                                   }      
                              }        
                              //对K-1的子串检查是否达到min_sup
                              if(!Apriori_check(AllItem))
                              {
                                  ThroughApriori = false;
                                  break;                           
                              }
                         } 
                         if(ThroughApriori)//通过了Apriori性质检查,将此条加入到K中 
                         {
                              Cl_Items[Items_l] = newItem;
                              Cl_Items[Items_l].confident = 0;
                              Items_l++;                     
                         } 
                     }
                 }
             } 
         }
         Gen++;
         //接下来处理L的数据的支持度,数据存放在Cl_Items中 
         for(int i = 0; i < Items_l; i++)
         {
             Cl_Items[i].confident =  ContainCi(Cl_Items[i]);
         }
         
         //剪去小于最小支持度的并将CL_Items中的数据转移到Ck_Items中 
         for(int m = 0; m < Items_k; m++)
         {
              for(int j = 0; j < 5; j++)
                  Ck_Items[m].Items[j] = "";
              Ck_Items[m].confident = 0;             
         }
         int new_Items_l = 0;
         for(int i = 0; i < Items_l; i++)
         {
              if(Cl_Items[i].confident >= min_sup)
              { 
                  for(int j = 0; j < 5; j++)
                      Ck_Items[new_Items_l].Items[j] = Cl_Items[i].Items[j];
                  Ck_Items[new_Items_l].confident = Cl_Items[i].confident;
                  new_Items_l++;
              } 
         }
         Items_k = new_Items_l;
         Items_l = 0;
    }  
} 
//当检查时不存在这样的K-1或者存在相等的K-1但是支持度小于min_sup则返回falsse 
bool Apri::Apriori_check(Ci OldCi){
     bool pass = false;
     for(int i = 0; i < Items_k; i++)
     {
         /*if(Ci_Equal(OldCi,Ck_Items[i] && Ck_Items[i].confident < min_sup)
         {
             pass = false;
             break;
         } 
         */
         if(Ci_Equal(OldCi,Ck_Items[i])) 
         {
             pass = true;
             break;
         } 
     }
     return pass;
} 
//判断该两个Ci是否相等 
bool Apri::Ci_Equal(Ci a,Ci b){
    bool isequal  = false;
    int num_a = 0,num_b = 0; 
    while(a.Items[num_a] != "")
        num_a++;
    while(b.Items[num_b] != "")
        num_b++;
    if(num_a != num_b)
        return false;
        
    for(int i = 0;i < Gen; i++)
    {
        isequal = false;
        for(int j = 0; j < Gen; j++)
            if(a.Items[i] == b.Items[j])
            {
                isequal = true;
                break;
            }
        if(!isequal)
            break;                
    }
    return isequal;
}
int Apri::ContainCi(Ci NewCi){
    int num_new = 0, num_item, num_contain = 0; 
    while(NewCi.Items[num_new] != "")
        num_new++;
    bool isequal  = false;
    for(int i = 0; i < 9; i++)
    {
         num_item = 0;          
         while(First_set[i].Iteams[num_item] != "")
             num_item++;
         if(num_new > num_item)
             continue;
         else
         {
             for(int j = 0; j < num_new; j++)
             {
                 isequal = false;
                 for(int m = 0; m < num_item; m++)
                 {
                      if(NewCi.Items[j] == First_set[i].Iteams[m])
                      {
                          isequal = true;
                          break;                  
                      }
                 }
                 if(!isequal)
                     break;     
             }
             if(isequal)
                 num_contain++;
         } 
    }
    return num_contain;
}

宿舍

原文地址:https://www.cnblogs.com/wyqx/p/2258579.html