数据挖掘学习相关

一直以来对机器学习,数据挖掘这块挺感兴趣,最近有时间就买了本《python数据挖掘入门与实践》,这篇文章主要贴一些书里面的代码,和我自己的一些理解与注释,供以后复习查看

商品推荐:

#coding=utf-8
#如果开头不声明保存编码的格式是什么,那么它会默认使用ASKII码保存文件,这时如果你的代码中有中文就会出错了,即使你的中文是包含在注释里面的。所以加上中文注释很重要。

import numpy as np
from collections import defaultdict

def calS(X , n_feature):
valid_rules = defaultdict(int) #规则应验
invalid_rules = defaultdict(int) #规则无效
num_occurances = defaultdict(int) #条件相同的规则数量

for sample in X:
for premise in range(5): #这一段主要统计在某一水果出现的情况下其他水果出现的次数
if sample[premise] == 0: #如果没有,就直接跳过
continue
num_occurances[premise] += 1
for conclusion in range(n_features): #统计在该水果出现的情况下其他水果也出现, 就加一
if premise == conclusion:
continue
if sample[conclusion] == 1:
valid_rules[(premise , conclusion)] += 1
else:
invalid_rules[(premise , conclusion)] += 1

support = valid_rules #支持度就是规则应验的次数

confidence = defaultdict(float) #计算置信度
for premise , conclusion in valid_rules.keys():
rule = (premise , conclusion)
confidence[rule] = float(valid_rules[rule]) / num_occurances[premise] #切记前面要转为float型

return support , confidence


def print_rule(premise , conclusion , support , confidence , features): #用于打印输出
premise_name = features[premise]
conclusion_name = features[conclusion]
print ("Rule: If a person buys {0} they will also buy {1} ".format(premise_name , conclusion_name))
print (" - Support: {0} ".format(support[(premise,conclusion)]))
print("-Confidence:{0:.6f}".format(confidence[(premise, conclusion)]))


if __name__ == '__main__':
dataset_filename = "affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_sample, n_features = X.shape
print (n_features)
print (X[:5]) # 前五次交易顾客都买了?
premise = 1
conclusion = 3
support, confidence = calS(X, n_features)
features = ['bread' , 'cow' , 'cheese' , 'apple' , 'banana']
print (support)
print (confidence)
print_rule(premise , conclusion , support , confidence , features)

from operator import itemgetter
sorted_support = sorted(support.items() , key = itemgetter(1) , reverse=True)

for index in range(5): #输出支持度前五的
print ("Rule #{0}".format(index + 1))
premise , conclusion = sorted_support[index][0]
print_rule(premise , conclusion , support , confidence , features )

sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5): #输出置信度前五的
print ("Rule #{0}".format(index + 1))
premise , conclusion = sorted_confidence[index][0]
print_rule(premise , conclusion , support , confidence , features )
原文地址:https://www.cnblogs.com/jhmu0613/p/6873481.html