代码1:
# coding=gbk ''' 数据集: BX-Users.csv,包含用户的ID、位置和年龄。 BX-Books.csv,包含图书的ISBN、标题、作者、发表年代、出版社和缩略。 BX-Book-Ratings.csv, 包含用户对图书的评分信息。 比较两种p(f,i)两种定义方式,给[年龄<25]和[年龄>50]两类用户推荐的前10本书 ''' import pandas as pd users=pd.read_csv('../data/BX-Users.csv',sep=';',dtype={'Age':float}) books=pd.read_csv('../data/BX-Books.csv',sep=';',escapechar='\') rates=pd.read_csv('../data/BX-Book-Ratings.csv',sep=';',nrows=80000,dtype={'Book-Rating':float}) #分类用户,只保存用户ID AgeL25=set(users[users.Age<25].ix[:,0]) AgeG50=set(users[users.Age>25].ix[:,0]) #以字典形式存储书的id和名称 books={a:b for a, b in books[[0,1]].itertuples(index=False)} rates=rates[rates['ISBN'].isin(books)] RateL25=rates[rates['User-ID'].isin(AgeL25)] RateG50=rates[rates['User-ID'].isin(AgeG50)] #第一种方式,使用 25岁以下用户最热门书籍作为给25岁以下用户的推荐,50岁以上亦同 #这种方式会将全年龄段都热门书籍推荐给25岁以下用户 rankL25=dict() #按书籍分组 groups = RateL25.groupby(['ISBN']) for book,group in groups: rankL25[book]=len(group) recL25=[books[x[0]] for x in sorted(rankL25.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]] rankG50=dict() #按书籍分组 groups = RateG50.groupby(['ISBN']) for book,group in groups: rankG50[book]=len(group) recG50=[books[x[0]] for x in sorted(rankG50.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]] print recL25 print recG50 ''' 结果中有三本书是一样的,因为这三本书各年龄段都热门 ['Wild Animus', 'The Lovely Bones: A Novel', 'The Da Vinci Code', "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))", "She's Come Undone (Oprah's Book Club)"] ['Wild Animus', 'The Lovely Bones: A Novel', 'The Da Vinci Code', 'Life of Pi', 'Divine Secrets of the Ya-Ya Sisterhood: A Novel'] ''' #第2中方式,将读者中25岁以下用户占比最大的小说推荐给25岁以下用户,50岁以上亦同 #这种方式可以解决全年龄段热门读物会都被推荐的问题 #当有本书只有一个读者时,它的推荐比重就会是1,在分母中加上alpha,为了解决这个问题 alpha=10 groups = rates.groupby(['ISBN']) bookrates={book:len(group) for book,group in groups} for book,rank in rankL25.items(): rankL25[book]=rank/(bookrates[book]+alpha) recL25=[books[x[0]] for x in sorted(rankL25.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]] for book,rank in rankG50.items(): rankG50[book]=rank/(bookrates[book]+alpha) recG50=[books[x[0]] for x in sorted(rankG50.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]] print recL25 print recG50
代码2:
# coding=gbk import pandas as pd from sklearn import cross_validation users=pd.read_csv('data/BX-Users.csv',sep=';',dtype={'Age':float}) rates=pd.read_csv('data/BX-Book-Ratings.csv',sep=';',dtype={'Book-Rating':float}) #排除年龄为空及小于3岁和大于110岁的用户信息,以及其评价信息 users=users[(pd.notnull(users['Age']))&(users['Age']>0)&(users['Age']<100)]#只取用户ID和年龄 rates=rates[rates['User-ID'].isin(users['User-ID'])]#排除年龄不对的用户的评价 #地址只取国籍 def dealLocation(x): z=x['Location'].split(',') if len(z)<3: return 'False' else: return z[len(z)-1].strip() users['Location']=users.apply(dealLocation,axis=1) userdict = {a:(b,c) for a, b, c in users.itertuples(index=False)} #先按国籍分组,然后按年龄分组,年龄从1到99岁,划分为20个区间 groups=users.groupby(['Location']) userclass=dict() userclassRec=dict() for loc,group in groups: userclass[loc]=dict() userclassRec[loc]=dict() for i in range(20): userclass[loc][i]=set(group[(group['Age']>(i*5))&(group['Age']<=((i+1)*5))]['User-ID']) userclassRec[loc][i]=dict() #将评分分为测试组和训练组 train,test=cross_validation.train_test_split(rates,test_size=0.2) train = pd.DataFrame(train,columns=['User-ID', 'ISBN', 'Book-Rating']) test = pd.DataFrame(test,columns=['User-ID', 'ISBN', 'Book-Rating']) #计算每类用户的推荐 groups=train.groupby(['ISBN']) for book,group in groups: busers=set(group['User-ID']) for u in busers: uinfo = userdict[u] loc = uinfo[0] ageclass = int((uinfo[1]-1)/5) if book not in userclassRec[loc][ageclass]: userclassRec[loc][ageclass][book]=0 userclassRec[loc][ageclass][book]+=1.0/(len(busers)+5) #根据评分各个用户组里的书籍 for loc,ages in userclassRec.items(): for age,books in ages.items(): userclassRec[loc][age]=[i[0] for i in sorted(userclassRec[loc][age].items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:20]] groups=test.groupby(['User-ID']) total=0 accurate=0 for u,group in groups: uinfo = userdict[u] loc = uinfo[0] ageclass = int((uinfo[1]-1)/5) total += len(userclassRec[loc][ageclass]) for book in set(group['ISBN']): if book in userclassRec[loc][ageclass]: accurate +=1 print accurate*1.0/total