推荐系统之基于物品的协同过滤算法(ItemCF)




前端时间已经把基于用户的推荐系统给弄出来了,详情见我的另一篇文章:点击打开链接
<http://blog.csdn.net/sinat_35866463/article/details/79428898>
,(建议先看懂UserCF后再来看这篇文章,当然大佬可以忽视)其实理解了基于用户的协同过滤算法,再来看基于物品的协同过滤算法,就会感觉没啥太大差异,

具体的思路,通俗的讲:用户A 喜欢了一个物品s集合,那么推荐的时候就把与物品s集合里最相似的前N个物品推荐给用户A,结束。



是不是言简意赅?哈哈,其实道理都差不多,看懂了UserCF再来看ItemCF,就会感觉基本差不多。具体的步骤呢:


一、计算物品之间的相似度。




二、根据物品的相似度和用户的历史行为给用户生成推荐列表




同样,计算相似度的时候公式用的也是余弦相似度,详情就看我写的UserCF吧:点击打开链接
<http://blog.csdn.net/sinat_35866463/article/details/79428898>
,因为都差不多就不重复写了,对照着上一篇博客然后在看看书,就知道基本完全一样了




这里就贴上书上给的一个案例,最下面就是系统的推荐TOP N ,很好理解:
依旧是大牛的ItemCF 代码,贴上供大家学习:#-*- coding: utf-8 -*- ''' Created on 2015-06-22
@author: Lockvictor ''' import sys import random import math import os from
operator import itemgetter random.seed(0) class ItemBasedCF(object): ''' TopN
recommendation - Item Based Collaborative Filtering ''' def __init__(self):
self.trainset = {} self.testset = {} self.n_sim_movie = 20 self.n_rec_movie =
10 self.movie_sim_mat = {} self.movie_popular = {} self.movie_count = 0
print('Similar movie number = %d' % self.n_sim_movie, file=sys.stderr)
print('Recommended movie number = %d' % self.n_rec_movie, file=sys.stderr)
@staticmethod def loadfile(filename): ''' load a file, return a generator. '''
fp = open(filename, 'r') for i, line in enumerate(fp): yield line.strip('\r\n')
if i % 100000 == 0: print ('loading %s(%s)' % (filename, i), file=sys.stderr)
fp.close() print ('load %s succ' % filename, file=sys.stderr) def
generate_dataset(self, filename, pivot=0.7): ''' load rating data and split it
to training set and test set ''' trainset_len = 0 testset_len = 0 for line in
self.loadfile(filename): user, movie, rating, _ = line.split('::') # split the
data by pivot if random.random() < pivot: self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating) trainset_len += 1 else:
self.testset.setdefault(user, {}) self.testset[user][movie] = int(rating)
testset_len += 1 print ('split training set and test set succ',
file=sys.stderr) print ('train set = %s' % trainset_len, file=sys.stderr) print
('test set = %s' % testset_len, file=sys.stderr) def calc_movie_sim(self): '''
calculate movie similarity matrix ''' print('counting movies number and
popularity...', file=sys.stderr) for user, movies in self.trainset.items(): for
movie in movies: # count item popularity if movie not in self.movie_popular:
self.movie_popular[movie] = 0 self.movie_popular[movie] += 1 print('count
movies number and popularity succ', file=sys.stderr) # save the total number of
movies self.movie_count = len(self.movie_popular) print('total movie number =
%d' % self.movie_count, file=sys.stderr) # count co-rated users between items
itemsim_mat = self.movie_sim_mat print('building co-rated users matrix...',
file=sys.stderr) for user, movies in self.trainset.items(): for m1 in movies:
for m2 in movies: if m1 == m2: continue itemsim_mat.setdefault(m1, {})
itemsim_mat[m1].setdefault(m2, 0) itemsim_mat[m1][m2] += 1 print('build
co-rated users matrix succ', file=sys.stderr) # calculate similarity matrix
print('calculating movie similarity matrix...', file=sys.stderr)
simfactor_count = 0 PRINT_STEP = 2000000 for m1, related_movies in
itemsim_mat.items(): for m2, count in related_movies.items():
itemsim_mat[m1][m2] = count / math.sqrt( self.movie_popular[m1] *
self.movie_popular[m2]) simfactor_count += 1 if simfactor_count % PRINT_STEP ==
0: print('calculating movie similarity factor(%d)' % simfactor_count,
file=sys.stderr) print('calculate movie similarity matrix(similarity factor)
succ', file=sys.stderr) print('Total similarity factor number = %d' %
simfactor_count, file=sys.stderr) def recommend(self, user): ''' Find K similar
movies and recommend N movies. ''' K = self.n_sim_movie N = self.n_rec_movie
rank = {} watched_movies = self.trainset[user] for movie, rating in
watched_movies.items(): for related_movie, similarity_factor in
sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1), reverse=True)[:K]:
if related_movie in watched_movies: continue rank.setdefault(related_movie, 0)
rank[related_movie] += similarity_factor * rating # return the N best movies
return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N] def
evaluate(self): ''' print evaluation result: precision, recall, coverage and
popularity ''' print('Evaluation start...', file=sys.stderr) N =
self.n_rec_movie # varables for precision and recall hit = 0 rec_count = 0
test_count = 0 # varables for coverage all_rec_movies = set() # varables for
popularity popular_sum = 0 for i, user in enumerate(self.trainset): if i % 500
== 0: print ('recommended for %d users' % i, file=sys.stderr) test_movies =
self.testset.get(user, {}) rec_movies = self.recommend(user) for movie, _ in
rec_movies: if movie in test_movies: hit += 1 all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie]) rec_count += N
test_count += len(test_movies) precision = hit / (1.0 * rec_count) recall = hit
/ (1.0 * test_count) coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count) print
('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' % (precision,
recall, coverage, popularity), file=sys.stderr) if __name__ == '__main__':
ratingfile = os.path.join('ml-1m', 'ratings.dat') itemcf = ItemBasedCF()
itemcf.generate_dataset(ratingfile) itemcf.calc_movie_sim() itemcf.evaluate()
代码来源:https://github.com/Lockvictor/MovieLens-RecSys
<https://github.com/Lockvictor/MovieLens-RecSys>




友情链接
ioDraw流程图
API参考文档
OK工具箱
云服务器优惠
阿里云优惠券
腾讯云优惠券
华为云优惠券
站点信息
问题反馈
邮箱:ixiaoyang8@qq.com
QQ群:637538335
关注微信