python3實(shí)現(xiàn)基于用戶(hù)的協(xié)同過(guò)濾
本文實(shí)例為大家分享了python3實(shí)現(xiàn)基于用戶(hù)協(xié)同過(guò)濾的具體代碼,供大家參考,具體內(nèi)容如下
廢話(huà)不多說(shuō),直接看代碼。
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#20170916號(hào)協(xié)同過(guò)濾電影推薦基稿
#字典等格式數(shù)據(jù)處理及直接寫(xiě)入文件
##from numpy import *
import time
from math import sqrt
##from texttable import Texttable
class CF:
def __init__(self, movies, ratings, k=5, n=20):
self.movies = movies#[MovieID,Title,Genres]
(self.train_data,self.test_data) = (ratings[0], ratings[1])#[UserID::MovieID::Rating::Timestamp]
# 鄰居個(gè)數(shù)
self.k = k
# 推薦個(gè)數(shù)
self.n = n
# 用戶(hù)對(duì)電影的評(píng)分
# 數(shù)據(jù)格式{'UserID用戶(hù)ID':[(MovieID電影ID,Rating用戶(hù)對(duì)電影的評(píng)星)]}
self.userDict = {}
# 對(duì)某電影評(píng)分的用戶(hù)
# 數(shù)據(jù)格式:{'MovieID電影ID':[UserID,用戶(hù)ID]}
# {'1',[1,2,3..],...}
self.ItemUser = {}
# 鄰居的信息
self.neighbors = []
# 推薦列表
self.recommandList = []#包含dist和電影id
self.recommand = [] #訓(xùn)練集合測(cè)試集的交集,且僅有電影id
#用戶(hù)評(píng)過(guò)電影信息
self.train_user = []
self.test_user = []
#給用戶(hù)的推薦列表,僅含movieid
self.train_rec =[]
self.test_rec = []
#test中的電影評(píng)分預(yù)測(cè)數(shù)據(jù)集合,
self.forecast = {}#前k個(gè)近鄰的評(píng)分集合
self.score = {}#最終加權(quán)平均后的評(píng)分集合{“電影id”:預(yù)測(cè)評(píng)分}
#召回率和準(zhǔn)確率
self.pre = [0.0,0.0]
self.z = [0.0, 0.0]
'''''
userDict數(shù)據(jù)格式:
'3': [('3421', 0.8), ('1641', 0.4), ('648', 0.6), ('1394', 0.8), ('3534', 0.6), ('104', 0.8),
('2735', 0.8), ('1210', 0.8), ('1431', 0.6), ('3868', 0.6), ('1079', 1.0), ('2997', 0.6),
('1615', 1.0), ('1291', 0.8), ('1259', 1.0), ('653', 0.8), ('2167', 1.0), ('1580', 0.6),
('3619', 0.4), ('260', 1.0), ('2858', 0.8), ('3114', 0.6), ('1049', 0.8), ('1261', 0.2),
('552', 0.8), ('480', 0.8), ('1265', 0.4), ('1266', 1.0), ('733', 1.0), ('1196', 0.8),
('590', 0.8), ('2355', 1.0), ('1197', 1.0), ('1198', 1.0), ('1378', 1.0), ('593', 0.6),
('1379', 0.8), ('3552', 1.0), ('1304', 1.0), ('1270', 0.6), ('2470', 0.8), ('3168', 0.8),
('2617', 0.4), ('1961', 0.8), ('3671', 1.0), ('2006', 0.8), ('2871', 0.8), ('2115', 0.8),
('1968', 0.8), ('1136', 1.0), ('2081', 0.8)]}
ItemUser數(shù)據(jù)格式:
{'42': ['8'], '2746': ['10'], '2797': ['1'], '2987': ['5'], '1653': ['5', '8', '9'],
'194': ['5'], '3500': ['8', '10'], '3753': ['6', '7'], '1610': ['2', '5', '7'],
'1022': ['1', '10'], '1244': ['2'], '25': ['8', '9']
'''
# 將ratings轉(zhuǎn)換為userDict和ItemUser
def formatRate(self,train_or_test):
self.userDict = {}
self.ItemUser = {}
for i in train_or_test:#[UserID,MovieID,Rating,Timestamp]
# 評(píng)分最高為5 除以5 進(jìn)行數(shù)據(jù)歸一化
## temp = (i[1], float(i[2]) / 5)
temp = (i[1], float(i[2]))
## temp = (i[1], i[2])
# 計(jì)算userDict {'用戶(hù)id':[(電影id,評(píng)分),(2,5)...],'2':[...]...}一個(gè)觀(guān)眾對(duì)每一部電影的評(píng)分集合
if(i[0] in self.userDict):
self.userDict[i[0]].append(temp)
else:
self.userDict[i[0]] = [temp]
# 計(jì)算ItemUser {'電影id',[用戶(hù)id..],...}同一部電影的觀(guān)眾集合
if(i[1] in self.ItemUser):
self.ItemUser[i[1]].append(i[0])
else:
self.ItemUser[i[1]] = [i[0]]
# 格式化userDict數(shù)據(jù)
def formatuserDict(self, userId, p):#userID為待查詢(xún)目標(biāo),p為近鄰對(duì)象
user = {}
#user數(shù)據(jù)格式為:電影id:[userID的評(píng)分,近鄰用戶(hù)的評(píng)分]
for i in self.userDict[userId]:#i為userDict數(shù)據(jù)中的每個(gè)括號(hào)同81行
user[i[0]] = [i[1], 0]
for j in self.userDict[p]:
if(j[0] not in user):
user[j[0]] = [0, j[1]]#說(shuō)明目標(biāo)用戶(hù)和近鄰用戶(hù)沒(méi)有同時(shí)對(duì)一部電影評(píng)分
else:
user[j[0]][1] = j[1]#說(shuō)明兩者對(duì)同一部電影都有評(píng)分
return user
# 計(jì)算余弦距離
def getCost(self, userId, p):
# 獲取用戶(hù)userId和p評(píng)分電影的并集
# {'電影ID':[userId的評(píng)分,p的評(píng)分]} 沒(méi)有評(píng)分為0
user = self.formatuserDict(userId, p)
x = 0.0
y = 0.0
z = 0.0
for k, v in user.items():#k是鍵,v是值
x += float(v[0]) * float(v[0])
y += float(v[1]) * float(v[1])
z += float(v[0]) * float(v[1])
if(z == 0.0):
return 0
return z / sqrt(x * y)
#計(jì)算皮爾遜相似度
## def getCost(self, userId, p):
## # 獲取用戶(hù)userId和l評(píng)分電影的并集
## # {'電影ID':[userId的評(píng)分,l的評(píng)分]} 沒(méi)有評(píng)分為0
## user = self.formatuserDict(userId, p)
## sumxsq = 0.0
## sumysq = 0.0
## sumxy = 0.0
## sumx = 0.0
## sumy = 0.0
## n = len(user)
## for k, v in user.items():
## sumx +=float(v[0])
## sumy +=float(v[1])
## sumxsq += float(v[0]) * float(v[0])
## sumysq += float(v[1]) * float(v[1])
## sumxy += float(v[0]) * float(v[1])
## up = sumxy -sumx*sumy/n
## down = sqrt((sumxsq - pow(sumxsq,2)/n)*(sumysq - pow(sumysq,2)/n))
## if(down == 0.0):
## return 0
## return up/down
# 找到某用戶(hù)的相鄰用戶(hù)
def getNearestNeighbor(self, userId):
neighbors = []
self.neighbors = []
# 獲取userId評(píng)分的電影都有那些用戶(hù)也評(píng)過(guò)分
for i in self.userDict[userId]:#i為userDict數(shù)據(jù)中的每個(gè)括號(hào)同95行#user數(shù)據(jù)格式為:電影id:[userID的評(píng)分,近鄰用戶(hù)的評(píng)分]
for j in self.ItemUser[i[0]]:#i[0]為電影編號(hào),j為看同一部電影的每位用戶(hù)
if(j != userId and j not in neighbors):
neighbors.append(j)
# 計(jì)算這些用戶(hù)與userId的相似度并排序
for i in neighbors:#i為用戶(hù)id
dist = self.getCost(userId, i)
self.neighbors.append([dist, i])
# 排序默認(rèn)是升序,reverse=True表示降序
self.neighbors.sort(reverse=True)
self.neighbors = self.neighbors[:self.k]#切片操作,取前k個(gè)
## print('neighbors',len(neighbors))
# 獲取推薦列表
def getrecommandList(self, userId):
self.recommandList = []
# 建立推薦字典
recommandDict = {}
for neighbor in self.neighbors:#這里的neighbor數(shù)據(jù)格式為[[dist,用戶(hù)id],[],....]
movies = self.userDict[neighbor[1]]#movies數(shù)據(jù)格式為[(電影id,評(píng)分),(),。。。。]
for movie in movies:
if(movie[0] in recommandDict):
recommandDict[movie[0]] += neighbor[0]####????
else:
recommandDict[movie[0]] = neighbor[0]
# 建立推薦列表
for key in recommandDict:#recommandDict數(shù)據(jù)格式{電影id:累計(jì)dist,。。。}
self.recommandList.append([recommandDict[key], key])#recommandList數(shù)據(jù)格式【【累計(jì)dist,電影id】,【】,。。。?!?
self.recommandList.sort(reverse=True)
## print(len(self.recommandList))
self.recommandList = self.recommandList[:self.n]
## print(len(self.recommandList))
# 推薦的準(zhǔn)確率
def getPrecision(self, userId):
## print("開(kāi)始!??!")
#先運(yùn)算test_data,這樣最終self.neighbors等保留的是后來(lái)計(jì)算train_data后的數(shù)據(jù)(不交換位置的話(huà)就得在gR函數(shù)中增加參數(shù)保留各自的neighbor)
(self.test_user,self.test_rec) = self.getRecommand(self.test_data,userId)#測(cè)試集的用戶(hù)userId所評(píng)價(jià)的電影和給該用戶(hù)推薦的電影列表
(self.train_user,self.train_rec) = self.getRecommand(self.train_data,userId)#訓(xùn)練集的用戶(hù)userId所評(píng)價(jià)的所有電影集合(self.train_user)和給該用戶(hù)推薦的電影列表(self.train_rec)
#西安電大的張海朋:基于協(xié)同過(guò)濾的電影推薦系統(tǒng)的構(gòu)建(2015)中的準(zhǔn)確率召回率計(jì)算
for i in self.test_rec:
if i in self.train_rec:
self.recommand.append(i)
self.pre[0] = len(self.recommand)/len(self.train_rec)
self.z[0] = len(self.recommand)/len(self.test_rec)
#北京交大黃宇:基于協(xié)同過(guò)濾的推薦系統(tǒng)設(shè)計(jì)與實(shí)現(xiàn)(2015)中的準(zhǔn)、召計(jì)算
self.recommand = []#這里沒(méi)有歸零的話(huà),下面計(jì)算初始recommand不為空
for i in self.train_rec:
if i in self.test_user:
self.recommand.append(i)
self.pre[1] = len(self.recommand)/len(self.train_rec)
self.z[1] = len(self.recommand)/len(self.test_user)
## print(self.train_rec,self.test_rec,"20",len(self.train_rec),len(self.train_rec))
#對(duì)同一用戶(hù)分別通過(guò)訓(xùn)練集和測(cè)試集處理
def getRecommand(self,train_or_test,userId):
self.formatRate(train_or_test)
self.getNearestNeighbor(userId)
self.getrecommandList(userId)
user = [i[0] for i in self.userDict[userId]]#用戶(hù)userId評(píng)分的所有電影集合
recommand = [i[1] for i in self.recommandList]#推薦列表僅有電影id的集合,區(qū)別于recommandList(還含有dist)
## print("userid該用戶(hù)已通過(guò)訓(xùn)練集測(cè)試集處理")
return (user,recommand)
#對(duì)test的電影進(jìn)行評(píng)分預(yù)測(cè)
def foreCast(self):
self.forecast = {}#?????前面變量統(tǒng)一定義初始化后,函數(shù)內(nèi)部是否需要該初始化????
same_movie_id = []
neighbors_id = [i[1] for i in self.neighbors] #近鄰用戶(hù)數(shù)據(jù)僅含用戶(hù)id的集合
for i in self.test_user:#i為電影id,即在test里的i有被推薦到
if i in self.train_rec:
same_movie_id.append(i)
for j in self.ItemUser[i]:#j為用戶(hù)id,即尋找近鄰用戶(hù)的評(píng)分和相似度
if j in neighbors_id:
user = [i[0] for i in self.userDict[j]]#self.userDict[userId]數(shù)據(jù)格式:數(shù)據(jù)格式為[(電影id,評(píng)分),(),。。。。];這里的userid應(yīng)為近鄰用戶(hù)p
a = self.neighbors[neighbors_id.index(j)]#找到該近鄰用戶(hù)的數(shù)據(jù)【dist,用戶(hù)id】
b = self.userDict[j][user.index(i)]#找到該近鄰用戶(hù)的數(shù)據(jù)【電影id,用戶(hù)id】
c = [a[0], b[1], a[1]]
if (i in self.forecast):
self.forecast[i].append(c)
else:
self.forecast[i] = [c]#數(shù)據(jù)格式:字典{“電影id”:【dist,評(píng)分,用戶(hù)id】【】}{'589': [[0.22655856915174025, 0.6, '419'], [0.36264561173211646, 1.0, '1349']。。。}
## print(same_movie_id)
#每個(gè)近鄰用戶(hù)的評(píng)分加權(quán)平均計(jì)算得預(yù)測(cè)評(píng)分
self.score = {}
if same_movie_id :#在test里的電影是否有在推薦列表里,如果為空不做判斷,下面的處理會(huì)報(bào)錯(cuò)
for movieid in same_movie_id:
total_d = 0
total_down = 0
for d in self.forecast[movieid]:#此時(shí)的d已經(jīng)是最里層的列表了【】;self.forecast[movieid]的數(shù)據(jù)格式[[]]
total_d += d[0]*d[1]
total_down += d[0]
self.score[movieid] = [round(total_d/total_down,3)]#加權(quán)平均后取3位小數(shù)的精度
#在test里但是推薦沒(méi)有的電影id,這里先按零計(jì)算
for i in self.test_user:
if i not in movieid:
self.score[i] = [0]
else:
for i in self.test_user:
self.score[i] = [0]
## return self.score
#計(jì)算平均絕對(duì)誤差MAE
def cal_Mae(self,userId):
self.formatRate(self.test_data)
## print(self.userDict)
for item in self.userDict[userId]:
if item[0] in self.score:
self.score[item[0]].append(item[1])#self.score數(shù)據(jù)格式[[預(yù)測(cè)分,實(shí)際分]]
## #過(guò)渡代碼
## for i in self.score:
## pass
return self.score
# 基于用戶(hù)的推薦
# 根據(jù)對(duì)電影的評(píng)分計(jì)算用戶(hù)之間的相似度
## def recommendByUser(self, userId):
## print("親,請(qǐng)稍等片刻,系統(tǒng)正在快馬加鞭為你運(yùn)作中") #人機(jī)交互輔助解讀,
## self.getPrecision(self,userId)
# 獲取數(shù)據(jù)
def readFile(filename):
files = open(filename, "r", encoding = "utf-8")
data = []
for line in files.readlines():
item = line.strip().split("::")
data.append(item)
return data
files.close()
def load_dict_from_file(filepath):
_dict = {}
try:
with open(filepath, 'r',encoding = "utf -8") as dict_file:
for line in dict_file.readlines():
(key, value) = line.strip().split(':')
_dict[key] = value
except IOError as ioerr:
print ("文件 %s 不存在" % (filepath))
return _dict
def save_dict_to_file(_dict, filepath):
try:
with open(filepath, 'w',encoding = "utf - 8") as dict_file:
for (key,value) in _dict.items():
dict_file.write('%s:%s\n' % (key, value))
except IOError as ioerr:
print ("文件 %s 無(wú)法創(chuàng)建" % (filepath))
def writeFile(data,filename):
with open(filename, 'w', encoding = "utf-8")as f:
f.write(data)
# -------------------------開(kāi)始-------------------------------
def start3():
start1 = time.clock()
movies = readFile("D:/d/movies.dat")
ratings = [readFile("D:/d/201709train.txt"),readFile("D:/d/201709test.txt")]
demo = CF(movies, ratings, k=20)
userId = '1000'
demo.getPrecision(userId)
## print(demo.foreCast())
demo.foreCast()
print(demo.cal_Mae(userId))
## demo.recommendByUser(ID) #上一句只能實(shí)現(xiàn)固定用戶(hù)查詢(xún),這句可以實(shí)現(xiàn)“想查哪個(gè)查哪個(gè)”,后期可以加個(gè)循環(huán),挨個(gè)查,查到你不想查
print("處理的數(shù)據(jù)為%d條" % (len(ratings[0])+len(ratings[1])))
## print("____---",len(ratings[0]),len(ratings[1]))
## print("準(zhǔn)確率: %.2f %%" % (demo.pre * 100))
## print("召回率: %.2f %%" % (demo.z * 100))
print(demo.pre)
print(demo.z)
end1 = time.clock()
print("耗費(fèi)時(shí)間: %f s" % (end1 - start1))
def start1():
start1 = time.clock()
movies = readFile("D:/d/movies.dat")
ratings = [readFile("D:/d/201709train.txt"),readFile("D:/d/201709test.txt")]
demo = CF(movies, ratings, k = 20)
demo.formatRate(ratings[0])
writeFile(str(demo.userDict),"D:/d/dd/userDict.txt")
writeFile(str(demo.ItemUser), "D:/d/dd/ItemUser.txt")
## save_dict_to_file(demo.userDict,"D:/d/dd/userDict.txt")
## save_dict_to_file(demo.ItemUser,"D:/d/dd/ItemUser.txt")
print("處理結(jié)束")
## with open("D:/d/dd/userDict.txt",'r',encoding = 'utf-8') as f:
## diction = f.read()
## i = 0
## for j in eval(diction):
## print(j)
## i += 1
## if i == 4:
## break
def start2():
start1 = time.clock()
movies = readFile("D:/d/movies.dat")
ratings = [readFile("D:/d/201709train.txt"),readFile("D:/d/201709test.txt")]
demo = CF(movies, ratings, k = 20)
demo.formatRate_toMovie(ratings[0])
writeFile(str(demo.movieDict),"D:/d/dd/movieDict.txt")
## writeFile(str(demo.userDict),"D:/d/dd/userDict.txt")
## writeFile(str(demo.ItemUser), "D:/d/dd/ItemUser.txt")
## save_dict_to_file(demo.userDict,"D:/d/dd/userDict.txt")
## save_dict_to_file(demo.ItemUser,"D:/d/dd/ItemUser.txt")
print("處理結(jié)束")
if __name__ == '__main__':
start1()
以上就是本文的全部?jī)?nèi)容,希望對(duì)大家的學(xué)習(xí)有所幫助,也希望大家多多支持腳本之家。
相關(guān)文章
python通過(guò)cmd創(chuàng)建虛擬環(huán)境的實(shí)現(xiàn)(pip方式)
Python的虛擬環(huán)境是正常的現(xiàn)實(shí)環(huán)境相對(duì)應(yīng)的,在虛擬環(huán)境中安裝的包是與現(xiàn)實(shí)環(huán)境隔離的,本文主要介紹了python通過(guò)cmd創(chuàng)建虛擬環(huán)境的實(shí)現(xiàn),感興趣的可以了解一下2023-11-11
Python進(jìn)程間通信 multiProcessing Queue隊(duì)列實(shí)現(xiàn)詳解
這篇文章主要介紹了python進(jìn)程間通信 mulitiProcessing Queue隊(duì)列實(shí)現(xiàn)詳解,文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友可以參考下2019-09-09
python mongo 向數(shù)據(jù)中的數(shù)組類(lèi)型新增數(shù)據(jù)操作
這篇文章主要介紹了python mongo 向數(shù)據(jù)中的數(shù)組類(lèi)型新增數(shù)據(jù)操作,具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過(guò)來(lái)看看吧2020-12-12
Python&Matlab實(shí)現(xiàn)螞蟻群算法求解最短路徑問(wèn)題的示例
本文主要介紹了Python&Matlab實(shí)現(xiàn)螞蟻群算法求解最短路徑問(wèn)題的示例,文中通過(guò)示例代碼介紹的非常詳細(xì),具有一定的參考價(jià)值,感興趣的小伙伴們可以參考一下2022-03-03
Python開(kāi)發(fā)常用的一些開(kāi)源Package分享
這篇文章主要介紹了Python開(kāi)發(fā)常用的一些開(kāi)源Package分享,常用的開(kāi)源項(xiàng)目包括WEB開(kāi)發(fā)框架、工具包、數(shù)據(jù)庫(kù)操作包、網(wǎng)絡(luò)操作包等,需要的朋友可以參考下2015-02-02
解決在pycharm運(yùn)行代碼,調(diào)用CMD窗口的命令運(yùn)行顯示亂碼問(wèn)題
今天小編就為大家分享一篇解決在pycharm運(yùn)行代碼,調(diào)用CMD窗口的命令運(yùn)行顯示亂碼問(wèn)題,具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過(guò)來(lái)看看吧2019-08-08
Python爬蟲(chóng)之pandas基本安裝與使用方法示例
這篇文章主要介紹了Python爬蟲(chóng)之pandas基本安裝與使用方法,結(jié)合實(shí)例形式分析了Python爬蟲(chóng)操作中pandas的pip命令安裝與HTML、Excel等格式文件保存相關(guān)操作技巧,需要的朋友可以參考下2018-08-08
Python實(shí)現(xiàn)PS濾鏡中的USM銳化效果
這篇文章主要介紹了Python實(shí)現(xiàn)PS濾鏡中的USM銳化效果,幫助大家更好的利用python處理圖片,感興趣的朋友可以了解下2020-12-12
Python Pandas pandas.read_sql函數(shù)實(shí)例用法
在本篇文章里小編給大家整理的是一篇關(guān)于Python Pandas pandas.read_sql函數(shù)詳解內(nèi)容,有需要的朋友們可以學(xué)習(xí)下。2021-06-06

