I wanted to recreate a very simple collaborative filtering example with the 1M movielens dataset I have from Kaggle (https://www.kaggle.com/datasets/odedgolden/movielens-1m-dataset) and then calculate the mean average precision and recall at M. Everywhere I read the benchmark of the mean precision at M (MAP at M) is between 0.1 and 0.2 but if I calculate mine it is much higher. If I look at other code I can't understand where my error is. I used a regression model and then considered every movie with a true rating higher than 3 as relevant. Can anyone spot what I doing wrong with my MAP at M calculation?
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, Dot, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
#get the data
column_names = ['User_ID', 'MovieID', 'Rating', 'Timestamp']
ratings = (pd.read_csv('recommender/archive/ratings.dat',sep='::', names=column_names, engine='python')
.rename(columns={'MovieID':'item_id'})
.drop(columns={'Timestamp'})
)
column_names = ['MovieID', 'Title', 'Genres']
items = (pd.read_csv("recommender/archive/movies.dat", sep = "::",
names = column_names, encoding='latin-1', engine='python')
.rename(columns={'MovieID':'item_id', 'Title':'title'})
.drop(columns={'Genres'})
)
all_ratings = (ratings.merge(items, on='item_id', how='left')
.rename(columns={'User_ID':'user_id', 'Rating':'rating'})
)
#split into train and test sets
ratings_train, ratings_test = train_test_split(
all_ratings, test_size=0.4, random_state=0)
max_user_id = all_ratings['user_id'].max()
max_item_id = all_ratings['item_id'].max()
#collaborative filtering model
class CustomModel(Model):
def init(self, embedding_size, max_user_id, max_item_id):
super().init()
self.user_embedding = Embedding(output_dim=embedding_size,
input_dim=max_user_id + 1,
input_length=1,
name='user_embedding')
self.item_embedding = Embedding(output_dim=embedding_size,
input_dim=max_item_id + 1,
input_length=1,
name='item_embedding')
# The following two layers don't have parameters.
self.flatten = Flatten()
self.dot = Dot(axes=1)
#training=False is only used with the deep model
def call(self, inputs, training=False):
user_inputs = inputs[0]
item_inputs = inputs[1]
user_vecs = self.flatten(self.user_embedding(user_inputs))
item_vecs = self.flatten(self.item_embedding(item_inputs))
y = self.dot([user_vecs, item_vecs])
return y
model = CustomModel(64, max_user_id, max_item_id)
model.compile(optimizer='adam', loss='mae')
history = model.fit([np.array(ratings_train['user_id']), np.array(ratings_train['item_id'])],
np.array(ratings_train['rating']),
batch_size=64, epochs=10, validation_split=0.1,
shuffle=True)
test_preds = model.predict([np.array(ratings_test['user_id']), np.array(ratings_test['item_id'])])
print("test MSE: %0.3f" % mean_squared_error(test_preds, np.array(ratings_test['rating'])))
print("test MAE: %0.3f" % mean_absolute_error(test_preds, np.array(ratings_test['rating'])))
#calculation of MAP and mean recall at M
ratings_test['rating_pred'] = np.hstack(test_preds)
ratings_test = ratings_test.rename(columns = {'rating':'rating_true'})
#consider a movie rating of higher than limit=3 relevant
limit = 3
map_at_M=[]
recall_at_M=[]
M_range=[1, 2, 5, 10, 20, 40, 50, 60, 100]
ratings_test_sorted = (ratings_test
.sort_values(by=['user_id','rating_pred'], ascending=False)
.assign(relevant = lambda x: x.rating_true>limit)
.assign(sum_of_recommended_and_relevant = lambda x: x.groupby('user_id').relevant.transform('cumsum'))
.assign(item_no_of_user = lambda x: x.groupby('user_id').user_id.transform('cumcount')+1)
.assign(total_relevant = lambda x: x.groupby('user_id').relevant.transform('sum'))
)
#loop over all Ms (mean average precision at M/mean recall at M)
for M in M_range:
ratings_test_sorted = (ratings_test_sorted
.assign(size_at_M = lambda x: x[x.item_no_of_user<=M].groupby('user_id').relevant.transform('size'))
#only want to consider the M most relevant recommendations therefore multiply by (x.item_no_of_user <= M)
#divide by x.size_at_M which is the number of relevant items at M for each user_id
.assign(p_times_relevance = lambda x: np.where((x.size_at_M.notnull() & (x.size_at_M!=0)),
x.sum_of_recommended_and_relevant/(x.item_no_of_user)x.relevant(x.item_no_of_user <= M)/x.size_at_M,
0))
.assign(relevant_at_M = lambda x: x[x.item_no_of_user<=M].groupby('user_id').relevant.transform('sum'))
.assign(recall = lambda x: np.where(x.total_relevant!=0,x.relevant_at_M/x.total_relevant,0))
)
MAP = (ratings_test_sorted
#precision at M for each user_id (sum because p_times_relevance is already divided by size_at_M)
.groupby('user_id').p_times_relevance.sum()
#mean of precision at M over all user_ids
.mean()
)
mean_recall = (ratings_test_sorted
#recall is now the same in every row of the same user, so just drop duplicates by user_id
.drop_duplicates(subset={'user_id'})
.recall
#mean of recall over all users
.mean()
)
map_at_M.append(MAP)
recall_at_M.append(mean_recall)
#my results of MAP and recall at M (which are too high)
df = pd.DataFrame({'M':M_range, 'MAP':map_at_M, 'recall':recall_at_M})
df.plot(x='M')
df
```