precision and recall at k for movielens dataset

Question

I wanted to recreate a very simple collaborative filtering example with the 1M movielens dataset I have from Kaggle (https://www.kaggle.com/datasets/odedgolden/movielens-1m-dataset) and then calculate the mean average precision and recall at M. Everywhere I read the benchmark of the mean precision at M (MAP at M) is between 0.1 and 0.2 but if I calculate mine it is much higher. If I look at other code I can't understand where my error is. I used a regression model and then considered every movie with a true rating higher than 3 as relevant. Can anyone spot what I doing wrong with my MAP at M calculation?

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, Dot, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
#get the data
column_names = ['User_ID', 'MovieID', 'Rating', 'Timestamp']
ratings = (pd.read_csv('recommender/archive/ratings.dat',sep='::', names=column_names, engine='python')
           .rename(columns={'MovieID':'item_id'})
           .drop(columns={'Timestamp'})
          )
column_names = ['MovieID', 'Title', 'Genres']
items = (pd.read_csv("recommender/archive/movies.dat", sep = "::", 
                     names = column_names, encoding='latin-1', engine='python')
         .rename(columns={'MovieID':'item_id', 'Title':'title'})
         .drop(columns={'Genres'})
        )
all_ratings = (ratings.merge(items, on='item_id', how='left')
 .rename(columns={'User_ID':'user_id', 'Rating':'rating'})

 )
#split into train and test sets
ratings_train, ratings_test = train_test_split(
    all_ratings, test_size=0.4, random_state=0)
max_user_id = all_ratings['user_id'].max()
max_item_id = all_ratings['item_id'].max()
#collaborative filtering model
class CustomModel(Model):
    def init(self, embedding_size, max_user_id, max_item_id):
        super().init()
    self.user_embedding = Embedding(output_dim=embedding_size,
                                    input_dim=max_user_id + 1,
                                    input_length=1,
                                    name='user_embedding')
    self.item_embedding = Embedding(output_dim=embedding_size,
                                    input_dim=max_item_id + 1,
                                    input_length=1,
                                    name='item_embedding')



    # The following two layers don't have parameters.
    self.flatten = Flatten()
    self.dot = Dot(axes=1)

#training=False is only used with the deep model
def call(self, inputs, training=False):
    user_inputs = inputs[0]
    item_inputs = inputs[1]


    user_vecs = self.flatten(self.user_embedding(user_inputs))
    item_vecs = self.flatten(self.item_embedding(item_inputs))


    y = self.dot([user_vecs, item_vecs])


    return y




model = CustomModel(64, max_user_id, max_item_id)
model.compile(optimizer='adam', loss='mae')
history = model.fit([np.array(ratings_train['user_id']), np.array(ratings_train['item_id'])],
                    np.array(ratings_train['rating']),
                    batch_size=64, epochs=10, validation_split=0.1,
                    shuffle=True)
test_preds = model.predict([np.array(ratings_test['user_id']), np.array(ratings_test['item_id'])])
print("test MSE: %0.3f" % mean_squared_error(test_preds, np.array(ratings_test['rating'])))
print("test MAE: %0.3f" % mean_absolute_error(test_preds, np.array(ratings_test['rating'])))
#calculation of MAP and   mean  recall at M
ratings_test['rating_pred'] = np.hstack(test_preds)
ratings_test = ratings_test.rename(columns = {'rating':'rating_true'})
#consider a movie rating of higher than limit=3 relevant
limit = 3
map_at_M=[]
recall_at_M=[]
M_range=[1, 2, 5, 10, 20, 40, 50, 60, 100]
ratings_test_sorted  = (ratings_test
                         .sort_values(by=['user_id','rating_pred'], ascending=False)
                         .assign(relevant =  lambda x: x.rating_true>limit)
                         .assign(sum_of_recommended_and_relevant = lambda x: x.groupby('user_id').relevant.transform('cumsum'))
                         .assign(item_no_of_user =  lambda x: x.groupby('user_id').user_id.transform('cumcount')+1)
                         .assign(total_relevant = lambda x: x.groupby('user_id').relevant.transform('sum'))
    )
#loop over all Ms (mean average precision at  M/mean recall at M)
for M  in M_range:
    ratings_test_sorted  = (ratings_test_sorted
                     .assign(size_at_M = lambda x: x[x.item_no_of_user<=M].groupby('user_id').relevant.transform('size'))
                     #only want to consider  the M most relevant recommendations therefore multiply by  (x.item_no_of_user <=  M)
                     #divide by x.size_at_M which is the number  of relevant items at M for each user_id
                     .assign(p_times_relevance =  lambda x: np.where((x.size_at_M.notnull() & (x.size_at_M!=0)),
                                                                     x.sum_of_recommended_and_relevant/(x.item_no_of_user)x.relevant(x.item_no_of_user <=  M)/x.size_at_M,
                                                                     0))
                     .assign(relevant_at_M = lambda x: x[x.item_no_of_user<=M].groupby('user_id').relevant.transform('sum'))
                     .assign(recall = lambda x: np.where(x.total_relevant!=0,x.relevant_at_M/x.total_relevant,0))

                    )
MAP = (ratings_test_sorted
                 #precision at M  for each user_id (sum because p_times_relevance is already divided by size_at_M)
                 .groupby('user_id').p_times_relevance.sum()
                 #mean of precision  at M  over all  user_ids
                 .mean()
      )


mean_recall  = (ratings_test_sorted  
                 #recall is now the same in every row of the same user, so just drop  duplicates by user_id
                 .drop_duplicates(subset={'user_id'})
                 .recall
                 #mean of recall over all users
                 .mean()
                )

map_at_M.append(MAP)
recall_at_M.append(mean_recall)



#my results of MAP and recall at M (which are too high)
df = pd.DataFrame({'M':M_range, 'MAP':map_at_M, 'recall':recall_at_M})
df.plot(x='M')
df
```

score 1 · Answer 1 · answered Jun 05 '22 at 19:21

I used a regression model and then considered every movie with a true rating higher than 3 as relevant.

To me, it seems you are introducing bias here (above quote). Thus, the reason you get a higher score. Also, the reason you have not been able to spot an issue with your code (as of 06/05/2022 time and date)

If you want to replicate or get closer to what previous works did, you must follow equal model hyperparameters (values).

Unless, you can justify (data driven) why you're using 3.

One last thing to look at is data preparation and preprocessing.

precision and recall at k for movielens dataset

1 Answers1