ml_proj_1_book_recom January 8, 2023 1 Read the books dataset and explore it [1]: import numpy as np import pandas as pd [2]: # Read book user data df_user = pd.read_csv('BX-Users.csv',encoding='latin-1') df_user.head() C:\Users\anilj\AppData\Local\Temp\ipykernel_15084\792181112.py:2: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False. df_user = pd.read_csv('BX-Users.csv',encoding='latin-1') [2]: 0 1 2 3 4 user_id 1 2 3 4 5 Location nyc, new york, usa stockton, california, usa moscow, yukon territory, russia porto, v.n.gaia, portugal farnborough, hants, united kingdom Age NaN 18.0 NaN 17.0 NaN [3]: # Read book data df_books = pd.read_csv('BX-Books.csv', encoding='latin-1') df_books.head() C:\Users\anilj\AppData\Local\Temp\ipykernel_15084\314680294.py:2: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False. df_books = pd.read_csv('BX-Books.csv', encoding='latin-1') [3]: 0 1 2 3 4 0 isbn 195153448 2005018 60973129 374157065 393045218 book_title Classical Mythology Clara Callan Decision in Normandy Flu: The Story of the Great Influenza Pandemic… The Mummies of Urumchi book_author year_of_publication Mark P. O. Morford 2002 1 \ publisher Oxford University Press 1 2 3 4 2 Richard Bruce Carlo Gina Bari E. J. W. Wright D'Este Kolata Barber 2001 1991 1999 1999 HarperFlamingo Canada HarperPerennial Farrar Straus Giroux W. W. Norton & Company Clean up NaN values [4]: # Check for null values df_user.isnull().any() [4]: user_id False Location True Age True dtype: bool [5]: # Drop null values and confirm df_user1=df_user.dropna() df_user1.isnull().any() [5]: user_id False Location False Age False dtype: bool 3 Read the data where ratings are given by users [6]: # Read book ratings data df_ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='latin-1', nrows=20000) df_ratings.head() [6]: 0 1 2 3 4 user_id 276725 276726 276727 276729 276729 isbn 034545104X 155061224 446520802 052165615X 521795028 rating 0 5 0 3 6 [7]: # Merge the books and books ratings data for correlation df_book_ratings = pd.merge(df_ratings, df_books, on='isbn') df_book_ratings.head() [7]: 0 1 2 3 user_id 276725 2313 276726 276727 isbn 034545104X 034545104X 155061224 446520802 rating 0 5 5 0 book_title Flesh Tones: A Novel Flesh Tones: A Novel Rites of Passage The Notebook 2 book_author M. J. Rose M. J. Rose Judith Rae Nicholas Sparks \ 4 0 1 2 3 4 278418 446520802 year_of_publication 2002 2002 2001 1996 1996 4 0 The Notebook Nicholas Sparks publisher Ballantine Books Ballantine Books Heinle Warner Books Warner Books Take a quick look at the number of unique users and books [8]: # Code for checking number of unique users and books. n_users = df_ratings.user_id.nunique() n_books = df_ratings.isbn.nunique() print('Num. of Users: '+ str(n_users)) print('Num of Books: '+str(n_books)) Num. of Users: 2180 Num of Books: 17163 [9]: # Display number of unique users and books. n_users = df_book_ratings.user_id.nunique() n_books = df_book_ratings.isbn.nunique() print('Num. of Users: '+ str(n_users)) print('Num of Books: '+str(n_books)) Num. of Users: 1908 Num of Books: 14541 5 5.1 Convert both user_id and ISBN to the ordered list, i.e., from 0…n-1 Convert ISBN variables to numeric numbers in the correct order [10]: # Covnert ISBN variables to numberic numbers isbn_list = df_book_ratings.isbn.unique() print(" Size of isbn List:", len(isbn_list)) Size of isbn List: 14541 [11]: def get_isbn_numeric_id(isbn): #print (" Book's ISBN:" , isbn) itemindex = np.where(isbn_list==isbn) return itemindex[0][0] 3 df_book_ratings['isbn_id'] = df_book_ratings['isbn'].apply(get_isbn_numeric_id) [12]: df_book_ratings.head() [12]: user_id 276725 2313 276726 276727 278418 0 1 2 3 4 0 1 2 3 4 isbn 034545104X 034545104X 155061224 446520802 446520802 year_of_publication 2002 2002 2001 1996 1996 5.2 rating 0 5 5 0 0 book_title Flesh Tones: A Novel Flesh Tones: A Novel Rites of Passage The Notebook The Notebook publisher Ballantine Books Ballantine Books Heinle Warner Books Warner Books book_author M. J. Rose M. J. Rose Judith Rae Nicholas Sparks Nicholas Sparks \ isbn_id 0 0 1 2 2 Convert the user_id variable to numeric numbers in the correct order [13]: # Convert user_id userid_list = df_book_ratings.user_id.unique() print(" Size of user_id List:", len(userid_list)) Size of user_id List: 1908 [14]: def get_user_id_numeric_id(user_id): #print (" isbn is:" , isbn) itemindex = np.where(userid_list==user_id) return itemindex[0][0] df_book_ratings['user_id_order'] = df_book_ratings['user_id']. ↪apply(get_user_id_numeric_id) [15]: df_book_ratings.head() [15]: 0 1 2 3 4 0 1 2 3 user_id 276725 2313 276726 276727 278418 isbn 034545104X 034545104X 155061224 446520802 446520802 year_of_publication 2002 2002 2001 1996 rating 0 5 5 0 0 book_title Flesh Tones: A Novel Flesh Tones: A Novel Rites of Passage The Notebook The Notebook publisher Ballantine Books Ballantine Books Heinle Warner Books isbn_id 0 0 1 2 4 book_author M. J. Rose M. J. Rose Judith Rae Nicholas Sparks Nicholas Sparks user_id_order 0 1 2 3 \ 4 1996 Warner Books 2 4 # Re-index the columns to build a matrix [16]: # Re-index the columns to build a matrix¶ new_col_order = ['user_id_order', 'isbn_id', 'rating', 'book_title',␣ ↪'book_author','year_of_publication','publisher','isbn','user_id'] df_book_ratings = df_book_ratings.reindex(columns= new_col_order) df_book_ratings.head() [16]: 0 1 2 3 4 0 1 2 3 4 6 user_id_order 0 1 2 3 4 isbn_id 0 0 1 2 2 year_of_publication 2002 2002 2001 1996 1996 rating 0 5 5 0 0 book_title Flesh Tones: A Novel Flesh Tones: A Novel Rites of Passage The Notebook The Notebook isbn 034545104X 034545104X 155061224 446520802 446520802 publisher Ballantine Books Ballantine Books Heinle Warner Books Warner Books book_author M. J. Rose M. J. Rose Judith Rae Nicholas Sparks Nicholas Sparks user_id 276725 2313 276726 276727 278418 Split your data into two sets (training and testing) [17]: # Split your data into two sets (training and testing) from sklearn.model_selection import train_test_split train_data, test_data = train_test_split(df_book_ratings, test_size=0.30) [18]: # Create user-book matrix from training data train_data_matrix = np.zeros((n_users, n_books)) for line in train_data.itertuples(): train_data_matrix[line[1]-1, line[2]-1] = line[3] [19]: train_data_matrix [19]: array([[0., [5., [0., …, [0., [0., [0., 0., 0., …, 0., 0., 5.], 0., 0., …, 0., 0., 0.], 0., 0., …, 0., 0., 0.], 0., 0., …, 0., 0., 0.], 0., 0., …, 0., 0., 0.], 0., 0., …, 0., 0., 0.]]) 5 \ [20]: # Create user-book matrix from testing data test_data_matrix = np.zeros((n_users, n_books)) for line in test_data.itertuples(): test_data_matrix[line[1]-1, line[2]-1] = line[3] [21]: test_data_matrix [21]: array([[0., [0., [0., …, [0., [0., [0., 7 0., 0., …, 0., 0., 0.], 0., 0., …, 0., 0., 0.], 0., 0., …, 0., 0., 0.], 0., 0., …, 0., 0., 0.], 0., 0., …, 0., 0., 0.], 0., 0., …, 0., 0., 0.]]) Make predictions based on user and item variables [22]: # Importing pairwise_distances function from sklearn.metrics.pairwise import pairwise_distances user_similarity = pairwise_distances(train_data_matrix, metric='cosine') [23]: user_similarity [23]: array([[0., [1., [1., …, [1., [1., [1., 1., 1., …, 1., 1., 1.], 0., 1., …, 1., 1., 1.], 1., 0., …, 1., 1., 1.], 1., 1., …, 0., 1., 1.], 1., 1., …, 1., 0., 1.], 1., 1., …, 1., 1., 0.]]) [24]: item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine') [25]: item_similarity [25]: array([[0., [1., [1., …, [1., [1., [1., 1., 1., …, 1., 1., 1.], 0., 1., …, 1., 1., 1.], 1., 0., …, 1., 1., 1.], 1., 1., …, 0., 0., 1.], 1., 1., …, 0., 0., 1.], 1., 1., …, 1., 1., 0.]]) [26]: # Prediction function based on the user and item similarity def predict_similarity(ratings_martix, similarity_matrix, type='user'): if type == 'user': mean_user_rating = ratings_martix.mean(axis=1) 6 # Using the np.newaxis to ensure mean_user_rating format is similar to␣ book's ratings ratings_diff = (ratings_martix - mean_user_rating[:, np.newaxis]) pred = mean_user_rating[:, np.newaxis] + \ similarity_matrix.dot(ratings_diff) / np.array([np. ↪abs(similarity_matrix).sum(axis=1)]).T elif type == 'item': pred = ratings_martix.dot(similarity_matrix) / np.array([np. ↪abs(similarity_matrix).sum(axis=1)]) return pred ↪ # Carry out item prediction predicted_items = predict_similarity(train_data_matrix, item_similarity,␣ ↪type='item') # Carry out user prediction predicted_users = predict_similarity(train_data_matrix, user_similarity,␣ ↪type='user') [27]: predicted_items [27]: array([[0.00942228, 0.00942228, 0.00077695], [0. , 0.00034388, 0.00034429], [0. , 0. , 0. ], …, [0. , 0. , 0. ], [0. , 0. , 0. ], [0. , 0. , 0. ]]) 0.00942228, …, 0.00944242, 0.00944242, 0.00034388, …, 0.00034461, 0.00034461, 0. , …, 0. , 0. , 0. , …, 0. , 0. , 0. , …, 0. , 0. , 0. , …, 0. , 0. , [28]: predicted_users [28]: array([[ 0.01087004, 0.01139459, [-0.00083694, 0.00230937, [ 0.00144095, 0.00196533, …, [ 0.00144095, 0.00196533, [ 0.00144095, 0.00824728, 0.00824728, …, 0.00824728], -0.00083694, -0.00083694, …, 0.00178498], -0.00118097, -0.00118097, …, 0.00144095], 0.01139459, -0.00118097, -0.00118097, …, 0.00144095], -0.00118097, -0.00118097, …, 0.00196533, 7 0.00230937, 0.00196533, 0.00196533, 0.00196533, 0.00144095], [ 0.00144095, -0.00118097, -0.00118097, …, 0.00196533, 0.00144095]]) 8 0.00196533, Use RMSE to evaluate the predictions [29]: # Import RMSE package from sklearn.metrics import mean_squared_error from math import sqrt # Custom function to filter out elements with ground_truth.nonzero def compute_rmse(prediction, ground_truth): prediction = prediction[ground_truth.nonzero()].flatten() ground_truth = ground_truth[ground_truth.nonzero()].flatten() return sqrt(mean_squared_error(prediction, ground_truth)) # User based RMSE error calculation print('User-based RMSE: ' + str(compute_rmse(predicted_users,␣ ↪test_data_matrix))) # Item based RMSE error calculation print('Item-based RMSE: ' + str(compute_rmse(predicted_items,␣ ↪test_data_matrix))) User-based RMSE: 7.840632201150984 Item-based RMSE: 7.840663145205328 [ ]: 8