Source Code
Source Code
library(recommenderlab)
library(ggplot2)
library(data.table)
library(reshape2)
str(movie_data)
summary(movie_data)
head(movie_data)
summary(rating_data)
head(rating_data)
# Data pre-processing
# Creating a one-hot encoding to create a matrix that comprises of corresponding genres for each of
the films.
library(data.table)
type.convert=TRUE),
stringsAsFactors=FALSE)
genre_mat1[index+1,gen_col] <- 1
str(genre_mat2)
head(SearchMatrix)
ratingMatrix
# Overview some important parameters for building recommendation systems for movies
names(recommendation_model)
recommendation_model$IBCF_realRatingMatrix$parameters
# Collaborative Filtering involves suggesting movies to the users that are based on collecting
preferences from many other users.
method = "cosine",
which = "users")
as.matrix(similarity_mat)
as.matrix(movie_similarity)
Table_of_Ratings
library(ggplot2)
table_views$title <- NA
movie_data$movieId == table_views[index,1])$title)
table_views[1:6,]
# Visualize a bar plot for the total number of views of the top films
image(ratingMatrix[1:20, 1:25], axes = FALSE, main = "Heatmap of the first 25 rows and 25 columns")
# Data Preparation
movie_ratings
# Data Normalization
# Data Binarization
# Splitting the dataset into 80% training set and 20% test set
size = nrow(movie_ratings),
replace = TRUE,
recommendation_system$IBCF_realRatingMatrix$parameters
method = "IBCF",
recommen_model
class(recommen_model)
class(model_info$sim)
dim(model_info$sim)
top_items <- 20
image(model_info$sim[1:top_items, 1:top_items],
# Visualize sum of rows and columns with the similarity of the objects above 0
table(sum_rows)
top_recommendations <- 10
n = top_recommendations)
predicted_recommendations
movie_data$movieId == movies_user1[index])$title)
movies_user2
function(x){ as.integer(colnames(movie_ratings)[x]) })
#dim(recc_matrix)
recommendation_matrix[,1:4]
number_of_items_top)
for(i in 1:4) {
movie_data$movieId == table_top[i,1])$title)
}
colnames(table_top) <- c("Movie Title", "No. of Items")
head(table_top)
OutPut:
> library(recommenderlab)
> library(ggplot2)
> library(data.table)
> library(reshape2)
> # Retrieve and display data
>
> movie_data <- read.csv("movies.csv",stringsAsFactors=FALSE)
> rating_data <- read.csv("ratings.csv")
> str(movie_data)
'data.frame': 10329 obs. of 3 variables:
$ movieId: int 1 2 3 4 5 6 7 8 9 10 ...
$ title : chr "Toy Story (1995)" "Jumanji (1995)" "Grumpier Old Men
(1995)" "Waiting to Exhale (1995)" ...
$ genres : chr "Adventure|Animation|Children|Comedy|Fantasy" "Adventure|
Children|Fantasy" "Comedy|Romance" "Comedy|Drama|Romance" ...
> # Overview the summary
> summary(movie_data)
movieId title genres
Min. : 1 Length:10329 Length:10329
1st Qu.: 3240 Class :character Class :character
Median : 7088 Mode :character Mode :character
Mean : 31924
3rd Qu.: 59900
Max. :149532
> head(movie_data)
movieId title
1 1 Toy Story (1995)
2 2 Jumanji (1995)
3 3 Grumpier Old Men (1995)
4 4 Waiting to Exhale (1995)
5 5 Father of the Bride Part II (1995)
6 6 Heat (1995)
genres
1 Adventure|Animation|Children|Comedy|Fantasy
2 Adventure|Children|Fantasy
3 Comedy|Romance
4 Comedy|Drama|Romance
5 Comedy
6 Action|Crime|Thriller
> summary(rating_data)
userId movieId rating
Min. : 1.0 Min. : 1 Min. :0.500
1st Qu.:192.0 1st Qu.: 1073 1st Qu.:3.000
Median :383.0 Median : 2497 Median :3.500
Mean :364.9 Mean : 13381 Mean :3.517
3rd Qu.:557.0 3rd Qu.: 5991 3rd Qu.:4.000
Max. :668.0 Max. :149532 Max. :5.000
timestamp
Min. :8.286e+08
1st Qu.:9.711e+08
Median :1.115e+09
Mean :1.130e+09
3rd Qu.:1.275e+09
Max. :1.452e+09
> head(rating_data)
userId movieId rating timestamp
1 1 16 4.0 1217897793
2 1 24 1.5 1217895807
3 1 32 4.0 1217896246
4 1 47 4.0 1217896556
5 1 50 4.0 1217896523
6 1 110 4.0 1217896150
> # Data pre-processing
> # Creating a one-hot encoding to create a matrix that comprises of
corresponding genres for each of the films.
> movie_genre <- as.data.frame(movie_data$genres, stringsAsFactors=FALSE)
> library(data.table)
> movie_genre2 <- as.data.frame(tstrsplit(movie_genre[,1], '[|]',
+ type.convert=TRUE),
+ stringsAsFactors=FALSE)
> colnames(movie_genre2) <- c(1:10)
> list_genre <- c("Action", "Adventure", "Animation", "Children",
+ "Comedy", "Crime","Documentary", "Drama", "Fantasy",
+ "Film-Noir", "Horror", "Musical", "Mystery","Romance",
+ "Sci-Fi", "Thriller", "War", "Western")
> genre_mat1 <- matrix(0,10330,18)
> genre_mat1[1,] <- list_genre
> colnames(genre_mat1) <- list_genre
> for (index in 1:nrow(movie_genre2)) {
+ for (col in 1:ncol(movie_genre2)) {
+ gen_col = which(genre_mat1[1,] == movie_genre2[index,col])
+ genre_mat1[index+1,gen_col] <- 1
+ }
+ }
> genre_mat2 <- as.data.frame(genre_mat1[-1,], stringsAsFactors=FALSE)
#remove first row, which was the genre list
> for (col in 1:ncol(genre_mat2)) {
+ genre_mat2[,col] <- as.integer(genre_mat2[,col]) #convert from
characters to integers
+ }
> str(genre_mat2)
'data.frame': 10329 obs. of 18 variables:
$ Action : int 0 0 0 0 0 1 0 0 1 1 ...
$ Adventure : int 1 1 0 0 0 0 0 1 0 1 ...
$ Animation : int 1 0 0 0 0 0 0 0 0 0 ...
$ Children : int 1 1 0 0 0 0 0 1 0 0 ...
$ Comedy : int 1 0 1 1 1 0 1 0 0 0 ...
$ Crime : int 0 0 0 0 0 1 0 0 0 0 ...
$ Documentary: int 0 0 0 0 0 0 0 0 0 0 ...
$ Drama : int 0 0 0 1 0 0 0 0 0 0 ...
$ Fantasy : int 1 1 0 0 0 0 0 0 0 0 ...
$ Film-Noir : int 0 0 0 0 0 0 0 0 0 0 ...
$ Horror : int 0 0 0 0 0 0 0 0 0 0 ...
$ Musical : int 0 0 0 0 0 0 0 0 0 0 ...
$ Mystery : int 0 0 0 0 0 0 0 0 0 0 ...
$ Romance : int 0 0 1 1 0 0 1 0 0 0 ...
$ Sci-Fi : int 0 0 0 0 0 0 0 0 0 0 ...
$ Thriller : int 0 0 0 0 0 1 0 0 0 1 ...
$ War : int 0 0 0 0 0 0 0 0 0 0 ...
$ Western : int 0 0 0 0 0 0 0 0 0 0 ...
> # Creating a ‘search matrix’ - searching films by specifying the genre
> SearchMatrix <- cbind(movie_data[,1:2], genre_mat2[])
> head(SearchMatrix)
movieId title Action Adventure
1 1 Toy Story (1995) 0 1
2 2 Jumanji (1995) 0 1
3 3 Grumpier Old Men (1995) 0 0
4 4 Waiting to Exhale (1995) 0 0
5 5 Father of the Bride Part II (1995) 0 0
6 6 Heat (1995) 1 0
Animation Children Comedy Crime Documentary Drama Fantasy
1 1 1 1 0 0 0 1
2 0 1 0 0 0 0 1
3 0 0 1 0 0 0 0
4 0 0 1 0 0 1 0
5 0 0 1 0 0 0 0
6 0 0 0 1 0 0 0
Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War
1 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0
3 0 0 0 0 1 0 0 0
4 0 0 0 0 1 0 0 0
5 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 1 0
Western
1 0
2 0
3 0
4 0
5 0
6 0
> ratingMatrix <- dcast(rating_data, userId~movieId, value.var = "rating",
na.rm=FALSE)
> ratingMatrix <- as.matrix(ratingMatrix[,-1]) #remove userIds
> #Convert rating matrix into a recommenderlab sparse matrix
> ratingMatrix <- as(ratingMatrix, "realRatingMatrix")
> ratingMatrix
668 x 10325 rating matrix of class ‘realRatingMatrix’ with 105339 ratings.
> # Overview some important parameters for building recommendation systems
for movies
> recommendation_model <- recommenderRegistry$get_entries(dataType =
"realRatingMatrix")
> names(recommendation_model)
[1] "HYBRID_realRatingMatrix"
[2] "ALS_realRatingMatrix"
[3] "ALS_implicit_realRatingMatrix"
[4] "IBCF_realRatingMatrix"
[5] "LIBMF_realRatingMatrix"
[6] "POPULAR_realRatingMatrix"
[7] "RANDOM_realRatingMatrix"
[8] "RERECOMMEND_realRatingMatrix"
[9] "SVD_realRatingMatrix"
[10] "SVDF_realRatingMatrix"
[11] "UBCF_realRatingMatrix"
> lapply(recommendation_model, "[[", "description")
$HYBRID_realRatingMatrix
[1] "Hybrid recommender that aggegates several recommendation strategies
using weighted averages."
$ALS_realRatingMatrix
[1] "Recommender for explicit ratings based on latent factors, calculated
by alternating least squares algorithm."
$ALS_implicit_realRatingMatrix
[1] "Recommender for implicit data based on latent factors, calculated by
alternating least squares algorithm."
$IBCF_realRatingMatrix
[1] "Recommender based on item-based collaborative filtering."
$LIBMF_realRatingMatrix
[1] "Matrix factorization with LIBMF via package recosystem
(https://cran.r-project.org/web/packages/recosystem/vignettes/introduction
.html)."
$POPULAR_realRatingMatrix
[1] "Recommender based on item popularity."
$RANDOM_realRatingMatrix
[1] "Produce random recommendations (real ratings)."
$RERECOMMEND_realRatingMatrix
[1] "Re-recommends highly rated items (real ratings)."
$SVD_realRatingMatrix
[1] "Recommender based on SVD approximation with column-mean imputation."
$SVDF_realRatingMatrix
[1] "Recommender based on Funk SVD with gradient descend
(https://sifter.org/~simon/journal/20061211.html)."
$UBCF_realRatingMatrix
[1] "Recommender based on user-based collaborative filtering."
$method
[1] "cosine"
$normalize
[1] "center"
$normalize_sim_matrix
[1] FALSE
$alpha
[1] 0.5
$na_as_zero
[1] FALSE
$method
[1] "cosine"
$normalize
[1] "center"
$normalize_sim_matrix
[1] FALSE
$alpha
[1] 0.5
$na_as_zero
[1] FALSE