0% found this document useful (0 votes)
97 views7 pages

Hive Code BDM2 Assignment

The document describes steps to analyze movie rating data using Hive: 1. Data files are stored in HDFS and tables are created in Hive to load the data. This includes ratings, movies, and user data. 2.Queries are written to find the top 30 most and highest rated movies. Top movies are also identified by minimum number of raters. 3. Data is joined to associate ratings to users and movies to analyze average rating by gender and genre. 4. Additional queries find average rating by occupation and count/average rating for each genre.

Uploaded by

Seshadri Deb Roy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
97 views7 pages

Hive Code BDM2 Assignment

The document describes steps to analyze movie rating data using Hive: 1. Data files are stored in HDFS and tables are created in Hive to load the data. This includes ratings, movies, and user data. 2.Queries are written to find the top 30 most and highest rated movies. Top movies are also identified by minimum number of raters. 3. Data is joined to associate ratings to users and movies to analyze average rating by gender and genre. 4. Additional queries find average rating by occupation and count/average rating for each genre.

Uploaded by

Seshadri Deb Roy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 7

Q2 a) Hive solutions:

#Creating a directory in hadoop:

hdfs dfs -ls /user/

hdfs dfs -mkdir /user/seshadri

hdfs dfs -mkdir /user/seshadri/data

#Storing the files on hadoop:

hdfs dfs -put u.data /user/Seshadri/data/

hdfs dfs -put u.item /user/Seshadri/data/

hdfs dfs -put u.user /user/Seshadri/data/

hdfs dfs -ls /user/Seshadri/data

#Starting Hive session:

hive

#Creating Hive tables:

create table ratings(user_id INT, item_id INT , rating INT , timestamp INT)

row format delimited

fields terminated by “|”;

load data inpath ‘/user/Seshadri/data/u.data’ overwrite into table ratings;

create table movies(movie_id INT , movie_title STRING, release_date STRING ,video release date
STRING, IMDb_URL STRING , unknown INT, Action INT, Adventure INT, Animation INT, Children INT ,
Comedy INT, Crime INT, Documentry INT, Drama INT, Fantasy INT, Film_Noir INT, Horror INT, Musical
INT, Mystery INT, Romance INT, Sci-Fi INT, Thriller INT , War INT, Western INT)

row format delimited

fields terminated by “| \t”

load data inpath ‘/user/Seshadri/data/u.item’ overwrite into table movies;

create table user(user_id INT, age INT, gender STRING, occupation STRING, zip code STRING)
row format delimited

fields terminated by “|”

load data inpath ‘/user/Seshadri/data/u.user ‘ overwrite into table user;

#The files has been loaded as tables in HIVE datawarehouse

hdfs dfs -ls /user/hive/warehouse

Q2 b)

i) Top 30 most rated movies

Create table most_rated_movies as

Select a.user_id,b.movie_id,b.movie_title,a.rating from ratings a left join movies b on


(a.item_id=b.movie_id) ;

Create table top30_rated_movies as

Select movie_title,count(movie_title) as rating_count from most_rated_movies group by movie_title

Order by rating_count desc limit 30;

select * from top30_rated_movies;

ii) Top 30 Highest rated movies

create table highest_rated_movies as

select movie_title,avg(rating) as avg_rating from most_rated_movies group by movie_title order by


avg_rating desc limit 30;

select * from highest_rated_movies;

iii) Top 30 movies having atleast 50,20 and 100 raters with their average rating

atleast 50 users:
create table t1 as

select movie_title, count(distinct(user_id)) as count_user, avg(rating) as avg_rating from


most_rated_movies group by movie_title having count_user >=50 order by avg_rating desc limit 30;

select * from t1;

#atleast 20 users:

create table t2 as

select movie_title, count(distinct(user_id)) as count_user, avg(rating) as avg_rating from


most_rated_movies group by movie_title having count_user >=20 order by avg_rating desc limit 30;

select * from t2;

#atleast 100 users:

create table t3 as

select movie_title, count(distinct(user_id)) as count_user, avg(rating) as avg_rating from


most_rated_movies group by movie_title having count_user >=100 order by avg_rating desc limit 30;

select * from t3;

Q2 c)

# Joining the ratings table and the user table

create table t4 as

select a.user_id,b.gender,b.occupation,a.item_id,a.rating from ratings a left join user b on


(a.user_id=b.user_id );

#Joining table4 with movies table

create table t5 as

select a.*,b.movie_title,( case

when b.unknown = 1 then "Unknown"

else " "

end) as Unknown,
( case

when b.Action = 1 then "Action"

else " "

end) as Action,

( case

when b.Adventure = 1 then "Adventure"

else " "

end) as Adventure,

( case

when b.Animation = 1 then "Animation"

else " "

end) as Animation,

( case

when b.Children = 1 then "Children"

else " "

end) as Children,

( case

when b.Comedy = 1 then "Comedy"

else " "

end) as Comedy,

( case

when b.Crime = 1 then "Crime"

else " "

end) as Crime,

( case

when b.Documentry = 1 then "Documentry"

else " "

end) as Documentry,

( case
when b.Drama = 1 then "Drama"

else " "

end) as Drama,

( case

when b.Fantasy = 1 then "Fantasy"

else " "

end) as Fantasy,

( case

when b.Film_Noir = 1 then "Film_Noir"

else " "

end) as Film_Noir,

( case

when b.Horror = 1 then "Horror"

else " "

end) as Horror,

( case

when b.Musical = 1 then "Musical"

else " "

end) as Musical,

( case

when b.Mystery = 1 then "Mystery"

else " "

end) as Mystery,

( case

when b.Romance = 1 then "Romance"

else " "

end) as Romance,

( case
when b.Sci_Fi = 1 then "Sci-Fi"

else " "

end) as Sci_Fi,

( case

when b.Thriller = 1 then "Thriller"

else " "

end) as Thriller,

( case

when b.War = 1 then "War"

else " "

end) as War,

( case

when b.Western = 1 then "Western"

else " "

end) as Western

from t4 a left join movies b on (a.item_id=b.movie_id);

# Average rating per gender:

create table t6 as

select gender , avg(rating) as avg_rating from t5 group by gender;

Q2 c) a)

#Creating a column column for genre by concatenating the all the genres.

create table t7 as

select *, Concat(TRIM(unknown),TRIM(Action),TRIM(Adventure) , TRIM(Animation) , TRIM(Children) ,


TRIM(Comedy) , TRIM(Crime) , TRIM(Documentry) , TRIM(Drama) , TRIM(Fantasy) , TRIM(Film_Noir) ,
TRIM(Horror) , TRIM(Musical) , TRIM(Mystery) , TRIM(Romance) , TRIM(Sci_Fi) , TRIM(Thriller) ,
TRIM(War) , TRIM(Western)) as Genre from t5;
#The average rating per gender and genre

#For Males

create table t8 as

select gender,Genre,avg(rating) as avg_rating from t7 where gender ="M" group by gender,Genre order
by avg_rating desc limit 30;

#For females

create table t9 as

select gender,Genre,avg(rating) as avg_rating from t7 where gender ="F" group by gender,Genre order
by avg_rating desc limit 30;

Q2 d) The average rating given by users from each occupations

create table t10 as

select occupation,avg(rating) as avg_rating from t4 group by occupation order by avg_rating desc ;

Q2 e) The count of ratings and average rating of each genre.

create table t11 as

select Genre,count(rating) as count_rating,avg(rating) as avg_rating from t7 group by Genre order by


avg_rating desc ;

You might also like