0% found this document useful (0 votes)
76 views4 pages

Data Analytics of Theatres Using Seaborn and Plotly

The document discusses preprocessing and exploring a movie revenue dataset to build predictive models. It loads the data, creates visualizations to understand relationships between variables like budget, language and revenue. It extracts features from movie titles, descriptions and release dates to understand their impact on revenue. Plots and visualizations are created to analyze trends in revenue over time and across different movie characteristics to help explain drivers of box office performance.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
76 views4 pages

Data Analytics of Theatres Using Seaborn and Plotly

The document discusses preprocessing and exploring a movie revenue dataset to build predictive models. It loads the data, creates visualizations to understand relationships between variables like budget, language and revenue. It extracts features from movie titles, descriptions and release dates to understand their impact on revenue. Plots and visualizations are created to analyze trends in revenue over time and across different movie characteristics to help explain drivers of box office performance.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

import numpy as np

import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nlrk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
nltk.download('stopwords')
stop=set(stopwords.words('english'))
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import ployly.graph_objs as go
import plotly.tools as tls
import json
import ast
from urllib.request import urlopen
from PIL import Image

Task 1:Data loading and exploration


train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')
train.head()

Task 2:Visualize Data


#train.revenue.hist()
fig,axis=plt.subplots(figsize=(16,6))
plt.subplot(1,2,1)
sns.distplot(train['revenue'], kde=False)
plt.title('Distribution of revenue')
plt.subplot(1,2,2)
sns.distplot(np.log1p(train['revenue']),kde=False)
plt.title('Distribution of log-tranformed revenue')
train['log_revenue']=np.log1p(train['revenue'])

Task 3: Relationship between film revenue and budget


#sns.scatterplot(train['budget'], train['revenue'])
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
sns.scatterplot(train['budget'], train['revenue'])
plt.title('Revenue vs Budget');
plt.subplot(1,2,2)
sns.scatterplot(np.log1p(train['budget']), train['log_revenue'])
plt.title('Log Revenue vs Log Budget');
train['log_budget']=np.log1p(train['budget'])
test['log_budget']=np.log1p(test['budget'])

Task 4:Does having an official homepage affect revenue?


train['homepage'].value_counts().head(10) #checks the number of sites under the
homepage column

train['has_homepage']=0
train.loc[train['homepage'].isnull()==False, 'has_homepage']=1 #a binary feature
test['has_homepage']=0 #if the movie has
a homepage, we assign 0; if the movie as a homepage, we assign 1
test.loc[test['homepage'].isnull()==False, 'has_homepage']=1

sns.catplot(x='has_homepage', y='revenue', data=train);


plt.title('Revenue for films with and without a homepage')

Task 5:Distribution of languages in film


language_data =
train.loc[train['original_language'].isin(train['original_language'].value_counts()
.head(10).index)]
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
sns.boxplot(x='original_language', y='revenue', data=language_data)
plt.title('Mean revenue per language');
plt.figure(figsize=(16,8))
plt.subplot(1,2,2)
sns.boxplot(x='original_language', y='log_revenue', data=language_data)
plt.title('Mean Log revenue per language');

Task 6: Frequent Words in film titles and descriptions


plt.figure(figsize=(12,12))
text = ' '.join(train['original_title'].values)
wordcloud = WordCloud(max_font_size=None,
background_color='white',
width=1200, height=1000).generate(text)
plt.imshow(wordcloud)
plt.title('Top words across movie titles')
plt.axis('off')
plt.show()

plt.figure(figsize=(12,12))
text = ' '.join(train['overview'].fillna('').values)
wordcloud = WordCloud(max_font_size=None,
background_color='white',
width=1200, height=1000).generate(text)
plt.imshow(wordcloud)
plt.title('Top words across movie overviews')
plt.axis('off')
plt.show()

Task 7: Do film descriptions impact revenue?


import eli5
from sklearn.linear_model import LinearRegression

vectorizer = TfidfVectorizer(
sublinear_tf=True,
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1,2),
min_df=5
)
overview_text=vectorizer.fit_transform(train['overview'].fillna(''))
linreg=LinearRegression()
linreg.fit(overview_text, train['log_revenue'])
eli5.show_weights(linreg, vec=vectorizer, top=20, feature_filter=lambda x: x!=
'<BIAS>')

Task 8: analyzing movie release dates:


test.loc[test['release_date'].isnull()==False, 'release_date'].head()

Task 9: preprocessing features


#fixing release date columns(as it is not in a proper order and we don't know
whether it's in the 20th century or 21st century)
def fix_date(x):
year = x.split('/')[2]
if int(year) <= 19:
return x[:-2] + '20' + year
else:
return x[:-2] + '19' +year
test.loc[test['release_date'].isnull() == True].head()
test.loc[test['release_date'].isnull() == True, 'release_date] = '05/01/00' #to
assign the year 2000 a particular designation
train['release_date']= train['release_date'].apply(lambda x: fix_date(x))
test['release_date']= test['release_date'].apply(lambda x: fix_date(x))

Task 10: creating features based on release date


train['release_date'] = pd.to_datetime(train['release_date'])
test['release_date'] = pd.to_datetime(test['release_date'])
def process_date(df):
date_parts = ['year', 'weekday', 'month', 'weekofyear', 'day', 'quarter']
for parts in date_parts:
part_col = 'release_date' + '_' + part
df[part_col] = getattr(df['release_date'].dt, part).astype(int)
return df
train = process_date(train)
test = process_date(test)

Task 11: Using plotly to visualize the number of films per year
d1 = train['release_date_year'].value_counts().sort_index()
d2 = test['release_date_year'].value_counts().sort_index()

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

data=[go.Scatter(x=d1.index, y=d1.values, name='train'),


go.Scatter(x=d2.index, y=d2.values, name='test')]

layout = go.Layout(dict(title = 'Number of films per year',


xaxis = dict(title = 'Year'),
yaxis = dict(title = 'Count'),
), legend = dict(orientation = 'v'))
py.iplot(dict(data=data , layout=layout))

Task 12: Number of films and revenue per year


d1 = train['release_date_year'].value_counts().sort_index()
d2 = train.groupby(['release_date_year'])['revenue'].sum()

data=[go.Scatter(x=d1.index, y=d1.values, name='filmcount'),


go.Scatter(x=d2.index, y=d2.values, name='total_revenue', yaxis='y2')]

layout = go.Layout(dict(title = 'Number of films and Total Revenue per year',


xaxis = dict(title = 'Year'),
yaxis = dict(title = 'Count'),
yaxis=dict(title='Total Revenue', overlaying='y',
side='right')), legend = dict(orientation = 'v'))
py.iplot(dict(data=data , layout=layout))

Task 13: Do release days impact revenue?


sns.catplot(x='release_date_weekday', y='revenue', data=train);
plt.title('Revenue of different days in the week');

Task 14: Relationship between runtime and revenue


sns.distplot(train['runtime'].fillna(0) /60 , bins=40, kde=False);
plt.title('Distribution of the length of films in hours');
sns.scatterplot(train['runtime'].fillna(0) / 60 , train['revenue']);
plt.title('runtime vs revenue');

You might also like