Data Analytics of Theatres Using Seaborn and Plotly
Data Analytics of Theatres Using Seaborn and Plotly
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nlrk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
nltk.download('stopwords')
stop=set(stopwords.words('english'))
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import ployly.graph_objs as go
import plotly.tools as tls
import json
import ast
from urllib.request import urlopen
from PIL import Image
train['has_homepage']=0
train.loc[train['homepage'].isnull()==False, 'has_homepage']=1 #a binary feature
test['has_homepage']=0 #if the movie has
a homepage, we assign 0; if the movie as a homepage, we assign 1
test.loc[test['homepage'].isnull()==False, 'has_homepage']=1
plt.figure(figsize=(12,12))
text = ' '.join(train['overview'].fillna('').values)
wordcloud = WordCloud(max_font_size=None,
background_color='white',
width=1200, height=1000).generate(text)
plt.imshow(wordcloud)
plt.title('Top words across movie overviews')
plt.axis('off')
plt.show()
vectorizer = TfidfVectorizer(
sublinear_tf=True,
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1,2),
min_df=5
)
overview_text=vectorizer.fit_transform(train['overview'].fillna(''))
linreg=LinearRegression()
linreg.fit(overview_text, train['log_revenue'])
eli5.show_weights(linreg, vec=vectorizer, top=20, feature_filter=lambda x: x!=
'<BIAS>')
Task 11: Using plotly to visualize the number of films per year
d1 = train['release_date_year'].value_counts().sort_index()
d2 = test['release_date_year'].value_counts().sort_index()
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go