Set-A
Set-A
Q-1)
import pandas as pd
# Create DataFrame
columns = ['Title', 'Genre', 'Rating', 'Duration', 'Votes']
df = pd.DataFrame(movie_data, columns=columns)
# Save to Excel
df.to_excel('movie_data.xlsx', index=False)
Q-2)
import pandas as pd
movie_data = [
["The Shawshank Redemption", "Drama", 9.3, 142, 2500000],
["The Godfather", "Crime, Drama", 9.2, 175, 1800000],
["The Dark Knight", "Action, Crime, Drama", 9.0, 152, 2400000],
["Inception", "Action, Adventure, Sci-Fi", 8.8, 148, 2100000],
["Pulp Fiction", "Crime, Drama", 8.9, 154, 1900000],
["Fight Club", "Drama", 8.8, 139, 1950000],
["Forrest Gump", "Drama, Romance", 8.8, 142, 1850000],
["Matrix", "Action, Sci-Fi", 8.7, 136, 1750000],
["Goodfellas", "Biography, Crime, Drama", 8.7, 146, 1650000],
["The Silence of the Lambs", "Crime, Drama, Thriller", 8.6, 118, 1350000],
["Interstellar", "Adventure, Drama, Sci-Fi", 8.6, 169, 1600000],
["Saving Private Ryan", "Drama, War", 8.6, 169, 1400000],
["The Green Mile", "Crime, Drama, Fantasy", 8.6, 189, 1200000],
["Gladiator", "Action, Adventure, Drama", 8.5, 155, 1300000],
["The Departed", "Crime, Drama, Thriller", 8.5, 151, 1250000],
["The Prestige", "Drama, Mystery, Sci-Fi", 8.5, 130, 1150000],
["The Lion King", "Animation, Adventure, Drama", 8.5, 88, 950000],
["Whiplash", "Drama, Music", 8.5, 106, 850000],
["The Usual Suspects", "Crime, Mystery, Thriller", 8.5, 106, 1050000],
["Eternal Sunshine of the Spotless Mind", "Drama, Romance, Sci-Fi", 8.3, 108, 950000]
]
# Create DataFrame
columns = ['Title', 'Genre', 'Rating', 'Duration', 'Votes']
df = pd.DataFrame(movie_data, columns=columns)
# One-hot encoding for ratings (we'll bin the ratings into categories)
rating_bins = pd.cut(df['Rating'], bins=[0, 3, 5, 7, 10], labels=['0-3', '3-5', '5-7', '7-10'])
rating_dummies = pd.get_dummies(rating_bins, prefix='Rating')
rating_mean = df['Rating'].mean()
rating_median = df['Rating'].median()
rating_mode = df['Rating'].mode()[0]
duration_mean = df['Duration'].mean()
duration_median = df['Duration'].median()
duration_mode = df['Duration'].mode()[0]
print("\nMeasures of Variation:")
print(f"Rating - Range: {rating_range}, Std: {rating_std}, Variance: {rating_variance}")
print(f"Duration - Range: {duration_range}, Std: {duration_std}, Variance: {duration_variance}")
rating_skewness = df['Rating'].skew()
plt.figure(figsize=(10, 6))
plt.hist(df['Rating'], bins=10, color='skyblue', edgecolor='black')
plt.title('Distribution of Movie Ratings')
plt.xlabel('Ratings')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.boxplot(df['Rating'])
plt.title('Boxplot of Ratings')
plt.subplot(1, 2, 2)
plt.boxplot(df['Duration'])
plt.title('Boxplot of Duration')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
plt.scatter(df['Votes'], df['Rating'], color='blue', alpha=0.7)
plt.title('Scatter Plot of Ratings vs. Votes')
plt.xlabel('Votes')
plt.ylabel('Ratings')
plt.grid(True)
plt.show()