Experiment 7 ML
Experiment 7 ML
Code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('ratings_sample.csv')
data.head()
data['movie_id'] = data['movie_id'].str.replace('+', ' ')
data.describe()
data.info()
data.isnull().sum()
data = data.dropna()
data.isnull().sum()
data.info()
# Assign unique integer IDs to each distinct movie
data['movie_id'] = pd.factorize(data['movie_id'])[0]
data['production_companies'] = pd.factorize(data['production_companies'])[0]
data['production_countries'] = pd.factorize(data['production_countries'])[0]
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Initialize WordNet lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Function to preprocess text
def preprocess_text(text):
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'[^a-zA-Z]', ' ', text)
text = text.lower()
words = word_tokenize(text)
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
processed_text = ' '.join(words)
return processed_text
# Apply preprocessing to the 'overview' column
data['overview'] = data['overview'].apply(preprocess_text)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
overview_features = tfidf_vectorizer.fit_transform(data['overview'])
overview_features_array = overview_features.toarray()
# Split the genres into individual genres
genres_list = data['genres'].str.split(' ')
# Get unique genres
unique_genres = set(genre for sublist in genres_list for genre in sublist)
for genre in unique_genres:
data[genre] = data['genres'].str.contains(genre).astype(int)
data.drop('genres', axis=1, inplace=True)
data.info()