0% found this document useful (0 votes)
9 views3 pages

Experiment 7 ML

Uploaded by

Rishubh Gandhi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
9 views3 pages

Experiment 7 ML

Uploaded by

Rishubh Gandhi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3

Experiment 7

Code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('ratings_sample.csv')
data.head()
data['movie_id'] = data['movie_id'].str.replace('+', ' ')
data.describe()
data.info()
data.isnull().sum()
data = data.dropna()
data.isnull().sum()
data.info()
# Assign unique integer IDs to each distinct movie
data['movie_id'] = pd.factorize(data['movie_id'])[0]
data['production_companies'] = pd.factorize(data['production_companies'])[0]
data['production_countries'] = pd.factorize(data['production_countries'])[0]
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Initialize WordNet lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Function to preprocess text
def preprocess_text(text):
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'[^a-zA-Z]', ' ', text)
text = text.lower()
words = word_tokenize(text)
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
processed_text = ' '.join(words)
return processed_text
# Apply preprocessing to the 'overview' column
data['overview'] = data['overview'].apply(preprocess_text)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
overview_features = tfidf_vectorizer.fit_transform(data['overview'])
overview_features_array = overview_features.toarray()
# Split the genres into individual genres
genres_list = data['genres'].str.split(' ')
# Get unique genres
unique_genres = set(genre for sublist in genres_list for genre in sublist)
for genre in unique_genres:
data[genre] = data['genres'].str.contains(genre).astype(int)
data.drop('genres', axis=1, inplace=True)
data.info()

from sklearn.model_selection import train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
overview_features = tfidf_vectorizer.fit_transform(data['overview'])
combined_features = overview_features
X_train, X_test, y_train, y_test = train_test_split(combined_features, data['rating'], test_size=0.2,
random_state=42)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
Output:

You might also like