100% found this document useful (1 vote)
6K views4 pages

Unstructtured Data Classification Fresco

The document loads IMDB movie review data, preprocesses it by tokenizing, lemmatizing, removing stop words, and splits it into training and test sets. It then applies two classifiers - an SVM classifier and an SGD classifier - to the training data and evaluates their performance on the test set. Key steps include data loading and preprocessing, feature extraction using CountVectorizer, model training on the training set, and evaluation on the test set.

Uploaded by

sujesh
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
100% found this document useful (1 vote)
6K views4 pages

Unstructtured Data Classification Fresco

The document loads IMDB movie review data, preprocesses it by tokenizing, lemmatizing, removing stop words, and splits it into training and test sets. It then applies two classifiers - an SVM classifier and an SGD classifier - to the training data and evaluates their performance on the test set. Key steps include data loading and preprocessing, feature extraction using CountVectorizer, model training on the training set, and evaluation on the test set.

Uploaded by

sujesh
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

#Data Loading

imdb=pd.read_csv('imdb.csv')
imdb.columns = ["index","text","label"]
print(imdb.head(5))

-------------------------------------------------------------

data_size = imdb.shape

print(data_size)

imdb_col_names = list(imdb.columns)

print(imdb_col_names)
print(imdb.groupby('label').describe())
print(imdb.head(3))

-------------------------------------------------------------

imdb_target=imdb['label']

print(imdb_target)

-------------------------------------------------------------

from nltk.tokenize import word_tokenize


import nltk
nltk.download('all')

def split_tokens(text):

message = text.lower()

word_tokens = word_tokenize(text)

return word_tokens

imdb['tokenized_message'] = imdb.apply(lambda row:split_tokens(row['text']),axis=1)

-------------------------------------------------------------

from nltk.stem.wordnet import WordNetLemmatizer

def split_into_lemmas(text):

lemma = []

lemmatizer = WordNetLemmatizer()

for word in text:

a=lemmatizer.lemmatize(word)

lemma.append(a)

return lemma
imdb['lemmatized_message'] = imdb.apply(lambda row:
split_into_lemmas(row['tokenized_message']),axis=1)

print('Tokenized message:', imdb['tokenized_message'][55] )

print('Lemmatized message:', imdb['lemmatized_message'][55])

-------------------------------------------------------------

from nltk.corpus import stopwords

def stopword_removal(text):

stop_words = set(stopwords.words('english'))

filtered_sentence = []

filtered_sentence = ' '.join([word for word in text if word not in stop_words])

return filtered_sentence

imdb['preprocessed_message'] = imdb.apply(lambda row:


stopword_removal(row['lemmatized_message']),axis=1)

print('Preprocessed message:',imdb['preprocessed_message'])

Training_data=pd.Series(list(imdb['preprocessed_message']))

Training_label=pd.Series(list(imdb['label']))

-------------------------------------------------------------

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


training_data=pd.Series(list(imdb['preprocessed_message']))

training_label=pd.Series(list(imdb['label']))

tf_vectorizer = CountVectorizer(ngram_range=(1,2),min_df=(1/len(Training_label)),
max_df=0.7)

Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)

message_data_TDM = Total_Dictionary_TDM.transform(training_data)

-------------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer( ngram_range = (1,2), min_df =


(1/len(training_label)),max_df=0.7 )

Total_Dictionary_TFIDF = tfidf_vectorizer.fit(training_data)

message_data_TFIDF = Total_Dictionary_TFIDF.transform(training_data)

-------------------------------------------------------------

from sklearn.model_selection import train_test_split#Splitting the data for


training and testing

train_data,test_data, train_label, test_label =


train_test_split(message_data_TDM, training_label, test_size=0.1)

-------------------------------------------------------------

seed=9
from sklearn.svm import SVC

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data : ", train_data.shape)

print("The shape of test data : ", test_data.shape)

classifier = SVC(kernel="linear",C=0.025, random_state=seed)

classifier = classifier.fit(train_data,train_label)

#target =

score = classifier.fit(train_data,train_label)

print('SVM Classifier : ',score)

with open('output.txt', 'w') as file:


file.write(str((imdb['tokenized_message'][55],imdb['lemmatized_message'][55])))

-------------------------------------------------------------

from sklearn.linear_model import SGDClassifier


train_data,test_data, train_label, test_label = train_test_split(message_data_TDM,
training_label, test_size=0.1)

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data : ",train_data.shape)

print("The shape of test data : ",test_data.shape)

classifier = SGDClassifier(loss = 'modified_huber', shuffle=True,


random_state=seed)

classifier = classifier.fit(train_data,train_label)

#target=

score = classifier.score(test_data,test_label)

print('SGD classifier : ',score)

with open('output1.txt', 'w') as file:


file.write(str((imdb['preprocessed_message'][55])))

-------------------------------------------------------------

You might also like