0% found this document useful (0 votes)
2 views

code text

The document outlines a machine learning workflow for text classification using tweets, including data preprocessing, tokenization, and model training with a neural network. It employs techniques such as stemming, TF-IDF vectorization, and LSTM architecture for multi-class classification. The model's performance is evaluated and visualized through accuracy and loss plots, and the trained model is saved for future use.

Uploaded by

xewevo7586
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

code text

The document outlines a machine learning workflow for text classification using tweets, including data preprocessing, tokenization, and model training with a neural network. It employs techniques such as stemming, TF-IDF vectorization, and LSTM architecture for multi-class classification. The model's performance is evaluated and visualized through accuracy and loss plots, and the trained model is saved for future use.

Uploaded by

xewevo7586
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

import os

import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,
Dropout
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

df=pd.read_csv("train.csv")

df.head()

df.info()

check_null = df.isnull().sum()

check_null

df.shape

df["Tweet"].describe()

df["Category"].describe()

len(df["Tweet"])

for Tweet in range(len(df["Tweet"])):


df["Tweet"][Tweet]=re.sub(r'<[^<>]+>', repl=" ",string=df["Tweet"][Tweet])
#remove html tags
df["Tweet"][Tweet]=re.sub(r'[^a-zA-Z0-9\s]', repl=" ",string=df["Tweet"]
[Tweet]) #remove special characters/whitespaces

df.head()

df["Tweet"][1]

port_stem = PorterStemmer()
def stemming(content):
#replace any non-alphabetic characters in the content variable with a space
character
stemmed_content= re.sub('[^a-zA-Z]',' ',content)
#Convert all words into lower case letters
stemmed_content = stemmed_content.lower()
# Split the words into list
stemmed_content = stemmed_content.split()
#generate a list of stemmed words from stemmed_content, excluding any stop
words from the list
stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word
in stopwords.words('english')]
#Join the elements from the list 'stemmed_content' into a single string
separated by spaces
stemmed_content = " ".join(stemmed_content)
return stemmed_content

df["Tweet"]= df["Tweet"].apply(stemming)
df["Tweet"]

tokenizer = Tokenizer(num_words=5000) # unique words limit set to 5000

tokenizer.fit_on_texts(df['Tweet'])

X = tokenizer.texts_to_sequences(df['Tweet'])

X[0]

len(X[0])

# padding so that all reviews will be of length 500


X = pad_sequences(X,maxlen=500)

X[0]

len(X[0])

# Convert tokenized and stemmed sequences back to text format


documents = []
for sequence in X:
text = " ".join([str(token) for token in sequence if token != 0])
documents.append(text)

# Create an instance of TfidfVectorizer


vectorizer = TfidfVectorizer()

# Compute TF-IDF scores on the entire dataset


x = vectorizer.fit_transform(documents)

print(x)

Y = df['Category']

X_train, X_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.2)

X_train.shape

pickle_out=open('tokenizer.pickle',"wb")
pickle.dump(tokenizer,pickle_out)
pickle_out.close()

y_train=pd.get_dummies(Y_train)
y_test=pd.get_dummies(Y_test)

y_train

vocab_size = len(tokenizer.word_index) + 1 # +1 is necessary for embedding method

vocab_size

print(y_train.shape)
print(y_test.shape)
y_train.head()

from tensorflow.keras import Sequential


from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Reshape, Bidirectional, LSTM

# Convert sparse matrix to dense NumPy array


X_train_dense = X_train.toarray()

# Convert DataFrame to NumPy array


y_train_np = y_train.to_numpy()

# Define the model architecture with the correct output shape for multi-class
classification
num_classes = 4 # Number of actual number of classes in the data
timesteps = 64 # Set the desired number of timesteps

model = Sequential([
Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)),
Dense(64, activation='relu'),
Reshape((timesteps, -1)), # Reshape the output of the previous Dense layer to
(None, timesteps, features)
Bidirectional(LSTM(64)),
Dense(num_classes, activation='softmax') # Softmax activation for multi-class
classification
])

# Compile the model with categorical_crossentropy loss for multi-class


classification
model.compile(optimizer='adam', loss='categorical_crossentropy',
metrics=['accuracy'])

model.summary()

earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1,


patience=8)
# fitting the model with the updated architecture
modelTraining = model.fit(X_train_dense, y_train_np,
batch_size=64,
epochs=15,
validation_data=(X_train_dense, y_train_np),
callbacks=[earlyStopping])

# Evaluate the model


score = model.evaluate(X_train_dense, y_train_np, verbose=0)

print("Test_accuracy = ", score[1])

# Storing epoch values in variables


epochs = range(1, len(modelTraining.history['accuracy']) + 1)
accuracy = modelTraining.history['accuracy']
loss = modelTraining.history['loss']
val_accuracy = modelTraining.history['val_accuracy']
val_loss = modelTraining.history['val_loss']

import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs, accuracy, 'r', label='Training Accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

model.save('my')

You might also like