0% found this document useful (0 votes)
2 views

code text

The document outlines a machine learning workflow for text classification using tweets, including data preprocessing, tokenization, and model training with a neural network. It employs techniques such as stemming, TF-IDF vectorization, and LSTM architecture for multi-class classification. The model's performance is evaluated and visualized through accuracy and loss plots, and the trained model is saved for future use.

Uploaded by

xewevo7586
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

code text

The document outlines a machine learning workflow for text classification using tweets, including data preprocessing, tokenization, and model training with a neural network. It employs techniques such as stemming, TF-IDF vectorization, and LSTM architecture for multi-class classification. The model's performance is evaluated and visualized through accuracy and loss plots, and the trained model is saved for future use.

Uploaded by

xewevo7586
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

import os

import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,
Dropout
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

df=pd.read_csv("train.csv")

df.head()

df.info()

check_null = df.isnull().sum()

check_null

df.shape

df["Tweet"].describe()

df["Category"].describe()

len(df["Tweet"])

for Tweet in range(len(df["Tweet"])):


df["Tweet"][Tweet]=re.sub(r'<[^<>]+>', repl=" ",string=df["Tweet"][Tweet])
#remove html tags
df["Tweet"][Tweet]=re.sub(r'[^a-zA-Z0-9\s]', repl=" ",string=df["Tweet"]
[Tweet]) #remove special characters/whitespaces

df.head()

df["Tweet"][1]

port_stem = PorterStemmer()
def stemming(content):
#replace any non-alphabetic characters in the content variable with a space
character
stemmed_content= re.sub('[^a-zA-Z]',' ',content)
#Convert all words into lower case letters
stemmed_content = stemmed_content.lower()
# Split the words into list
stemmed_content = stemmed_content.split()
#generate a list of stemmed words from stemmed_content, excluding any stop
words from the list
stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word
in stopwords.words('english')]
#Join the elements from the list 'stemmed_content' into a single string
separated by spaces
stemmed_content = " ".join(stemmed_content)
return stemmed_content

df["Tweet"]= df["Tweet"].apply(stemming)
df["Tweet"]

tokenizer = Tokenizer(num_words=5000) # unique words limit set to 5000

tokenizer.fit_on_texts(df['Tweet'])

X = tokenizer.texts_to_sequences(df['Tweet'])

X[0]

len(X[0])

# padding so that all reviews will be of length 500


X = pad_sequences(X,maxlen=500)

X[0]

len(X[0])

# Convert tokenized and stemmed sequences back to text format


documents = []
for sequence in X:
text = " ".join([str(token) for token in sequence if token != 0])
documents.append(text)

# Create an instance of TfidfVectorizer


vectorizer = TfidfVectorizer()

# Compute TF-IDF scores on the entire dataset


x = vectorizer.fit_transform(documents)

print(x)

Y = df['Category']

X_train, X_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.2)

X_train.shape

pickle_out=open('tokenizer.pickle',"wb")
pickle.dump(tokenizer,pickle_out)
pickle_out.close()

y_train=pd.get_dummies(Y_train)
y_test=pd.get_dummies(Y_test)

y_train

vocab_size = len(tokenizer.word_index) + 1 # +1 is necessary for embedding method

vocab_size

print(y_train.shape)
print(y_test.shape)
y_train.head()

from tensorflow.keras import Sequential


from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Reshape, Bidirectional, LSTM

# Convert sparse matrix to dense NumPy array


X_train_dense = X_train.toarray()

# Convert DataFrame to NumPy array


y_train_np = y_train.to_numpy()

# Define the model architecture with the correct output shape for multi-class
classification
num_classes = 4 # Number of actual number of classes in the data
timesteps = 64 # Set the desired number of timesteps

model = Sequential([
Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)),
Dense(64, activation='relu'),
Reshape((timesteps, -1)), # Reshape the output of the previous Dense layer to
(None, timesteps, features)
Bidirectional(LSTM(64)),
Dense(num_classes, activation='softmax') # Softmax activation for multi-class
classification
])

# Compile the model with categorical_crossentropy loss for multi-class


classification
model.compile(optimizer='adam', loss='categorical_crossentropy',
metrics=['accuracy'])

model.summary()

earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1,


patience=8)
# fitting the model with the updated architecture
modelTraining = model.fit(X_train_dense, y_train_np,
batch_size=64,
epochs=15,
validation_data=(X_train_dense, y_train_np),
callbacks=[earlyStopping])

# Evaluate the model


score = model.evaluate(X_train_dense, y_train_np, verbose=0)

print("Test_accuracy = ", score[1])

# Storing epoch values in variables


epochs = range(1, len(modelTraining.history['accuracy']) + 1)
accuracy = modelTraining.history['accuracy']
loss = modelTraining.history['loss']
val_accuracy = modelTraining.history['val_accuracy']
val_loss = modelTraining.history['val_loss']

import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs, accuracy, 'r', label='Training Accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

model.save('my')

You might also like