code text
code text
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,
Dropout
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
df=pd.read_csv("train.csv")
df.head()
df.info()
check_null = df.isnull().sum()
check_null
df.shape
df["Tweet"].describe()
df["Category"].describe()
len(df["Tweet"])
df.head()
df["Tweet"][1]
port_stem = PorterStemmer()
def stemming(content):
#replace any non-alphabetic characters in the content variable with a space
character
stemmed_content= re.sub('[^a-zA-Z]',' ',content)
#Convert all words into lower case letters
stemmed_content = stemmed_content.lower()
# Split the words into list
stemmed_content = stemmed_content.split()
#generate a list of stemmed words from stemmed_content, excluding any stop
words from the list
stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word
in stopwords.words('english')]
#Join the elements from the list 'stemmed_content' into a single string
separated by spaces
stemmed_content = " ".join(stemmed_content)
return stemmed_content
df["Tweet"]= df["Tweet"].apply(stemming)
df["Tweet"]
tokenizer.fit_on_texts(df['Tweet'])
X = tokenizer.texts_to_sequences(df['Tweet'])
X[0]
len(X[0])
X[0]
len(X[0])
print(x)
Y = df['Category']
X_train.shape
pickle_out=open('tokenizer.pickle',"wb")
pickle.dump(tokenizer,pickle_out)
pickle_out.close()
y_train=pd.get_dummies(Y_train)
y_test=pd.get_dummies(Y_test)
y_train
vocab_size
print(y_train.shape)
print(y_test.shape)
y_train.head()
# Define the model architecture with the correct output shape for multi-class
classification
num_classes = 4 # Number of actual number of classes in the data
timesteps = 64 # Set the desired number of timesteps
model = Sequential([
Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)),
Dense(64, activation='relu'),
Reshape((timesteps, -1)), # Reshape the output of the previous Dense layer to
(None, timesteps, features)
Bidirectional(LSTM(64)),
Dense(num_classes, activation='softmax') # Softmax activation for multi-class
classification
])
model.summary()
# Plotting
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, accuracy, 'r', label='Training Accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()
model.save('my')