0% found this document useful (0 votes)

2 views4 pages

code text

The document outlines a machine learning workflow for text classification using tweets, including data preprocessing, tokenization, and model training with a neural network. It employs techniques such as stemming, TF-IDF vectorization, and LSTM architecture for multi-class classification. The model's performance is evaluated and visualized through accuracy and loss plots, and the trained model is saved for future use.

Uploaded by

xewevo7586

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

2 views4 pages

code text

Uploaded by

xewevo7586

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 4

import os

import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,
Dropout
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

df=pd.read_csv("train.csv")

df.head()

df.info()

check_null = df.isnull().sum()

check_null

df.shape

df["Tweet"].describe()

df["Category"].describe()

len(df["Tweet"])

for Tweet in range(len(df["Tweet"])):

df["Tweet"][Tweet]=re.sub(r'<[^<>]+>', repl=" ",string=df["Tweet"][Tweet])
#remove html tags
df["Tweet"][Tweet]=re.sub(r'[^a-zA-Z0-9\s]', repl=" ",string=df["Tweet"]
[Tweet]) #remove special characters/whitespaces

df.head()

df["Tweet"][1]

port_stem = PorterStemmer()
def stemming(content):
#replace any non-alphabetic characters in the content variable with a space
character
stemmed_content= re.sub('[^a-zA-Z]',' ',content)
#Convert all words into lower case letters
stemmed_content = stemmed_content.lower()
# Split the words into list
stemmed_content = stemmed_content.split()
#generate a list of stemmed words from stemmed_content, excluding any stop
words from the list
stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word
in stopwords.words('english')]
#Join the elements from the list 'stemmed_content' into a single string
separated by spaces
stemmed_content = " ".join(stemmed_content)
return stemmed_content

df["Tweet"]= df["Tweet"].apply(stemming)
df["Tweet"]

tokenizer = Tokenizer(num_words=5000) # unique words limit set to 5000

tokenizer.fit_on_texts(df['Tweet'])

X = tokenizer.texts_to_sequences(df['Tweet'])

X[0]

len(X[0])

# padding so that all reviews will be of length 500

X = pad_sequences(X,maxlen=500)

X[0]

len(X[0])

# Convert tokenized and stemmed sequences back to text format

documents = []
for sequence in X:
text = " ".join([str(token) for token in sequence if token != 0])
documents.append(text)

# Create an instance of TfidfVectorizer

vectorizer = TfidfVectorizer()

# Compute TF-IDF scores on the entire dataset

x = vectorizer.fit_transform(documents)

print(x)

Y = df['Category']

X_train, X_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.2)

X_train.shape

pickle_out=open('tokenizer.pickle',"wb")
pickle.dump(tokenizer,pickle_out)
pickle_out.close()

y_train=pd.get_dummies(Y_train)
y_test=pd.get_dummies(Y_test)

y_train

vocab_size = len(tokenizer.word_index) + 1 # +1 is necessary for embedding method

vocab_size

print(y_train.shape)
print(y_test.shape)
y_train.head()

from tensorflow.keras import Sequential

from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Reshape, Bidirectional, LSTM

# Convert sparse matrix to dense NumPy array

X_train_dense = X_train.toarray()

# Convert DataFrame to NumPy array

y_train_np = y_train.to_numpy()

# Define the model architecture with the correct output shape for multi-class
classification
num_classes = 4 # Number of actual number of classes in the data
timesteps = 64 # Set the desired number of timesteps

model = Sequential([
Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)),
Dense(64, activation='relu'),
Reshape((timesteps, -1)), # Reshape the output of the previous Dense layer to
(None, timesteps, features)
Bidirectional(LSTM(64)),
Dense(num_classes, activation='softmax') # Softmax activation for multi-class
classification
])

# Compile the model with categorical_crossentropy loss for multi-class

classification
model.compile(optimizer='adam', loss='categorical_crossentropy',
metrics=['accuracy'])

model.summary()

earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1,

patience=8)
# fitting the model with the updated architecture
modelTraining = model.fit(X_train_dense, y_train_np,
batch_size=64,
epochs=15,
validation_data=(X_train_dense, y_train_np),
callbacks=[earlyStopping])

# Evaluate the model

score = model.evaluate(X_train_dense, y_train_np, verbose=0)

print("Test_accuracy = ", score[1])

# Storing epoch values in variables

epochs = range(1, len(modelTraining.history['accuracy']) + 1)
accuracy = modelTraining.history['accuracy']
loss = modelTraining.history['loss']
val_accuracy = modelTraining.history['val_accuracy']
val_loss = modelTraining.history['val_loss']

import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs, accuracy, 'r', label='Training Accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

model.save('my')

cyberbullying code
No ratings yet
cyberbullying code
6 pages
Cv prince
No ratings yet
Cv prince
120 pages
Shaurya DL file
No ratings yet
Shaurya DL file
75 pages
Deep Learning Programs Updated
No ratings yet
Deep Learning Programs Updated
24 pages
Deep Learning Manual (1)
No ratings yet
Deep Learning Manual (1)
53 pages
Animal Production Grade 10 - LM
94% (47)
Animal Production Grade 10 - LM
302 pages
Neural_DEEP
No ratings yet
Neural_DEEP
39 pages
DOC-20250104-WA0000.
No ratings yet
DOC-20250104-WA0000.
40 pages
Machine Learning Code Explanation
No ratings yet
Machine Learning Code Explanation
33 pages
DEEP LEARNING EXPERIMENTS
No ratings yet
DEEP LEARNING EXPERIMENTS
42 pages
Dl lab answers batch 2
No ratings yet
Dl lab answers batch 2
27 pages
Report On - Social Media Research Topic Modeling
No ratings yet
Report On - Social Media Research Topic Modeling
26 pages
TMA01 Question 2 (55 Marks)
No ratings yet
TMA01 Question 2 (55 Marks)
26 pages
unit4 (1)
No ratings yet
unit4 (1)
23 pages
hatespeech_code_ipynb
No ratings yet
hatespeech_code_ipynb
31 pages
DL
No ratings yet
DL
17 pages
Chapter07 Working-With-Keras
No ratings yet
Chapter07 Working-With-Keras
12 pages
Image Caption2
No ratings yet
Image Caption2
9 pages
Import As Import As Import As Import As From Import From Import From Import From Import From Import From Import From Import From Import From Import
No ratings yet
Import As Import As Import As Import As From Import From Import From Import From Import From Import From Import From Import From Import From Import
8 pages
Python CA 4
No ratings yet
Python CA 4
9 pages
Spam Detection Using Tensorflow
No ratings yet
Spam Detection Using Tensorflow
13 pages
566f0619-9145-4b8f-b12b-cb8a5b0cd30d
No ratings yet
566f0619-9145-4b8f-b12b-cb8a5b0cd30d
17 pages
Dl 5 Excuted
No ratings yet
Dl 5 Excuted
13 pages
Workshop - NLP - Ipynb - Colaboratory
No ratings yet
Workshop - NLP - Ipynb - Colaboratory
5 pages
Fake News Classifier
No ratings yet
Fake News Classifier
5 pages
751_DNN_Assn 01
No ratings yet
751_DNN_Assn 01
8 pages
Sample
No ratings yet
Sample
6 pages
Untitled 10
No ratings yet
Untitled 10
6 pages
vertopal.com_8-12
No ratings yet
vertopal.com_8-12
6 pages
Sample Code
No ratings yet
Sample Code
8 pages
Dl Lab Manual
No ratings yet
Dl Lab Manual
18 pages
Practical No10
No ratings yet
Practical No10
4 pages
dl_5 excuted
No ratings yet
dl_5 excuted
13 pages
Sumati
No ratings yet
Sumati
10 pages
Lab Report 8
No ratings yet
Lab Report 8
11 pages
SOURCE CODE-image_to_text
No ratings yet
SOURCE CODE-image_to_text
7 pages
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
No ratings yet
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
8 pages
DL Exp-10,11,12
No ratings yet
DL Exp-10,11,12
6 pages
Adobe Scan 08 Jan 2025
No ratings yet
Adobe Scan 08 Jan 2025
7 pages
L2 - Basic ANN model building with TF-Keras.pptx
No ratings yet
L2 - Basic ANN model building with TF-Keras.pptx
16 pages
ex-7
No ratings yet
ex-7
2 pages
bertweet tokenizer
No ratings yet
bertweet tokenizer
2 pages
GloVe Embedding code
No ratings yet
GloVe Embedding code
3 pages
DL_3
No ratings yet
DL_3
6 pages
Sentence Embedding Code
No ratings yet
Sentence Embedding Code
9 pages
Import Numpy as Np
No ratings yet
Import Numpy as Np
5 pages
A Comprehensive Guide To Understand and Implement Text Classification in Python
No ratings yet
A Comprehensive Guide To Understand and Implement Text Classification in Python
34 pages
Super Visionado VSRegras
No ratings yet
Super Visionado VSRegras
6 pages
P3 (1)
No ratings yet
P3 (1)
2 pages
dl exps
No ratings yet
dl exps
9 pages
Aped For Fake News
No ratings yet
Aped For Fake News
6 pages
sentiment analysis using LSTM (1)
No ratings yet
sentiment analysis using LSTM (1)
5 pages
AIML lab ex 2
No ratings yet
AIML lab ex 2
4 pages
Stuff
No ratings yet
Stuff
2 pages
EXP NO 5
No ratings yet
EXP NO 5
5 pages
nndlmac
No ratings yet
nndlmac
9 pages
DL Programs
No ratings yet
DL Programs
12 pages
nndlrepo2
No ratings yet
nndlrepo2
3 pages
Assignment 2.4.1 Multiclass Classification
No ratings yet
Assignment 2.4.1 Multiclass Classification
5 pages
Document 2
0% (1)
Document 2
3 pages
Comparison Between Different Commercial Gear Tooth Contact Analysis Software Packages
No ratings yet
Comparison Between Different Commercial Gear Tooth Contact Analysis Software Packages
16 pages
Tensor Flow 2
No ratings yet
Tensor Flow 2
3 pages
Estimates
No ratings yet
Estimates
8 pages
Assignment 2 Sem 1
No ratings yet
Assignment 2 Sem 1
6 pages
Initial - FAQ Immu F6 V1.0 20210105
100% (2)
Initial - FAQ Immu F6 V1.0 20210105
5 pages
Embraco Commercial Electrical Components Catalog
No ratings yet
Embraco Commercial Electrical Components Catalog
20 pages
Massmart - Road To Recovery
100% (2)
Massmart - Road To Recovery
49 pages
DSA Final Report
No ratings yet
DSA Final Report
9 pages
Methods For Testing Tar and Bituminous Materials - : Determination of Specific Gravity
No ratings yet
Methods For Testing Tar and Bituminous Materials - : Determination of Specific Gravity
10 pages
Unit 1
No ratings yet
Unit 1
11 pages
Abdul Rizwan
No ratings yet
Abdul Rizwan
2 pages
3. Comparing and Ordering Numbers
No ratings yet
3. Comparing and Ordering Numbers
2 pages
FDS All Units
No ratings yet
FDS All Units
19 pages
Adms-10 FT-70D Im Eng 2211-C
No ratings yet
Adms-10 FT-70D Im Eng 2211-C
32 pages
Renpu 2011
No ratings yet
Renpu 2011
2 pages
You Are My Everything (Ost Descendants of The Sun) (Gummy) - Score and Parts
No ratings yet
You Are My Everything (Ost Descendants of The Sun) (Gummy) - Score and Parts
15 pages
Crash 2022 03 15 15 34 35 1647329675755
No ratings yet
Crash 2022 03 15 15 34 35 1647329675755
2 pages
ESF Guidance Note 3 Resource Efficiency and Pollution Prevention and Management English
No ratings yet
ESF Guidance Note 3 Resource Efficiency and Pollution Prevention and Management English
20 pages
Cersai Format
No ratings yet
Cersai Format
12 pages
Email Writing
No ratings yet
Email Writing
7 pages
Hyderabad Airport Media
No ratings yet
Hyderabad Airport Media
17 pages
ARVUELS Acknowledgement
No ratings yet
ARVUELS Acknowledgement
2 pages
Top 10 Open Source Data Mining Tools: A Brief Look at Mining Tasks
No ratings yet
Top 10 Open Source Data Mining Tools: A Brief Look at Mining Tasks
2 pages
BUS 525 Strategic Management 58
No ratings yet
BUS 525 Strategic Management 58
1 page
Mass Media
No ratings yet
Mass Media
13 pages
Isago Q5iams Auditee Manual Ed2 Sep 10
100% (1)
Isago Q5iams Auditee Manual Ed2 Sep 10
31 pages
Refreshing PI Data in Excel PDF
No ratings yet
Refreshing PI Data in Excel PDF
3 pages
Bank Renewal Letter
74% (31)
Bank Renewal Letter
3 pages
I2DB Assignment No1
No ratings yet
I2DB Assignment No1
2 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet

code text

Uploaded by

code text

Uploaded by

import os

for Tweet in range(len(df["Tweet"])):

tokenizer = Tokenizer(num_words=5000) # unique words limit set to 5000

# padding so that all reviews will be of length 500

# Convert tokenized and stemmed sequences back to text format

# Create an instance of TfidfVectorizer

# Compute TF-IDF scores on the entire dataset

X_train, X_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.2)

vocab_size = len(tokenizer.word_index) + 1 # +1 is necessary for embedding method

from tensorflow.keras import Sequential

# Convert sparse matrix to dense NumPy array

# Convert DataFrame to NumPy array

# Compile the model with categorical_crossentropy loss for multi-class

earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1,

# Evaluate the model

print("Test_accuracy = ", score[1])

# Storing epoch values in variables

import matplotlib.pyplot as plt

You might also like