0% found this document useful (0 votes)
6 views3 pages

GloVe Embedding Code

Uploaded by

valachi b-boy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views3 pages

GloVe Embedding Code

Uploaded by

valachi b-boy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

import pandas as pd

import numpy as np
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM,
Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

print('import done')

# Define hyperparameters
MAX_SEQ_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
FILTER_SIZES = [3, 5, 7]
NUM_FILTERS = 256
LSTM_UNITS = 256
DENSE_UNITS = 1
DROPOUT_RATE = 0.5
PATIENCE = 10

print('config done')

# Load the dataset


df =
pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",
encoding='latin-1', header=None)
df.columns = ["sentiment", "id", "date", "query", "user", "text"]
df = df[["sentiment", "text"]]
df["sentiment"] = df["sentiment"].replace({0: "negative", 4: "positive"})
texts = df["text"].values
labels = df["sentiment"].values
labels = np.array([1 if label == "positive" else 0 for label in labels])
print(df.head(10))

# Preprocess text data


tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)

# Load pre-trained word embeddings (e.g., GloVe)


embedding_dim = 100
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
coefficients = np.asarray(values[1:], dtype='float32')
embedding_index[word] = coefficients

# Create embedding matrix


num_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
if i >= MAX_NB_WORDS:
continue
embedding_vector = embedding_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector

# Define model architecture


inputs = Input(shape=(MAX_SEQ_LENGTH,))
embedding = Embedding(input_dim=num_words, output_dim=embedding_dim,
input_length=MAX_SEQ_LENGTH, weights=[embedding_matrix], trainable=False)(inputs)
conv_layers = []
for filter_size in FILTER_SIZES:
conv = Conv1D(filters=NUM_FILTERS, kernel_size=filter_size, activation='relu')
(embedding)
pool = MaxPooling1D(pool_size=MAX_SEQ_LENGTH - filter_size + 1)(conv)
conv_layers.append(pool)
concat = Concatenate()(conv_layers)
lstm = LSTM(units=LSTM_UNITS)(concat)
dropout = Dropout(rate=DROPOUT_RATE)(lstm)
outputs = Dense(units=DENSE_UNITS, activation='sigmoid')(dropout)
model = Model(inputs=inputs, outputs=outputs)
optimizer = RMSprop(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()

# Train the model with early stopping


es = EarlyStopping(monitor='val_accuracy', patience=PATIENCE, mode='max',
min_delta=0.01, baseline=0.85)
history = model.fit(data, labels, epochs=50, validation_split=0.3, callbacks=[es])

# Plot accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Make predictions on new data


new_texts = ["is upset that he can't update his Facebook by texting it... and might
cry as a result School today ...",
"@Kenichan I dived many times for the ball. Managed to save 50% The
rest go out of bounds",
"my whole body feels itchy and like its on fire",
"@nationwideclass no, it's not behaving at all. i'm mad. why am i
here? because I can't see you all o...",
"@Kwesidei not the whole crew",
"@LettyA ahh ive always wanted to see rent love the soundtrack!!",
"@FakerPattyPattz Oh dear. Were you drinking out of the forgotten
table drinks? "]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_data = pad_sequences(new_sequences, maxlen=MAX_SEQ_LENGTH)
predictions = model.predict(new_data)
# Evaluate the model
y_pred = np.round(predictions)
y_true = np.array([0, 0, 0, 0, 0, 0, 0]) # true labels of new data
cm = confusion_matrix(y_true, y_pred)
print(cm)

You might also like