0% found this document useful (0 votes)
1 views

Word2Vec code

Uploaded by

valachi b-boy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
1 views

Word2Vec code

Uploaded by

valachi b-boy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

#Word2Vec + BiLSTM

#code 0 updated with early stop


import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM,
Dense, Dropout, Bidirectional, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
print('import done')

# define hyperparameters
MAX_SEQ_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
FILTER_SIZES = [3, 4, 5]
NUM_FILTERS = 128
LSTM_UNITS = 64
DENSE_UNITS = 1
DROPOUT_RATE = 0.5
PATIENCE = 10
EPOCHS = 50
print('config done')

# load data
df =
pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",
encoding='latin-1', header=None)
df.columns = ["sentiment", "id", "date", "query", "user", "text"]
df = df[["sentiment", "text"]]
df["sentiment"] = df["sentiment"].replace({0: "negative", 4: "positive"})
texts = df["text"].values
labels = df["sentiment"].values
labels = np.array([1 if label == "positive" else 0 for label in labels])
print(df.head(10))

# preprocess text data


tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)
print('preprocessing done')

# define model architecture


inputs = Input(shape=(MAX_SEQ_LENGTH,))
embedding = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM,
input_length=MAX_SEQ_LENGTH)(inputs)
conv_layers = []
for filter_size in FILTER_SIZES:
conv = Conv1D(filters=NUM_FILTERS, kernel_size=filter_size, activation='relu')
(embedding)
pool = MaxPooling1D(pool_size=MAX_SEQ_LENGTH - filter_size + 1)(conv)
conv_layers.append(pool)
concat = Concatenate()(conv_layers)
lstm = Bidirectional(LSTM(units=LSTM_UNITS))(concat)
dropout = Dropout(rate=DROPOUT_RATE)(lstm)
outputs = Dense(units=DENSE_UNITS, activation='sigmoid')(dropout)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# train the model with early stopping


es = EarlyStopping(monitor='val_accuracy', patience=PATIENCE, mode='max',
min_delta=0.01, baseline=0.85)
history = model.fit(data, labels, epochs= EPOCHS, validation_split=0.4,
callbacks=[es])

# Plot accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Make predictions on new data


new_texts = ["is upset that he can't update his Facebook by texting it... and might
cry as a result School today ...",
"@Kenichan I dived many times for the ball. Managed to save 50% The
rest go out of bounds",
"my whole body feels itchy and like its on fire",
"@nationwideclass no, it's not behaving at all. i'm mad. why am i
here? because I can't see you all o...",
"@Kwesidei not the whole crew",
"@LettyA ahh ive always wanted to see rent love the soundtrack!!",
"@FakerPattyPattz Oh dear. Were you drinking out of the forgotten
table drinks? "]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_data = pad_sequences(new_sequences, maxlen=MAX_SEQ_LENGTH)
predictions = model.predict(new_data)

# Evaluate the model


y_pred = np.round(predictions)
y_true = np.array([0, 0, 0, 0, 0, 0, 0]) # true labels of new data
cm = confusion_matrix(y_true, y_pred)
print(cm)

You might also like