0% found this document useful (0 votes)
3 views2 pages

Bertweet Tokenizer

Uploaded by

valachi b-boy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views2 pages

Bertweet Tokenizer

Uploaded by

valachi b-boy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

import pandas as pd

import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D,
Flatten, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
import re
import string
import matplotlib.pyplot as plt

# Load the Sentiment140 dataset


df = pd.read_csv("/path/to/sentiment140_dataset.csv", encoding='latin-1',
header=None)
df.columns = ["sentiment", "id", "date", "query", "user", "text"]

# Preprocess the data


texts = df["text"].values
labels = df["sentiment"].values

# Split the data into train and test sets


texts_train, texts_test, labels_train, labels_test = train_test_split(texts,
labels, test_size=0.2, random_state=42)

# Text normalization
def normalize_text(text):
# Remove URLs
text = re.sub(r"http\S+|www\S+|https\S+", "", text)
# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# Lowercase
text = text.lower()
return text

texts_train = [normalize_text(text) for text in texts_train]


texts_test = [normalize_text(text) for text in texts_test]

# Tokenize the text data using BERTweet tokenizer


tokenizer = BertweetTokenizer()
sequences_train = tokenizer.batch_encode_plus(texts_train, padding=True,
truncation=True, max_length=100)["input_ids"]
sequences_test = tokenizer.batch_encode_plus(texts_test, padding=True,
truncation=True, max_length=100)["input_ids"]

# Convert sequences to numpy arrays


X_train = np.array(sequences_train)
X_test = np.array(sequences_test)

# Define the model architecture


input_dim = tokenizer.vocab_size
embedding_dim = 100
num_filters = 128
filter_sizes = [3, 4, 5]
dropout_rate = 0.5
output_units = 1
inputs = Input(shape=(100,))
embedding = Embedding(input_dim=input_dim, output_dim=embedding_dim)(inputs)
conv_layers = []
for filter_size in filter_sizes:
conv = Conv1D(filters=num_filters, kernel_size=filter_size, activation='relu')
(embedding)
pool = MaxPooling1D(pool_size=98)(conv)
conv_layers.append(pool)
concat = Flatten()(conv_layers)
dropout = Dropout(rate=dropout_rate)(concat)
outputs = Dense(units=output_units, activation='sigmoid')(dropout)

model = Model(inputs=inputs, outputs=outputs)

# Compile the model


optimizer = Adamax(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()

# Train the model


batch_size = 64
epochs = 10
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, labels_train, batch_size=batch_size, epochs=epochs,
validation_split=0.2, callbacks=[es])

# Evaluate the model


predictions = model.predict(X_test)
predictions = np.round(predictions).flatten()
accuracy = accuracy_score(labels_test, predictions)
print("Test Accuracy:", accuracy)

# Plot accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

You might also like