Sentence Embedding Code
Sentence Embedding Code
import pandas as pd
import json
import re
import numpy as np
import spacy
import tqdm
import xgboost as xgb
import lightgbm as lgb
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,
AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from nltk import pos_tag, word_tokenize
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
# Preprocessing
df.drop(columns=['article_link'], inplace=True) # Drop the 'article_link' column
df.dropna(inplace=True) # Drop any rows with missing values
df['headline'] = df['headline'].str.lower() # Convert text to lowercase
def preprocess_text(text):
# Lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
# Remove punctuation
text = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', text)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
return text
df['headline'] = df['headline'].apply(preprocess_text)
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
# Padding
maxlen = 100 # You can adjust this value
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)
# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(type(X_train))
print(type(X_test))
X_train = X_train.tolist()
X_test = X_test.tolist()
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, GRU, Conv1D,
GlobalMaxPooling1D, Embedding, SimpleRNN
from keras.utils import to_categorical
# Prepare data
X = df['headline'].values
y = df['is_sarcastic'].values
# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y)
def create_bilstm_model(input_shape):
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True),
input_shape=input_shape))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_gru_model(input_shape):
model = Sequential()
model.add(GRU(128, return_sequences=True, input_shape=input_shape))
model.add(GRU(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_bigru_model(input_shape):
model = Sequential()
model.add(Bidirectional(GRU(128, return_sequences=True),
input_shape=input_shape))
model.add(Bidirectional(GRU(128)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_cnn_model(input_shape):
model = Sequential()
model.add(Conv1D(128, 5, activation='relu', input_shape=input_shape))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_rnn_model(input_shape):
model = Sequential()
model.add(SimpleRNN(128, return_sequences=True, input_shape=input_shape))
model.add(SimpleRNN(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
# Print results
for name, result in results.items():
print(f"{name} model trained and evaluated.")