0% found this document useful (0 votes)
8 views

Sentence Embedding Code

Uploaded by

bhattibaba118
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views

Sentence Embedding Code

Uploaded by

bhattibaba118
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 9

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer


from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, GRU, Conv1D,
MaxPooling1D, Flatten, Dense, Dropout, SimpleRNN
from tensorflow.keras.optimizers import Adam
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.fasttext import FastText
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding,
Dropout
from torchtext.vocab import GloVe
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer

import pandas as pd
import json
import re
import numpy as np
import spacy
import tqdm
import xgboost as xgb
import lightgbm as lgb
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,
AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from nltk import pos_tag, word_tokenize
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download NLTK data


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to read JSON lines file


def read_json_lines(file_path):
data = []
with open(file_path, 'r') as file:
for line in file:
data.append(json.loads(line))
return data

# Load the datasets


file1_path = '/kaggle/input/sarcasm/Sarcasm_Headlines_Dataset.json'
file2_path = '/kaggle/input/sarcasm/Sarcasm_Headlines_Dataset_v2.json'

df1 = pd.read_json(file1_path, lines=True)


df2 = pd.read_json(file2_path, lines=True)

# Concatenate the datasets


df = pd.concat([df1, df2], ignore_index=True)

# Preprocessing
df.drop(columns=['article_link'], inplace=True) # Drop the 'article_link' column
df.dropna(inplace=True) # Drop any rows with missing values
df['headline'] = df['headline'].str.lower() # Convert text to lowercase

# Basic text preprocessing


stop_words = set(stopwords.words('english'))

def preprocess_text(text):
# Lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
# Remove punctuation
text = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', text)
# Tokenize
tokens = word_tokenize(text)

# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
return text

df['headline'] = df['headline'].apply(preprocess_text)

# Check for any missing values


df.isnull().sum()

# Display the first few rows after preprocessing


print("\nAfter Preprocessing:")
print(df.head())

# Example model training (Logistic Regression as a placeholder)


X = df['headline']
y = df['is_sarcastic']

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
maxlen = 100 # You can adjust this value
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)
# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print(type(X_train))
print(type(X_test))
X_train = X_train.tolist()
X_test = X_test.tolist()

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, GRU, Conv1D,
GlobalMaxPooling1D, Embedding, SimpleRNN
from keras.utils import to_categorical

# Prepare data
X = df['headline'].values
y = df['is_sarcastic'].values

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y)

# Split the dataset into train and test sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize Sentence Transformer model


sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Generate sentence embeddings


X_train_embeddings = sbert_model.encode(X_train)
X_test_embeddings = sbert_model.encode(X_test)

# Define model architectures using sentence embeddings


def create_lstm_model(input_shape):
model = Sequential()
model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_bilstm_model(input_shape):
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True),
input_shape=input_shape))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_gru_model(input_shape):
model = Sequential()
model.add(GRU(128, return_sequences=True, input_shape=input_shape))
model.add(GRU(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_bigru_model(input_shape):
model = Sequential()
model.add(Bidirectional(GRU(128, return_sequences=True),
input_shape=input_shape))
model.add(Bidirectional(GRU(128)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_cnn_model(input_shape):
model = Sequential()
model.add(Conv1D(128, 5, activation='relu', input_shape=input_shape))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_rnn_model(input_shape):
model = Sequential()
model.add(SimpleRNN(128, return_sequences=True, input_shape=input_shape))
model.add(SimpleRNN(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

# Train and evaluate models


def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)
print(classification_report(y_test_classes, y_pred_classes))
cm = confusion_matrix(y_test_classes, y_pred_classes)
print("Confusion Matrix:\n", cm)
return model

# Create a dictionary of models


models = {
"LSTM": create_lstm_model((X_train_embeddings.shape[1], 1)),
"Bi-LSTM": create_bilstm_model((X_train_embeddings.shape[1], 1)),
"GRU": create_gru_model((X_train_embeddings.shape[1], 1)),
"Bi-GRU": create_bigru_model((X_train_embeddings.shape[1], 1)),
"CNN": create_cnn_model((X_train_embeddings.shape[1], 1)),
"RNN": create_rnn_model((X_train_embeddings.shape[1], 1))
}

# Train and evaluate each model


results = {}
for name, model in models.items():
print(f"Training {name}...")
trained_model = train_and_evaluate_model(model, X_train_embeddings, y_train,
X_test_embeddings, y_test)
results[name] = trained_model

# Print results
for name, result in results.items():
print(f"{name} model trained and evaluated.")

You might also like