0% found this document useful (0 votes)
21 views9 pages

Sentence Embedding Code

Uploaded by

bhattibaba118
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
21 views9 pages

Sentence Embedding Code

Uploaded by

bhattibaba118
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd

import tensorflow as tf

from [Link] import Tokenizer


from [Link] import pad_sequences
from [Link] import Sequential
from [Link] import Embedding, LSTM, Bidirectional, GRU, Conv1D,
MaxPooling1D, Flatten, Dense, Dropout, SimpleRNN
from [Link] import Adam
from [Link] import Word2Vec, KeyedVectors
from [Link].glove2word2vec import glove2word2vec
from [Link] import FastText
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from [Link] import Sequential
from [Link] import LSTM, Bidirectional, Dense, Embedding,
Dropout
from [Link] import GloVe
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer

import pandas as pd
import json
import re
import numpy as np
import spacy
import tqdm
import xgboost as xgb
import lightgbm as lgb
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from [Link] import FeatureUnion
from [Link] import BaseEstimator, TransformerMixin
from [Link] import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from [Link] import RandomForestClassifier, GradientBoostingClassifier,
AdaBoostClassifier
from [Link] import DecisionTreeClassifier
from [Link] import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from [Link] import SVC
from nltk import pos_tag, word_tokenize
from [Link] import Pipeline
from [Link] import stopwords
from [Link] import WordNetLemmatizer, PorterStemmer

# Download NLTK data


[Link]('punkt')
[Link]('averaged_perceptron_tagger')
[Link]('stopwords')
[Link]('wordnet')

# Function to read JSON lines file


def read_json_lines(file_path):
data = []
with open(file_path, 'r') as file:
for line in file:
[Link]([Link](line))
return data

# Load the datasets


file1_path = '/kaggle/input/sarcasm/Sarcasm_Headlines_Dataset.json'
file2_path = '/kaggle/input/sarcasm/Sarcasm_Headlines_Dataset_v2.json'

df1 = pd.read_json(file1_path, lines=True)


df2 = pd.read_json(file2_path, lines=True)

# Concatenate the datasets


df = [Link]([df1, df2], ignore_index=True)

# Preprocessing
[Link](columns=['article_link'], inplace=True) # Drop the 'article_link' column
[Link](inplace=True) # Drop any rows with missing values
df['headline'] = df['headline'].[Link]() # Convert text to lowercase

# Basic text preprocessing


stop_words = set([Link]('english'))

def preprocess_text(text):
# Lowercase
text = [Link]()
# Remove URLs
text = [Link](r'http\S+|www\S+|https\S+', '', text, flags=[Link])
# Remove punctuation
text = [Link](r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', text)
# Tokenize
tokens = word_tokenize(text)

# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
return text

df['headline'] = df['headline'].apply(preprocess_text)

# Check for any missing values


[Link]().sum()

# Display the first few rows after preprocessing


print("\nAfter Preprocessing:")
print([Link]())

# Example model training (Logistic Regression as a placeholder)


X = df['headline']
y = df['is_sarcastic']

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
maxlen = 100 # You can adjust this value
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)
# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print(type(X_train))
print(type(X_test))
X_train = X_train.tolist()
X_test = X_test.tolist()

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import LabelEncoder
from sentence_transformers import SentenceTransformer
from [Link] import Sequential
from [Link] import Dense, LSTM, Bidirectional, GRU, Conv1D,
GlobalMaxPooling1D, Embedding, SimpleRNN
from [Link] import to_categorical

# Prepare data
X = df['headline'].values
y = df['is_sarcastic'].values

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y)

# Split the dataset into train and test sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize Sentence Transformer model


sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Generate sentence embeddings


X_train_embeddings = sbert_model.encode(X_train)
X_test_embeddings = sbert_model.encode(X_test)

# Define model architectures using sentence embeddings


def create_lstm_model(input_shape):
model = Sequential()
[Link](LSTM(128, input_shape=input_shape, return_sequences=True))
[Link](LSTM(128))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_bilstm_model(input_shape):
model = Sequential()
[Link](Bidirectional(LSTM(128, return_sequences=True),
input_shape=input_shape))
[Link](Bidirectional(LSTM(128)))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_gru_model(input_shape):
model = Sequential()
[Link](GRU(128, return_sequences=True, input_shape=input_shape))
[Link](GRU(128))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_bigru_model(input_shape):
model = Sequential()
[Link](Bidirectional(GRU(128, return_sequences=True),
input_shape=input_shape))
[Link](Bidirectional(GRU(128)))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_cnn_model(input_shape):
model = Sequential()
[Link](Conv1D(128, 5, activation='relu', input_shape=input_shape))
[Link](GlobalMaxPooling1D())
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

def create_rnn_model(input_shape):
model = Sequential()
[Link](SimpleRNN(128, return_sequences=True, input_shape=input_shape))
[Link](SimpleRNN(128))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model

# Train and evaluate models


def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
[Link](X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)
y_pred = [Link](X_test)
y_pred_classes = [Link](y_pred, axis=1)
y_test_classes = [Link](y_test, axis=1)
print(classification_report(y_test_classes, y_pred_classes))
cm = confusion_matrix(y_test_classes, y_pred_classes)
print("Confusion Matrix:\n", cm)
return model

# Create a dictionary of models


models = {
"LSTM": create_lstm_model((X_train_embeddings.shape[1], 1)),
"Bi-LSTM": create_bilstm_model((X_train_embeddings.shape[1], 1)),
"GRU": create_gru_model((X_train_embeddings.shape[1], 1)),
"Bi-GRU": create_bigru_model((X_train_embeddings.shape[1], 1)),
"CNN": create_cnn_model((X_train_embeddings.shape[1], 1)),
"RNN": create_rnn_model((X_train_embeddings.shape[1], 1))
}

# Train and evaluate each model


results = {}
for name, model in [Link]():
print(f"Training {name}...")
trained_model = train_and_evaluate_model(model, X_train_embeddings, y_train,
X_test_embeddings, y_test)
results[name] = trained_model

# Print results
for name, result in [Link]():
print(f"{name} model trained and evaluated.")

You might also like