import tensorflow as tf
from [Link] import Tokenizer
from [Link] import pad_sequences
from [Link] import Sequential
from [Link] import Embedding, LSTM, Bidirectional, GRU, Conv1D,
MaxPooling1D, Flatten, Dense, Dropout, SimpleRNN
from [Link] import Adam
from [Link] import Word2Vec, KeyedVectors
from [Link].glove2word2vec import glove2word2vec
from [Link] import FastText
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from [Link] import Sequential
from [Link] import LSTM, Bidirectional, Dense, Embedding,
Dropout
from [Link] import GloVe
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
import pandas as pd
import json
import re
import numpy as np
import spacy
import tqdm
import xgboost as xgb
import lightgbm as lgb
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from [Link] import FeatureUnion
from [Link] import BaseEstimator, TransformerMixin
from [Link] import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from [Link] import RandomForestClassifier, GradientBoostingClassifier,
AdaBoostClassifier
from [Link] import DecisionTreeClassifier
from [Link] import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from [Link] import SVC
from nltk import pos_tag, word_tokenize
from [Link] import Pipeline
from [Link] import stopwords
from [Link] import WordNetLemmatizer, PorterStemmer
# Download NLTK data
[Link]('punkt')
[Link]('averaged_perceptron_tagger')
[Link]('stopwords')
[Link]('wordnet')
# Function to read JSON lines file
def read_json_lines(file_path):
data = []
with open(file_path, 'r') as file:
for line in file:
[Link]([Link](line))
return data
# Load the datasets
file1_path = '/kaggle/input/sarcasm/Sarcasm_Headlines_Dataset.json'
file2_path = '/kaggle/input/sarcasm/Sarcasm_Headlines_Dataset_v2.json'
df1 = pd.read_json(file1_path, lines=True)
df2 = pd.read_json(file2_path, lines=True)
# Concatenate the datasets
df = [Link]([df1, df2], ignore_index=True)
# Preprocessing
[Link](columns=['article_link'], inplace=True) # Drop the 'article_link' column
[Link](inplace=True) # Drop any rows with missing values
df['headline'] = df['headline'].[Link]() # Convert text to lowercase
# Basic text preprocessing
stop_words = set([Link]('english'))
def preprocess_text(text):
# Lowercase
text = [Link]()
# Remove URLs
text = [Link](r'http\S+|www\S+|https\S+', '', text, flags=[Link])
# Remove punctuation
text = [Link](r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', text)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
return text
df['headline'] = df['headline'].apply(preprocess_text)
# Check for any missing values
[Link]().sum()
# Display the first few rows after preprocessing
print("\nAfter Preprocessing:")
print([Link]())
# Example model training (Logistic Regression as a placeholder)
X = df['headline']
y = df['is_sarcastic']
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
# Padding
maxlen = 100 # You can adjust this value
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)
# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(type(X_train))
print(type(X_test))
X_train = X_train.tolist()
X_test = X_test.tolist()
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import LabelEncoder
from sentence_transformers import SentenceTransformer
from [Link] import Sequential
from [Link] import Dense, LSTM, Bidirectional, GRU, Conv1D,
GlobalMaxPooling1D, Embedding, SimpleRNN
from [Link] import to_categorical
# Prepare data
X = df['headline'].values
y = df['is_sarcastic'].values
# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y)
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Initialize Sentence Transformer model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
# Generate sentence embeddings
X_train_embeddings = sbert_model.encode(X_train)
X_test_embeddings = sbert_model.encode(X_test)
# Define model architectures using sentence embeddings
def create_lstm_model(input_shape):
model = Sequential()
[Link](LSTM(128, input_shape=input_shape, return_sequences=True))
[Link](LSTM(128))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_bilstm_model(input_shape):
model = Sequential()
[Link](Bidirectional(LSTM(128, return_sequences=True),
input_shape=input_shape))
[Link](Bidirectional(LSTM(128)))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_gru_model(input_shape):
model = Sequential()
[Link](GRU(128, return_sequences=True, input_shape=input_shape))
[Link](GRU(128))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_bigru_model(input_shape):
model = Sequential()
[Link](Bidirectional(GRU(128, return_sequences=True),
input_shape=input_shape))
[Link](Bidirectional(GRU(128)))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_cnn_model(input_shape):
model = Sequential()
[Link](Conv1D(128, 5, activation='relu', input_shape=input_shape))
[Link](GlobalMaxPooling1D())
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_rnn_model(input_shape):
model = Sequential()
[Link](SimpleRNN(128, return_sequences=True, input_shape=input_shape))
[Link](SimpleRNN(128))
[Link](Dense(2, activation='softmax'))
[Link](loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
# Train and evaluate models
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
[Link](X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)
y_pred = [Link](X_test)
y_pred_classes = [Link](y_pred, axis=1)
y_test_classes = [Link](y_test, axis=1)
print(classification_report(y_test_classes, y_pred_classes))
cm = confusion_matrix(y_test_classes, y_pred_classes)
print("Confusion Matrix:\n", cm)
return model
# Create a dictionary of models
models = {
"LSTM": create_lstm_model((X_train_embeddings.shape[1], 1)),
"Bi-LSTM": create_bilstm_model((X_train_embeddings.shape[1], 1)),
"GRU": create_gru_model((X_train_embeddings.shape[1], 1)),
"Bi-GRU": create_bigru_model((X_train_embeddings.shape[1], 1)),
"CNN": create_cnn_model((X_train_embeddings.shape[1], 1)),
"RNN": create_rnn_model((X_train_embeddings.shape[1], 1))
}
# Train and evaluate each model
results = {}
for name, model in [Link]():
print(f"Training {name}...")
trained_model = train_and_evaluate_model(model, X_train_embeddings, y_train,
X_test_embeddings, y_test)
results[name] = trained_model
# Print results
for name, result in [Link]():
print(f"{name} model trained and evaluated.")