0% found this document useful (0 votes)
12 views28 pages

SC Project Kaggle

Uploaded by

mutyalasai7
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views28 pages

SC Project Kaggle

Uploaded by

mutyalasai7
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 28

Loading Dependencies

!pip install distance

Collecting distance
Downloading Distance-0.1.3.tar.gz (180 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 180.3/180.3 kB 5.7 MB/s eta
0:00:00
etadata (setup.py) ... e=Distance-0.1.3-py3-none-any.whl size=16258
sha256=4242bc9c5e48479c0c4838743c3e3d445c13c35d607276776e841d6aeb693e9
6
Stored in directory:
/root/.cache/pip/wheels/e8/bb/de/f71bf63559ea9a921059a5405806f7ff6ed61
2a9231c4a9309
Successfully built distance
Installing collected packages: distance
Successfully installed distance-0.1.3

import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,
CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import zipfile
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score
from scipy.sparse import hstack,csr_matrix
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
import distance
import nltk
import spacy
import string
import re
from textblob import TextBlob

pip install python-Levenshtein

Collecting python-Levenshtein
Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7
kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
Downloading levenshtein-0.26.1-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-
Levenshtein)
Downloading rapidfuzz-3.10.1-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 162.6/162.6 kB 5.7 MB/s eta
0:00:00
anylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.1/3.1 MB 59.7 MB/s eta
0:00:00:00:01
ay need to restart the kernel to use updated packages.

Opening the zip files

zip_path = "/kaggle/input/quora-question-pairs/train.csv.zip"

with zipfile.ZipFile(zip_path,'r') as zip_ref:


zip_ref.extractall()

Loading the dataset

train_df = pd.read_csv("/kaggle/working/train.csv")
test_df = pd.read_csv("/kaggle/input/quora-question-pairs/test.csv")

train_df_req = train_df.iloc[:3000,:]
print(train_df_req.shape)
train_df_req.head()

(3000, 6)

id qid1 qid2
question1 \
0 0 1 2 What is the step by step guide to invest in sh...

1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia...

2 2 5 6 How can I increase the speed of my internet co...

3 3 7 8 Why am I mentally very lonely? How can I solve...

4 4 9 10 Which one dissolve in water quikly sugar, salt...

question2 is_duplicate
0 What is the step by step guide to invest in sh... 0
1 What would happen if the Indian government sto... 0
2 How can Internet speed be increased by hacking... 0
3 Find the remainder when [math]23^{24}[/math] i... 0
4 Which fish would survive in salt water? 0
train_df_req = train_df_req.copy()
train_df_req.dropna(inplace=True)
print(train_df_req.shape)
train_df_req.isnull().sum()

(3000, 6)

id 0
qid1 0
qid2 0
question1 0
question2 0
is_duplicate 0
dtype: int64

y = train_df_req['is_duplicate']

Preprocessing the data

# Load spaCy model


nlp = spacy.load('en_core_web_sm')
exclude = string.punctuation

def preprocess(text):
# Convert text to lowercase and strip whitespace
text = text.lower().strip()

# Remove HTML tags


text = re.sub(r'<.*?>', '', text)

# Replace specific symbols


text = text.replace("%", 'percent')
text = text.replace("$", 'dollar')
text = text.replace("@", 'at')
text = text.replace('[math]', " ")

# Replace large numbers with suffixes


text = re.sub(r'([0-9]+)000000000', r'\1b', text)
text = re.sub(r'([0-9]+)000000', r'\1m', text)
text = re.sub(r'([0-9]+)000', r'\1k', text)

# Expand contractions
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he has",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has",
"I'd": "I had",
"I'd've": "I would have",
"I'll": "I shall",
"I'll've": "I shall have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it shall",
"it'll've": "it shall have",
"it's": "it has",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall",
"what'll've": "what shall have",
"what're": "what are",
"what's": "what has",
"what've": "what have",
"when's": "when has",
"when've": "when have",
"where'd": "where did",
"where's": "where has",
"where've": "where have",
"who'll": "who shall",
"who'll've": "who shall have",
"who's": "who has",
"who've": "who have",
"why's": "why has",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you shall",
"you'll've": "you shall have",
"you're": "you are",
"you've": "you have"
}

# Replace contractions
for contraction, expansion in contractions.items():
text = text.replace(contraction, expansion)

# Remove punctuation
text = text.translate(str.maketrans('', '', exclude))

# Optional spelling correction


text = str(TextBlob(text).correct())

# Tokenize and lemmatize


doc = nlp(text)
tokens = [token.lemma_ for token in doc]

return tokens

train_df_req['question1'] =
train_df_req['question1'].apply(preprocess)
train_df_req['question2'] =
train_df_req['question2'].apply(preprocess)
train_df_req.head()

id qid1 qid2
question1 \
0 0 1 2 [what, be, the, step, by, step, guide, to, inv...

1 1 3 4 [what, be, the, story, of, kohinoor, kohinoor,...

2 2 5 6 [how, can, I, increase, the, speed, of, my, in...

3 3 7 8 [why, be, I, mentally, very, lonely, how, can,...

4 4 9 10 [which, one, dissolve, in, water, quickly, sug...

question2 is_duplicate
0 [what, be, the, step, by, step, guide, to, inv... 0
1 [what, would, happen, if, the, indian, governm... 0
2 [how, can, internet, speed, be, increase, by, ... 0
3 [find, the, remainder, when, , 2324math, be, ... 0
4 [which, fish, would, survive, in, salt, water] 0

Feature Engineering (PART-1) Basic

def cnt_chr(text):
# Function to count characters
return len(text) if isinstance(text, str) else 0

def common_words(q1, q2):


words_q1 = set(q1.lower().split())
words_q2 = set(q2.lower().split())
return len(words_q1.intersection(words_q2))

def basic_features(df):
df['question1'] = df['question1'].fillna('unknown').astype(str)
df['question2'] = df['question2'].fillna('unknown').astype(str)
df['len_q1'] = df['question1'].str.len()
df['len_q2'] = df['question2'].str.len()
df['diff_len'] = abs(df['len_q1'] - df['len_q2'])
df['len_word_q1'] = df['question1'].apply(lambda x: len(x.split())
if isinstance(x, str) else 0)
df['len_word_q2'] = df['question2'].apply(lambda x: len(x.split())
if isinstance(x, str) else 0)
df['common_word'] = df.apply(lambda row:
common_words(row['question1'], row['question2']), axis=1)
df['chr_cnt_q1'] = df['question1'].apply(cnt_chr)
df['chr_cnt_q2'] = df['question2'].apply(cnt_chr)
df['total_words'] = df['len_word_q1'] + df['len_word_q2']
df['word_share'] = df['common_word']/df['total_words']
return df

train_preprocessed = basic_features(train_df_req)
train_preprocessed.head()

id qid1 qid2
question1 \
0 0 1 2 ['what', 'be', 'the', 'step', 'by', 'step', 'g...

1 1 3 4 ['what', 'be', 'the', 'story', 'of', 'kohinoor...

2 2 5 6 ['how', 'can', 'I', 'increase', 'the', 'speed'...

3 3 7 8 ['why', 'be', 'I', 'mentally', 'very', 'lonely...

4 4 9 10 ['which', 'one', 'dissolve', 'in', 'water', 'q...


question2 is_duplicate
len_q1 \
0 ['what', 'be', 'the', 'step', 'by', 'step', 'g... 0
108
1 ['what', 'would', 'happen', 'if', 'the', 'indi... 0
71
2 ['how', 'can', 'internet', 'speed', 'be', 'inc... 0
113
3 ['find', 'the', 'remainder', 'when', ' ', '232... 0
82
4 ['which', 'fish', 'would', 'survive', 'in', 's... 0
114

len_q2 diff_len len_word_q1 len_word_q2 common_word chr_cnt_q1


\
0 93 15 14 12 10 108

1 123 52 8 13 3 71

2 85 28 14 10 5 113

3 83 1 11 11 1 82

4 60 54 13 7 3 114

chr_cnt_q2 total_words word_share


0 93 26 0.384615
1 123 21 0.142857
2 85 24 0.208333
3 83 22 0.045455
4 60 20 0.150000

Feature Engineering (PART-2) - FUZZY FEATURES

def fetch_fuzzy_features(row):

q1 = row['question1']
q2 = row['question2']

fuzzy_features = [0.0]*4
fuzzy_features[0] = fuzz.QRatio(q1,q2)
fuzzy_features[1] = fuzz.partial_ratio(q1,q2)
fuzzy_features[2] = fuzz.token_sort_ratio(q1,q2)
fuzzy_features[3] = fuzz.token_set_ratio(q1,q2)
return fuzzy_features

train_fuzz = train_preprocessed.apply(fetch_fuzzy_features,axis=1)
train_preprocessed['fuzz_ratio'] = list(map(lambda x:x[0],train_fuzz))
train_preprocessed['fuzz_partial'] = list(map(lambda
x:x[1],train_fuzz))
train_preprocessed['token_sort_ratio'] = list(map(lambda
x:x[2],train_fuzz))
train_preprocessed['token_set_ratio'] = list(map(lambda
x:x[3],train_fuzz))
train_preprocessed.head(5)

id qid1 qid2
question1 \
0 0 1 2 ['what', 'be', 'the', 'step', 'by', 'step', 'g...

1 1 3 4 ['what', 'be', 'the', 'story', 'of', 'kohinoor...

2 2 5 6 ['how', 'can', 'I', 'increase', 'the', 'speed'...

3 3 7 8 ['why', 'be', 'I', 'mentally', 'very', 'lonely...

4 4 9 10 ['which', 'one', 'dissolve', 'in', 'water', 'q...

question2 is_duplicate
len_q1 \
0 ['what', 'be', 'the', 'step', 'by', 'step', 'g... 0
108
1 ['what', 'would', 'happen', 'if', 'the', 'indi... 0
71
2 ['how', 'can', 'internet', 'speed', 'be', 'inc... 0
113
3 ['find', 'the', 'remainder', 'when', ' ', '232... 0
82
4 ['which', 'fish', 'would', 'survive', 'in', 's... 0
114

len_q2 diff_len len_word_q1 len_word_q2 common_word chr_cnt_q1


\
0 93 15 14 12 10 108

1 123 52 8 13 3 71

2 85 28 14 10 5 113

3 83 1 11 11 1 82

4 60 54 13 7 3 114

chr_cnt_q2 total_words word_share fuzz_ratio fuzz_partial \


0 93 26 0.384615 92 99
1 123 21 0.142857 65 70
2 85 24 0.208333 49 53
3 83 22 0.045455 28 37
4 60 20 0.150000 45 58

token_sort_ratio token_set_ratio
0 93 100
1 60 81
2 63 73
3 27 33
4 46 67

Feature Engineering (PART-3) Token Based

def token_features(row):

q1 = row['question1']
q2 = row['question2']

q1_token = q1.split()
q2_token = q2.split()

STOP_WORDS = stopwords('english')
STEP = 0.0001
new_features = [0.0]*8

if len(q1_token)==0 or len(q2_token)==0:
return new_features

q1_non_stop = {word for word in q1_token if word not in


STOP_WORDS}
q2_non_stop = {word for word in q2_token if word not in
STOP_WORDS}

q1_word = {word for word in q1_token if word in STOP_WORDS}


q2_word = {word for word in q2_token if word in STOP_WORDS}

common_tokens = len(q1_token& q2_token)


common_non_stop = len(q1_non_stop & q2_non_stop)
common_word = len(q1_word & q2_word)

new_features[0] = common_tokens/(min(len(q1_token),len(q2_token))
+STEP)
new_features[1] = common_tokens/(max(len(q1_token),len(q2_token))
+STEP)
new_features[2] =
common_non_stop/(min(len(q1_non_stop),len(q2_non_stop))+STEP)
new_features[3] =
common_non_stop/(max(len(q1_non_stop),len(q2_non_stop))+STEP)
new_features[4] = common_word/(min(len(q1_word),len(q2_word))
+STEP)
new_features[5] = common_word/(max(len(q1_word),len(q2_word))
+STEP)

new_features[6] = q1_token[0]==q2_token[0]
new_features[7] = q1_token[-1]==q2_token[-1]
return new_features

def token_features(row):
q1 = row['question1']
q2 = row['question2']

q1_token = q1.split()
q2_token = q2.split()

# Example token features (replace with your actual logic)


cts = len(q1_token)
ctl = len(q2_token)
css = sum(1 for word in q1_token if word in q2_token)
csl = sum(len(word) for word in q1_token if word in q2_token)
cws = len(set(q1_token) - STOP_WORDS)
cwl = len(set(q2_token) - STOP_WORDS)
fws = q1_token[0] if q1_token else None
lws = q2_token[-1] if q2_token else None

return [cts, ctl, css, csl, cws, cwl, fws, lws]

import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
STOP_WORDS = set(stopwords.words('english'))

# Apply the function


token_features_df = train_preprocessed.apply(token_features, axis=1,
result_type='expand')

# Assign columns
train_preprocessed[['cts', 'ctl', 'css', 'csl', 'cws', 'cwl', 'fws',
'lws']] = token_features_df

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...


[nltk_data] Package stopwords is already up-to-date!

print(train_preprocessed.head())

id qid1 qid2
question1 \
0 0 1 2 ['what', 'be', 'the', 'step', 'by', 'step', 'g...

1 1 3 4 ['what', 'be', 'the', 'story', 'of', 'kohinoor...


2 2 5 6 ['how', 'can', 'I', 'increase', 'the', 'speed'...

3 3 7 8 ['why', 'be', 'I', 'mentally', 'very', 'lonely...

4 4 9 10 ['which', 'one', 'dissolve', 'in', 'water', 'q...

question2 is_duplicate
len_q1 \
0 ['what', 'be', 'the', 'step', 'by', 'step', 'g... 0
108
1 ['what', 'would', 'happen', 'if', 'the', 'indi... 0
71
2 ['how', 'can', 'internet', 'speed', 'be', 'inc... 0
113
3 ['find', 'the', 'remainder', 'when', ' ', '232... 0
82
4 ['which', 'fish', 'would', 'survive', 'in', 's... 0
114

len_q2 diff_len len_word_q1 ... token_sort_ratio


token_set_ratio cts \
0 93 15 14 ... 93
100 14
1 123 52 8 ... 60
81 8
2 85 28 14 ... 63
73 14
3 83 1 11 ... 27
33 11
4 60 54 13 ... 46
67 13

ctl css csl cws cwl fws lws


0 12 12 78 12 11 ['what', 'market']
1 13 4 36 7 11 ['what', 'back']
2 10 5 43 14 10 ['how', 'des']
3 11 1 5 10 11 ['why', '2423']
4 7 3 21 13 7 ['which', 'water']

[5 rows x 28 columns]

Feature Engineering - Part(3) Distance Based

def distance_features(row):
q1 = row['question1']
q2 = row['question2']

q1_token = q1.split()
q2_token = q2.split()

length_features = [0.0]*2

if len(q1_token) == 0 or len(q2_token) == 0:
return length_features

length_features[0] = (len(q1_token) + len(q2_token)) / 2


strs = list(distance.lcsubstrings(q1, q2))

# Correct the variable name from str to strs


if strs: # Ensure strs is not empty to avoid IndexError
length_features[1] = len(strs[0]) / (min(len(q1), len(q2)) +
0.0001)
return length_features

length_features = train_preprocessed.apply(distance_features, axis=1)

# Unpack length_features into separate columns


train_preprocessed['avg_words'] = length_features.apply(lambda x:
x[0])
train_preprocessed['com_sub'] = length_features.apply(lambda x: x[1])

print(x_graph.dtypes)

is_duplicate int64
len_q1 int64
len_q2 int64
diff_len int64
len_word_q1 int64
len_word_q2 int64
common_word int64
chr_cnt_q1 int64
chr_cnt_q2 int64
total_words int64
word_share float64
fuzz_ratio int64
fuzz_partial int64
token_sort_ratio int64
token_set_ratio int64
cts int64
ctl int64
css int64
csl int64
cws int64
cwl int64
fws object
lws object
avg_words float64
com_sub float64
dtype: object

Vizualisation

print(x_graph.info())
print(x_graph.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 is_duplicate 3000 non-null int64
1 len_q1 3000 non-null int64
2 len_q2 3000 non-null int64
3 diff_len 3000 non-null int64
4 len_word_q1 3000 non-null int64
5 len_word_q2 3000 non-null int64
6 common_word 3000 non-null int64
7 chr_cnt_q1 3000 non-null int64
8 chr_cnt_q2 3000 non-null int64
9 total_words 3000 non-null int64
10 word_share 3000 non-null float64
11 fuzz_ratio 3000 non-null int64
12 fuzz_partial 3000 non-null int64
13 token_sort_ratio 3000 non-null int64
14 token_set_ratio 3000 non-null int64
15 cts 3000 non-null int64
16 ctl 3000 non-null int64
17 css 3000 non-null int64
18 csl 3000 non-null int64
19 cws 3000 non-null int64
20 cwl 3000 non-null int64
21 fws 3000 non-null object
22 lws 3000 non-null object
23 avg_words 3000 non-null float64
24 com_sub 3000 non-null float64
dtypes: float64(3), int64(20), object(2)
memory usage: 586.1+ KB
None
is_duplicate len_q1 len_q2 diff_len len_word_q1 len_word_q2 \
0 0 108 93 15 14 12
1 0 71 123 52 8 13
2 0 113 85 28 14 10
3 0 82 83 1 11 11
4 0 114 60 54 13 7

common_word chr_cnt_q1 chr_cnt_q2 total_words ... cts ctl


css csl \
0 10 108 93 26 ... 14 12
12 78
1 3 71 123 21 ... 8 13
4 36
2 5 113 85 24 ... 14 10
5 43
3 1 82 83 22 ... 11 11
1 5
4 3 114 60 20 ... 13 7
3 21

cws cwl fws lws avg_words com_sub


0 12 11 ['what', 'market'] 13.0 0.989246
1 7 11 ['what', 'back'] 10.5 0.507042
2 14 10 ['how', 'des'] 12.0 0.188235
3 10 11 ['why', '2423'] 11.0 0.121951
4 13 7 ['which', 'water'] 10.0 0.200000

[5 rows x 25 columns]

import ast # To safely parse string representations of lists

# Function to handle string to list conversion


def safely_convert_to_list(value):
try:
return ast.literal_eval(value) # Safely evaluate the string
as a Python literal
except (ValueError, SyntaxError):
return [] # Return an empty list if conversion fails

# Apply conversion for `fws` and `lws` columns


x_graph['fws'] = x_graph['fws'].apply(safely_convert_to_list)
x_graph['lws'] = x_graph['lws'].apply(safely_convert_to_list)

# Example: Replace lists with their lengths (or derive other features
as needed)
x_graph['fws_length'] = x_graph['fws'].apply(len)
x_graph['lws_length'] = x_graph['lws'].apply(len)

# Drop the original `fws` and `lws` columns after feature extraction
x_graph = x_graph.drop(columns=['fws', 'lws'])

# Ensure all remaining columns are numeric


assert x_graph.select_dtypes(include=['object']).empty, "Non-numeric
columns remain!"

# Proceed with scaling and t-SNE


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_graph_scaled = scaler.fit_transform(x_graph)

from sklearn.manifold import TSNE


tsne2d = TSNE(
n_components=2,
init='random',
random_state=10,
method="barnes_hut",
n_iter=1000,
verbose=2,
angle=0.5
).fit_transform(x_graph_scaled)

[t-SNE] Computing 91 nearest neighbors...


[t-SNE] Indexed 3000 samples in 0.001s...
[t-SNE] Computed neighbors for 3000 samples in 0.281s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3000
[t-SNE] Computed conditional probabilities for sample 2000 / 3000
[t-SNE] Computed conditional probabilities for sample 3000 / 3000
[t-SNE] Mean sigma: 0.783743
[t-SNE] Computed conditional probabilities in 0.086s
[t-SNE] Iteration 50: error = 82.6931076, gradient norm = 0.0665216
(50 iterations in 0.727s)
[t-SNE] Iteration 100: error = 71.0447235, gradient norm = 0.0057538
(50 iterations in 0.678s)
[t-SNE] Iteration 150: error = 70.4523315, gradient norm = 0.0018530
(50 iterations in 0.684s)
[t-SNE] Iteration 200: error = 70.3037033, gradient norm = 0.0006665
(50 iterations in 0.704s)
[t-SNE] Iteration 250: error = 70.2524109, gradient norm = 0.0003141
(50 iterations in 0.718s)
[t-SNE] KL divergence after 250 iterations with early exaggeration:
70.252411
[t-SNE] Iteration 300: error = 1.8783119, gradient norm = 0.0203971
(50 iterations in 0.630s)
[t-SNE] Iteration 350: error = 1.4992595, gradient norm = 0.0166862
(50 iterations in 0.605s)
[t-SNE] Iteration 400: error = 1.3437542, gradient norm = 0.0140167
(50 iterations in 0.586s)
[t-SNE] Iteration 450: error = 1.2620921, gradient norm = 0.0123222
(50 iterations in 0.600s)
[t-SNE] Iteration 500: error = 1.2126354, gradient norm = 0.0108801
(50 iterations in 0.591s)
[t-SNE] Iteration 550: error = 1.1799737, gradient norm = 0.0098113
(50 iterations in 0.599s)
[t-SNE] Iteration 600: error = 1.1567816, gradient norm = 0.0087193
(50 iterations in 0.610s)
[t-SNE] Iteration 650: error = 1.1411700, gradient norm = 0.0077777
(50 iterations in 0.691s)
[t-SNE] Iteration 700: error = 1.1293473, gradient norm = 0.0069052
(50 iterations in 0.611s)
[t-SNE] Iteration 750: error = 1.1210830, gradient norm = 0.0058587
(50 iterations in 0.621s)
[t-SNE] Iteration 800: error = 1.1149733, gradient norm = 0.0045917
(50 iterations in 0.612s)
[t-SNE] Iteration 850: error = 1.1106164, gradient norm = 0.0041198
(50 iterations in 0.623s)
[t-SNE] Iteration 900: error = 1.1070890, gradient norm = 0.0038037
(50 iterations in 0.617s)
[t-SNE] Iteration 950: error = 1.1039897, gradient norm = 0.0032104
(50 iterations in 0.613s)
[t-SNE] Iteration 1000: error = 1.1012192, gradient norm = 0.0031819
(50 iterations in 0.619s)
[t-SNE] KL divergence after 1000 iterations: 1.101219

import matplotlib.pyplot as plt


import seaborn as sns

# Create a DataFrame for easier plotting


import pandas as pd

# Combine tense2d and y_labels into a DataFrame


tsne_df = pd.DataFrame(tsne2d, columns=['Dimension 1', 'Dimension 2'])

tsne_df['is_duplicate'] = y.reset_index(drop=True)

# Set color palette


palette = {0: 'blue', 1: 'red'} # Assuming 0 for non-duplicate, 1 for
duplicate

# Create the plot


plt.figure(figsize=(10, 8))
sns.scatterplot(
x='Dimension 1',
y='Dimension 2',
hue='is_duplicate',
data=tsne_df,
palette=palette,
alpha=0.7,
edgecolor=None
)

# Add title and labels


plt.title('t-SNE Visualization of Text Data', fontsize=15)
plt.xlabel('Dimension 1', fontsize=12)
plt.ylabel('Dimension 2', fontsize=12)
plt.legend(title='Is Duplicate', loc='best')
plt.grid(True)
# Show the plot
plt.show()

Vectorizing the data

x_combined = [''.join(pair.astype(str)) for pair in


train_preprocessed[['question1','question2']].values]
len(x_combined)

3000

#tfv =
TfidfVectorizer(min_df=3,stop_words='english',use_idf=1,smooth_idf=1,s
ublinear_tf=1)
#tfv.fit(list(x_combined))
#train_tfv = tfv.transform(x_combined)

from sklearn.feature_extraction.text import TfidfVectorizer


# Correct usage with boolean True/False for use_idf
fv = TfidfVectorizer(min_df=3, stop_words='english', use_idf=True,
smooth_idf=True, sublinear_tf=True)
fv.fit(list(x_combined))
train_tfv = fv.transform(x_combined)

train_preprocessed.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',


'len_q1', 'len_q2', 'diff_len', 'len_word_q1', 'len_word_q2',
'common_word', 'chr_cnt_q1', 'chr_cnt_q2', 'total_words',
'word_share',
'fuzz_ratio', 'fuzz_partial', 'token_sort_ratio',
'token_set_ratio',
'cts', 'ctl', 'css', 'csl', 'cws', 'cwl', 'fws', 'lws',
'avg_words',
'com_sub'],
dtype='object')

new_features = train_df_req[['len_word_q1', 'len_word_q2', 'len_q1',


'len_q2', 'diff_len', 'common_word', 'chr_cnt_q1',
'chr_cnt_q2','fuzz_ratio','fuzz_partial','token_sort_ratio','token_set
_ratio']].values
new_features_sparse = csr_matrix(new_features)

X = hstack([train_tfv,new_features_sparse])
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'


with 52182 stored elements and shape (3000, 1652)>
Coords Values
(0, 1407) 0.6277962555725236
(0, 1319) 0.4406139030015651
(0, 872) 0.4318582783379746
(0, 756) 0.44544020947602825
(0, 722) 0.16374583104237736
(0, 1640) 14.0
(0, 1641) 12.0
(0, 1642) 108.0
(0, 1643) 93.0
(0, 1644) 15.0
(0, 1645) 10.0
(0, 1646) 108.0
(0, 1647) 93.0
(0, 1648) 92.0
(0, 1649) 99.0
(0, 1650) 93.0
(0, 1651) 100.0
(1, 1414) 0.46662060587488785
(1, 1406) 0.555343927824952
(1, 723) 0.38072024578347957
(1, 657) 0.383421272771835
(1, 631) 0.42649566707939707
(1, 1640) 8.0
(1, 1641) 13.0
(1, 1642) 71.0
: :
(2998, 1643) 99.0
(2998, 1645) 12.0
(2998, 1646) 99.0
(2998, 1647) 99.0
(2998, 1648) 96.0
(2998, 1649) 96.0
(2998, 1650) 90.0
(2998, 1651) 95.0
(2999, 1559) 0.2896838949855595
(2999, 1364) 0.2919619337820428
(2999, 676) 0.6318210832073798
(2999, 149) 0.37316370984265435
(2999, 55) 0.5407332380012957
(2999, 1640) 7.0
(2999, 1641) 13.0
(2999, 1642) 49.0
(2999, 1643) 106.0
(2999, 1644) 57.0
(2999, 1645) 3.0
(2999, 1646) 49.0
(2999, 1647) 106.0
(2999, 1648) 53.0
(2999, 1649) 69.0
(2999, 1650) 39.0
(2999, 1651) 71.0

xtrain, xvalid, ytrain, yvalid = train_test_split(X, y,


stratify=y,
random_state=42,
test_size=0.1,
shuffle=True)

Training the model

model_lr = LogisticRegression(penalty='l2', solver='liblinear')


model_lr.fit(xtrain,ytrain)
pred = model_lr.predict(xvalid)
print(accuracy_score(yvalid,pred))

0.74

pred_proba_lr = model_lr.predict_proba(xvalid)[:,1]
print(log_loss(yvalid,pred_proba_lr))
0.5029506557087635

from xgboost import XGBClassifier

model_xgb = XGBClassifier()
model_xgb.fit(xtrain,ytrain)
pred_prob_xgb = model_xgb.predict_proba(xvalid)[:,1]
print(log_loss(yvalid,pred_prob_xgb))

0.4913228531206017

print(accuracy_score(yvalid,model_xgb.predict(xvalid)))

0.7466666666666667

Making Prediction

!pip install utils

Collecting utils
Downloading utils-1.0.2.tar.gz (13 kB)
Preparing metadata (setup.py) ... e=utils-1.0.2-py2.py3-none-any.whl
size=13906
sha256=433af146c2d720029e75083949f4bfbbb204e72040d96946786732ad8b172c0
0
Stored in directory:
/root/.cache/pip/wheels/b8/39/f5/9d0ca31dba85773ececf0a7f5469f18810e1c
8a8ed9da28ca7
Successfully built utils
Installing collected packages: utils
Successfully installed utils-1.0.2

def preprocess_questions(df):
# Example preprocessing steps
# You can modify this according to your requirements

# lowercasing text
df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()

# removes punctuation, numbers, unwanted characters


df['question1'] = df['question1'].str.replace(r'[^a-zA-Z\s]', '',
regex=True)
df['question2'] = df['question2'].str.replace(r'[^a-zA-Z\s]', '',
regex=True)

from nltk.tokenize import word_tokenize


df['question1'] = df['question1'].apply(word_tokenize)
df['question2'] = df['question2'].apply(word_tokenize)

return df
test_preprocessed = preprocess_questions(test_df)
test_preprocessed.head()

question1 \
0 [how, does, the, surface, pro, himself, compar...
1 [should, i, have, a, hair, transplant, at, age...
2 [what, but, is, the, best, way, to, send, mone...
3 [which, food, not, emulsifiers]
4 [how, aberystwyth, start, reading]

question2 fuzz_ratio \
0 [why, did, microsoft, choose, core, m, and, no... 47
1 [how, much, cost, does, hair, transplant, requ... 50
2 [what, you, send, money, to, china] 60
3 [what, foods, fibre] 52
4 [how, their, can, i, start, reading] 69

fuzz_partial token_sort_ratio token_set_ratio len_q1 len_q2


diff_len \
0 45 55 57 55 64
9
1 57 60 82 62 42
20
2 82 55 92 59 28
31
3 56 52 52 26 16
10
4 69 66 74 29 29
0

len_word_q1 len_word_q2 common_word chr_cnt_q1 chr_cnt_q2


total_words \
0 10 13 2 55 64
23
1 13 7 5 62 42
20
2 14 6 5 59 28
20
3 4 3 0 26 16
7
4 4 6 3 29 29
10

word_share
0 0.086957
1 0.250000
2 0.250000
3 0.000000
4 0.300000
print(test_preprocessed.head())
print(test_preprocessed.dtypes)

question1 \
0 [how, does, the, surface, pro, himself, compar...
1 [should, i, have, a, hair, transplant, at, age...
2 [what, but, is, the, best, way, to, send, mone...
3 [which, food, not, emulsifiers]
4 [how, aberystwyth, start, reading]

question2 fuzz_ratio \
0 [why, did, microsoft, choose, core, m, and, no... 47
1 [how, much, cost, does, hair, transplant, requ... 50
2 [what, you, send, money, to, china] 60
3 [what, foods, fibre] 52
4 [how, their, can, i, start, reading] 69

fuzz_partial token_sort_ratio token_set_ratio len_q1 len_q2


diff_len \
0 45 55 57 55 64
9
1 57 60 82 62 42
20
2 82 55 92 59 28
31
3 56 52 52 26 16
10
4 69 66 74 29 29
0

len_word_q1 len_word_q2 common_word chr_cnt_q1 chr_cnt_q2


total_words \
0 10 13 2 55 64
23
1 13 7 5 62 42
20
2 14 6 5 59 28
20
3 4 3 0 26 16
7
4 4 6 3 29 29
10

word_share
0 0.086957
1 0.250000
2 0.250000
3 0.000000
4 0.300000
question1 object
question2 object
fuzz_ratio int64
fuzz_partial int64
token_sort_ratio int64
token_set_ratio int64
len_q1 int64
len_q2 int64
diff_len int64
len_word_q1 int64
len_word_q2 int64
common_word int64
chr_cnt_q1 int64
chr_cnt_q2 int64
total_words int64
word_share float64
dtype: object

from joblib import Parallel, delayed

def fetch_fuzzy_features(row):
# Ensure that row['question1'] and row['question2'] are valid
strings
q1 = row.get('question1', '') # Default to empty string if the
value is missing
q2 = row.get('question2', '') # Default to empty string if the
value is missing

# Ensure both q1 and q2 are strings


if isinstance(q1, (list, pd.Series, np.ndarray)): # Check if it's
an array or list
q1 = str(q1[0]) if len(q1) > 0 else '' # Convert the first
element to a string
elif not isinstance(q1, str):
q1 = str(q1) if not pd.isna(q1) else ''

if isinstance(q2, (list, pd.Series, np.ndarray)): # Check if it's


an array or list
q2 = str(q2[0]) if len(q2) > 0 else '' # Convert the first
element to a string
elif not isinstance(q2, str):
q2 = str(q2) if not pd.isna(q2) else ''

# Initialize fuzzy features as a list of zeros


fuzzy_features = [0.0] * 4

# Ensure the results are scalars by converting to float


fuzzy_features[0] = float(fuzz.QRatio(q1, q2))
fuzzy_features[1] = float(fuzz.partial_ratio(q1, q2))
fuzzy_features[2] = float(fuzz.token_sort_ratio(q1, q2))
fuzzy_features[3] = float(fuzz.token_set_ratio(q1, q2))
return fuzzy_features

test_fuzz = Parallel(n_jobs=-1)(delayed(fetch_fuzzy_features)(row) for


_, row in test_preprocessed.iterrows())

test_preprocessed['fuzz_ratio'] = list(map(lambda x:x[0],test_fuzz))


test_preprocessed['fuzz_partial'] = list(map(lambda x:x[1],test_fuzz))
test_preprocessed['token_sort_ratio'] = list(map(lambda
x:x[2],test_fuzz))
test_preprocessed['token_set_ratio'] = list(map(lambda
x:x[3],test_fuzz))
test_preprocessed.head(5)

question1 \
0 [how, does, the, surface, pro, himself, compar...
1 [should, i, have, a, hair, transplant, at, age...
2 [what, but, is, the, best, way, to, send, mone...
3 [which, food, not, emulsifiers]
4 [how, aberystwyth, start, reading]

question2 fuzz_ratio \
0 [why, did, microsoft, choose, core, m, and, no... 33.0
1 [how, much, cost, does, hair, transplant, requ... 44.0
2 [what, you, send, money, to, china] 100.0
3 [what, foods, fibre] 44.0
4 [how, their, can, i, start, reading] 100.0

fuzz_partial token_sort_ratio token_set_ratio len_q1 len_q2


diff_len \
0 40.0 33.0 33.0 55 64
9
1 67.0 44.0 44.0 62 42
20
2 100.0 100.0 100.0 59 28
31
3 50.0 44.0 44.0 26 16
10
4 100.0 100.0 100.0 29 29
0

len_word_q1 len_word_q2 common_word chr_cnt_q1 chr_cnt_q2


total_words \
0 10 13 2 55 64
23
1 13 7 5 62 42
20
2 14 6 5 59 28
20
3 4 3 0 26 16
7
4 4 6 3 29 29
10

word_share
0 0.086957
1 0.250000
2 0.250000
3 0.000000
4 0.300000

test_txt = test_preprocessed[['question1','question2']].values
test_txt

array([[list(['how', 'does', 'the', 'surface', 'pro', 'himself',


'compare', 'with', 'ipad', 'pro']),
list(['why', 'did', 'microsoft', 'choose', 'core', 'm', 'and',
'not', 'core', 'i', 'home', 'surface', 'pro'])],
[list(['should', 'i', 'have', 'a', 'hair', 'transplant', 'at',
'age', 'how', 'much', 'would', 'it', 'cost']),
list(['how', 'much', 'cost', 'does', 'hair', 'transplant',
'require'])],
[list(['what', 'but', 'is', 'the', 'best', 'way', 'to', 'send',
'money', 'from', 'china', 'to', 'the', 'us']),
list(['what', 'you', 'send', 'money', 'to', 'china'])],
...,
[list(['what', 'are', 'some', 'famous', 'romanian', 'drinks',
'alcoholic', 'nonalcoholic']),
list(['can', 'a', 'nonalcoholic', 'restaurant', 'be', 'a',
'huge', 'success'])],
[list(['what', 'were', 'the', 'best', 'and', 'worst', 'things',
'about', 'public', 'transit', 'in', 'proddatur', 'andhra', 'pradesh',
'india', 'how', 'could', 'it', 'be', 'improved']),
list(['what', 'are', 'the', 'best', 'and', 'worst', 'things',
'examination', 'public', 'transit', 'in', 'visakhapatnam', 'andhra',
'pradesh', 'india', 'how', 'could', 'it', 'be', 'improved'])],
[list(['what', 'is', 'the', 'best', 'medication', 'equation',
'erectile', 'dysfunction']),
list(['how', 'do', 'i', 'out', 'get', 'rid', 'of', 'erectile',
'dysfunction'])]],
dtype=object)

test_combined = [
''.join(map(str, pair)) if isinstance(pair, (list, np.ndarray))
else str(pair)
for pair in test_txt
]

test_tfv = tfv.transform(list(test_combined))
test_df = basic_features(test_df)

print(test_df.columns)

Index(['question1', 'question2', 'fuzz_ratio', 'fuzz_partial',


'token_sort_ratio', 'token_set_ratio', 'len_q1', 'len_q2',
'diff_len',
'len_word_q1', 'len_word_q2', 'common_word', 'chr_cnt_q1',
'chr_cnt_q2',
'total_words', 'word_share'],
dtype='object')

missing_columns = [col for col in ['len_word_q1', 'len_word_q2',


'len_q1', 'len_q2', 'diff_len',
'common_word', 'chr_cnt_q1',
'chr_cnt_q2', 'fuzz_ratio',
'fuzz_partial', 'token_sort_ratio',
'token_set_ratio']
if col not in test_df.columns]
print("Missing columns:", missing_columns)

Missing columns: []

new_features_test = test_df[['len_word_q1', 'len_word_q2', 'len_q1',


'len_q2', 'diff_len', 'common_word', 'chr_cnt_q1',
'chr_cnt_q2','fuzz_ratio','fuzz_partial','token_sort_ratio',
'token_set_ratio']].values
new_features_sparse_test = csr_matrix(new_features_test)

X_test = hstack([test_tfv,new_features_sparse_test])
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'


with 34566883 stored elements and shape (2345796, 1652)>
Coords Values
(0, 1445) 0.5164534171421614
(0, 1113) 0.518586360973459
(0, 902) 0.26453386726631267
(0, 688) 0.2426570971470472
(0, 340) 0.47852689884413857
(0, 313) 0.21820394289418196
(0, 277) 0.2426570971470472
(0, 1640) 10.0
(0, 1641) 13.0
(0, 1642) 85.0
(0, 1643) 103.0
(0, 1644) 18.0
(0, 1645) 2.0
(0, 1646) 85.0
(0, 1647) 103.0
(0, 1648) 33.0
(0, 1649) 40.0
(0, 1650) 33.0
(0, 1651) 33.0
(1, 1228) 0.3802335504206669
(1, 653) 0.6363849818186401
(1, 343) 0.5636045185802993
(1, 68) 0.3643988867793811
(1, 1640) 13.0
(1, 1641) 7.0
: :
(2345794, 1641) 20.0
(2345794, 1642) 176.0
(2345794, 1643) 185.0
(2345794, 1644) 9.0
(2345794, 1645) 17.0
(2345794, 1646) 176.0
(2345794, 1647) 185.0
(2345794, 1648) 100.0
(2345794, 1649) 100.0
(2345794, 1650) 100.0
(2345794, 1651) 100.0
(2345795, 1246) 0.6885531799258099
(2345795, 499) 0.7251858509472282
(2345795, 1640) 8.0
(2345795, 1641) 9.0
(2345795, 1642) 82.0
(2345795, 1643) 72.0
(2345795, 1644) 10.0
(2345795, 1645) 2.0
(2345795, 1646) 82.0
(2345795, 1647) 72.0
(2345795, 1648) 29.0
(2345795, 1649) 33.0
(2345795, 1650) 29.0
(2345795, 1651) 29.0

pred_test_lr = model_lr.predict_proba(X_test)
pred_test_lr

array([[0.88149224, 0.11850776],
[0.92167437, 0.07832563],
[0.39366431, 0.60633569],
...,
[0.9511523 , 0.0488477 ],
[0.15301288, 0.84698712],
[0.89131706, 0.10868294]])

You might also like