SC Project Kaggle
SC Project Kaggle
Collecting distance
Downloading Distance-0.1.3.tar.gz (180 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 180.3/180.3 kB 5.7 MB/s eta
0:00:00
etadata (setup.py) ... e=Distance-0.1.3-py3-none-any.whl size=16258
sha256=4242bc9c5e48479c0c4838743c3e3d445c13c35d607276776e841d6aeb693e9
6
Stored in directory:
/root/.cache/pip/wheels/e8/bb/de/f71bf63559ea9a921059a5405806f7ff6ed61
2a9231c4a9309
Successfully built distance
Installing collected packages: distance
Successfully installed distance-0.1.3
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,
CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import zipfile
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score
from scipy.sparse import hstack,csr_matrix
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
import distance
import nltk
import spacy
import string
import re
from textblob import TextBlob
Collecting python-Levenshtein
Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7
kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
Downloading levenshtein-0.26.1-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-
Levenshtein)
Downloading rapidfuzz-3.10.1-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 162.6/162.6 kB 5.7 MB/s eta
0:00:00
anylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.1/3.1 MB 59.7 MB/s eta
0:00:00:00:01
ay need to restart the kernel to use updated packages.
zip_path = "/kaggle/input/quora-question-pairs/train.csv.zip"
train_df = pd.read_csv("/kaggle/working/train.csv")
test_df = pd.read_csv("/kaggle/input/quora-question-pairs/test.csv")
train_df_req = train_df.iloc[:3000,:]
print(train_df_req.shape)
train_df_req.head()
(3000, 6)
id qid1 qid2
question1 \
0 0 1 2 What is the step by step guide to invest in sh...
question2 is_duplicate
0 What is the step by step guide to invest in sh... 0
1 What would happen if the Indian government sto... 0
2 How can Internet speed be increased by hacking... 0
3 Find the remainder when [math]23^{24}[/math] i... 0
4 Which fish would survive in salt water? 0
train_df_req = train_df_req.copy()
train_df_req.dropna(inplace=True)
print(train_df_req.shape)
train_df_req.isnull().sum()
(3000, 6)
id 0
qid1 0
qid2 0
question1 0
question2 0
is_duplicate 0
dtype: int64
y = train_df_req['is_duplicate']
def preprocess(text):
# Convert text to lowercase and strip whitespace
text = text.lower().strip()
# Expand contractions
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he has",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has",
"I'd": "I had",
"I'd've": "I would have",
"I'll": "I shall",
"I'll've": "I shall have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it shall",
"it'll've": "it shall have",
"it's": "it has",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall",
"what'll've": "what shall have",
"what're": "what are",
"what's": "what has",
"what've": "what have",
"when's": "when has",
"when've": "when have",
"where'd": "where did",
"where's": "where has",
"where've": "where have",
"who'll": "who shall",
"who'll've": "who shall have",
"who's": "who has",
"who've": "who have",
"why's": "why has",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you shall",
"you'll've": "you shall have",
"you're": "you are",
"you've": "you have"
}
# Replace contractions
for contraction, expansion in contractions.items():
text = text.replace(contraction, expansion)
# Remove punctuation
text = text.translate(str.maketrans('', '', exclude))
return tokens
train_df_req['question1'] =
train_df_req['question1'].apply(preprocess)
train_df_req['question2'] =
train_df_req['question2'].apply(preprocess)
train_df_req.head()
id qid1 qid2
question1 \
0 0 1 2 [what, be, the, step, by, step, guide, to, inv...
question2 is_duplicate
0 [what, be, the, step, by, step, guide, to, inv... 0
1 [what, would, happen, if, the, indian, governm... 0
2 [how, can, internet, speed, be, increase, by, ... 0
3 [find, the, remainder, when, , 2324math, be, ... 0
4 [which, fish, would, survive, in, salt, water] 0
def cnt_chr(text):
# Function to count characters
return len(text) if isinstance(text, str) else 0
def basic_features(df):
df['question1'] = df['question1'].fillna('unknown').astype(str)
df['question2'] = df['question2'].fillna('unknown').astype(str)
df['len_q1'] = df['question1'].str.len()
df['len_q2'] = df['question2'].str.len()
df['diff_len'] = abs(df['len_q1'] - df['len_q2'])
df['len_word_q1'] = df['question1'].apply(lambda x: len(x.split())
if isinstance(x, str) else 0)
df['len_word_q2'] = df['question2'].apply(lambda x: len(x.split())
if isinstance(x, str) else 0)
df['common_word'] = df.apply(lambda row:
common_words(row['question1'], row['question2']), axis=1)
df['chr_cnt_q1'] = df['question1'].apply(cnt_chr)
df['chr_cnt_q2'] = df['question2'].apply(cnt_chr)
df['total_words'] = df['len_word_q1'] + df['len_word_q2']
df['word_share'] = df['common_word']/df['total_words']
return df
train_preprocessed = basic_features(train_df_req)
train_preprocessed.head()
id qid1 qid2
question1 \
0 0 1 2 ['what', 'be', 'the', 'step', 'by', 'step', 'g...
1 123 52 8 13 3 71
2 85 28 14 10 5 113
3 83 1 11 11 1 82
4 60 54 13 7 3 114
def fetch_fuzzy_features(row):
q1 = row['question1']
q2 = row['question2']
fuzzy_features = [0.0]*4
fuzzy_features[0] = fuzz.QRatio(q1,q2)
fuzzy_features[1] = fuzz.partial_ratio(q1,q2)
fuzzy_features[2] = fuzz.token_sort_ratio(q1,q2)
fuzzy_features[3] = fuzz.token_set_ratio(q1,q2)
return fuzzy_features
train_fuzz = train_preprocessed.apply(fetch_fuzzy_features,axis=1)
train_preprocessed['fuzz_ratio'] = list(map(lambda x:x[0],train_fuzz))
train_preprocessed['fuzz_partial'] = list(map(lambda
x:x[1],train_fuzz))
train_preprocessed['token_sort_ratio'] = list(map(lambda
x:x[2],train_fuzz))
train_preprocessed['token_set_ratio'] = list(map(lambda
x:x[3],train_fuzz))
train_preprocessed.head(5)
id qid1 qid2
question1 \
0 0 1 2 ['what', 'be', 'the', 'step', 'by', 'step', 'g...
question2 is_duplicate
len_q1 \
0 ['what', 'be', 'the', 'step', 'by', 'step', 'g... 0
108
1 ['what', 'would', 'happen', 'if', 'the', 'indi... 0
71
2 ['how', 'can', 'internet', 'speed', 'be', 'inc... 0
113
3 ['find', 'the', 'remainder', 'when', ' ', '232... 0
82
4 ['which', 'fish', 'would', 'survive', 'in', 's... 0
114
1 123 52 8 13 3 71
2 85 28 14 10 5 113
3 83 1 11 11 1 82
4 60 54 13 7 3 114
token_sort_ratio token_set_ratio
0 93 100
1 60 81
2 63 73
3 27 33
4 46 67
def token_features(row):
q1 = row['question1']
q2 = row['question2']
q1_token = q1.split()
q2_token = q2.split()
STOP_WORDS = stopwords('english')
STEP = 0.0001
new_features = [0.0]*8
if len(q1_token)==0 or len(q2_token)==0:
return new_features
new_features[0] = common_tokens/(min(len(q1_token),len(q2_token))
+STEP)
new_features[1] = common_tokens/(max(len(q1_token),len(q2_token))
+STEP)
new_features[2] =
common_non_stop/(min(len(q1_non_stop),len(q2_non_stop))+STEP)
new_features[3] =
common_non_stop/(max(len(q1_non_stop),len(q2_non_stop))+STEP)
new_features[4] = common_word/(min(len(q1_word),len(q2_word))
+STEP)
new_features[5] = common_word/(max(len(q1_word),len(q2_word))
+STEP)
new_features[6] = q1_token[0]==q2_token[0]
new_features[7] = q1_token[-1]==q2_token[-1]
return new_features
def token_features(row):
q1 = row['question1']
q2 = row['question2']
q1_token = q1.split()
q2_token = q2.split()
import nltk
from nltk.corpus import stopwords
# Download stopwords
nltk.download('stopwords')
STOP_WORDS = set(stopwords.words('english'))
# Assign columns
train_preprocessed[['cts', 'ctl', 'css', 'csl', 'cws', 'cwl', 'fws',
'lws']] = token_features_df
print(train_preprocessed.head())
id qid1 qid2
question1 \
0 0 1 2 ['what', 'be', 'the', 'step', 'by', 'step', 'g...
question2 is_duplicate
len_q1 \
0 ['what', 'be', 'the', 'step', 'by', 'step', 'g... 0
108
1 ['what', 'would', 'happen', 'if', 'the', 'indi... 0
71
2 ['how', 'can', 'internet', 'speed', 'be', 'inc... 0
113
3 ['find', 'the', 'remainder', 'when', ' ', '232... 0
82
4 ['which', 'fish', 'would', 'survive', 'in', 's... 0
114
[5 rows x 28 columns]
def distance_features(row):
q1 = row['question1']
q2 = row['question2']
q1_token = q1.split()
q2_token = q2.split()
length_features = [0.0]*2
if len(q1_token) == 0 or len(q2_token) == 0:
return length_features
print(x_graph.dtypes)
is_duplicate int64
len_q1 int64
len_q2 int64
diff_len int64
len_word_q1 int64
len_word_q2 int64
common_word int64
chr_cnt_q1 int64
chr_cnt_q2 int64
total_words int64
word_share float64
fuzz_ratio int64
fuzz_partial int64
token_sort_ratio int64
token_set_ratio int64
cts int64
ctl int64
css int64
csl int64
cws int64
cwl int64
fws object
lws object
avg_words float64
com_sub float64
dtype: object
Vizualisation
print(x_graph.info())
print(x_graph.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 is_duplicate 3000 non-null int64
1 len_q1 3000 non-null int64
2 len_q2 3000 non-null int64
3 diff_len 3000 non-null int64
4 len_word_q1 3000 non-null int64
5 len_word_q2 3000 non-null int64
6 common_word 3000 non-null int64
7 chr_cnt_q1 3000 non-null int64
8 chr_cnt_q2 3000 non-null int64
9 total_words 3000 non-null int64
10 word_share 3000 non-null float64
11 fuzz_ratio 3000 non-null int64
12 fuzz_partial 3000 non-null int64
13 token_sort_ratio 3000 non-null int64
14 token_set_ratio 3000 non-null int64
15 cts 3000 non-null int64
16 ctl 3000 non-null int64
17 css 3000 non-null int64
18 csl 3000 non-null int64
19 cws 3000 non-null int64
20 cwl 3000 non-null int64
21 fws 3000 non-null object
22 lws 3000 non-null object
23 avg_words 3000 non-null float64
24 com_sub 3000 non-null float64
dtypes: float64(3), int64(20), object(2)
memory usage: 586.1+ KB
None
is_duplicate len_q1 len_q2 diff_len len_word_q1 len_word_q2 \
0 0 108 93 15 14 12
1 0 71 123 52 8 13
2 0 113 85 28 14 10
3 0 82 83 1 11 11
4 0 114 60 54 13 7
[5 rows x 25 columns]
# Example: Replace lists with their lengths (or derive other features
as needed)
x_graph['fws_length'] = x_graph['fws'].apply(len)
x_graph['lws_length'] = x_graph['lws'].apply(len)
# Drop the original `fws` and `lws` columns after feature extraction
x_graph = x_graph.drop(columns=['fws', 'lws'])
tsne_df['is_duplicate'] = y.reset_index(drop=True)
3000
#tfv =
TfidfVectorizer(min_df=3,stop_words='english',use_idf=1,smooth_idf=1,s
ublinear_tf=1)
#tfv.fit(list(x_combined))
#train_tfv = tfv.transform(x_combined)
train_preprocessed.columns
X = hstack([train_tfv,new_features_sparse])
print(X)
0.74
pred_proba_lr = model_lr.predict_proba(xvalid)[:,1]
print(log_loss(yvalid,pred_proba_lr))
0.5029506557087635
model_xgb = XGBClassifier()
model_xgb.fit(xtrain,ytrain)
pred_prob_xgb = model_xgb.predict_proba(xvalid)[:,1]
print(log_loss(yvalid,pred_prob_xgb))
0.4913228531206017
print(accuracy_score(yvalid,model_xgb.predict(xvalid)))
0.7466666666666667
Making Prediction
Collecting utils
Downloading utils-1.0.2.tar.gz (13 kB)
Preparing metadata (setup.py) ... e=utils-1.0.2-py2.py3-none-any.whl
size=13906
sha256=433af146c2d720029e75083949f4bfbbb204e72040d96946786732ad8b172c0
0
Stored in directory:
/root/.cache/pip/wheels/b8/39/f5/9d0ca31dba85773ececf0a7f5469f18810e1c
8a8ed9da28ca7
Successfully built utils
Installing collected packages: utils
Successfully installed utils-1.0.2
def preprocess_questions(df):
# Example preprocessing steps
# You can modify this according to your requirements
# lowercasing text
df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()
return df
test_preprocessed = preprocess_questions(test_df)
test_preprocessed.head()
question1 \
0 [how, does, the, surface, pro, himself, compar...
1 [should, i, have, a, hair, transplant, at, age...
2 [what, but, is, the, best, way, to, send, mone...
3 [which, food, not, emulsifiers]
4 [how, aberystwyth, start, reading]
question2 fuzz_ratio \
0 [why, did, microsoft, choose, core, m, and, no... 47
1 [how, much, cost, does, hair, transplant, requ... 50
2 [what, you, send, money, to, china] 60
3 [what, foods, fibre] 52
4 [how, their, can, i, start, reading] 69
word_share
0 0.086957
1 0.250000
2 0.250000
3 0.000000
4 0.300000
print(test_preprocessed.head())
print(test_preprocessed.dtypes)
question1 \
0 [how, does, the, surface, pro, himself, compar...
1 [should, i, have, a, hair, transplant, at, age...
2 [what, but, is, the, best, way, to, send, mone...
3 [which, food, not, emulsifiers]
4 [how, aberystwyth, start, reading]
question2 fuzz_ratio \
0 [why, did, microsoft, choose, core, m, and, no... 47
1 [how, much, cost, does, hair, transplant, requ... 50
2 [what, you, send, money, to, china] 60
3 [what, foods, fibre] 52
4 [how, their, can, i, start, reading] 69
word_share
0 0.086957
1 0.250000
2 0.250000
3 0.000000
4 0.300000
question1 object
question2 object
fuzz_ratio int64
fuzz_partial int64
token_sort_ratio int64
token_set_ratio int64
len_q1 int64
len_q2 int64
diff_len int64
len_word_q1 int64
len_word_q2 int64
common_word int64
chr_cnt_q1 int64
chr_cnt_q2 int64
total_words int64
word_share float64
dtype: object
def fetch_fuzzy_features(row):
# Ensure that row['question1'] and row['question2'] are valid
strings
q1 = row.get('question1', '') # Default to empty string if the
value is missing
q2 = row.get('question2', '') # Default to empty string if the
value is missing
question1 \
0 [how, does, the, surface, pro, himself, compar...
1 [should, i, have, a, hair, transplant, at, age...
2 [what, but, is, the, best, way, to, send, mone...
3 [which, food, not, emulsifiers]
4 [how, aberystwyth, start, reading]
question2 fuzz_ratio \
0 [why, did, microsoft, choose, core, m, and, no... 33.0
1 [how, much, cost, does, hair, transplant, requ... 44.0
2 [what, you, send, money, to, china] 100.0
3 [what, foods, fibre] 44.0
4 [how, their, can, i, start, reading] 100.0
word_share
0 0.086957
1 0.250000
2 0.250000
3 0.000000
4 0.300000
test_txt = test_preprocessed[['question1','question2']].values
test_txt
test_combined = [
''.join(map(str, pair)) if isinstance(pair, (list, np.ndarray))
else str(pair)
for pair in test_txt
]
test_tfv = tfv.transform(list(test_combined))
test_df = basic_features(test_df)
print(test_df.columns)
Missing columns: []
X_test = hstack([test_tfv,new_features_sparse_test])
print(X_test)
pred_test_lr = model_lr.predict_proba(X_test)
pred_test_lr
array([[0.88149224, 0.11850776],
[0.92167437, 0.07832563],
[0.39366431, 0.60633569],
...,
[0.9511523 , 0.0488477 ],
[0.15301288, 0.84698712],
[0.89131706, 0.10868294]])