0% found this document useful (0 votes)
18 views4 pages

Analysis On Weight Capacity

Logistic Regireee

Uploaded by

nijir70713
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views4 pages

Analysis On Weight Capacity

Logistic Regireee

Uploaded by

nijir70713
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

# This Python 3 environment comes with many helpful analytics libraries installed

# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python


# For example, here's several helpful packages to load

import numpy as np # linear algebra


import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory


# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/student-bag-price-prediction-dataset/Noisy_Student_Bag_Price_Prediction_Dataset.csv
/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv

import pandas as pd, numpy as np

train = pd.read_csv("/kaggle/input/playground-series-s5e2/train.csv")
print("Train shape",train.shape)
train_extra = pd.read_csv("/kaggle/input/playground-series-s5e2/training_extra.csv")
print("Extra Train shape",train_extra.shape)
train = pd.concat([train,train_extra],axis=0,ignore_index=True)
print("Combined Train shape",train.shape)

Train shape (300000, 11)


Extra Train shape (3694318, 11)
Combined Train shape (3994318, 11)

train.head(10)

Laptop Weight
id Brand Material Size Compartments Waterproof Style Color Price
Compartment Capacity (kg)

0 0 Jansport Leather Medium 7.0 Yes No Tote Black 11.611723 112.15875

1 1 Jansport Canvas Small 10.0 Yes Yes Messenger Green 27.078537 68.88056

Under
2 2 Leather Small 2.0 Yes No Messenger Red 16.643760 39.17320
Armour

3 3 Nike Nylon Small 8.0 Yes No Messenger Green 12.937220 80.60793

4 4 Adidas Canvas Medium 1.0 Yes Yes Messenger Green 17.749338 86.02312

5 5 Nike Canvas Medium 10.0 No Yes NaN Black 7.241812 20.01553

6 6 Nike NaN Large 3.0 No No Backpack Green 6.828123 84.80500

7 7 Puma Canvas Small 1.0 Yes Yes Backpack Blue 21.488864 27.15815

Under
8 8 Polyester Medium 8.0 Yes No Tote Gray 10.207780 25.98652
Armour

Under
9 9 Nylon Medium 2.0 Yes Yes Messenger Pink 15.895100 38.48741
Armour

# Assuming your DataFrame is named df


unique_values = train['Weight Capacity (kg)'].unique()
print("Unique Weight Capacity values:", unique_values)

Unique Weight Capacity values: [11.61172281 27.07853658 16.64375995 ... 12.79080004 22.95972519
16.64173875]

# Assuming your DataFrame is named df


unique_values = train['Weight Capacity (kg)'].value_counts()
print("Unique Weight Capacity values:", unique_values)
Unique Weight Capacity values: Weight Capacity (kg)
5.000000 58087
30.000000 2588
11.898250 1571
14.908437 1559
22.898382 1417
...
20.923873 1
28.897699 1
20.485038 1
14.355831 1
21.643489 1
Name: count, Length: 1920345, dtype: int64

import matplotlib.pyplot as plt


import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(train['Weight Capacity (kg)'], bins=30, kde=False)
plt.title("Distribution of Weight Capacity")
plt.xlabel("Weight Capacity")
plt.ylabel("Count")
plt.show()

/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is depreca


ted and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):

train_mean = train.Price.mean()
train['pred'] = train_mean
s = np.sqrt(np.mean( (train.Price-train.pred)**2.0 ) )
print(f"Validation RMSE using Train Mean = {s}")

Validation RMSE using Train Mean = 38.93867923358143

train.head()

Weight
Laptop
id Brand Material Size Compartments Waterproof Style Color Capacity Price pred
Compartment
(kg)

0 0 Jansport Leather Medium 7.0 Yes No Tote Black 11.611723 112.15875 81.362175

1 1 Jansport Canvas Small 10.0 Yes Yes Messenger Green 27.078537 68.88056 81.362175

Under
2 2 Leather Small 2.0 Yes No Messenger Red 16.643760 39.17320 81.362175
Armour

3 3 Nike Nylon Small 8.0 Yes No Messenger Green 12.937220 80.60793 81.362175

4 4 Adidas Canvas Medium 1.0 Yes Yes Messenger Green 17.749338 86.02312 81.362175

from cuml.preprocessing import TargetEncoder


TE = TargetEncoder(n_folds=30, smooth=20, split_method='random', stat='mean')
train['pred'] = TE.fit_transform(train['Weight Capacity (kg)'],train.Price)
s = np.sqrt(np.mean( (train.Price-train.pred)**2.0 ) )
print(f"Validation RSME using Target Encode Weight Capacity = {s}")

Validation RSME using Target Encode Weight Capacity = 38.71037277159231

from cuml.preprocessing import TargetEncoder


import numpy as np
import pandas as pd

# Variation 1: More smoothing


TE1 = TargetEncoder(n_folds=30, smooth=40, split_method='random', stat='mean')
train['pred_TE1'] = TE1.fit_transform(train['Weight Capacity (kg)'], train.Price)
rmse_TE1 = np.sqrt(np.mean((train.Price - train['pred_TE1'])**2.0))
print(f"Variation 1 (smooth=40) RMSE: {rmse_TE1}")

# Variation 2: Fewer folds and less smoothing


TE2 = TargetEncoder(n_folds=10, smooth=10, split_method='random', stat='mean')
train['pred_TE2'] = TE2.fit_transform(train['Weight Capacity (kg)'], train.Price)
rmse_TE2 = np.sqrt(np.mean((train.Price - train['pred_TE2'])**2.0))
print(f"Variation 2 (n_folds=10, smooth=10) RMSE: {rmse_TE2}")

Variation 1 (smooth=40) RMSE: 38.719630133491286


Variation 2 (n_folds=10, smooth=10) RMSE: 38.729537426961386

from sklearn.model_selection import KFold


import numpy as np

# Create a KFold instance


kf = KFold(n_splits=30, shuffle=True, random_state=42)

# Initialize an array to hold fold IDs for each sample


fold_ids = np.empty(len(train), dtype=int)

# Assign fold IDs


for fold, (_, val_idx) in enumerate(kf.split(train)):
fold_ids[val_idx] = fold

# Now use your custom fold_ids with the target encoder


TE3 = TargetEncoder(n_folds=30, smooth=20, split_method='customize', stat='mean')
train['pred_TE3'] = TE3.fit_transform(train['Weight Capacity (kg)'], train.Price, fold_ids=fold_ids)

rmse_TE3 = np.sqrt(np.mean((train.Price - train['pred_TE3'])**2.0))


print(f"Variation 2 (n_folds=30, smooth=20) RMSE: {rmse_TE3}")

Variation 2 (n_folds=30, smooth=20) RMSE: 38.71077699918216

# Now you can use these features in a model, e.g., a simple linear regression:
from sklearn.linear_model import LinearRegression
features = ['pred_TE1', 'pred_TE2', 'pred_TE3']
lr_model = LinearRegression()
lr_model.fit(train[features], train.Price)

# Evaluate RMSE on the training set (or better, via cross-validation)


pred_lr = lr_model.predict(train[features])
rmse_lr = np.sqrt(np.mean((train.Price - pred_lr)**2.0))
print(f"Linear Regression using multiple target encoding features RMSE: {rmse_lr}")

Linear Regression using multiple target encoding features RMSE: 38.70900752709825

test = pd.read_csv("/kaggle/input/playground-series-s5e2/test.csv")

# Step 1: Apply Target Encoding to Test Set


test['pred_TE1'] = TE1.transform(test['Weight Capacity (kg)'])
test['pred_TE2'] = TE2.transform(test['Weight Capacity (kg)'])
test['pred_TE3'] = TE3.transform(test['Weight Capacity (kg)'])

# Step 2: Predict on Test Set using Linear Regression Model


test_features = ['pred_TE1', 'pred_TE2', 'pred_TE3']
test['Price'] = lr_model.predict(test[test_features])

# Step 3: Prepare Submission File


sub = pd.read_csv("/kaggle/input/playground-series-s5e2/sample_submission.csv") # Load sample submission
sub['Price'] = test['Price']
sub.to_csv("submission_lr_target_encoding.csv", index=False) # Save as CSV

# Step 4: Verify Submission File


print(sub.head()) # Check the first few rows to ensure everything looks correct
id Price
0 300000 82.787024
1 300001 81.064362
2 300002 90.514901
3 300003 78.539448
4 300004 81.464757

# sub = pd.read_csv("/kaggle/input/playground-series-s5e2/sample_submission.csv")
# print('Submission shape', sub.shape)
# test = pd.read_csv("/kaggle/input/playground-series-s5e2/test.csv")
# sub['Price'] = TE.transform(test['Weight Capacity (kg)'])
# sub.to_csv("submission_TE_weight_capacity.csv",index=False)
# sub.head()

You might also like