The Role of Cultural Traditions: A Predictive Study On Husband's Age and Karwa Chauth
The Role of Cultural Traditions: A Predictive Study On Husband's Age and Karwa Chauth
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('karwa_chauth_dataset_new.csv')
df.head()
[5 rows x 27 columns]
df.tail()
9996 2022 29 2 2
9997 2023 37 13 3
9998 2023 32 1 2
9999 2022 31 5 3
[5 rows x 27 columns]
df.shape
(10000, 27)
df.columns
df.duplicated().sum()
df.isnull().sum()
Year 0
Wife's Age 0
Marriage Duration (Years) 0
Number of Children 0
Husband's Age 0
Educational Qualification 0
Occupation 0
Cultural Background 0
City/Region 0
Economic Status 0
Husband's Health Status 0
Lifestyle Factors 0
Participation Frequency 0
Perceived Longevity Factors 0
Age Group 0
Marital Status 0
Traditions Observed 0
Cultural Activities 0
Gifts Exchanged 0
Health Impact 0
Social Media Trends 0
Public Celebrations 0
Food Prepared 0
Common Myths 0
Economic Impact 0
Emotional Impact 0
Longevity Influence of Karwa Chauth 0
dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Year 10000 non-null int64
1 Wife's Age 10000 non-null int64
2 Marriage Duration (Years) 10000 non-null int64
3 Number of Children 10000 non-null int64
4 Husband's Age 10000 non-null int64
5 Educational Qualification 10000 non-null object
6 Occupation 10000 non-null object
7 Cultural Background 10000 non-null object
8 City/Region 10000 non-null object
9 Economic Status 10000 non-null object
10 Husband's Health Status 10000 non-null object
11 Lifestyle Factors 10000 non-null object
12 Participation Frequency 10000 non-null object
13 Perceived Longevity Factors 10000 non-null object
14 Age Group 10000 non-null object
15 Marital Status 10000 non-null object
16 Traditions Observed 10000 non-null object
17 Cultural Activities 10000 non-null object
18 Gifts Exchanged 10000 non-null object
19 Health Impact 10000 non-null object
20 Social Media Trends 10000 non-null object
21 Public Celebrations 10000 non-null object
22 Food Prepared 10000 non-null object
23 Common Myths 10000 non-null object
24 Economic Impact 10000 non-null object
25 Emotional Impact 10000 non-null object
26 Longevity Influence of Karwa Chauth 10000 non-null object
dtypes: int64(5), object(22)
memory usage: 2.1+ MB
df.describe()
df.nunique()
Year 2
Wife's Age 16
Marriage Duration (Years) 20
Number of Children 5
Husband's Age 36
Educational Qualification 4
Occupation 5
Cultural Background 4
City/Region 5
Economic Status 3
Husband's Health Status 3
Lifestyle Factors 3
Participation Frequency 2
Perceived Longevity Factors 5
Age Group 3
Marital Status 2
Traditions Observed 4
Cultural Activities 5
Gifts Exchanged 5
Health Impact 3
Social Media Trends 3
Public Celebrations 3
Food Prepared 4
Common Myths 5
Economic Impact 3
Emotional Impact 5
Longevity Influence of Karwa Chauth 3
dtype: int64
object_columns = df.select_dtypes(include=['object']).columns
print("Object type columns:")
print(object_columns)
numerical_columns = df.select_dtypes(include=['int64',
'float64']).columns
print("\nNumerical type columns:")
print(numerical_columns)
def classify_features(df):
categorical_features = []
non_categorical_features = []
discrete_features = []
continuous_features = []
for i in continuous:
plt.figure(figsize=(15,6))
sns.histplot(df[i], bins = 20, kde = True, palette='hls')
plt.xticks(rotation = 90)
plt.show()
for i in continuous:
plt.figure(figsize=(15,6))
sns.distplot(df[i], bins = 20, kde = True)
plt.xticks(rotation = 90)
plt.show()
for i in continuous:
plt.figure(figsize=(15, 6))
sns.boxplot(x=i, data=df, palette='hls')
plt.xticks(rotation=90)
plt.show()
for i in discrete:
print(i)
print(df[i].unique())
print()
Year
[2022 2023]
Number of Children
[4 3 0 1 2]
for i in discrete:
print(i)
print(df[i].value_counts())
print()
Year
Year
2022 5013
2023 4987
Name: count, dtype: int64
Number of Children
Number of Children
4 2151
3 2026
2 2009
0 1918
1 1896
Name: count, dtype: int64
for i in discrete:
plt.figure(figsize=(15, 6))
ax = sns.countplot(x=i, data=df, palette='hls')
for p in ax.patches:
height = p.get_height()
ax.annotate(f'{height}',
xy=(p.get_x() + p.get_width() / 2., height),
xytext=(0, 10),
textcoords='offset points',
ha='center', va='center')
plt.show()
import plotly.express as px
for i in discrete:
counts = df[i].value_counts()
fig = px.pie(counts, values=counts.values, names=counts.index,
title=f'Distribution of {i}')
fig.show()
for i in categorical:
print(i)
print(df[i].unique())
print()
Educational Qualification
["Master's" 'PhD' 'High School' "Bachelor's"]
Occupation
['Teacher' 'Engineer' 'Doctor' 'Laborer' 'Businessman']
Cultural Background
['South Indian' 'West Indian' 'North Indian' 'East Indian']
City/Region
['Kolkata' 'Chennai' 'Bangalore' 'Mumbai' 'Delhi']
Economic Status
['Low' 'High' 'Middle']
Lifestyle Factors
['Non-smoker, Non-drinker' 'Smoker, Non-drinker' 'Occasional drinker']
Participation Frequency
['Occasionally' 'Every year']
Marital Status
['Married' 'Divorced']
Traditions Observed
['Fasting, Offerings' 'Fasting, Evening Puja' 'Fasting, Prayer'
'Fasting, Rituals']
Cultural Activities
['Cultural dance' 'Local festivals' 'Traditional games'
'Family activities' 'Community prayers']
Gifts Exchanged
['Clothing' 'Sweets' 'Cash gifts' 'Jewelry' 'Fruits']
Health Impact
['Positive mood' 'Neutral mood' 'Negative mood']
Public Celebrations
['Public gatherings' 'Local fairs' 'Cultural parades']
Food Prepared
['Traditional dishes' 'Savory snacks' 'Sweets' 'Rice, Lentils']
Common Myths
['Fasting is harmful' 'Fasting improves relationships'
'Fasting leads to weight loss' 'Fasting causes stress'
'Fasting promotes health']
Economic Impact
['Low' 'High' 'Moderate']
Emotional Impact
['Mixed emotions' 'Satisfaction' 'Increased happiness' 'Frustration'
'Joyful celebrations']
for i in categorical:
print(i)
print(df[i].value_counts())
print()
Educational Qualification
Educational Qualification
High School 2510
Bachelor's 2508
PhD 2506
Master's 2476
Name: count, dtype: int64
Occupation
Occupation
Businessman 2041
Teacher 2033
Engineer 2015
Doctor 1970
Laborer 1941
Name: count, dtype: int64
Cultural Background
Cultural Background
West Indian 2538
North Indian 2526
South Indian 2492
East Indian 2444
Name: count, dtype: int64
City/Region
City/Region
Mumbai 2050
Chennai 2021
Kolkata 1999
Delhi 1976
Bangalore 1954
Name: count, dtype: int64
Economic Status
Economic Status
Low 3391
Middle 3332
High 3277
Name: count, dtype: int64
Lifestyle Factors
Lifestyle Factors
Non-smoker, Non-drinker 3362
Smoker, Non-drinker 3357
Occasional drinker 3281
Name: count, dtype: int64
Participation Frequency
Participation Frequency
Occasionally 5027
Every year 4973
Name: count, dtype: int64
Age Group
Age Group
25-30 3427
36-40 3326
31-35 3247
Name: count, dtype: int64
Marital Status
Marital Status
Divorced 5029
Married 4971
Name: count, dtype: int64
Traditions Observed
Traditions Observed
Fasting, Rituals 2591
Fasting, Offerings 2511
Fasting, Evening Puja 2449
Fasting, Prayer 2449
Name: count, dtype: int64
Cultural Activities
Cultural Activities
Traditional games 2006
Community prayers 2006
Local festivals 2002
Family activities 1998
Cultural dance 1988
Name: count, dtype: int64
Gifts Exchanged
Gifts Exchanged
Cash gifts 2019
Clothing 2010
Sweets 2000
Jewelry 1988
Fruits 1983
Name: count, dtype: int64
Health Impact
Health Impact
Neutral mood 3356
Positive mood 3334
Negative mood 3310
Name: count, dtype: int64
Public Celebrations
Public Celebrations
Cultural parades 3343
Local fairs 3338
Public gatherings 3319
Name: count, dtype: int64
Food Prepared
Food Prepared
Savory snacks 2571
Rice, Lentils 2487
Sweets 2477
Traditional dishes 2465
Name: count, dtype: int64
Common Myths
Common Myths
Fasting causes stress 2066
Fasting improves relationships 2030
Fasting is harmful 1993
Fasting leads to weight loss 1984
Fasting promotes health 1927
Name: count, dtype: int64
Economic Impact
Economic Impact
Moderate 3350
High 3333
Low 3317
Name: count, dtype: int64
Emotional Impact
Emotional Impact
Joyful celebrations 2044
Mixed emotions 2017
Satisfaction 1997
Increased happiness 1979
Frustration 1963
Name: count, dtype: int64
for i in categorical:
plt.figure(figsize=(15, 6))
ax = sns.countplot(x=i, data=df, palette='hls')
for p in ax.patches:
height = p.get_height()
ax.annotate(f'{height}',
xy=(p.get_x() + p.get_width() / 2., height),
xytext=(0, 10),
textcoords='offset points',
ha='center', va='center')
plt.show()
for i in categorical:
counts = df[i].value_counts()
fig = px.pie(counts, values=counts.values, names=counts.index,
title=f'Distribution of {i}')
fig.show()
for feature in categorical:
df[feature] = df[feature].astype('category')
sns.set(style="whitegrid")
plt.figure(figsize=(20, 5 * rows))
for i, feature in enumerate(categorical):
plt.subplot(rows, cols, i + 1)
sns.boxplot(x=feature, y=continuous_feature, data=df)
plt.title(f'{continuous_feature} by {feature}')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
anova_results = {}
for feature in ['Educational Qualification', 'Occupation', 'Cultural
Background']:
groups = [group[continuous_features[2]] for name, group in
data.groupby(feature)]
anova_results[feature] = stats.f_oneway(*groups)
categories=participation_order,
ordered=True)
plt.figure(figsize=(10, 6))
sns.stripplot(x='Participation Frequency', y="Husband's Age", data=df,
jitter=True)
plt.title("Husband's Age vs Participation Frequency in Karwa Chauth")
plt.xlabel('Participation Frequency')
plt.ylabel("Husband's Age")
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='Participation Frequency', y="Husband's Age", data=df)
plt.title("Distribution of Husband's Age by Participation Frequency in
Karwa Chauth")
plt.xlabel('Participation Frequency')
plt.ylabel("Husband's Age")
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.violinplot(x='Participation Frequency', y="Husband's Age",
data=df)
plt.title("Distribution of Husband's Age by Participation Frequency in
Karwa Chauth")
plt.xlabel('Participation Frequency')
plt.ylabel("Husband's Age")
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.pointplot(x='Participation Frequency', y="Husband's Age", data=df,
ci='sd')
plt.title("Average Husband's Age by Participation Frequency in Karwa
Chauth")
plt.xlabel('Participation Frequency')
plt.ylabel("Husband's Age")
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.barplot(x='Participation Frequency', y="Husband's Age", data=df,
ci='sd')
plt.title("Mean Husband's Age by Participation Frequency in Karwa
Chauth")
plt.xlabel('Participation Frequency')
plt.ylabel("Husband's Age")
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.swarmplot(x='Participation Frequency', y="Husband's Age", data=df)
plt.title("Husband's Age Distribution by Participation Frequency in
Karwa Chauth")
plt.xlabel('Participation Frequency')
plt.ylabel("Husband's Age")
plt.xticks(rotation=45)
plt.show()
sns.catplot(x='Participation Frequency', y="Husband's Age", data=df,
kind='box', height=6, aspect=2)
plt.title("Box Plot of Husband's Age by Participation Frequency in
Karwa Chauth")
plt.xlabel('Participation Frequency')
plt.ylabel("Husband's Age")
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='Participation Frequency', y="Husband's Age",
hue='City/Region', data=df)
plt.title("Husband's Age by Participation Frequency in Karwa Chauth
(by Region)")
plt.xlabel('Participation Frequency')
plt.ylabel("Husband's Age")
plt.xticks(rotation=45)
plt.legend(title='Region', bbox_to_anchor=(1, 1))
plt.show()
plt.figure(figsize=(10, 6))
sns.barplot(x='Participation Frequency', y="Husband's Age",
hue='City/Region', data=df, ci='sd')
plt.title("Mean Husband's Age by Participation Frequency in Karwa
Chauth (by Region)")
plt.xlabel('Participation Frequency')
plt.ylabel("Husband's Age")
plt.xticks(rotation=45)
plt.legend(title='Region', bbox_to_anchor=(1, 1))
plt.show()
pivot_1 = df.pivot_table(values="Husband's Age",
index='Participation Frequency',
aggfunc='mean')
print(pivot_1)
Husband's Age
Participation Frequency
Occasionally 42.440024
Every year 42.406395
count
Husband's Age
Participation Frequency
Occasionally 5027
Every year 4973
Never 0
Rarely 0
Sometimes 0
Often 0
Always 0
X = df[features]
y = df['Husband_Age_Category']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(drop='first', handle_unknown='ignore'),
categorical_features)
])
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=42))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify =
y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
StandardScaler(),
["Wife's Age",
'Marriage Duration
(Years)',
'Number of
Children']),
('cat',
OneHotEncoder(drop='first',
handle_unknown='ignore'),
['Cultural
Background',
'Lifestyle
Factors',
'Economic Status',
'Participation
Frequency',
'Traditions
Observed',
'Health
Impact'])])),
('classifier',
RandomForestClassifier(random_state=42))])
y_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[762 348]
[597 293]]
precision recall f1-score support
lr_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(max_iter=1000, random_state=42))
])
metrics_df = pd.DataFrame({
'Model': models,
'Precision': precision,
'Recall': recall,
'F1-Score': f1_score
})
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
metrics_df_melted = metrics_df.melt(id_vars='Model',
var_name='Metric', value_name='Score')
plt.show()
Thanks !!!