Data_preprocessing_example_programs1
Data_preprocessing_example_programs1
import pandas as pd
import numpy as np
data = {
'Location': ['New York', 'San Francisco', 'Chicago', None, 'New York', 'Chicago', 'New York',
'Chicago']
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Income'] = df['Income'].fillna(df['Income'].median())
df['Name'] = df['Name'].fillna('Unknown')
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Location'] = df['Location'].fillna('Unknown')
df = df.drop_duplicates()
# Remove rows where numerical features have Z-scores > 3 (extreme outliers)
label_encoder = LabelEncoder()
df = df.drop(columns=['Name'])
scaler = StandardScaler()
print("\nCleaned DataFrame:")
print(df)
2. Write a Python program and include Data Integration steps and Data
cleaning steps
import pandas as pd
import numpy as np
# Sample Datasets
data1 = {
data2 = {
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print("Dataset 1:")
print(df1)
print("\nDataset 2:")
print(df2)
print("\nIntegrated Dataset:")
print(df)
print("\nIntegrated Dataset:")
print(df10)
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Income'] = df['Income'].fillna(df['Income'].median())
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Location'] = df['Location'].fillna('Unknown')
df = df.drop_duplicates()
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Location'] = label_encoder.fit_transform(df['Location'])
scaler = StandardScaler()
print("\nProcessed Dataset:")
print(df)
# Sample Dataset
data = {
'Feature1': [2.5, 3.0, 2.8, 3.2, 2.7, 3.5, 2.9],
'Feature2': [1.2, 1.5, 1.3, 1.7, 1.4, 1.8, 1.6],
'Feature3': [0.8, 0.9, 1.0, 0.7, 1.1, 0.6, 1.2],
'Feature4': [10, 20, 15, 10, 20, 15, 10],
'Target': [0, 1, 1, 0, 1, 0, 1]
}
import pandas as pd
import numpy as np
# Sample Dataset
data = {
df = pd.DataFrame(data)
print("Original Dataset:")
print(df)
# Step 1: Normalization
minmax_scaler = MinMaxScaler()
df['Feature2_Normalized'] = minmax_scaler.fit_transform(df[['Feature2']])
# Step 2: Standardization
std_scaler = StandardScaler()
df['Feature1_Standardized'] = std_scaler.fit_transform(df[['Feature1']])
df['Feature3_Log'] = np.log(df['Feature3'])
df['Feature2_BoxCox'], _ = boxcox(df['Feature2'])
label_encoder = LabelEncoder()
df['Category_LabelEncoded'] = label_encoder.fit_transform(df['Category'])
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(df[['Category']])
onehot_encoded_df = pd.DataFrame(onehot_encoded,
columns=onehot_encoder.get_feature_names_out(['Category']))
print("\nTransformed Dataset:")
print(df)
# Load Titanic dataset (replace 'train.csv' with the path to your dataset)
data = pd.read_csv('train.csv')
imputer_embarked = SimpleImputer(strategy='most_frequent')
df['Embarked'] = imputer_embarked.fit_transform(df[['Embarked']])
return df