Edp 3
Edp 3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 1. Creating a dataset with NaN values
data = {
'A': [10, 20, np.nan, 40, 50],
'B': [5, np.nan, 15, 20, 25],
'C': [np.nan, 30, 35, np.nan, 45]
}
df = pd.DataFrame(data)
# 2. Displaying the initial dataset
print("Initial Dataset:")
print(df)
# 3. Basic EDA operations
print("\nSummary Statistics:")
print(df.describe()) # Shows basic statistics, ignoring NaN values by default
print("\nChecking for NaN values in the dataset:")
print(df.isna()) # Returns a DataFrame showing True where NaN values are
located
print("\nTotal number of NaN values in each column:")
print(df.isna().sum()) # Shows the number of NaN values in each column
# 4. Mathematical Operations on NaN
# Example: Adding columns A and B
df['A_plus_B'] = df['A'] + df['B'] # NaN will propagate
print("\nResult of Adding Columns A and B:")
print(df[['A', 'B', 'A_plus_B']])
# Example: Handling NaN using fillna()
df_filled = df.fillna(0) # Replace NaN values with 0
print("\nDataset with NaN values replaced by 0:")
print(df_filled)
# Example: Calculating mean ignoring NaN
mean_a = df['A'].mean() # Calculates the mean ignoring NaNs by default
print("\nMean of Column A (ignoring NaN):", mean_a)
# Example: Perform operation with NaN values
df['A_times_B'] = df['A'] * df['B'] # NaN values will result in NaN in the product
print("\nResult of Multiplying Columns A and B:")
print(df[['A', 'B', 'A_times_B']])
# 5. Visualizing the data to understand NaN distribution
plt.figure(figsize=(10, 6))
df.isna().sum().plot(kind='bar', color='red', alpha=0.7)
plt.title('NaN Values Distribution by Column')
plt.xlabel('Columns')
plt.ylabel('Number of NaNs')
plt.show()
# 6. Filling missing values with mean (another common strategy)
df_filled_mean = df.fillna(df.mean())
print("\nDataset with NaN values filled with column mean:")
print(df_filled_mean)
# 7. Final Summary and Handling NaN for further operations
print("\nFinal Cleaned Dataset (after filling NaNs with mean):")
print(df_filled_mean)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Step 1: Create a sample dataset with continuous values
data = {
'Age': [23, 45, 18, 35, 60, 27, 50, 30, 22, 40],
'Income': [30000, 50000, 15000, 35000, 80000, 28000, 52000, 33000, 24000,
47000]
}
# Create DataFrame
df = pd.DataFrame(data)
# Bin the 'Income' column into specific intervals (custom bin edges)
income_bins = pd.cut(df['Income'], bins=[15000, 30000, 50000, 70000, 100000])
print("\nBinned 'Income' into custom bins:")
print(income_bins)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Step 1: Create a sample dataset
data = {
'Age': [23, 45, 18, 35, 60, 27, 50, 30, 22, 40],
'Income': [30000, 50000, 15000, 35000, 80000, 28000, 52000, 33000, 24000, 47000]
}
# Create DataFrame
df = pd.DataFrame(data)
# Display the initial dataset
print("Initial Dataset:")
print(df)
# Step 2: Random Sampling
# Randomly sample 5 rows from the dataset without replacement
random_sample = df.sample(n=5, random_state=42)
print("\nRandom Sample (5 rows without replacement):")
print(random_sample)
# Randomly sample 5 rows from the dataset with replacement
random_sample_with_replacement = df.sample(n=5, replace=True, random_state=42)
print("\nRandom Sample (5 rows with replacement):")
print(random_sample_with_replacement)
# Step 3: Permutation (Shuffling the data)
# Permuting (shuffling) the 'Age' column
permuted_age = df['Age'].sample(frac=1, random_state=42).reset_index(drop=True)
print("\nPermuted Age Column:")
print(permuted_age)
# Permuting (shuffling) the entire DataFrame (rows shuffled)
permuted_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print("\nPermuted DataFrame (Rows shuffled):")
print(permuted_df)
# Step 4: Visualizing the original and permuted data
# Plotting Age distribution before and after permutation
plt.figure(figsize=(12, 6))
# Original Age distribution
plt.subplot(1, 2, 1)
plt.hist(df['Age'], bins=5, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Original Age Distribution')
# Permuted Age distribution
plt.subplot(1, 2, 2)
plt.hist(permuted_age, bins=5, alpha=0.7, color='salmon', edgecolor='black')
plt.title('Permuted Age Distribution')
plt.tight_layout()
plt.show()
# Step 5: Comparing Summary Statistics before and after permutation
# Original summary statistics (before permutation)
original_stats = df.describe()
print("\nOriginal Summary Statistics:")
print(original_stats)
# Permuted summary statistics (for Age column after permutation)
permuted_stats = permuted_age.describe()
print("\nPermuted Age Summary Statistics:")
print(permuted_stats)
# Step 6: Comparing summary statistics for Random Sampling with and without replacement
# Summary statistics for Random Sampling without replacement
sampled_stats_without_replacement = random_sample.describe()
print("\nSummary Statistics for Random Sample (without replacement):")
print(sampled_stats_without_replacement)
# Summary statistics for Random Sampling with replacement
sampled_stats_with_replacement = random_sample_with_replacement.describe()
print("\nSummary Statistics for Random Sample (with replacement):")
print(sampled_stats_with_replacement)