0% found this document useful (0 votes)
4 views16 pages

Edp 3

The document covers various data manipulation techniques using Python, including handling NaN values, random sampling, discretization, outlier detection, and data transformation. It demonstrates how to create datasets, perform basic exploratory data analysis, and visualize results using libraries like pandas and matplotlib. Additionally, it discusses the benefits and challenges of data transformation, providing practical examples throughout.

Uploaded by

ys304123
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views16 pages

Edp 3

The document covers various data manipulation techniques using Python, including handling NaN values, random sampling, discretization, outlier detection, and data transformation. It demonstrates how to create datasets, perform basic exploratory data analysis, and visualize results using libraries like pandas and matplotlib. Additionally, it discusses the benefits and challenges of data transformation, providing practical examples throughout.

Uploaded by

ys304123
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 16

MATHEMATICAL OPERATIONS

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 1. Creating a dataset with NaN values
data = {
'A': [10, 20, np.nan, 40, 50],
'B': [5, np.nan, 15, 20, 25],
'C': [np.nan, 30, 35, np.nan, 45]
}
df = pd.DataFrame(data)
# 2. Displaying the initial dataset
print("Initial Dataset:")
print(df)
# 3. Basic EDA operations
print("\nSummary Statistics:")
print(df.describe()) # Shows basic statistics, ignoring NaN values by default
print("\nChecking for NaN values in the dataset:")
print(df.isna()) # Returns a DataFrame showing True where NaN values are
located
print("\nTotal number of NaN values in each column:")
print(df.isna().sum()) # Shows the number of NaN values in each column
# 4. Mathematical Operations on NaN
# Example: Adding columns A and B
df['A_plus_B'] = df['A'] + df['B'] # NaN will propagate
print("\nResult of Adding Columns A and B:")
print(df[['A', 'B', 'A_plus_B']])
# Example: Handling NaN using fillna()
df_filled = df.fillna(0) # Replace NaN values with 0
print("\nDataset with NaN values replaced by 0:")
print(df_filled)
# Example: Calculating mean ignoring NaN
mean_a = df['A'].mean() # Calculates the mean ignoring NaNs by default
print("\nMean of Column A (ignoring NaN):", mean_a)
# Example: Perform operation with NaN values
df['A_times_B'] = df['A'] * df['B'] # NaN values will result in NaN in the product
print("\nResult of Multiplying Columns A and B:")
print(df[['A', 'B', 'A_times_B']])
# 5. Visualizing the data to understand NaN distribution
plt.figure(figsize=(10, 6))
df.isna().sum().plot(kind='bar', color='red', alpha=0.7)
plt.title('NaN Values Distribution by Column')
plt.xlabel('Columns')
plt.ylabel('Number of NaNs')
plt.show()
# 6. Filling missing values with mean (another common strategy)
df_filled_mean = df.fillna(df.mean())
print("\nDataset with NaN values filled with column mean:")
print(df_filled_mean)
# 7. Final Summary and Handling NaN for further operations
print("\nFinal Cleaned Dataset (after filling NaNs with mean):")
print(df_filled_mean)

Filling and random sampling


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Step 1: Create a sample dataset with NaN values
data = {
'A': [10, 20, np.nan, 40, 50],
'B': [5, np.nan, 15, 20, 25],
'C': [np.nan, 30, 35, np.nan, 45]
}
# Create DataFrame
df = pd.DataFrame(data)
# Display initial dataset
print("Initial Dataset with NaN values:")
print(df)
# Step 2: Filling missing values with different strategies
# Fill missing values with the column mean
df_filled_mean = df.fillna(df.mean())
print("\nDataset with NaN values filled with column mean:")
print(df_filled_mean)
# Fill missing values with the column median
df_filled_median = df.fillna(df.median())
print("\nDataset with NaN values filled with column median:")
print(df_filled_median)
# Fill missing values with the column mode (mode returns a Series, so we take the
first mode)
df_filled_mode = df.fillna(df.mode().iloc[0])
print("\nDataset with NaN values filled with column mode:")
print(df_filled_mode)
# Step 3: Visualizing missing data
plt.figure(figsize=(10, 6))
df.isna().sum().plot(kind='bar', color='red', alpha=0.7)
plt.title('NaN Values Distribution by Column')
plt.xlabel('Columns')
plt.ylabel('Number of NaNs')
plt.show()
# Step 4: Random Sampling
# 4.1 Randomly sample 3 rows without replacement
sampled_df = df.sample(n=3, random_state=42)
print("\nRandomly sampled 3 rows (without replacement):")
print(sampled_df)
# 4.2 Randomly sample 3 rows with replacement (allows duplicate samples)
sampled_with_replacement = df.sample(n=3, replace=True, random_state=42)
print("\nRandomly sampled 3 rows (with replacement):")
print(sampled_with_replacement)
# 4.3 Randomly sample 50% of the dataset (with replacement)
sampled_fraction = df.sample(frac=0.5, replace=True, random_state=42)
print("\nRandomly sampled 50% of the dataset:")
print(sampled_fraction)
# Step 5: Visualizing the sampled data (just to show the process)
plt.figure(figsize=(10, 6))
df['A'].plot(kind='hist', bins=5, alpha=0.7, color='blue', label='Original A',
legend=True)
sampled_df['A'].plot(kind='hist', bins=5, alpha=0.7, color='orange', label='Sampled
A', legend=True)
plt.title('Distribution of A: Original vs. Sampled')
plt.xlabel('A')
plt.ylabel('Frequency')
plt.legend()
plt.show()

Discretization and binning

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Step 1: Create a sample dataset with continuous values
data = {
'Age': [23, 45, 18, 35, 60, 27, 50, 30, 22, 40],
'Income': [30000, 50000, 15000, 35000, 80000, 28000, 52000, 33000, 24000,
47000]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display initial dataset


print("Initial Dataset:")
print(df)

# Step 2: Binning using pandas' cut() function (Discretization)

# Bin the 'Age' column into 3 equal-width bins


age_bins = pd.cut(df['Age'], bins=3)
print("\nBinned 'Age' into 3 equal-width bins:")
print(age_bins)

# Bin the 'Income' column into specific intervals (custom bin edges)
income_bins = pd.cut(df['Income'], bins=[15000, 30000, 50000, 70000, 100000])
print("\nBinned 'Income' into custom bins:")
print(income_bins)

# Step 3: Binning using pandas' qcut() function (Quantile-based Discretization)

# Quantile-based binning of 'Age' into 4 equal-size bins


age_qbins = pd.qcut(df['Age'], q=4)
print("\nQuantile-based binning of 'Age' into 4 bins (equal frequency):")
print(age_qbins)

# Quantile-based binning of 'Income' into 3 quantiles


income_qbins = pd.qcut(df['Income'], q=3)
print("\nQuantile-based binning of 'Income' into 3 quantiles:")
print(income_qbins)

# Step 4: Visualizing the binning results

# Plotting 'Age' bins


plt.figure(figsize=(12, 6))
plt.hist(df['Age'], bins=3, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Binned Age Distribution (3 Equal-Width Bins)')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Plotting 'Income' bins


plt.figure(figsize=(12, 6))
plt.hist(df['Income'], bins=[15000, 30000, 50000, 70000, 100000], alpha=0.7,
color='salmon', edgecolor='black')
plt.title('Binned Income Distribution (Custom Bins)')
plt.xlabel('Income')
plt.ylabel('Frequency')
plt.show()
# Step 5: Add the binned columns to the DataFrame for easier analysis
df['Age_Bins'] = age_bins
df['Income_Bins'] = income_bins
df['Age_Quantile_Bins'] = age_qbins
df['Income_Quantile_Bins'] = income_qbins

# Display the DataFrame with the added bins


print("\nDataset with Binned Columns:")
print(df)

Outlier detection and filtering


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
# Step 1: Create a sample dataset with some outliers
data = {
'Age': [23, 45, 18, 35, 60, 27, 50, 30, 22, 40, 100, 110, 120], # Age has outliers (100, 110,
120)
'Income': [30000, 50000, 15000, 35000, 80000, 28000, 52000, 33000, 24000, 47000, 200000,
150000, 250000] # Income has outliers (200000, 150000, 250000)
}
# Create DataFrame
df = pd.DataFrame(data)

# Display the initial dataset


print("Initial Dataset:")
print(df)
# Step 2: Visualizing the data before outlier filtering
# Plotting 'Age' and 'Income' distributions
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.boxplot(df['Age'])
plt.title('Age Distribution (Before Outlier Filtering)')
plt.subplot(1, 2, 2)
plt.boxplot(df['Income'])
plt.title('Income Distribution (Before Outlier Filtering)')
plt.show()
# Step 3: Outlier Detection Using Z-score
# Calculate Z-scores for 'Age' and 'Income'
z_scores = np.abs(stats.zscore(df[['Age', 'Income']]))
# Set a threshold for Z-score to consider a point as an outlier (common threshold is 3)
outliers_zscore = (z_scores > 3).all(axis=1)
print("\nOutliers based on Z-score:")
print(df[outliers_zscore])
# Step 4: Outlier Detection Using IQR (Interquartile Range)
# Calculate Q1 (25th percentile) and Q3 (75th percentile) for 'Age' and 'Income'
Q1 = df[['Age', 'Income']].quantile(0.25)
Q3 = df[['Age', 'Income']].quantile(0.75)
# Calculate the IQR
IQR = Q3 - Q1
# Define outlier criteria: values outside of [Q1 - 1.5 * IQR, Q3 + 1.5 * IQR]
outliers_iqr = ((df[['Age', 'Income']] < (Q1 - 1.5 * IQR)) | (df[['Age', 'Income']] > (Q3 + 1.5 *
IQR)))
# Identify rows that have outliers in either 'Age' or 'Income'
outliers_iqr_rows = df[outliers_iqr.any(axis=1)]
print("\nOutliers based on IQR:")
print(outliers_iqr_rows)
# Step 5: Filtering Outliers
# Filter the dataset to remove outliers based on Z-score method (all columns)
filtered_df_zscore = df[~outliers_zscore]
# Filter the dataset to remove outliers based on IQR method
filtered_df_iqr = df[~outliers_iqr.any(axis=1)]

# Display filtered datasets


print("\nDataset after filtering outliers based on Z-score:")
print(filtered_df_zscore)
print("\nDataset after filtering outliers based on IQR:")
print(filtered_df_iqr)
# Step 6: Visualizing the data after outlier filtering
# Plotting 'Age' and 'Income' distributions after filtering outliers
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.boxplot(filtered_df_zscore['Age'])
plt.title('Age Distribution (After Z-score Filtering)')
plt.subplot(1, 2, 2)
plt.boxplot(filtered_df_zscore['Income'])
plt.title('Income Distribution (After Z-score Filtering)')
plt.show()

Permutation and random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Step 1: Create a sample dataset
data = {
'Age': [23, 45, 18, 35, 60, 27, 50, 30, 22, 40],
'Income': [30000, 50000, 15000, 35000, 80000, 28000, 52000, 33000, 24000, 47000]
}
# Create DataFrame
df = pd.DataFrame(data)
# Display the initial dataset
print("Initial Dataset:")
print(df)
# Step 2: Random Sampling
# Randomly sample 5 rows from the dataset without replacement
random_sample = df.sample(n=5, random_state=42)
print("\nRandom Sample (5 rows without replacement):")
print(random_sample)
# Randomly sample 5 rows from the dataset with replacement
random_sample_with_replacement = df.sample(n=5, replace=True, random_state=42)
print("\nRandom Sample (5 rows with replacement):")
print(random_sample_with_replacement)
# Step 3: Permutation (Shuffling the data)
# Permuting (shuffling) the 'Age' column
permuted_age = df['Age'].sample(frac=1, random_state=42).reset_index(drop=True)
print("\nPermuted Age Column:")
print(permuted_age)
# Permuting (shuffling) the entire DataFrame (rows shuffled)
permuted_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print("\nPermuted DataFrame (Rows shuffled):")
print(permuted_df)
# Step 4: Visualizing the original and permuted data
# Plotting Age distribution before and after permutation
plt.figure(figsize=(12, 6))
# Original Age distribution
plt.subplot(1, 2, 1)
plt.hist(df['Age'], bins=5, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Original Age Distribution')
# Permuted Age distribution
plt.subplot(1, 2, 2)
plt.hist(permuted_age, bins=5, alpha=0.7, color='salmon', edgecolor='black')
plt.title('Permuted Age Distribution')
plt.tight_layout()
plt.show()
# Step 5: Comparing Summary Statistics before and after permutation
# Original summary statistics (before permutation)
original_stats = df.describe()
print("\nOriginal Summary Statistics:")
print(original_stats)
# Permuted summary statistics (for Age column after permutation)
permuted_stats = permuted_age.describe()
print("\nPermuted Age Summary Statistics:")
print(permuted_stats)
# Step 6: Comparing summary statistics for Random Sampling with and without replacement
# Summary statistics for Random Sampling without replacement
sampled_stats_without_replacement = random_sample.describe()
print("\nSummary Statistics for Random Sample (without replacement):")
print(sampled_stats_without_replacement)
# Summary statistics for Random Sampling with replacement
sampled_stats_with_replacement = random_sample_with_replacement.describe()
print("\nSummary Statistics for Random Sample (with replacement):")
print(sampled_stats_with_replacement)

Benefits of data transformation,challenges


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import boxcox
# Step 1: Create a sample dataset with skewed data
data = {
'Age': [23, 45, 18, 35, 60, 27, 50, 30, 22, 40],
'Income': [30000, 50000, 15000, 35000, 80000, 28000, 52000, 33000, 24000, 47000],
'Experience': [1, 5, 0, 3, 8, 2, 10, 6, 1, 4],
'Height': [5.5, 6.0, 5.8, 5.4, 6.2, 5.7, 6.1, 5.9, 5.6, 5.5],
}
df = pd.DataFrame(data)
# Add some skewness to 'Income' by applying an exponential function
df['Income'] = np.exp(df['Income'] / 10000) # Skewed data (log-transformed)
# Display the initial dataset
print("Initial Dataset:")
print(df)
# Step 2: Visualize data distributions before transformations
plt.figure(figsize=(12, 8))
# Plotting the distributions of the variables
plt.subplot(2, 2, 1)
plt.hist(df['Age'], bins=5, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Age Distribution')
plt.subplot(2, 2, 2)
plt.hist(df['Income'], bins=5, alpha=0.7, color='salmon', edgecolor='black')
plt.title('Income Distribution (Skewed)')
plt.subplot(2, 2, 3)
plt.hist(df['Experience'], bins=5, alpha=0.7, color='green', edgecolor='black')
plt.title('Experience Distribution')
plt.subplot(2, 2, 4)
plt.hist(df['Height'], bins=5, alpha=0.7, color='orange', edgecolor='black')
plt.title('Height Distribution')
plt.tight_layout()
plt.show()
# Step 3: Apply Log Transformation to the 'Income' column to reduce skewness
df['Income_Log'] = np.log(df['Income'])
# Visualize the effect of Log Transformation
plt.figure(figsize=(12, 6))
# Plotting the transformed 'Income' distribution
plt.subplot(1, 2, 1)
plt.hist(df['Income'], bins=5, alpha=0.7, color='salmon', edgecolor='black')
plt.title('Original Income Distribution')
plt.subplot(1, 2, 2)
plt.hist(df['Income_Log'], bins=5, alpha=0.7, color='lightgreen', edgecolor='black')
plt.title('Log Transformed Income Distribution')
plt.tight_layout()
plt.show()
# Step 4: Apply Scaling (Standardization)
scaler = StandardScaler()
df['Age_Scaled'] = scaler.fit_transform(df[['Age']])
df['Income_Scaled'] = scaler.fit_transform(df[['Income_Log']])
# Step 5: Apply Min-Max Scaling to 'Experience'
min_max_scaler = MinMaxScaler()
df['Experience_Scaled'] = min_max_scaler.fit_transform(df[['Experience']])
# Step 6: Apply Box-Cox Transformation to 'Height' to stabilize variance and make it normal
# Box-Cox requires positive values
df['Height_BoxCox'], _ = boxcox(df['Height'] + 1) # Adding 1 to ensure positivity
# Visualizing the transformed data
plt.figure(figsize=(12, 8))
# Plotting scaled data distributions
plt.subplot(2, 2, 1)
plt.hist(df['Age_Scaled'], bins=5, alpha=0.7, color='blue', edgecolor='black')
plt.title('Scaled Age Distribution')
plt.subplot(2, 2, 2)
plt.hist(df['Income_Scaled'], bins=5, alpha=0.7, color='purple', edgecolor='black')
plt.title('Scaled Income Distribution')
plt.subplot(2, 2, 3)
plt.hist(df['Experience_Scaled'], bins=5, alpha=0.7, color='red', edgecolor='black')
plt.title('Scaled Experience Distribution')
plt.subplot(2, 2, 4)
plt.hist(df['Height_BoxCox'], bins=5, alpha=0.7, color='green', edgecolor='black')
plt.title('Box-Cox Transformed Height Distribution')
plt.tight_layout()
plt.show()
# Step 7: Handling Missing Values as a Challenge
# Introduce some NaN values to simulate missing data
df_with_na = df.copy()
df_with_na['Income_Scaled'].iloc[2] = np.nan
df_with_na['Height_BoxCox'].iloc[5] = np.nan
# Fill missing values with the mean of the respective columns
df_ with_na_filled = df_with_na.fillna(df_with_na.mean())
print("\nDataset with Missing Values:")
print(df_with_na)
print("\nDataset after filling missing values:")
print(df_with_na_filled)

You might also like