0% found this document useful (0 votes)
2 views9 pages

Data Wrangling Notebook Summary

This document outlines various data wrangling techniques using Python, including library imports, sample data generation, and functions for data manipulation. It covers methods for cleaning data, handling missing values, encoding categorical variables, and summarizing data through descriptive statistics and group operations. The notebook provides practical code examples for each technique, demonstrating how to apply them to a sample dataset.

Uploaded by

imsubscribeyou
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views9 pages

Data Wrangling Notebook Summary

This document outlines various data wrangling techniques using Python, including library imports, sample data generation, and functions for data manipulation. It covers methods for cleaning data, handling missing values, encoding categorical variables, and summarizing data through descriptive statistics and group operations. The notebook provides practical code examples for each technique, demonstrating how to apply them to a sample dataset.

Uploaded by

imsubscribeyou
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 9

Data Wrangling Notebook Summary

1. Introduction
# Data Wrangling

This notebook presents different data wrangling techniques used commonly

2. Library Imports
# import required libraries
import random
import datetime
import numpy as np
import pandas as pd
from random import randrange
from sklearn import preprocessing

from IPython.display import display

pd.options.mode.chained_assignment = None

3. Utilities
## Utilities

def _random_date(start,date_count):
"""This function generates a random date based on params
Args:
start (date object): the base date
date_count (int): number of dates to be generated
Returns:
list of random dates

"""
current = start
while date_count > 0:
curr = current + datetime.timedelta(days=randrange(42))
yield curr
date_count-=1

def generate_sample_data(row_count=100):
"""This function generates a random transaction dataset
Args:
row_count (int): number of rows for the dataframe
Returns:
a pandas dataframe

"""

# sentinels
startDate = datetime.datetime(2016, 1, 1,13)
serial_number_sentinel = 1000
user_id_sentinel = 5001
product_id_sentinel = 101
price_sentinel = 2000

# base list of attributes


data_dict = {
'Serial No': np.arange(row_count)+serial_number_sentinel,
'Date': np.random.permutation(pd.to_datetime([x.strftime("%d-%m-%Y")
for x in _random_date(startDate,
row_count)]).date
),
'User ID': np.random.permutation(np.random.randint(0,
row_count,
size=int(row_count/10)) + user_id_sentinel).tolist()*10,
'Product ID': np.random.permutation(np.random.randint(0,
row_count,
size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,
'Quantity Purchased': np.random.permutation(np.random.randint(1,
42,
size=row_count)),
'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel,
decimals=2),
'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1))
for i in range(row_count)])
}

# introduce missing values


for index in range(int(np.sqrt(row_count))):
data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] =
np.nan
data_dict['User Type'][np.argmax(data_dict['User Type'] ==
random.choice(data_dict['User Type']))] = np.nan
data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] =
np.nan
data_dict['Product ID'][np.argmax(data_dict['Product ID'] ==
random.choice(data_dict['Product ID']))] = 0
data_dict['Serial No'][np.argmax(data_dict['Serial No'] ==
random.choice(data_dict['Serial No']))] = -1
data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User
ID']))] = -101

# create data frame


df = pd.DataFrame(data_dict)

return df

def describe_dataframe(df=pd.DataFrame()):
"""This function generates descriptive stats of a dataframe
Args:
df (dataframe): the dataframe to be analyzed
Returns:
None

"""
print("\n\n")
print("*"*30)
print("About the Data")
print("*"*30)

print("Number of rows::",df.shape[0])
print("Number of columns::",df.shape[1])
print("\n")

print("Column Names::",df.columns.values.tolist())
print("\n")

print("Column Data Types::\n",df.dtypes)


print("\n")

print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist())


print("\n")

print("Number of rows with Missing Values::",len(pd.isnull(df).any(1).nonzero()


[0].tolist()))
print("\n")

print("Sample Indices with missing data::",pd.isnull(df).any(1).nonzero()[0].tolist()[0:5])


print("\n")

print("General Stats::")
print(df.info())
print("\n")

print("Summary Stats::")
print(df.describe())
print("\n")

print("Dataframe Sample Rows::")


display(df.head(5))

def cleanup_column_names(df,rename_dict={},do_inplace=True):
"""This function renames columns of a pandas dataframe
It converts column names to snake case if rename_dict is not passed.
Args:
rename_dict (dict): keys represent old column names and values point to
newer ones
do_inplace (bool): flag to update existing dataframe or return a new one
Returns:
pandas dataframe if do_inplace is set to False, None otherwise

"""
if not rename_dict:
return df.rename(columns={col: col.lower().replace(' ','_')
for col in df.columns.values.tolist()},
inplace=do_inplace)
else:
return df.rename(columns=rename_dict,inplace=do_inplace)

def expand_user_type(u_type):
"""This function maps user types to user classes
Args:
u_type (str): user type value
Returns:
(str) user_class value

"""
if u_type in ['a','b']:
return 'new'
elif u_type == 'c':
return 'existing'
elif u_type == 'd':
return 'loyal_existing'
else:
return 'error'

4. Sample Dataset Generation


## Generate a Sample Dataset

df = generate_sample_data(row_count=1000)

5. Additional Explanations
### Describe the Dataset

### Rename Columns

### Sort Rows on defined attributes

### Rearrange Columns in a Dataframe

### Filtering Columns

Using Column Index

Using Column Name

Using Column Datatype

### Filtering Rows

Select specific rows

Exclude Specific Row indices

Conditional Filtering

Offset from top of the dataframe

Offset from bottom of the dataframe

### TypeCasting/Data Type Conversion

### Apply/Map Usage


Map : Create a derived attribute using map

Apply: Using apply to get attribute ranges

Applymap: Extract week from date

### Missing Values

Drop Rows with missing dates

Fill Missing Price values with mean price

Fill Missing user_type values with value from previous row (forward fill)

Fill Missing user_type values with value from next row (backward fill)

### Duplicates

Drop Duplicate serial_no rows

Remove rows which have less than 3 attributes with non-missing data

### Encode Categoricals

One Hot Encoding using get_dummies()

Label Mapping

### Random Sampling data from DataFrame

### Normalizing Numeric Values

Normalize price values using **Min-Max Scaler**

Normalize quantity purchased values using **Robust Scaler**

### Data Summarization

Condition based aggregation

Condtion based counts

### Group By

Group By certain attributes

Group By with different aggregate functions

Group by specific aggregate functions for each attribute

Group by with multiple agg for each attribute


### Pivot Tables

### Stack a Dataframe

6. Data Wrangling Code


describe_dataframe(df)

print("Dataframe columns:\n{}".format(df.columns.tolist()))

cleanup_column_names(df)

print("Dataframe columns:\n{}".format(df.columns.tolist()))

display(df.sort_values(['serial_no', 'price'],
ascending=[True, False]).head())

display(df[['serial_no','date','user_id','user_type',
'product_id','quantity_purchased','price']].head())

# print 10 values from column at index 3


print(df.iloc[:,3].values[0:10])

# print 10 values of quantity purchased


print(df.quantity_purchased.values[0:10])

# print 10 values of columns with data type float


print(df.select_dtypes(include=['float64']).values[:10,0])

display(df.iloc[[10,501,20]])

display(df.drop([0,24,51], axis=0).head())

display(df[df.quantity_purchased>25].head())

display(df[100:].head())

display(df[-10:].head())

df['date'] = pd.to_datetime(df.date)
# compare dtypes of the original df with this one
print(df.dtypes)

df['user_class'] = df['user_type'].map(expand_user_type)
display(df.tail())

display(df.select_dtypes(include=[np.number]).apply(lambda x:
x.max()-
x.min()))

df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week


if not
pd.isnull(dt.week)
else 0)

display(df.head())

df_dropped = df.dropna(subset=['date'])
display(df_dropped.head())

df_dropped['price'].fillna(value=np.round(df.price.mean(),decimals=2),
inplace=True)

df_dropped['user_type'].fillna(method='ffill',inplace=True)

df_dropped['user_type'].fillna(method='bfill',inplace=True)

# sample duplicates
display(df_dropped[df_dropped.duplicated(subset=['serial_no'])].head())
print("Shape of df={}".format(df_dropped.shape))

df_dropped.drop_duplicates(subset=['serial_no'],inplace=True)

# updated dataframe
display(df_dropped.head())
print("Shape of df={}".format(df_dropped.shape))

display(df.dropna(thresh=3).head())
print("Shape of df={}".format(df.dropna(thresh=3).shape))

display(pd.get_dummies(df,columns=['user_type']).head())

type_map={'a':0,'b':1,'c':2,'d':3,np.NAN:-1}
df['encoded_user_type'] = df.user_type.map(type_map)
display((df.tail()))

display(df.sample(frac=0.2, replace=True, random_state=42).head())

df_normalized = df.dropna().copy()
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled =
min_max_scaler.fit_transform(df_normalized['price'].values.reshape(-
1,1))
df_normalized['price'] = np_scaled.reshape(-1,1)

display(df_normalized.head())

df_normalized = df.dropna().copy()
robust_scaler = preprocessing.RobustScaler()
rs_scaled =
robust_scaler.fit_transform(df_normalized['quantity_purchased'].values.r
eshape(-1,1))
df_normalized['quantity_purchased'] = rs_scaled.reshape(-1,1)

display(df_normalized.head())
print("Mean price of items purchased by user_type=a ::
{}".format(df['price'][df['user_type']=='a'].mean()))

print(df['purchase_week'].value_counts())

print(df.groupby(['user_class'])['quantity_purchased'].sum())

display(df.groupby(['user_class'])['quantity_purchased'].agg([np.sum,
np.mean,

np.count_nonzero]))

display(df.groupby(['user_class','user_type']).agg({'price':np.mean,

'quantity_purchased':np.max}))

display(df.groupby(['user_class','user_type']).agg({'price':{

'total_price':np.sum,

'mean_price':np.mean,

'variance_price':np.std,

'count':np.count_nonzero},

'quantity_purchased':np.sum}))

display(df.pivot_table(index='date', columns='user_type',
values='price',aggfunc=np.mean))

print(df.stack())

You might also like