Week 2
Week 2
# DataFrame 1
data1 = {'Name': ['Pankaj', 'Meghna', 'Lisa'],
'Country': ['India', 'India', 'USA'],
'Role': ['CEO', 'CTO', 'CTO']}
df1 = pd.DataFrame(data1)
# DataFrame 2
data2 = {'ID': [1, 2, 3],
'Name': ['Pankaj', 'Anupam', 'Amit']}
df2 = pd.DataFrame(data2)
print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)
DataFrame 1:
Name Country Role
0 Pankaj India CEO
1 Meghna India CTO
2 Lisa USA CTO
DataFrame 2:
ID Name
0 1 Pankaj
1 2 Anupam
2 3 Amit
# Left Join
result_left = pd.merge(df1, df2, on='Name', how='left')
print("\nResult Left Join:")
print(result_left)
# Right Join
result_right = pd.merge(df1, df2, on='Name', how='right')
print("\nResult Right Join:")
print(result_right)
# Outer Join
# Left Join
result_left = pd.merge(df1, df2, on='Name', how='left')
print("\nResult Left Join:")
print(result_left)
# Right Join
result_right = pd.merge(df1, df2, on='Name', how='right')
print("\nResult Right Join:")
print(result_right)
# Outer Join
Sales DataFrame:
ID Amount
0 1 100
1 2 200
2 3 300
3 4 400
Region DataFrame:
ID Region
0 1 East
1 2 West
2 3 North
3 5 South
Left Join:
ID Amount Region
0 1 100 East
1 2 200 West
2 3 300 North
3 4 400 NaN
Right Join:
ID Amount Region
0 1 100.0 East
1 2 200.0 West
2 3 300.0 North
3 5 NaN South
Outer Join:
ID Amount Region
0 1 100.0 East
1 2 200.0 West
2 3 300.0 North
3 4 400.0 NaN
4 5 NaN South
import numpy as np
import pandas as pd
# Data with Missing Values
data = {'A': [1, np.nan, 3, 4],
'B': [5, 6, np.nan, 8],
'C': [np.nan, np.nan, 9, 10]}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
# 1. Drop rows with any missing value
print("\nDrop rows with any missing values:")
print(df.dropna())
# 2. Drop columns with at least one missing value
print("\nDrop columns with at least one missing value:")
print(df.dropna(axis=1))
# 3. Drop rows/columns with all missing values
print("\nDrop rows/columns with all missing values:")
print(df.dropna(how='all'))
# 4. Drop rows/columns based on threshold (at least 2 non-NaN values)
print("\nDrop rows/columns based on threshold:")
print(df.dropna(thresh=2))
# 5. Replace NaN with the previous value (Forward Fill)
print("\nReplace NaN with the previous value:")
print(df.ffill()) # Using ffill() instead of fillna(method='pad')
# 6. Replace NaN with the previous value, limit=1 (Forward Fill with Limit)
print("\nReplace NaN with the previous value, limit=1:")
print(df.ffill(limit=1)) # Using ffill() with limit
# 7. Replace NaN with the next value (Backward Fill)
print("\nReplace NaN with the forward value:")
print(df.bfill()) # Using bfill() instead of fillna(method='bfill')
Original DataFrame:
A B C
0 1.0 5.0 NaN
1 NaN 6.0 NaN
2 3.0 NaN 9.0
3 4.0 8.0 10.0
import pandas as pd
0 3 0 7
1 2 3 14
2 0 7 6
3 1 2 15
0 13 10 20.0
1 12 13 23.0
2 10 17 27.0
3 2 2 4.0
4 55 9 NaN
5 98 76 NaN
df2 = df2.drop(df2.index[2])
df2
grapes mango banana
0 13 10 20.0
1 12 13 23.0
3 2 2 4.0
4 55 9 NaN
5 98 76 NaN
df1
0 3 0 7
1 2 3 14
2 0 7 6
3 1 2 15
%%time
df = pd.DataFrame(columns=['A'])
for i in range(30):
# Instead of append, use concat to add rows
df = pd.concat([df, pd.DataFrame([{'A': i*2}])], ignore_index=True)
%%time
df = pd.concat([pd.DataFrame([i*2], columns=['A']) for i in range(30)], ignore_index=True)
CPU times: user 11.4 ms, sys: 1.04 ms, total: 12.5 ms
Wall time: 39.6 ms