Pandas - Basics - Practice - Assignment 2 - PDF
Pandas - Basics - Practice - Assignment 2 - PDF
Consider the following Python dictionary data and Python list labels:
data = {'birds': ['Cranes', 'Cranes', 'plovers', 'spoonbills', 'spoonbills', 'Cranes', 'plovers', 'Cranes',
'spoonbills', 'spoonbills'], 'age': [3.5, 4, 1.5, np.nan, 6, 3, 5.5, np.nan, 8, 4], 'visits': [2, 4, 3, 4, 3, 4, 2,
2, 3, 2], 'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
1. Create a DataFrame birds from this dictionary data which has the index labels.
In [ ]:
import pandas as pd
import numpy as np
data = {'birds': ['Cranes', 'Cranes', 'plovers', 'spoonbills', 'spoonbills', 'Cranes
'age': [3.5, 4, 1.5, np.nan, 6, 3, 5.5, np.nan, 8, 4],
'visits': [2, 4, 3, 4, 3, 4, 2, 2, 3, 2],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index = labels)
print(df)
In [ ]:
print(df.info())
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 birds 10 non-null object
1 age 8 non-null float64
2 visits 10 non-null int64
3 priority 10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes
None
3. Print the first 2 rows of the birds dataframe
In [ ]:
df.iloc[:2]
4. Print all the rows with only 'birds' and 'age' columns from the dataframe
In [ ]: import pandas as pd
import numpy as np
data = {'birds': ['Cranes', 'Cranes', 'plovers', 'spoonbills', 'spoonbills', 'Cranes
'age': [3.5, 4, 1.5, np.nan, 6, 3, 5.5, np.nan, 8, 4],
'visits': [2, 4, 3, 4, 3, 4, 2, 2, 3, 2],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index = labels)
print(df)
In [ ]:
#df.loc[['c','d','h'],['birds','age','visits']]
df.iloc[[2,3,7],[0,1,2]]
c plovers 1.5 3
d spoonbills NaN 4
h Cranes NaN 2
In [ ]:
df[df['visits']<4]
c plovers 1.5 3 no
e spoonbills 6.0 3 no
g plovers 5.5 2 no
i spoonbills 8.0 3 no
j spoonbills 4.0 2 no
7. select the rows with columns ['birds', 'visits'] where the age is missing i.e NaN
In [ ]:
import pandas as pd
import numpy as np
data = {'birds': ['Cranes', 'Cranes', 'plovers', 'spoonbills', 'spoonbills', 'Cranes
'age': [3.5, 4, 1.5, np.nan, 6, 3, 5.5, np.nan, 8, 4],
'visits': [2, 4, 3, 4, 3, 4, 2, 2, 3, 2],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index = labels)
print(df)
df[df['age'].isna()].loc[:,['birds', 'visits']]
d spoonbills 4
h Cranes 2
8. Select the rows where the birds is a Cranes and the age is less than 4
In [ ]:
#df[df['birds'] == 'Cranes'].loc[df['age']<4]
df.loc[(df['birds']== 'Cranes') & (df['age']<4)]
f Cranes 3.0 4 no
In [ ]:
df.loc[(df['age']>2) & (df['age']<=4)]
f Cranes 3.0 4 no
j spoonbills 4.0 2 no
In [ ]:
bc = df[df['birds'] == 'Cranes'].loc[:,['visits']]
print(bc)
print(bc.sum(axis=0))
visits
a 2
b 4
f 4
h 2
visits 12
dtype: int64
11. Calculate the mean age for each different birds in dataframe.
file:///C:/Users/HP/OneDrive/Applied ai/Module 1/assignment/pandas_basics_practice_assignment 2.html 3/7
9/14/22, 11:02 PM pandas_basics_practice_assignment 2
In [ ]:
import pandas as pd
import numpy as np
data = {'birds': ['Cranes', 'Cranes', 'plovers', 'spoonbills', 'spoonbills', 'Cranes
'age': [3.5, 4, 1.5, np.nan, 6, 3, 5.5, np.nan, 8, 4],
'visits': [2, 4, 3, 4, 3, 4, 2, 2, 3, 2],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index = labels)
#print(df)
gb = df.groupby('birds')
#for birds,brids_group in gb:
#print(birds)
#print(brids_group)
print(gb.mean())
age visits
birds
Cranes 3.5 3.0
plovers 3.5 2.5
spoonbills 6.0 3.0
12. Append a new row 'k' to dataframe with your choice of values for each column. Then
delete that row to return the original DataFrame.
In [ ]:
import pandas as pd
import numpy as np
data = {'birds': ['Cranes', 'Cranes', 'plovers', 'spoonbills', 'spoonbills', 'Cranes
'age': [3.5, 4, 1.5, np.nan, 6, 3, 5.5, np.nan, 8, 4],
'visits': [2, 4, 3, 4, 3, 4, 2, 2, 3, 2],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index = labels)
#print(df)
data1 = {'birds': ['seagul'], 'age' : [5], 'visits' : [5], 'priority' : ['yes']}
labels1 = ['k']
df1 = pd.DataFrame(data1, index= labels1)
#print(df1)
df2 = pd.concat([df,df1])
print(df2)
print("\n***********************\n")
df3 = df2.drop(['k'], axis=0)
print(df3)
***********************
e spoonbills 6.0 3 no
f Cranes 3.0 4 no
g plovers 5.5 2 no
h Cranes NaN 2 yes
i spoonbills 8.0 3 no
j spoonbills 4.0 2 no
13. Find the number of each type of birds in dataframe (Counts)
In [ ]:
import pandas as pd
import numpy as np
data = {'birds': ['Cranes', 'Cranes', 'plovers', 'spoonbills', 'spoonbills', 'Cranes
'age': [3.5, 4, 1.5, np.nan, 6, 3, 5.5, np.nan, 8, 4],
'visits': [2, 4, 3, 4, 3, 4, 2, 2, 3, 2],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index = labels)
#print(df)
gb = df.groupby('birds').size()
#for birds,brids_group in gb:
#print(birds)
#print(brids_group)
#print(gb.count())
print(gb)
birds
Cranes 4
plovers 2
spoonbills 4
dtype: int64
14. Sort dataframe (birds) first by the values in the 'age' in decending order, then by the
value in the 'visits' column in ascending order.
In [ ]:
import pandas as pd
import numpy as np
data = {'birds': ['Cranes', 'Cranes', 'plovers', 'spoonbills', 'spoonbills', 'Cranes
'age': [3.5, 4, 1.5, np.nan, 6, 3, 5.5, np.nan, 8, 4],
'visits': [2, 4, 3, 4, 3, 4, 2, 2, 3, 2],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index = labels)
df.sort_values(by=['age','visits'], ascending=[False,True],inplace=True)
df
i spoonbills 8.0 3 no
e spoonbills 6.0 3 no
g plovers 5.5 2 no
j spoonbills 4.0 2 no
f Cranes 3.0 4 no
c plovers 1.5 3 no
15. Replace the priority column values with'yes' should be 1 and 'no' should be 0
In [ ]:
import pandas as pd
import numpy as np
data = {'birds': ['Cranes', 'Cranes', 'plovers', 'spoonbills', 'spoonbills', 'Cranes
'age': [3.5, 4, 1.5, np.nan, 6, 3, 5.5, np.nan, 8, 4],
'visits': [2, 4, 3, 4, 3, 4, 2, 2, 3, 2],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index = labels)
df.replace(to_replace=['yes','no'],value=[1,0],inplace=True)
df
a Cranes 3.5 2 1
b Cranes 4.0 4 1
c plovers 1.5 3 0
d spoonbills NaN 4 1
e spoonbills 6.0 3 0
f Cranes 3.0 4 0
g plovers 5.5 2 0
h Cranes NaN 2 1
i spoonbills 8.0 3 0
j spoonbills 4.0 2 0
In [ ]:
df.replace(to_replace=['Cranes'],value=['trumpeters'],inplace=True)
df
a trumpeters 3.5 2 1
b trumpeters 4.0 4 1
c plovers 1.5 3 0
d spoonbills NaN 4 1
e spoonbills 6.0 3 0
f trumpeters 3.0 4 0
g plovers 5.5 2 0
h trumpeters NaN 2 1
i spoonbills 8.0 3 0
j spoonbills 4.0 2 0