002 Python Pandas
002 Python Pandas
Installation of Pandas
If you are using Conda, you can install this library by typing following command in Anaconda prompt:
If you are using PIP, you can install this library by typing following command in Anaconda prompt:
Series
Syntax:
In [248]: #Example1
#creating a series with default index value
data1 = ['a', 'b', 'c', 'd', 'e']
s1 = pd.Series(data1)
s1
Out[248]: 0 a
1 b
2 c
3 d
4 e
dtype: object
In [249]: #Example2
#creating a series with predefined index values
Index = ['x1', 'x2', 'x3', 'x4', 'x5']
s2 = pd.Series(data1, Index)
s2
Out[249]: x1 a
x2 b
x3 c
x4 d
x5 e
dtype: object
DataFrame
Syntax:
Out[250]:
0
0 1
1 2
2 3
3 4
4 5
Out[251]:
Fruit Count
0 Apple 10
1 Orange 12
2 Pear 13
In [252]: #Creating a dataframe from dictionaries of ndarrays
data4 = {'Name':['Rick', 'Megan', 'John', 'Jill'],
'Age':[28,25,29,27]}
df3 = pd.DataFrame(data4)
df3
Out[252]:
Name Age
0 Rick 28
1 Megan 25
2 John 29
3 Jill 27
In [253]: #Creating a dataframe from dictionaries of lists which also denotes the missin
g values "NaN"
data5 = [{'a':41, 'b': 4},
{'a': 3, 'b': 8, 'c': 0},
{ 'b': 22, 'c': 16},
{'a': 15, 'b': 4, 'c': 5}, ]
df4 = pd.DataFrame(data5, index=['first','second','third','fourth'])
df4
Out[253]:
a b c
Out[254]:
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
In [255]: # Adding new column to df5 dataframe
df5['three'] = pd.Series(['x', 'y', 'z', 'w'], index=['a', 'b', 'c', 'd'])
df5
Out[255]:
one two three
a 1.0 1 x
b 2.0 2 y
c 3.0 3 z
d NaN 4 w
Out[256]:
one two three four five
a 1.0 1 x 2.0 5
b 2.0 2 y 4.0 5
c 3.0 3 z 6.0 5
d NaN 4 w NaN 5
Out[257]:
one two three four
a 1.0 1 x 2.0
b 2.0 2 y 4.0
c 3.0 3 z 6.0
d NaN 4 w NaN
Out[258]:
one two three four five
b 2.0 2 y 4.0 5
d NaN 4 w NaN 5
In [259]: #Row Selection usinf loc() function
df5.loc['b']
Out[259]: one 2
two 2
three y
four 4
five 5
Name: b, dtype: object
Out[260]: one 3
two 3
three z
four 6
five 5
Name: c, dtype: object
Out[261]:
one two three four five
c 3.0 3 z 6.0 5
d NaN 4 w NaN 5
Out[262]:
Fruit Count
0 Apple 10
1 Orange 12
2 Pear 13
In [263]: #Appending df6 data with df7 data
df7 = pd.DataFrame([['Strawberry', 5], ['Pineapple', 3], ['Grapes', 20]],colum
ns=['Fruit','Count'], dtype = int)
df6 = df6.append(df7)
df6
Out[263]:
Fruit Count
0 Apple 10
1 Orange 12
2 Pear 13
0 Strawberry 5
1 Pineapple 3
2 Grapes 20
Out[264]: '0.25.1'
Out[267]:
id name host_id host_name neighbourhood_group neighbourhood latitude
Skylit Midtown
1 2595 2845 Jennifer Manhattan Midtown 40.75362
Castle
THE VILLAGE
OF
2 3647 4632 Elisabeth Manhattan Harlem 40.80902
HARLEM....NEW
YORK !
Cozy Entire
3 3831 Floor of 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514
Brownstone
Entire Apt:
Spacious
4 5022 7192 Laura Manhattan East Harlem 40.79851
Studio/Loft by
central park
Out[268]:
id name host_id host_name neighbourhood_group neighbourhood la
Charming one
bedroom -
Bedford-
48890 36484665 newly 8232441 Sabrina Brooklyn 40
Stuyvesant
renovated
rowhouse
Affordable
room in
48891 36485057 6570630 Marisol Brooklyn Bushwick 40
Bushwick/East
Williamsburg
Sunny Studio
Ilgar &
48892 36485431 at Historical 23492952 Manhattan Harlem 40
Aysel
Neighborhood
Trendy duplex
in the very
48894 36487245 68119814 Christophe Manhattan Hell's Kitchen 40
heart of Hell's
Kitchen
In [269]: #To see the list of the labels of the series.
dfx.axes
Out[269]: [RangeIndex(start=0, stop=48895, step=1),
Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
'minimum_nights', 'number_of_reviews', 'last_review',
'reviews_per_month', 'calculated_host_listings_count',
'availability_365'],
dtype='object')]
Out[270]: id int64
name object
host_id int64
host_name object
neighbourhood_group object
neighbourhood object
latitude float64
longitude float64
room_type object
price int64
minimum_nights int64
number_of_reviews int64
last_review object
reviews_per_month float64
calculated_host_listings_count int64
availability_365 int64
dtype: object
In [271]: #Returns the Boolean value saying whether the Object is empty or not. True =
empty
dfx.empty
Out[271]: False
Out[272]: 2
Out[274]: 782320
In [275]: #to displaya actual data in the dataframe in arrray format
dfx.values
Out[275]: array([[2539, 'Clean & quiet apt home by the park', 2787, ..., 0.21, 6,
365],
[2595, 'Skylit Midtown Castle', 2845, ..., 0.38, 2, 355],
[3647, 'THE VILLAGE OF HARLEM....NEW YORK !', 4632, ..., nan, 1,
365],
...,
[36485431, 'Sunny Studio at Historical Neighborhood', 23492952,
..., nan, 1, 27],
[36485609, '43rd St. Time Square-cozy single bed', 30985759, ...,
nan, 6, 2],
[36487245, "Trendy duplex in the very heart of Hell's Kitchen",
68119814, ..., nan, 1, 23]], dtype=object)
In [276]: #Returns the sum of the values for the requested axis.
dfx.minimum_nights.sum()
Out[276]: 343730
Out[277]: 7.029962163820431
Out[278]: 20.51054953317987
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
id 48895 non-null int64
name 48879 non-null object
host_id 48895 non-null int64
host_name 48874 non-null object
neighbourhood_group 48895 non-null object
neighbourhood 48895 non-null object
latitude 48895 non-null float64
longitude 48895 non-null float64
room_type 48895 non-null object
price 48895 non-null int64
minimum_nights 48895 non-null int64
number_of_reviews 48895 non-null int64
last_review 38843 non-null object
reviews_per_month 38843 non-null float64
calculated_host_listings_count 48895 non-null int64
availability_365 48895 non-null int64
dtypes: float64(3), int64(7), object(6)
memory usage: 6.0+ MB
In [280]: #It displays MAX to MIN distinct values in the column or series
dfx.minimum_nights.value_counts(dropna=False)
Out[280]: 1 12720
2 11696
3 7999
30 3760
4 3303
...
42 1
186 1
265 1
1000 1
364 1
Name: minimum_nights, Length: 109, dtype: int64
Out[281]:
id host_id latitude longitude price minimum_nights
Out[282]: 1
Out[283]: 1250
Out[284]: 3.0
In [285]: #Mode of values
dfx.minimum_nights.mode()
Out[285]: 0 1
dtype: int64
Out[286]: 0
Out[287]: 0 1
1 1
2 3
3 1
4 10
..
48890 2
48891 4
48892 10
48893 1
48894 7
Name: minimum_nights, Length: 48895, dtype: int64
Out[288]: 0 1
1 2
2 5
3 6
4 16
...
48890 343708
48891 343712
48892 343722
48893 343723
48894 343730
Name: minimum_nights, Length: 48895, dtype: int64
In [289]: #Cumulative product
dfx.minimum_nights.cumprod()
Out[289]: 0 1
1 1
2 3
3 3
4 30
..
48890 0
48891 0
48892 0
48893 0
48894 0
Name: minimum_nights, Length: 48895, dtype: int64
Out[290]:
id host_id latitude longitude price minimum_night
Out[291]:
id host_id latitude longitude
Out[292]: id 48895
name 48879
host_id 48895
host_name 48874
neighbourhood_group 48895
neighbourhood 48895
latitude 48895
longitude 48895
room_type 48895
price 48895
minimum_nights 48895
number_of_reviews 48895
last_review 38843
reviews_per_month 38843
calculated_host_listings_count 48895
availability_365 48895
dtype: int64
Out[293]:
id name host_id host_name neighbourhood_group neighbourhood latitude
CHELSEA
1 Bdrm
700 258690 Plus 1359611 Andrea Manhattan Chelsea 40.74618
Sleeping
Loft!!
Easy,
comfortable
754 271694 1387370 James Manhattan Midtown 40.75282
studio in
Midtown
Cozy Room
in Sunny
970 387324 Apartment 1828506 Yogi Manhattan Kips Bay 40.74238
(Long/Short
Term)
800sqft
apartment
1305 568684 2798644 Alessandra Brooklyn Bushwick 40.70202
with huge
terrace
Manhattan
1449 649561 Sky Crib (1 3260084 David Manhattan Chelsea 40.75164
year sublet)
Out[294]:
id name host_id host_name neighbourhood_group neighbourhood latitude
Clean &
quiet
apt
0 2539 2787 John Brooklyn Kensington 40.64749
home
by the
park
Cozy
Room
with
Window
16078 12990578 49447536 Wing Yan Manhattan Hell's Kitchen 40.75484
View
near
Times
Square
Share
space
16090 13001082 4112409 Rick Manhattan East Harlem 40.79193
in E
Harlem
In [295]: # To sort column in descending order
dfx.sort_values('minimum_nights',ascending=False).head(3)
Out[295]:
id name host_id host_name neighbourhood_group neighbourhood latitud
Prime
W.
Greenwich
5767 4204302 Village 17550546 Genevieve Manhattan 40.7329
Village
location
1 bdrm
Battery Park
2854 1615764 NaN 6676776 Peter Manhattan 40.7123
City
Shared
Studio Greenwich
38664 30378211 200401254 Meg Manhattan 40.7309
(females Village
only)
In [296]: #It displays the value in order where minimum_nights is less where price is mo
re.
dfx.sort_values(['minimum_nights','price'],ascending=[True,False]).head(3)
Out[296]:
id name host_id host_name neighbourhood_group neighbourhood lati
Film
4377 2953058 1177497 Jessica Brooklyn Clinton Hill 40.6
Location
East 72nd
Townhouse
29662 22779726 156158778 Sally Manhattan Upper East Side 40.7
by (Hidden
by Airbnb)
70' Luxury
MotorYacht Battery Park
42523 33007610 7407743 Jack Manhattan 40.7
on the City
Hudson
In [297]: #Displays the different types of categories in given column. Ypu can filter da
ta using groupby function on one or more col
dfx.groupby(['room_type']).groups
Data Cleaning
Out[298]: id 0
name 16
host_id 0
host_name 21
neighbourhood_group 0
neighbourhood 0
latitude 0
longitude 0
room_type 0
price 0
minimum_nights 0
number_of_reviews 0
last_review 10052
reviews_per_month 10052
calculated_host_listings_count 0
availability_365 0
dtype: int64
In [299]: #Displays number of non null cells in each column
dfx.notnull().sum()
Out[299]: id 48895
name 48879
host_id 48895
host_name 48874
neighbourhood_group 48895
neighbourhood 48895
latitude 48895
longitude 48895
room_type 48895
price 48895
minimum_nights 48895
number_of_reviews 48895
last_review 38843
reviews_per_month 38843
calculated_host_listings_count 48895
availability_365 48895
dtype: int64
Out[300]:
id name host_id host_name neighbourhood_group neighbourhood latitude
Skylit Midtown
1 2595 2845 Jennifer Manhattan Midtown 40.75362
Castle
THE VILLAGE
OF
2 3647 4632 Elisabeth Manhattan Harlem 40.80902
HARLEM....NEW
YORK !
In [301]: #it is used to fill empty values with any constant value in column or datafram
e
dfx1.last_review.fillna('2018-01-01')
Out[301]: 0 2018-10-19
1 2019-05-21
2 2018-01-01
3 2019-07-05
4 2018-11-19
...
48890 2018-01-01
48891 2018-01-01
48892 2018-01-01
48893 2018-01-01
48894 2018-01-01
Name: last_review, Length: 48879, dtype: object
In [305]: #Replacing null values from reviews_per_month with mean
dfx1.reviews_per_month.fillna(dfx1.reviews_per_month.mean(axis=0))
Out[305]: 0 0.21000
1 0.38000
2 1.37341
3 4.64000
4 0.10000
...
48890 1.37341
48891 1.37341
48892 1.37341
48893 1.37341
48894 1.37341
Name: reviews_per_month, Length: 48879, dtype: float64
Out[302]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude
Clean
&
quiet
apt
0 2539 2787 John Brooklyn Kensington 40.64749 -73.97237
home
by
the
park
Out[303]:
id name host_id host_name neighbourhood_group neighbourhood latitude
Skylit Midtown
1 2595 2845 Jennifer Manhattan Midtown 40.75362
Castle
THE VILLAGE
OF
2 3647 4632 Elisabeth Manhattan Harlem 40.80902
HARLEM....NEW
YORK !
Cozy Entire
3 3831 Floor of 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514
Brownstone