Pandas Notes: """ Useful Data Analysis Tool """
Pandas Notes: """ Useful Data Analysis Tool """
July 2, 2017
In [145]: s = pd.Series([1,3,5,np.nan,6,8])
s
Out[145]: 0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
1
Out[148]: DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
Initialization
Out[149]: A B C D
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
2013-01-05 -2.173683 0.121913 1.157189 0.115356
2013-01-06 1.330829 0.845027 1.826736 1.175270
Pass in a dict
Out[150]: A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
Out[151]: A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
View Data
2
Out[152]: A B C D
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
In [153]: df.tail(4)
Out[153]: A B C D
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
2013-01-05 -2.173683 0.121913 1.157189 0.115356
2013-01-06 1.330829 0.845027 1.826736 1.175270
Out[157]: A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.598145 0.392122 0.089488 0.461638
std 1.169646 0.757682 1.752058 0.579919
min -2.173683 -0.846547 -2.094493 0.010433
25% -1.149952 0.101682 -1.314849 0.046997
50% -0.604770 0.483470 0.220487 0.163485
75% -0.304789 0.889388 1.659349 0.934356
max 1.330829 1.233227 1.878106 1.232942
3
In [159]: df.sort_index(axis=0, ascending=False) #axis=0/1 -> sort row/coln name
Out[159]: A B C D
2013-01-06 1.330829 0.845027 1.826736 1.175270
2013-01-05 -2.173683 0.121913 1.157189 0.115356
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
Out[160]: A B C D
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
2013-01-05 -2.173683 0.121913 1.157189 0.115356
2013-01-06 1.330829 0.845027 1.826736 1.175270
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
2013-01-02 -0.280776 1.233227 1.878106 0.010433
Selection
In [162]: # same as
df["A"]
In [163]: # same as
df.get("A")
4
2013-01-04 -0.832713
2013-01-05 -2.173683
2013-01-06 1.330829
Freq: D, Name: A, dtype: float64
In [164]: df[0:3]
Out[164]: A B C D
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
In [165]: # same as
df['20130102':'20130104']
Out[165]: A B C D
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
Out[166]: A -0.376828
B 0.904175
C -0.716216
D 1.232942
Name: 2013-01-01 00:00:00, dtype: float64
Out[167]: A B
2013-01-01 -0.376828 0.904175
2013-01-02 -0.280776 1.233227
2013-01-03 -1.255698 -0.846547
2013-01-04 -0.832713 0.094939
2013-01-05 -2.173683 0.121913
2013-01-06 1.330829 0.845027
In [168]: df.loc["20130101":'20130103',["C","B"]]
Out[168]: C B
2013-01-01 -0.716216 0.904175
2013-01-02 1.878106 1.233227
2013-01-03 -2.094493 -0.846547
In [169]: # same as
df.loc[dates[:-3],["C","B"]]
Out[169]: C B
2013-01-01 -0.716216 0.904175
2013-01-02 1.878106 1.233227
2013-01-03 -2.094493 -0.846547
5
In [170]: df.loc['20130103',["C","B"]] # reduction of dim
Out[170]: C -2.094493
B -0.846547
Name: 2013-01-03 00:00:00, dtype: float64
In [171]: df.loc[dates[0],'A']
Out[171]: -0.37682797203913287
In [172]: # same as
df.at[dates[0],'A']
Out[172]: -0.37682797203913287
Out[173]: A -0.832713
B 0.094939
C -1.514394
D 0.024210
Name: 2013-01-04 00:00:00, dtype: float64
In [174]: df.iloc[3:5,0:2]
Out[174]: A B
2013-01-04 -0.832713 0.094939
2013-01-05 -2.173683 0.121913
In [175]: df.iloc[1,1]
Out[175]: 1.2332273329710621
In [176]: # same as
df.iat[1,1]
Out[176]: 1.2332273329710621
Boolean Indexing
In [177]: # similar to R
df[df.A > 0] # returns a bunchs of True/False
Out[177]: A B C D
2013-01-06 1.330829 0.845027 1.826736 1.17527
In [178]: df[df>0] # for <0 will still display, but the value is missing
6
Out[178]: A B C D
2013-01-01 NaN 0.904175 NaN 1.232942
2013-01-02 NaN 1.233227 1.878106 0.010433
2013-01-03 NaN NaN NaN 0.211614
2013-01-04 NaN 0.094939 NaN 0.024210
2013-01-05 NaN 0.121913 1.157189 0.115356
2013-01-06 1.330829 0.845027 1.826736 1.175270
Out[180]: A B C D E
2013-01-01 -0.376828 0.904175 -0.716216 1.232942 one
2013-01-02 -0.280776 1.233227 1.878106 0.010433 one
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614 two
2013-01-04 -0.832713 0.094939 -1.514394 0.024210 three
2013-01-05 -2.173683 0.121913 1.157189 0.115356 four
2013-01-06 1.330829 0.845027 1.826736 1.175270 three
Out[181]: A B C D E
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614 two
2013-01-05 -2.173683 0.121913 1.157189 0.115356 four
Setting
Out[182]: 2013-01-02 1
2013-01-03 2
2013-01-04 3
2013-01-05 4
2013-01-06 5
2013-01-07 6
Freq: D, dtype: int64
Out[183]: A B C D F
2013-01-01 -0.376828 0.904175 -0.716216 1.232942 NaN
7
2013-01-02 -0.280776 1.233227 1.878106 0.010433 1.0
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614 2.0
2013-01-04 -0.832713 0.094939 -1.514394 0.024210 3.0
2013-01-05 -2.173683 0.121913 1.157189 0.115356 4.0
2013-01-06 1.330829 0.845027 1.826736 1.175270 5.0
Out[185]: A B C D F
2013-01-01 0.000000 0.904175 -0.716216 1.232942 NaN
2013-01-02 -0.280776 1.233227 1.878106 0.010433 1.0
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614 2.0
2013-01-04 -0.832713 0.094939 -1.514394 0.024210 3.0
2013-01-05 -2.173683 0.121913 1.157189 0.115356 4.0
2013-01-06 1.330829 0.845027 1.826736 1.175270 5.0
Out[104]: A B C D F
2013-01-01 0.000000 0.000000 0.411997 -0.591356 NaN
2013-01-02 0.353742 0.271770 0.489375 -2.474150 1.0
2013-01-03 -0.438979 -1.275026 1.044026 -0.356895 2.0
2013-01-04 -1.811731 0.719650 0.080504 -0.242658 3.0
2013-01-05 -0.239879 0.374453 -0.116906 -0.493383 4.0
2013-01-06 0.139179 1.299168 -1.235911 0.998853 5.0
Out[107]: A B C D F
2013-01-01 0.000000 0.000000 0.411997 5 NaN
2013-01-02 0.353742 0.271770 0.489375 5 1.0
2013-01-03 -0.438979 -1.275026 1.044026 5 2.0
2013-01-04 -1.811731 0.719650 0.080504 5 3.0
2013-01-05 -0.239879 0.374453 -0.116906 5 4.0
2013-01-06 0.139179 1.299168 -1.235911 5 5.0
Out[108]: A B C D F
2013-01-01 0.000000 0.000000 -0.411997 -5 NaN
2013-01-02 -0.353742 -0.271770 -0.489375 -5 -1.0
8
2013-01-03 -0.438979 -1.275026 -1.044026 -5 -2.0
2013-01-04 -1.811731 -0.719650 -0.080504 -5 -3.0
2013-01-05 -0.239879 -0.374453 -0.116906 -5 -4.0
2013-01-06 -0.139179 -1.299168 -1.235911 -5 -5.0
Out[113]: A B C D F E
2013-01-01 0.000000 0.000000 0.411997 5 NaN NaN
2013-01-02 0.353742 0.271770 0.489375 5 1.0 NaN
2013-01-03 -0.438979 -1.275026 1.044026 5 2.0 NaN
2013-01-04 -1.811731 0.719650 0.080504 5 3.0 NaN
Out[115]: A B C D F E
2013-01-01 0.000000 0.000000 0.411997 5 NaN 1.0
2013-01-02 0.353742 0.271770 0.489375 5 1.0 1.0
2013-01-03 -0.438979 -1.275026 1.044026 5 2.0 NaN
2013-01-04 -1.811731 0.719650 0.080504 5 3.0 NaN
Out[118]: A B C D F E
2013-01-02 0.353742 0.27177 0.489375 5 1.0 1.0
In [120]: # fill na
df1.fillna(value=5) # fill na's w/ 5
Out[120]: A B C D F E
2013-01-01 0.000000 0.000000 0.411997 5 5.0 1.0
2013-01-02 0.353742 0.271770 0.489375 5 1.0 1.0
2013-01-03 -0.438979 -1.275026 1.044026 5 2.0 5.0
2013-01-04 -1.811731 0.719650 0.080504 5 3.0 5.0
Out[121]: A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True
9
0.0.3 Operations
Stats
In [122]: df.mean() # mean of each coln
Out[122]: A -0.332945
B 0.231669
C 0.112181
D 5.000000
F 3.000000
dtype: float64
In [123]: df.mean(0) # same
Out[123]: A -0.332945
B 0.231669
C 0.112181
D 5.000000
F 3.000000
dtype: float64
In [125]: df.mean(1) # mean of each row
Out[125]: 2013-01-01 1.352999
2013-01-02 1.422977
2013-01-03 1.266004
2013-01-04 1.397685
2013-01-05 1.803534
2013-01-06 2.040487
Freq: D, dtype: float64
Apply
In [126]: df.apply(np.cumsum) #cumulative sum of each row
Out[126]: A B C D F
2013-01-01 0.000000 0.000000 0.411997 5 NaN
2013-01-02 0.353742 0.271770 0.901372 10 1.0
2013-01-03 -0.085237 -1.003256 1.945398 15 3.0
2013-01-04 -1.896968 -0.283605 2.025902 20 6.0
2013-01-05 -2.136847 0.090848 1.908996 25 10.0
2013-01-06 -1.997667 1.390015 0.673085 30 15.0
In [132]: df.apply(lambda x:x.max() - x.min(), axis = 0) # (0)/1 -> coln/row
# lambda func <=> function(x): {return x.max() - x.min()}, here x is ret
Out[132]: A 2.165474
B 2.574193
C 2.279937
D 0.000000
F 4.000000
dtype: float64
10
In [133]:
In [ ]:
11