Pandas 2
Pandas 2
[2]: pwd # current working directory ( the folder where code is present )
[2]: 'C:\\Users\\admin\\2802'
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
2 1 plymouth satellite
3 1 amc rebel sst
4 1 ford torino
[ ]: # these steps only work in local machine ( not for google colab )
[10]: 'C\\Users'
1
1 15.0 8.0 350.0 165 3693 11.5 70
2 NaN 8.0 318.0 150 3436 11.0 70
3 NaN 8.0 NaN 150 3433 12.0 70
4 NaN 8.0 NaN 140 3449 10.5 70
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
2 1 plymouth satellite
3 1 amc rebel sst
4 1 ford torino
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
2 1 plymouth satellite
3 1 amc rebel sst
4 1 ford torino
5 1 ford galaxie 500
6 1 chevrolet impala
origin name
392 1 ford mustang gl
393 2 vw pickup
394 1 dodge rampage
395 1 ford ranger
396 1 chevy s-10
2
[7]: df1.tail(2)
origin name
395 1 ford ranger
396 1 chevy s-10
[11]: (397, 9)
[12]: df1.head(2)
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
[ ]: # Attribute Error
# Case sensistive
# spelling mistake
# the function doesnt exist
[13]: df1.dtypes # returns the data type of individual column # numbers : int / float␣
↪, character/string : object
[14]: df1.head()
3
1 15.0 8.0 350.0 165 3693 11.5 70
2 NaN 8.0 318.0 150 3436 11.0 70
3 NaN 8.0 NaN 150 3433 12.0 70
4 NaN 8.0 NaN 140 3449 10.5 70
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
2 1 plymouth satellite
3 1 amc rebel sst
4 1 ford torino
[16]: df1.isnull().sum()
# it will return no.of times True appearing
[16]: mpg 5
cylinders 3
displacement 5
Horse Power 0
weight 0
acceleration 0
year 0
origin 0
name 0
dtype: int64
[17]: df1.columns
[18]: df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mpg 392 non-null float64
1 cylinders 394 non-null float64
2 displacement 392 non-null float64
3 Horse Power 397 non-null object
4 weight 397 non-null int64
5 acceleration 397 non-null float64
6 year 397 non-null int64
7 origin 397 non-null int64
4
8 name 397 non-null object
dtypes: float64(4), int64(3), object(2)
memory usage: 28.0+ KB
year origin
count 397.000000 397.000000
mean 75.994962 1.574307
std 3.690005 0.802549
min 70.000000 1.000000
25% 73.000000 1.000000
50% 76.000000 1.000000
75% 79.000000 2.000000
max 82.000000 3.000000
[20]: df1['mpg']
[20]: 0 18.0
1 15.0
2 NaN
3 NaN
4 NaN
…
392 27.0
393 44.0
394 32.0
395 28.0
396 31.0
Name: mpg, Length: 397, dtype: float64
5
3 NaN 3433 12.0
4 NaN 3449 10.5
.. … … …
392 27.0 2790 15.6
393 44.0 2130 24.6
394 32.0 2295 11.6
395 28.0 2625 18.6
396 31.0 2720 19.4
[ ]: # min,max
[24]: df1.min()
[25]: df1['mpg'].min()
[25]: 9.0
[23]: df1.describe()
6
25% 17.500000 4.000000 100.750000 2223.000000 13.800000
50% 23.000000 4.000000 145.500000 2800.000000 15.500000
75% 29.000000 8.000000 260.000000 3609.000000 17.100000
max 46.600000 8.000000 455.000000 5140.000000 24.800000
year origin
count 397.000000 397.000000
mean 75.994962 1.574307
std 3.690005 0.802549
min 70.000000 1.000000
25% 73.000000 1.000000
50% 76.000000 1.000000
75% 79.000000 2.000000
max 82.000000 3.000000
[ ]:
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
[ ]: # if you want to create a new column which should consists of only value 2
[27]: df1['col1'] = 2
[30]: df1.head(2)
[32]: df1.head(2)
7
[32]: mpg cylinders displacement Horse Power weight acceleration year \
0 18.0 8.0 307.0 130 3504 12.0 70
1 15.0 8.0 350.0 165 3693 11.5 70
[ ]: # describe results will change when you perform any operation on that column (␣
↪adding new rows / deleting rows )
[37]: df1.head(5)
[33]: 1+2+3+4+5/5
[33]: 11.0
[34]: 1+2+3+4/4
[34]: 7.0
[ ]:
Data Cleaning
[ ]: # soft conversion
# int --> float , float --> int
[38]: df1.head(2)
8
[38]: mpg cylinders displacement Horse Power weight acceleration year \
0 18.0 8.0 307.0 130 3504 12.0 70
1 15.0 8.0 350.0 165 3693 11.5 70
[41]: df1.head()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[42], line 1
----> 1 df1['Horse Power'].astype(int)
6637 results = [
6638 ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.
↪items()
6639 ]
6641 else:
6642 # else, only a single dtype is given
-> 6643 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
6644 res = self._constructor_from_mgr(new_data, axes=new_data.axes)
6645 return res.__finalize__(self, method="astype")
9
427 elif using_copy_on_write():
428 copy = False
--> 430 return self.apply(
431 "astype",
432 dtype=dtype,
433 copy=copy,
434 errors=errors,
435 using_cow=using_copy_on_write(),
436 )
755 raise ValueError("Can not squeeze with more than one column.")
756 values = values[0, :] # type: ignore[call-overload]
--> 758 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
760 new_values = maybe_coerce_values(new_values)
762 refs = None
10
132 # Explicit copy, or required since NumPy can't view from / to object.
--> 133 return arr.astype(dtype, copy=True)
135 return arr.astype(dtype, copy=copy)
[43]: int('?')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[43], line 1
----> 1 int('?')
[44]: int('a')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[44], line 1
----> 1 int('a')
[46]: 0
[51]: df1.head(2)
[52]: int('130')
[52]: 130
11
[ ]: # int('?') --> converting these strings into null
[49]: 5
[50]: dtype('float64')
[ ]:
[53]: df1.isnull().sum()
[53]: mpg 5
cylinders 3
displacement 5
Horse Power 5
weight 0
acceleration 0
year 0
origin 0
name 0
col1 0
col2 0
col3 0
dtype: int64
[ ]: # 397 rows
[54]: df1.dropna()
12
393 44.0 4.0 97.0 52.0 2130 24.6 82.0
394 32.0 4.0 135.0 84.0 2295 11.6 82.0
395 28.0 4.0 120.0 79.0 2625 18.6 82.0
396 31.0 4.0 119.0 82.0 2720 19.4 82.0
[55]: df1['mpg'].mean() # ideal way of delaing with null values is to replace them␣
↪with average
[55]: 23.611734693877548
[57]: df1.isnull().sum()
[57]: mpg 0
cylinders 3
displacement 5
Horse Power 5
weight 0
acceleration 0
year 0
origin 0
name 0
col1 0
col2 0
col3 0
dtype: int64
[ ]:
13
[58]: df1.rename({'Horse Power': 'horse_power'}, axis = 1,inplace = True)
# axis = 1--> columns --> look for Horse Power in column name
# inplace = True --> commit your changes --> make the change permanent
[59]: df1.head()
[61]: df1.head()
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
2 1 plymouth satellite
3 1 amc rebel sst
4 1 ford torino
[ ]:
[62]: df1.head(10)
14
2 23.61 8.0 318.0 150.0 3436 11.0 70.0
3 23.61 8.0 NaN 150.0 3433 12.0 70.0
4 23.61 8.0 NaN 140.0 3449 10.5 70.0
5 23.61 8.0 NaN 198.0 4341 10.0 70.0
6 23.61 8.0 NaN 220.0 4354 9.0 70.0
7 14.00 8.0 NaN 215.0 4312 8.5 70.0
8 14.00 8.0 455.0 225.0 4425 10.0 70.0
9 15.00 8.0 390.0 190.0 3850 8.5 70.0
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
2 1 plymouth satellite
3 1 amc rebel sst
4 1 ford torino
5 1 ford galaxie 500
6 1 chevrolet impala
7 1 plymouth fury iii
8 1 pontiac catalina
9 1 amc ambassador dpl
[66]: df1.head(5)
origin name
0 1 chevrolet chevelle malibu
15
1 1 buick skylark 320
2 1 plymouth satellite
3 1 amc rebel sst
4 1 ford torino
[68]: df1.loc[1:4,'mpg']
[68]: 1 15.00
2 23.61
3 23.61
4 23.61
Name: mpg, dtype: float64
[69]: df1.loc[1:4,['mpg','weight']]
[ ]:
[71]: sqr_num(4)
[71]: 16
[72]: df1.head(2)
16
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
[76]: 0
df1.head(4)
17
3 1 amc rebel sst 144.00 144.00 Even
[81]: df1.head(5)
18