vertopal.com_pandas32
vertopal.com_pandas32
import pandas as pd
import matplotlib.pyplot as plt
var1['abtest'].info()
<class 'pandas.core.series.Series'>
RangeIndex: 50000 entries, 0 to 49999
Series name: abtest
Non-Null Count Dtype
-------------- -----
50000 non-null object
dtypes: object(1)
memory usage: 390.8+ KB
var1['price'] = var1['price'].str.replace("$","")
var1['price'] = var1['price'].str.replace(",","")
var1['price'] = pd.to_numeric(var1['price'])
type(var1['price'][0])
numpy.int64
int(var1['price_in_usd'].mean()) # average
9840
99999999
var1['vehicleType'].value_counts()
vehicleType
limousine 12859
kleinwagen 10822
kombi 9127
bus 4093
cabrio 3061
coupe 2537
suv 1986
andere 420
Name: count, dtype: int64
var1['gearbox'].value_counts()
gearbox
manuell 36993
automatik 10327
Name: count, dtype: int64
var1['gearbox'] = var1['gearbox'].replace("manuell","manual")
var1['gearbox'] = var1['gearbox'].replace("automatik","automatic")
var1['gearbox'].head()
0 manual
1 automatic
2 manual
3 automatic
4 manual
Name: gearbox, dtype: object
var1.rename(columns = {'odometer':'km'},inplace=True)
var1['km'] = var1['km'].str.replace("km","")
var1['km'] = var1['km'].str.replace(",","")
var1['km'].head()
0 150000
1 150000
2 70000
3 70000
4 150000
Name: km, dtype: object
var1['km'] = pd.to_numeric(var1['km'])
var1['km'].dtype
dtype('int64')
var1['fuelType'].value_counts()
fuelType
benzin 30107
diesel 14567
lpg 691
cng 75
hybrid 37
andere 22
elektro 19
Name: count, dtype: int64
var1['gearbox'].head()
0 manual
1 automatic
2 manual
3 automatic
4 manual
Name: gearbox, dtype: object
var1['notRepairedDamage']=
var1['notRepairedDamage'].str.replace('nein','no')
var1['notRepairedDamage']=
var1['notRepairedDamage'].str.replace('ja','yes')
var1['notRepairedDamage']=
var1['notRepairedDamage'].str.replace('NO','no')
var1.head(0)
Empty DataFrame
Columns: [dateCrawled, name, seller, offerType, price_in_usd, abtest,
vehicleType, yearOfRegistration, gearbox, powerPS, model, km,
monthOfRegistration, fuelType, brand, notRepairedDamage, dateCreated,
nrOfPictures, postalCode, lastSeen]
Index: []
var1.head()
dateCrawled
name \
0 2016-03-26 17:47:46
Peugeot_807_160_NAVTECH_ON_BOARD
1 2016-04-04 13:38:56
BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik
2 2016-03-26 18:57:24
Volkswagen_Golf_1.6_United
3 2016-03-12 16:58:10
Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...
4 2016-04-01 14:38:50
Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...
taruncode lastSeen
0 79588 2016-04-06 06:45:54
1 71034 2016-04-06 14:45:08
2 35394 2016-04-06 20:15:37
3 33729 2016-03-15 03:16:28
4 39218 2016-04-01 14:38:50
var1['seller'] = var1['seller'].str.replace("privat","Private")
var1['yearOfRegistration'].value_counts()
yearOfRegistration
2000 3354
2005 3015
1999 3000
2004 2737
2003 2727
...
2800 1
1500 1
1953 1
4800 1
1001 1
Name: count, Length: 97, dtype: int64
var1.describe(include="all")
lastSeen
count 50000
unique 39481
top 2016-04-07 06:17:27
freq 8
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
var1 = var1.drop("No_of_pictures",axis=1)
var1 = var1.drop("notRepairedDamage",axis=1)
var1.shape
(50000, 18)
var1_null_sum = var1.isnull().sum()
var1_nperc = (var1_null_sum * 100/ len(var1))
print(round(var1_nperc,2).sort_values(ascending = False))
fuelType 7.76
vehicleType 6.52
model 5.12
gearbox 5.04
name 0.00
Date_Crawled 0.00
abtest 0.00
price_in_usd 0.00
seller 0.00
offerType 0.00
powerPS 0.00
yearOfRegistration 0.00
km 0.00
monthOfRegistration 0.00
brand 0.00
dateCreated 0.00
taruncode 0.00
lastSeen 0.00
dtype: float64
var1.head()
Date_Crawled
name \
0 2016-03-26 17:47:46
Peugeot_807_160_NAVTECH_ON_BOARD
1 2016-04-04 13:38:56
BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik
2 2016-03-26 18:57:24
Volkswagen_Golf_1.6_United
3 2016-03-12 16:58:10
Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...
4 2016-04-01 14:38:50
Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...
var1['price_in_usd'].describe(include = "all")
count 5.000000e+04
mean 9.840044e+03
std 4.811044e+05
min 0.000000e+00
25% 1.100000e+03
50% 2.950000e+03
75% 7.200000e+03
max 1.000000e+08
Name: price_in_usd, dtype: float64
var1['price_in_usd'].value_counts()
price_in_usd
0 1421
500 781
1500 734
2500 643
1000 639
...
35222 1
34490 1
910 1
3890000 1
48600 1
Name: count, Length: 2357, dtype: int64
iqr = q3-q1
lower_bound = q1-(10*iqr)
upper_bound = q3+(10*iqr)
var1['km'].describe(include="all")
count 49892.000000
mean 125883.107512
std 39889.981761
min 5000.000000
25% 125000.000000
50% 150000.000000
75% 150000.000000
max 150000.000000
Name: km, dtype: float64
var1['km'].value_counts()
km
150000 32409
125000 5164
100000 2164
90000 1755
80000 1434
70000 1230
60000 1155
50000 1019
5000 952
40000 809
30000 774
20000 768
10000 259
Name: count, dtype: int64
var1[['yearOfRegistration','monthOfRegistration']].describe()
yearOfRegistration monthOfRegistration
count 49892.000000 49892.000000
mean 2005.076906 5.723403
std 105.824742 3.713133
min 1000.000000 0.000000
25% 1999.000000 3.000000
50% 2003.000000 6.000000
75% 2008.000000 9.000000
max 9999.000000 12.000000
var1[['Date_Crawled','dateCreated','lastSeen']][0:5]
print(var1['Date_Crawled'].str[0:10].value_counts(normalize =
True,dropna= False).sort_index())
Date_Crawled
2016-03-05 0.025375
2016-03-06 0.013950
2016-03-07 0.035958
2016-03-08 0.033252
2016-03-09 0.033232
2016-03-10 0.032169
2016-03-11 0.032550
2016-03-12 0.036799
2016-03-13 0.015594
2016-03-14 0.036639
2016-03-15 0.033993
2016-03-16 0.029484
2016-03-17 0.031508
2016-03-18 0.013028
2016-03-19 0.034956
2016-03-20 0.037842
2016-03-21 0.037461
2016-03-22 0.032871
2016-03-23 0.032410
2016-03-24 0.029103
2016-03-25 0.031749
2016-03-26 0.032490
2016-03-27 0.030987
2016-03-28 0.034835
2016-03-29 0.034194
2016-03-30 0.033613
2016-03-31 0.031929
2016-04-01 0.033753
2016-04-02 0.035416
2016-04-03 0.038643
2016-04-04 0.036499
2016-04-05 0.013108
2016-04-06 0.003187
2016-04-07 0.001423
Name: proportion, dtype: float64
print(var1['dateCreated'].str[0:10].value_counts(normalize =
True,dropna= False).sort_index())
dateCreated
2015-06-11 0.000020
2015-08-10 0.000020
2015-09-09 0.000020
2015-11-10 0.000020
2015-12-05 0.000020
...
2016-04-03 0.038884
2016-04-04 0.036860
2016-04-05 0.011846
2016-04-06 0.003267
2016-04-07 0.001283
Name: proportion, Length: 76, dtype: float64
C:\Users\DELL\AppData\Local\Temp\ipykernel_5072\1530092152.py:1:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
print(var1['lastSeen'].str[0:10].value_counts(normalize = True,dropna=
False).sort_index())
lastSeen
2016-03-05 0.001082
2016-03-06 0.004430
2016-03-07 0.005352
2016-03-08 0.007596
2016-03-09 0.009821
2016-03-10 0.010783
2016-03-11 0.012547
2016-03-12 0.023852
2016-03-13 0.008999
2016-03-14 0.012828
2016-03-15 0.015894
2016-03-16 0.016456
2016-03-17 0.027980
2016-03-18 0.007416
2016-03-19 0.015774
2016-03-20 0.020725
2016-03-21 0.020745
2016-03-22 0.021607
2016-03-23 0.018580
2016-03-24 0.019542
2016-03-25 0.019242
2016-03-26 0.016937
2016-03-27 0.015995
2016-03-28 0.020845
2016-03-29 0.022368
2016-03-30 0.024874
2016-03-31 0.023831
2016-04-01 0.023130
2016-04-02 0.024894
2016-04-03 0.025335
2016-04-04 0.024673
2016-04-05 0.124068
2016-04-06 0.220997
2016-04-07 0.130803
Name: proportion, dtype: float64
Date_Crawled
name \
0 2016-03-26 17:47:46
Peugeot_807_160_NAVTECH_ON_BOARD
1 2016-04-04 13:38:56
BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik
2 2016-03-26 18:57:24
Volkswagen_Golf_1.6_United
3 2016-03-12 16:58:10
Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...
4 2016-04-01 14:38:50
Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...
var1['Date_Crawled'] = pd.to_datetime(var1['Date_Crawled']).dt.date
var1['dateCreated'] = pd.to_datetime(var1['dateCreated']).dt.date
var1['lastSeen'] = pd.to_datetime(var1['lastSeen']).dt.date
C:\Users\DELL\AppData\Local\Temp\ipykernel_5072\3435252794.py:1:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
print(pd.__version__)
2.2.2
C:\Users\DELL\AppData\Local\Temp\ipykernel_5072\2774122297.py:1:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
Date_Crawled name
seller \
0 2016-03-26 Peugeot 807 160 NAVTECH ON BOARD
Private
1 2016-04-04 BMW 740i 4 4 Liter HAMANN UMBAU Mega Optik
Private
2 2016-03-26 Volkswagen Golf 1.6 United
Private
3 2016-03-12 Smart smart fortwo coupe softouch/F1/Klima/Pan...
Private
4 2016-04-01 Ford Focus 1 6 Benzin TÜV neu ist sehr gepfleg...
Private
var1['Date_Crawled'].dtype
dtype('O')
type(var1['Date_Crawled'][0])
datetime.date
var1['seller'] = var1['seller'].str.replace("gewerblich","commercial")
C:\Users\DELL\AppData\Local\Temp\ipykernel_5072\1842077218.py:1:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
Date_Crawled name
seller \
0 2016-03-26 Peugeot 807 160 NAVTECH ON BOARD
Private
1 2016-04-04 BMW 740i 4 4 Liter HAMANN UMBAU Mega Optik
Private
2 2016-03-26 Volkswagen Golf 1.6 United
Private
3 2016-03-12 Smart smart fortwo coupe softouch/F1/Klima/Pan...
Private
4 2016-04-01 Ford Focus 1 6 Benzin TÜV neu ist sehr gepfleg...
Private
var1['seller'].unique()
var1['offerType'] = var1['offerType'].str.replace("Angebot","offer")
var1['offerType'] = var1['offerType'].str.replace("Gesuch","request")
C:\Users\DELL\AppData\Local\Temp\ipykernel_5072\3117879292.py:1:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
var1.head()
Date_Crawled name
seller \
0 2016-03-26 Peugeot 807 160 NAVTECH ON BOARD
Private
1 2016-04-04 BMW 740i 4 4 Liter HAMANN UMBAU Mega Optik
Private
2 2016-03-26 Volkswagen Golf 1.6 United
Private
3 2016-03-12 Smart smart fortwo coupe softouch/F1/Klima/Pan...
Private
4 2016-04-01 Ford Focus 1 6 Benzin TÜV neu ist sehr gepfleg...
Private
var1['gearbox'].unique()
var1['fuelType'].unique()
var1['fuelType'] = var1['fuelType'].str.replace("benzin","petrol")
var1['fuelType'] = var1['fuelType'].str.replace("andere","other")
var1['fuelType'] = var1['fuelType'].str.replace("elektro","electric")
C:\Users\DELL\AppData\Local\Temp\ipykernel_5072\2091247764.py:1:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
var1['vehicleType'].unique()
var1['vehicleType'] =
var1['vehicleType'].str.replace("kleinwagen","small car")
var1['vehicleType'] =
var1['vehicleType'].str.replace("cabrio","convertible")
var1['vehicleType'] = var1['vehicleType'].str.replace("kombi","station
wagon")
var1['vehicleType'] =
var1['vehicleType'].str.replace("andere","other")
C:\Users\DELL\AppData\Local\Temp\ipykernel_5072\541384445.py:1:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
var1.head()
Date_Crawled name
seller \
0 2016-03-26 Peugeot 807 160 NAVTECH ON BOARD
Private
1 2016-04-04 BMW 740i 4 4 Liter HAMANN UMBAU Mega Optik
Private
2 2016-03-26 Volkswagen Golf 1.6 United
Private
3 2016-03-12 Smart smart fortwo coupe softouch/F1/Klima/Pan...
Private
4 2016-04-01 Ford Focus 1 6 Benzin TÜV neu ist sehr gepfleg...
Private