BDDA Notes
BDDA Notes
0 Filtering with regular expressions # 11 Filtering logical operatotrs: &, '|', == # 10.3
# 9.1 Filter with regular expressions: # 11.1 We can filter our data based on multiple conditions airports_df.join(
airports_df.select('name'). \ (AND or OR)
where(" name rlike 'pal$' " ). \
show(3,truncate = False) weather_df,
# 9.2 where is an alias for filter # Logical Operators: & ==and, | == or ~ == not
airports_df.select('name'). \ airports_df.faa==weather_df.origin,
filter(" name rlike 'pal$' " ). \ airports_df.filter( \
show(3,truncate = False) how = 'left'
# 9.3 airports_df.filter(" name rlike 'pal$' " ). \ (airports_df.tz == -5) & \
show(3,truncate = False)
).count() # Could also use 'left_outer', 'right', 'full'
## Use of Column.API
###################### (airports_df.dst=="A") | \
# 8.7 Use of isin() function. # 10.4
# It is difficult to use isin() within (airports_df.name.like('%Lans%')) \
# a string-condition because of list-object weather_df.join(
# Syntax: Column.isin(*cols) ). show(3)
airports_df.select("name"). airports_df,
where(airports_df.name.isin(['Lansdowne Airport',
# 11.2 Conditions within strings:
'Randall Airport']))
# 8.8 Use of %like% airports_df.faa==weather_df.origin,
# Syntax: Column.like(other) airports_df.filter( \
# other: SQL like expression how = 'left'
airports_df.select(airports_df.columns[:2]). \ "(tz == -5) AND \
where("name like '%La%'"). \ ).count() # Could also use 'left_outer', 'right', 'full'
show(3) (dst== 'A') OR \
# 8.9 Note like() function
airports_df.select(airports_df.columns[:2]). \
where(airports_df.name.like('%La%')). \ (name like '%Lans%' )" \
show(3)
10. Combining verbs: select, filter and distinct ). show(3)
airports_df.select('dst', 'tz'). \
filter(airports_df.tz == -5). \ # 12. groupby. Can apply sum, min, max, count
show(3)
# 10.1
airports_df.select('dst', 'tz'). \ airports_df.groupby('tz'). \
filter(airports_df.tz == -5). \
distinct(). \ count(). \
show(3)
show(3)
# 12.1
airports_df.groupby('tz'). \
agg({'lat' : 'mean'}). \
show(3)
# 12.2
airports_df.groupby(['tz','dst']). \
agg({'lat' : 'mean'}). \
show(3)
# 10. Joins
# 10.1
airports_df.faa==weather_df.origin # Join on
).show(3)
# 10.2
airports_df.join(
weather_df,
airports_df.faa==weather_df.origin,
how = 'inner'
# 9.0 Getting mode of a feature--step-by-step
data.select([count(when(isnan(
c) | col(c).isNull(), c)).alias(c)
for c in data.columns]).show()
# 7.1 Use the function:
null_values(df)
# 7.2 Use where filter
df.select('*').where(df.income.i
sNull()).count()