Pyspark coding questions from StrataScratch platform
Pyspark coding questions from StrataScratch platform
Easy
1. Salaries Differences
win = Window.partitionBy('bike_number').orderBy(desc('end_time'))
dc_bikeshare_q1_2012.toPandas()
5. Count the number of movies that Abigail Breslin nominated for Oscar
# Import your libraries
from pyspark.sql.functions import col, count,lower,countDistinct
7. Popularity of Hack
# Import your libraries
import pyspark
11. Find libraries who haven't provided the email address in circulation year 2016 but their notice
preference definition is set to email
18. Find the most profitable company in the financial sector of the entire world along with its
continent
# Import your libraries
from pyspark.sql.functions import *
online_orders = online_orders.groupBy('product_id').agg(sum('Revenue').alias('Total'))
win_spec = Window.orderBy(desc('Total'))
online_orders = online_orders.withColumn('rnk',
dense_rank().over(win_spec)).filter(col('rnk')<6).drop('rnk')
# To validate your solution, convert your final pySpark df to a pandas df
online_orders.toPandas()
22. Number of Shipments Per Month
# Import your libraries
from pyspark.sql.functions import *
MEDIUM
wins = Window.partitionBy('user_id').orderBy('created_at')
4. New Products
# Import your libraries
from pyspark.sql.functions import *
wins = Window.partitionBy('state').orderBy(desc('fraud_score'))
w2 = Window.partitionBy('state')
wins = Window.orderBy('user1')
df_u = df.select('user1','user2').union(df.select('user2','user1')).select('user1',
'user2').toDF('user', 'frnd')
total = df_u.select('user').distinct().count()
df2 = df_u.groupBy('user').agg(countDistinct('frnd').alias('frnd'))
df2 = df2.withColumn('pct', round((100*col('frnd')/total),3)).sort('user').drop('frnd')
wins = Window.orderBy(desc('n_messages'))
# Start writing code
df = airbnb_contacts.groupBy('id_guest').agg(sum('n_messages').alias('n_messages'))
df = df.withColumn('rnk', dense_rank().over(wins))
9. Spam Posts
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window
win = Window.partitionBy('post_date').orderBy('post_date')
# Start writing code
facebook_posts = facebook_posts.join(facebook_post_views,['post_id'],'inner')
facebook_posts = facebook_posts.withColumn('spam',
when(col('post_keywords').contains('spam'),1).otherwise(0))
facebook_posts = facebook_posts.groupBy('post_date').agg(sum('spam').alias('spam_cnt'),
count('spam').alias('total'))
facebook_posts = facebook_posts.withColumn('spam_share',
100*col('spam_cnt')/col('total')).select('post_date','spam_share')
o2 = orders.filter(col('address').isNotNull()).count()
res = 100*o2/tot
res
# To validate your solution, convert your final pySpark df to a pandas df
# orders.toPandas()
df = df.groupBy('date').agg(sum('consumption').alias('consumption'))
wins = Window.orderBy(desc('consumption'))
df = df.withColumn('rnk', dense_rank().over(wins)).filter(col('rnk')==1).drop('rnk')
15. Find all wineries which produce wines by possessing aromas of plum, cherry, rose, or hazelnut
# Import your libraries
from pyspark.sql.functions import *
df = df.withColumn('rnk',dense_rank().over(wins)).filter('rnk=1').drop('rnk')
wins = Window.orderBy(desc('target'))
# Start writing code
salesforce_employees = salesforce_employees.filter(col('manager_id') == 13)
salesforce_employees = salesforce_employees.withColumn('rnk',
rank().over(wins)).filter(col('rnk')==1).select('first_name','target')
# To validate your solution, convert your final pySpark df to a pandas df
salesforce_employees.toPandas()
df = df.select('business_name','Business_type').distinct()
win = Window.orderBy('sat_writing')
df = sat_scores.withColumn('rn', row_number().over(win)).filter(col('rn') ==
'68').select('sat_writing').first()[0]
# Start writing code
sat_scores = sat_scores.filter(col('sat_writing').isin(df)).select(col('student_id'))
print(sat_scores.count())
# To validate your solution, convert your final pySpark df to a pandas df
sat_scores.toPandas()
df2 = df2.groupBy('video_id').agg(countDistinct('name').alias('n_users'))
user_flags
win = Window.partitionBy('voter').orderBy('voter')
# Start writing code
voting_results = voting_results.filter(col('candidate').isNotNull())
df = voting_results.withColumn('rnk', row_number().over(win)).select('voter', 'rnk')
df = df.groupBy('voter').agg(max('rnk').alias('n_vote')).withColumn('votes', 1.0*1/col('n_vote'))
df2 =
df.join(voting_results,['voter'],'right').groupBy('candidate').agg(sum('votes').alias('total_votes')).
orderBy(desc('total_votes'))
win2 = Window.orderBy(desc('total_votes'))
df2 = df2.withColumn('rnk', dense_rank().over(win2)).filter('rnk<2').select('candidate')
wins = Window.orderBy('month_col')
df = df.withColumn('prev_rev', lag('revenue',1).over(wins))
df = df.withColumn('pct', round((100*(col('revenue') -
col('prev_rev'))/col('prev_rev')),2)).drop('revenue', 'prev_rev')
2. Premium vs Freemium
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window
df = df.groupBy('date').pivot('paying_customer').sum('downloads')
df = df.filter(col('no')>col('yes'))
df2 =
df.groupBy('host_review_pop').agg(min('price').alias('min_price'),avg('price').alias('avg_price'),
max('price').alias('max_price'))
# df =
df.filter(col('review_scores_rating').between(6,15)).select('review_scores_rating').distinct()
# To validate your solution, convert your final pySpark df to a pandas df
df2.toPandas()
5. Retention Rate
6. The Most Popular Client_Id Among Users Using Video and Voice Calls
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window
event_type_msg = ['video call received', 'video call sent', 'voice call received', 'voice call sent']
# Start writing code
df = fact_events.withColumn('flag',
when(col('event_type').isin(event_type_msg),1).otherwise(0))
df2 = df.groupBy('user_id').agg(count('flag').alias('cnt_us'),sum('flag').alias('sum_us'))
# df = df.filter('flag = 1').groupBy('client_id').agg(count('user_id').alias('cnt_us'))
df2 = df2.withColumn('pct', 100*(1.0*col('sum_us')/col('cnt_us'))).filter('pct>=50')
final_df = fact_events.join(df2, ['user_id'], 'inner').select(df2['*'], fact_events.client_id)
final_df = final_df.groupBy('client_id').agg(count('*').alias('cnt'))
final_df2 = final_df.select(max('cnt')).collect()
finaldf = final_df.filter(col('cnt') == final_df2[0][0]).select('client_id')
THANK YOU