Data Engineering - Solutions
Data Engineering - Solutions
PROBLEM 1 - SOLUTION
df_q1
.na.fill("NIL", col_names)
.write.format("avro")
.option("compression", "snappy")
.mode("overwrite")
.save("/user/vb_student/problems/section07/problem01/")
check_1.show(3, false)
check_1.count()
df_q2
.filter("nbr_of_patients > 3000 AND nbr_of_patients < 4000")
.selectExpr("practice_code", "nbr_of_patients")
.write.format("json")
.option("compression", "deflate")
.mode("overwrite")
.save("/user/vb_student/problems/section07/problem02")
check_2.show(3, false)
check_2.count()
df_q3
.select("card_holder_name", "issuing_bank", "issue_date")
.write.format("orc")
.option("compression", "zlib")
.mode("overwrite")
.save("/user/vb_student/problems/section07/problem03")
check_3.show(3, false)
check_3.count()
Should be:2000000
df_q4
.selectExpr("concat_ws('\t', sha, pct, practice_code, bnf_code, bnf_name, items, nic,
act_cost, quantity, period) as results")
.write.format("text")
.option("compression", "lz4")
.mode("overwrite")
.save("/user/vb_student/problems/section07/problem04")
check_4.show(3, false)
check_4.count()
df_q5
.where(month($"pickup_datetime") === "03")
.selectExpr("concat_ws('|',*)")
.write.format("text")
.option("compression", "gzip")
.mode("overwrite")
.save("/user/vb_student/problems/section07/problem05")
check_5.show(3, false)
check_5.count()
df_q6
.selectExpr("practice_code", "bnf_code", "bnf_name", "items", "nic", "act_cost",
"abs(nic-act_cost) as difference")
.where($"difference" > 2)
.drop("difference")
.coalesce(1)
.write.format("parquet")
.option("compression", "gzip")
.option("path", "/user/vb_student/problems/section07/problem06/")
.mode("append")
.saveAsTable("gp_db.q6_soln")
check_6.show(3,false)
check_6.count()
CCA175 Exam Prep Questions Part A ETL Focus (With Spark 2.4 Hadoop Cluster
VM)