Spark Mini Project
Spark Mini Project
option",
"some-value").getOrCreate()
parquet1DF = spark.read.parquet("h1_b_dataset.parquet")
PARQUETdf11 = parquet1DF.select("CASE_STATUS", "VISA_CLASS", "EMPLOYER_NAME",
"JOB_TITLE", "PREVAILING_WAGE", "PW_SOURCE_YEAR", "WORKSITE_STATE")
p1 =
parquetDF11.withColumnRenamed("PREVAILING_WAGE","SALARY").withColumnRenamed("PW_SOU
RCE_YEAR","FINANCIAL_YEAR")
p2 = p1.where(p1.CASE_STATUS == "CERTIFIED")
p3 = p2.na.drop()
p4 = p3.selectExpr("cast(CASE_STATUS as string) CASE_STATUS","cast(VISA_CLASS as
string) VISA_CLASS","cast(EMPLOYER_NAME as string) EMPLOYER_NAME","cast(JOB_TITLE
as string) JOB_TITLE",
"cast(SALARY as double) SALARY","cast(FINANCIAL_YEAR as integer)
FINANCIAL_YEAR","cast(WORKSITE_STATE as string) WORKSITE_STATE")
p3.filter(~p3.EMPLOYER_NAME.endswith("LLC"))
p5= p4.filter(~p3.EMPLOYER_NAME.endswith("LLC"))
p6.write.format('csv').option('header',True).option('sep',',').save('c.csv')
val data
=spark.read.option("header","true").option("InferSchema","true").parquet("h1_b_data
set.parquet")
val info =
raw.withColumnRenamed("PREVAILING_WAGE","SALARY").withColumnRenamed("PW_SOURCE_YEAR
","FINANCIAL_YEAR")
val value = info.filter(info("CASE_STATUS") === "CERTIFIED")
This study source was downloaded by 100000839058166 from CourseHero.com on 06-07-2022 22:05:40 GMT -05:00
https://www.coursehero.com/file/101518475/spark-mini-projecttxt/
Powered by TCPDF (www.tcpdf.org)