0% found this document useful (0 votes)
76 views1 page

Spark Mini Project

Uploaded by

Sai Gopi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
76 views1 page

Spark Mini Project

Uploaded by

Sai Gopi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 1

spark = SparkSession.builder.appName("H1B") .config("spark.some.config.

option",
"some-value").getOrCreate()
parquet1DF = spark.read.parquet("h1_b_dataset.parquet")
PARQUETdf11 = parquet1DF.select("CASE_STATUS", "VISA_CLASS", "EMPLOYER_NAME",
"JOB_TITLE", "PREVAILING_WAGE", "PW_SOURCE_YEAR", "WORKSITE_STATE")
p1 =
parquetDF11.withColumnRenamed("PREVAILING_WAGE","SALARY").withColumnRenamed("PW_SOU
RCE_YEAR","FINANCIAL_YEAR")
p2 = p1.where(p1.CASE_STATUS == "CERTIFIED")
p3 = p2.na.drop()
p4 = p3.selectExpr("cast(CASE_STATUS as string) CASE_STATUS","cast(VISA_CLASS as
string) VISA_CLASS","cast(EMPLOYER_NAME as string) EMPLOYER_NAME","cast(JOB_TITLE
as string) JOB_TITLE",
"cast(SALARY as double) SALARY","cast(FINANCIAL_YEAR as integer)
FINANCIAL_YEAR","cast(WORKSITE_STATE as string) WORKSITE_STATE")
p3.filter(~p3.EMPLOYER_NAME.endswith("LLC"))
p5= p4.filter(~p3.EMPLOYER_NAME.endswith("LLC"))

p6.write.format('csv').option('header',True).option('sep',',').save('c.csv')

val data
=spark.read.option("header","true").option("InferSchema","true").parquet("h1_b_data
set.parquet")

val raw =data.select("CASE_STATUS", "VISA_CLASS", "EMPLOYER_NAME", "JOB_TITLE",


"PREVAILING_WAGE", "PW_SOURCE_YEAR", "WORKSITE_STATE")

val info =
raw.withColumnRenamed("PREVAILING_WAGE","SALARY").withColumnRenamed("PW_SOURCE_YEAR
","FINANCIAL_YEAR")
val value = info.filter(info("CASE_STATUS") === "CERTIFIED")

val raws = value.filter(~value("EMPLOYER_NAME).endswith("LLC"))


val conditions = value.columns.map(value(_).endsWith("LLC")).reduce(_ or _)
val output = value.withColumn("condition", conditions).filter($"condition" ===
true).drop("condition")

This study source was downloaded by 100000839058166 from CourseHero.com on 06-07-2022 22:05:40 GMT -05:00

https://www.coursehero.com/file/101518475/spark-mini-projecttxt/
Powered by TCPDF (www.tcpdf.org)

You might also like