Spark SQL
Spark SQL
Spark SQL
import org.apache.spark.sql.SparkSession
val sparkSession = SparkSession.builder.master("local").appName("Spark session in
Fresco").getOrCreate()
val langPercentDF = spark.createDataFrame(List(("Scala", 35), ("Python", 30), ("R",
15), ("Java", 20)))
langPercentDF.show()
val lpDF = langPercentDF.withColumnRenamed("_1",
"language").withColumnRenamed("_2", "percent")
lpDF.orderBy(desc("percent")).show(false)
2) DATASET:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.master("local").appName("Spark session in
Fresco").getOrCreate()
val numDS = spark.range(5, 100, 5)
numDS.show()
numDS.orderBy(desc("id")).show(5)
numDS.describe().show()
{"name":"Rahul","age":"35"}
{"name":"Sachin","age":"46"}
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.master("local").appName("Spark session in
Fresco").getOrCreate()
val peopleDS = spark.read.json("/projects/People.json")
peopleDS.show()
case class Person(name:String,age:String)
object Main
{
def main(args: Array[String])
{
var Person1 = Person("35", "Rahul")
var Person2 = Person("46", "Sachin")
println("Age of the Person1 is " + Person1.age);
println("Name of the Person1 is " + Person1.name);
println("Age of the Person2 is " + Person2.age);
println("Name of the Person2 is " + Person2.name);
}
}
4) PARQUET
{"name":"Rahul","age":"35"}
{"name":"Sachin","age":"46"}
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.master("local").appName("Spark session in
Fresco").getOrCreate()
val peopleDS = spark.read.json("/projects/People.json")
peopleDS.show()
val peoplePAR = peopleDS.write.parquet("/projects/challenge/data.parquet")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val data = sqlContext.read.parquet("/projects/challenge/data.parquet")
data.show()
5) CSV Files
git clone https://github.com/frescoplaylab/Census.git
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().master("local[1]").appName("Spark Session in
Frescoplay").getOrCreate()
val dfs = spark.read.format("csv").option("header",
"true").option("Inferschema","true").option("mode",
"DROPMALFORMED").load("/projects/challenge/Census/demography.csv")
val joined = dfs.join(TotalPopulation, "Total Population")