Journal
Journal
#Flatmap
rdd2=rdd.flatMap(lambda x: x.split(" "))
for element in rdd2.collect():
print(element)
#map
rdd3=rdd2.map(lambda x: (x,1))
for element in rdd3.collect():
print(element)
#reduceByKey
rdd4=rdd3.reduceByKey(lambda a,b: a+b)
for element in rdd4.collect():
print(element)
#map
rdd5 = rdd4.map(lambda x: (x[1],x[0])).sortByKey()
for element in rdd5.collect():
print(element)
#filter
rdd6 = rdd5.filter(lambda x : 'a' in x[1])
for element in rdd6.collect():
print(element)
listRdd = spark.sparkContext.parallelize([1,2,3,4,5,3,2])
#aggregate
seqOp = (lambda x, y: x + y)
combOp = (lambda x, y: x + y)
agg=listRdd.aggregate(0, seqOp, combOp)
print(agg) # output 20
#aggregate 2
seqOp2 = (lambda x, y: (x[0] + y, x[1] + 1))
combOp2 = (lambda x, y: (x[0] + y[0], x[1] + y[1]))
agg2=listRdd.aggregate((0, 0), seqOp2, combOp2)
print(agg2) # output (20,7)
agg2=listRdd.treeAggregate(0,seqOp, combOp)
print(agg2) # output 20
#fold
from operator import add
foldRes=listRdd.fold(0, add)
print(foldRes) # output 20
#reduce
redRes=listRdd.reduce(add)
print(redRes) # output 20
#Collect
data = listRdd.collect()
print(data)
#first
print("first : "+str(listRdd.first()))
#Output: first : 1
print("first : "+str(inputRDD.first()))
#Output: first : (Z,1)
#top
print("top : "+str(listRdd.top(2)))
print("top : "+str(inputRDD.top(2)))
#min
print("min : "+str(listRdd.min()))
#Output: min : 1
print("min : "+str(inputRDD.min()))
#Output: min : (A,20)
#max
print("max : "+str(listRdd.max()))
#Output: max : 5
print("max : "+str(inputRDD.max()))
#Output: max : (Z,1)
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import *
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
(('Michael','Rose',''),'2000-05-19','M',4000),
(('Robert','','Williams'),'1978-09-05','M',4000),
(('Maria','Anne','Jones'),'1967-12-01','F',4000),
(('Jen','Mary','Brown'),'1980-02-17','F',-1)
]
schema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('dob', StringType(), True),
StructField('gender', StringType(), True),
StructField('salary', IntegerType(), True)
])
# Example 1
df.withColumnRenamed("dob","DateOfBirth").printSchema()
# Example 2
df2 = df.withColumnRenamed("dob","DateOfBirth") \
.withColumnRenamed("salary","salary_amount")
df2.printSchema()
# Example 3
schema2 = StructType([
StructField("fname",StringType()),
StructField("middlename",StringType()),
StructField("lname",StringType())])
df.select(col("name").cast(schema2),
col("dob"),
col("gender"),
col("salary")) \
.printSchema()
# Example 4
df.select(col("name.firstname").alias("fname"),
col("name.middlename").alias("mname"),
col("name.lastname").alias("lname"),
col("dob"),col("gender"),col("salary")) \
.printSchema()
# Example 5
df4 = df.withColumn("fname",col("name.firstname")) \
.withColumn("mname",col("name.middlename")) \
.withColumn("lname",col("name.lastname")) \
.drop("name")
df4.printSchema()
#Example 7
newColumns = ["newCol1","newCol2","newCol3","newCol4"]
df.toDF(*newColumns).printSchema()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
data = [('James','','Smith','1991-04-01','M',3000),
('Michael','Rose','','2000-05-19','M',4000),
('Robert','','Williams','1978-09-05','M',4000),
('Maria','Anne','Jones','1967-12-01','F',4000),
('Jen','Mary','Brown','1980-02-17','F',-1)
]
columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.printSchema()
df.show(truncate=False)
df2 = df.withColumn("salary",col("salary").cast("Integer"))
df2.printSchema()
df2.show(truncate=False)
df3 = df.withColumn("salary",col("salary")*100)
df3.printSchema()
df3.show(truncate=False)
df.withColumnRenamed("gender","sex") \
.show(truncate=False)
df4.drop("CopiedColumn") \
.show(truncate=False)
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
columns= ["Product","Amount","Country"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)
pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate=False)
pivotDF = df.groupBy("Product","Country") \
.sum("Amount") \
.groupBy("Product") \
.pivot("Country") \
.sum("sum(Amount)")
pivotDF.printSchema()
pivotDF.show(truncate=False)
# Imports
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, asc,desc
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000), \
("Michael","Sales","NY",86000,56,20000), \
("Robert","Sales","CA",81000,30,23000), \
("Maria","Finance","CA",90000,24,23000), \
("Raman","Finance","CA",99000,40,24000), \
("Scott","Finance","NY",83000,36,19000), \
("Jen","Finance","NY",79000,53,15000), \
("Jeff","Marketing","CA",80000,25,18000), \
("Kumar","Marketing","NY",91000,50,21000) \
]
columns= ["employee_name","department","state","salary","age","bonus"]
df.printSchema()
df.show(truncate=False)
df.sort("department","state").show(truncate=False)
df.sort(col("department"),col("state")).show(truncate=False)
df.orderBy("department","state").show(truncate=False)
df.orderBy(col("department"),col("state")).show(truncate=False)
df.sort(df.department.asc(),df.state.asc()).show(truncate=False)
df.sort(col("department").asc(),col("state").asc()).show(truncate=False)
df.orderBy(col("department").asc(),col("state").asc()).show(truncate=False)
df.sort(df.department.asc(),df.state.desc()).show(truncate=False)
df.sort(col("department").asc(),col("state").desc()).show(truncate=False)
df.orderBy(col("department").asc(),col("state").desc()).show(truncate=False)
df.createOrReplaceTempView("EMP")
spark.sql("select employee_name,department,state,salary,age,bonus from EMP ORDER
BY department asc").show(truncate=False)
b) Write a programme to demonstrate Drop rows with NULL Values.
filePath="resources/small_zipcode.csv"
df = spark.read.options(header='true', inferSchema='true') \
.csv(filePath)
df.printSchema()
df.show(truncate=False)
df.na.drop().show(truncate=False)
df.na.drop(how="any").show(truncate=False)
df.na.drop(subset=["population","type"]) \
.show(truncate=False)
df.dropna().show(truncate=False)
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, substring, regexp_replace
spark=SparkSession.builder.appName("sparkbyexamples").getOrCreate()
data=data = [('James','','Smith','1991-04-01'),
('Michael','Rose','','2000-05-19'),
('Robert','','Williams','1978-09-05'),
('Maria','Anne','Jones','1967-12-01'),
('Jen','Mary','Brown','1980-02-17')
]
columns=["firstname","middlename","lastname","dob"]
df=spark.createDataFrame(data,columns)
df.printSchema()
df.show(truncate=False)
df1 = df.withColumn('year', split(df['dob'], '-').getItem(0)) \
.withColumn('month', split(df['dob'], '-').getItem(1)) \
.withColumn('day', split(df['dob'], '-').getItem(2))
df1.printSchema()
df1.show(truncate=False)
df4.select(split(df4.date,'^([\d]+-[\d]+-[\d])').alias('date'),
regexp_replace(split(df4.date,'^([\d]+-[\d]+-[\d]+)').getItem(1),'-
','').alias('day')).show()
"""
df4 = spark.createDataFrame([('oneAtwoBthree',)], ['str',])
df4.select(split(df4.str, '[AB]').alias('str')).show()
df4.select(split(df4.str, '[AB]',2).alias('str')).show()
df4.select(split(df4.str, '[AB]',3).alias('str')).show()
data = [('James','','Smith','1991-04-01','M',3000),
('Michael','Rose','','2000-05-19','M',4000),
('Robert','','Williams','1978-09-05','M',4000),
('Maria','Anne','Jones','1967-12-01','F',4000),
('Jen','Mary','Brown','1980-02-17','F',-1)
]
columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df2=df.select(concat(df.firstname,df.middlename,df.lastname)
.alias("FullName"),"dob","gender","salary")
df2.show(truncate=False)
df3=df.select(concat_ws('_',df.firstname,df.middlename,df.lastname)
.alias("FullName"),"dob","gender","salary")
df3.show(truncate=False)
e) Write a programme to demonstrate PySpark fillna() & fill() & Replace NULLNone
Values.
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[1]") \
.appName("TusharsExamples") \
.getOrCreate()
filePath="resources/small_zipcode.csv"
df = spark.read.options(header='true', inferSchema='true') \
.csv(filePath)
df.printSchema()
df.show(truncate=False)
df.fillna(value=0).show()
df.fillna(value=0,subset=["population"]).show()
df.na.fill(value=0).show()
df.na.fill(value=0,subset=["population"]).show()
df.fillna(value="").show()
df.na.fill(value="").show()
df.fillna("unknown",["city"]) \
.fillna("",["type"]).show()
df.na.fill("unknown",["city"]) \
.na.fill("",["type"]).show()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
#Distinct
distinctDF = df.distinct()
print("Distinct count: "+str(distinctDF.count()))
distinctDF.show(truncate=False)
#Drop duplicates
df2 = df.dropDuplicates()
print("Distinct count: "+str(df2.count()))
df2.show(truncate=False)
arrayCol = ArrayType(StringType(),False)
data = [
("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]
schema = StructType([
StructField("name",StringType(),True),
StructField("languagesAtSchool",ArrayType(StringType()),True),
StructField("languagesAtWork",ArrayType(StringType()),True),
StructField("currentState", StringType(), True),
StructField("previousState", StringType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
("Robert,,Williams",["CSharp","VB"],"NV")]
df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)
df.createOrReplaceTempView("ARRAY_STRING")
spark.sql("select name, concat_ws(',',languagesAtSchool) as languagesAtSchool," + \
" currentState from ARRAY_STRING") \
.show(truncate=False)
data = ["Project",
"Gutenberg’s",
"Alice’s",
"Adventures",
"in",
"Wonderland",
"Project",
"Gutenberg’s",
"Adventures",
"in",
"Wonderland",
"Project",
"Gutenberg’s"]
rdd=spark.sparkContext.parallelize(data)
rdd2=rdd.map(lambda x: (x,1))
for element in rdd2.collect():
print(element)
data = [('James','Smith','M',30),
('Anna','Rose','F',41),
('Robert','Williams','M',62),
]
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()
rdd2=df.rdd.map(lambda x:
(x[0]+","+x[1],x[2],x[3]*2)
)
df2=rdd2.toDF(["name","gender","new_salary"] )
df2.show()
def func1(x):
firstName=x.firstname
lastName=x.lastname
name=firstName+","+lastName
gender=x.gender.lower()
salary=x.salary*2
return (name,gender,salary)
rdd2=df.rdd.map(lambda x: func1(x))
d) Write a programme to demonstrate converting Map to column.
dataDictionary = [
('James',{'hair':'black','eye':'brown'}),
('Michael',{'hair':'brown','eye':None}),
('Robert',{'hair':'red','eye':'black'}),
('Washington',{'hair':'grey','eye':'grey'}),
('Jefferson',{'hair':'brown','eye':''})
]
df3=df.rdd.map(lambda x: \
(x.name,x.properties["hair"],x.properties["eye"])) \
.toDF(["name","hair","eye"])
df3.printSchema()
df3.show()
df.withColumn("hair",df.properties.getItem("hair")) \
.withColumn("eye",df.properties.getItem("eye")) \
.drop("properties") \
.show()
df.withColumn("hair",df.properties["hair"]) \
.withColumn("eye",df.properties["eye"]) \
.drop("properties") \
.show()
# Functions
from pyspark.sql.functions import explode,map_keys,col
keysDF = df.select(explode(map_keys(df.properties))).distinct()
keysList = keysDF.rdd.map(lambda x:x[0]).collect()
keyCols = list(map(lambda x: col("properties").getItem(x).alias(str(x)), keysList))
df.select(df.name, *keyCols).show()
e) Write a programme to demonstrate use of explode on array & map.
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()
arrayData = [
('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
('Robert',['CSharp',''],{'hair':'red','eye':''}),
('Washington',None,None),
('Jefferson',['1','2'],{})
]
df = spark.createDataFrame(data=arrayData, schema =
['name','knownLanguages','properties'])
df.printSchema()
df.show()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, flatten
spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()
arrayArrayData = [
("James",[["Java","Scala","C++"],["Spark","Java"]]),
("Michael",[["Spark","Java","C++"],["Spark","Java"]]),
("Robert",[["CSharp","VB"],["Spark","Python"]])
]
""" """
df.select(df.name,explode(df.subjects)).show(truncate=False)
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import approx_count_distinct,collect_list
from pyspark.sql.functions import collect_set,sum,avg,max,countDistinct,count
from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
from pyspark.sql.functions import variance,var_samp, var_pop
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
print("approx_count_distinct: " + \
str(df.select(approx_count_distinct("salary")).collect()[0][0]))
df.select(collect_list("salary")).show(truncate=False)
df.select(collect_set("salary")).show(truncate=False)
print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
.show(truncate=False)
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000),
("Michael","Sales","NY",86000,56,20000),
("Robert","Sales","CA",81000,30,23000),
("Maria","Finance","CA",90000,24,23000),
("Raman","Finance","CA",99000,40,24000),
("Scott","Finance","NY",83000,36,19000),
("Jen","Finance","NY",79000,53,15000),
("Jeff","Marketing","CA",80000,25,18000),
("Kumar","Marketing","NY",91000,50,21000)
]
schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)
df.groupBy("department").sum("salary").show(truncate=False)
df.groupBy("department").count().show(truncate=False)
df.groupBy("department","state") \
.sum("salary","bonus") \
.show(truncate=False)
df.groupBy("department") \
.agg(sum("salary").alias("sum_salary"), \
avg("salary").alias("avg_salary"), \
sum("bonus").alias("sum_bonus"), \
max("bonus").alias("max_bonus") \
)\
.show(truncate=False)
df.groupBy("department") \
.agg(sum("salary").alias("sum_salary"), \
avg("salary").alias("avg_salary"), \
sum("bonus").alias("sum_bonus"), \
max("bonus").alias("max_bonus")) \
.where(col("sum_bonus") >= 50000) \
.show(truncate=False)
# Using countDistrinct()
from pyspark.sql.functions import countDistinct
df2=df.select(countDistinct("Dept","Salary"))
df2.show()
print("Distinct Count of Department & Salary: "+ str(df2.collect()[0][0]))
d) Write a programme to demonstrate Select First Row of Each Group with options.
data = [("James","Sales",3000),("Michael","Sales",4600),
("Robert","Sales",4100),("Maria","Finance",3000),
("Raman","Finance",3000),("Scott","Finance",3300),
("Jen","Finance",3900),("Jeff","Marketing",3000),
("Kumar","Marketing",2000)
]
df = spark.createDataFrame(data,["Name","Department","Salary"])
df.show()
#current_date()
df.select(current_date().alias("current_date")
).show(1)
#date_format()
df.select(col("input"),
date_format(col("input"), "MM-dd-yyyy").alias("date_format")
).show()
#to_date()
df.select(col("input"),
to_date(col("input"), "yyy-MM-dd").alias("to_date")
).show()
#datediff()
df.select(col("input"),
datediff(current_date(),col("input")).alias("datediff")
).show()
#months_between()
df.select(col("input"),
months_between(current_date(),col("input")).alias("months_between")
).show()
#trunc()
df.select(col("input"),
trunc(col("input"),"Month").alias("Month_Trunc"),
trunc(col("input"),"Year").alias("Month_Year"),
trunc(col("input"),"Month").alias("Month_Trunc")
).show()
df.select(col("input"),
year(col("input")).alias("year"),
month(col("input")).alias("month"),
next_day(col("input"),"Sunday").alias("next_day"),
weekofyear(col("input")).alias("weekofyear")
).show()
df.select(col("input"),
dayofweek(col("input")).alias("dayofweek"),
dayofmonth(col("input")).alias("dayofmonth"),
dayofyear(col("input")).alias("dayofyear"),
).show()
#current_timestamp()
df2.select(current_timestamp().alias("current_timestamp")
).show(1,truncate=False)
#to_timestamp()
df2.select(col("input"),
to_timestamp(col("input"), "MM-dd-yyyy HH mm ss SSS").alias("to_timestamp")
).show(truncate=False)
#hour, minute,second
data=[["1","2020-02-01 11:01:19.06"],["2","2019-03-01 12:01:19.406"],["3","2021-03-
01 12:01:19.406"]]
df3=spark.createDataFrame(data,["id","input"])
df3.select(col("input"),
hour(col("input")).alias("hour"),
minute(col("input")).alias("minute"),
second(col("input")).alias("second")
).show(truncate=False)
# Create SparkSession
spark = SparkSession.builder \
.appName('TusharsExamples') \
.getOrCreate()
data = [("1","2019-07-01"),("2","2019-06-24"),("3","2019-08-24")]
df=spark.createDataFrame(data=data,schema=["id","date"])
df.select(
col("date"),
current_date().alias("current_date"),
datediff(current_date(),col("date")).alias("datediff")
).show()
df.withColumn("datesDiff", datediff(current_date(),col("date"))) \
.withColumn("montsDiff", months_between(current_date(),col("date"))) \
.withColumn("montsDiff_round",round(months_between(current_date(),col("date")),2)) \
.withColumn("yearsDiff",months_between(current_date(),col("date"))/lit(12)) \
.withColumn("yearsDiff_round",round(months_between(current_date(),col("date"))/lit(12)
,2)) \
.show()
data2 = [("1","07-01-2019"),("2","06-24-2019"),("3","08-24-2019")]
df2=spark.createDataFrame(data=data2,schema=["id","date"])
df2.select(
to_date(col("date"),"MM-dd-yyyy").alias("date"),
current_date().alias("endDate")
)
#SQL
spark.sql("select round(months_between('2019-07-01',current_date())/12,2) as
years_diff").show()
c) Write a programme to demonstrate PySpark timestamp & date utilities.
# Create SparkSession
spark = SparkSession.builder \
.appName('TusharsExamples') \
.getOrCreate()
df=spark.createDataFrame(
data = [ ("1","2019-06-24 12:01:19.000")],
schema=["id","input_timestamp"])
df.printSchema()
df.withColumn("ts",to_timestamp(col("input_timestamp"))) \
.withColumn("datetype",to_date(col("ts"))) \
.show(truncate=False)
# Create SparkSession
spark = SparkSession.builder \
.appName('TusharsExamples') \
.getOrCreate()
df = spark.createDataFrame(data=dates, schema=["id","from_timestamp"])
df.withColumn('from_timestamp',to_timestamp(col('from_timestamp')))\
.withColumn('end_timestamp', current_timestamp())\
.withColumn('DiffInSeconds',unix_timestamp("end_timestamp") -
unix_timestamp('from_timestamp')) \
.show(truncate=False)
df2.withColumn('DiffInMinutes',round(col('DiffInSeconds')/60))\
.show(truncate=False)
df2.withColumn('DiffInHours',round(col('DiffInSeconds')/3600))\
.show(truncate=False)
data= [("12:01:19.000","13:01:19.000"),
("12:01:19.000","12:02:19.000"),
("16:44:55.406","17:44:55.406"),
("16:50:59.406","16:44:59.406")]
df3 = spark.createDataFrame(data=data, schema=["from_timestamp","to_timestamp"])
df3.withColumn("from_timestamp",to_timestamp(col("from_timestamp"),"HH:mm:ss.SS
S")) \
.withColumn("to_timestamp",to_timestamp(col("to_timestamp"),"HH:mm:ss.SSS")) \
.withColumn("DiffInSeconds", col("from_timestamp").cast("long") -
col("to_timestamp").cast("long")) \
.withColumn("DiffInMinutes",round(col("DiffInSeconds")/60)) \
.withColumn("DiffInHours",round(col("DiffInSeconds")/3600)) \
.show(truncate=False)
df3 = spark.createDataFrame(
data=[("1","07-01-2019 12:01:19.406")],
schema=["id","input_timestamp"]
)
df3.withColumn("input_timestamp",to_timestamp(col("input_timestamp"),"MM-dd-
yyyy HH:mm:ss.SSS")) \
.withColumn("current_timestamp",current_timestamp().alias("current_timestamp"))
\
.withColumn("DiffInSeconds",current_timestamp().cast("long") -
col("input_timestamp").cast("long")) \
.withColumn("DiffInMinutes",round(col("DiffInSeconds")/60)) \
.withColumn("DiffInHours",round(col("DiffInSeconds")/3600)) \
.withColumn("DiffInDays",round(col("DiffInSeconds")/24*3600)) \
.show(truncate=False)
#SQL
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
emp = [(1,"Smith",-1,"2018","10","M",3000), \
(2,"Rose",1,"2010","20","M",4000), \
(3,"Williams",1,"2010","10","M",1000), \
(4,"Jones",2,"2005","10","F",2000), \
(5,"Brown",2,"2010","40","",-1), \
(6,"Brown",2,"2010","50","",-1) \
]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
"emp_dept_id","gender","salary"]
dept = [("Finance",10), \
("Marketing",20), \
("Sales",30), \
("IT",40) \
]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"inner") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"outer") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"full") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"fullouter") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"left") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"leftouter") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"right") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"rightouter") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"leftsemi") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"leftanti") \
.show(truncate=False)
empDF.alias("emp1").join(empDF.alias("emp2"), \
col("emp1.superior_emp_id") == col("emp2.emp_id"),"inner") \
.select(col("emp1.emp_id"),col("emp1.name"), \
col("emp2.emp_id").alias("superior_emp_id"), \
col("emp2.name").alias("superior_emp_name")) \
.show(truncate=False)
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
# Create SparkSession
spark = SparkSession.builder \
.appName('TusharsExamples') \
.getOrCreate()
#EMP DataFrame
empData = [(1,"Smith",10), (2,"Rose",20),
(3,"Williams",10), (4,"Jones",30)
]
empColumns = ["emp_id","name","emp_dept_id"]
empDF = spark.createDataFrame(empData,empColumns)
empDF.show()
#DEPT DataFrame
deptData = [("Finance",10), ("Marketing",20),
("Sales",30),("IT",40)
]
deptColumns = ["dept_name","dept_id"]
deptDF=spark.createDataFrame(deptData,deptColumns)
deptDF.show()
#Address DataFrame
addData=[(1,"1523 Main St","SFO","CA"),
(2,"3453 Orange St","SFO","NY"),
(3,"34 Warner St","Jersey","NJ"),
(4,"221 Cavalier St","Newark","DE"),
(5,"789 Walnut St","Sandiago","CA")
]
addColumns = ["emp_id","addline1","city","state"]
addDF = spark.createDataFrame(addData,addColumns)
addDF.show()
#SQL
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
addDF.createOrReplaceTempView("ADD")
# Import pyspark
from pyspark.sql import SparkSession
# Create SparkSession
spark = SparkSession.builder \
.appName('TusharsExamples') \
.getOrCreate()
#EMP DataFrame
empData = [(1,"Smith","2018",10,"M",3000),
(2,"Rose","2010",20,"M",4000),
(3,"Williams","2010",10,"M",1000),
(4,"Jones","2005",10,"F",2000),
(5,"Brown","2010",30,"",-1),
(6,"Brown","2010",50,"",-1)
]
empColumns = ["emp_id","name","branch_id","dept_id",
"gender","salary"]
empDF = spark.createDataFrame(empData,empColumns)
empDF.show()
#DEPT DataFrame
deptData = [("Finance",10,"2018"),
("Marketing",20,"2010"),
("Marketing",20,"2018"),
("Sales",30,"2005"),
("Sales",30,"2010"),
("IT",50,"2010")
]
deptColumns = ["dept_name","dept_id","branch_id"]
deptDF=spark.createDataFrame(deptData,deptColumns)
deptDF.show()
# Spark SQL
spark.sql("SELECT * FROM EMP e, DEPT d where e.dept_id == d.dept_id"
" and e.branch_id == d.branch_id").show()
7 Spark SQL StructType & SQL Functions
a) Write a programme to demonstrate PySpark map() with options.
data = ["Project",
"Gutenberg’s",
"Alice’s",
"Adventures",
"in",
"Wonderland",
"Project",
"Gutenberg’s",
"Adventures",
"in",
"Wonderland",
"Project",
"Gutenberg’s"]
rdd=spark.sparkContext.parallelize(data)
rdd2=rdd.map(lambda x: (x,1))
for element in rdd2.collect():
print(element)
data = [('James','Smith','M',30),
('Anna','Rose','F',41),
('Robert','Williams','M',62),
]
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()
rdd2=df.rdd.map(lambda x:
(x[0]+","+x[1],x[2],x[3]*2)
)
df2=rdd2.toDF(["name","gender","new_salary"] )
df2.show()
def func1(x):
firstName=x.firstname
lastName=x.lastname
name=firstName+","+lastName
gender=x.gender.lower()
salary=x.salary*2
return (name,gender,salary)
rdd2=df.rdd.map(lambda x: func1(x))
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
df.printSchema()
df.show(truncate=False)
windowSpecAgg = Window.partitionBy("department")
from pyspark.sql.functions import col,avg,sum,min,max,row_number
df.withColumn("row",row_number().over(windowSpec)) \
.withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
.withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
.withColumn("min", min(col("salary")).over(windowSpecAgg)) \
.withColumn("max", max(col("salary")).over(windowSpecAgg)) \
.where(col("row")==1).select("department","avg","sum","min","max") \
.show()
c) Write a programme to demonstrate PySpark JSON Functions with options.
jsonString="""{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC
PARQUE","State":"PR"}"""
df=spark.createDataFrame([(1, jsonString)],["id","value"])
df.show(truncate=False)
.select(schema_of_json(lit("""{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC
PARQUE","State":"PR"}"""))) \
.collect()[0][0]
print(schemaStr)
8 Spark SQL
a) Write a programme to demonstrate PySpark SQL examples.
# Create SparkSession
spark = SparkSession.builder.appName('TusharsExamples') \
.getOrCreate()
# Select query
df.select("country","city","zipcode","state") \
.show(5)
# where
df.select("country","city","zipcode","state") \
.where("state == 'AZ'") \
.show(5)
# sorting
df.select("country","city","zipcode","state") \
.where("state in ('PR','AZ','FL')") \
.orderBy("state") \
.show(10)
# grouping
df.groupBy("state").count() \
.show()
# Add
df.select(df.date,df.increment,
expr("increment + 5 as new_increment")
).show()
# Using cast to convert data types
df.select("increment",expr("cast(increment as string) as str_increment")) \
.printSchema()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
data = [("James","Smith","USA","CA"),
("Michael","Rose","USA","NY"),
("Robert","Williams","USA","CA"),
("Maria","Jones","USA","FL")
]
columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.show(truncate=False)
df.select("firstname").show()
df.select("firstname","lastname").show()
data = [(("James",None,"Smith"),"OH","M"),
(("Anna","Rose",""),"NY","F"),
(("Julia","","Williams"),"OH","F"),
(("Maria","Anne","Jones"),"NY","M"),
(("Jen","Mary","Brown"),"NY","M"),
(("Mike","Mary","Williams"),"OH","M")
]
from pyspark.sql.types import StructType,StructField, StringType
schema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('state', StringType(), True),
StructField('gender', StringType(), True)
])
df2.select("name").show(truncate=False)
df2.select("name.firstname","name.lastname").show(truncate=False)
df2.select("name.*").show(truncate=False)
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()
#Example 1 mapPartitions()
def reformat(partitionData):
for row in partitionData:
yield [row.firstname+","+row.lastname,row.salary*10/100]
df2=df.rdd.mapPartitions(reformat).toDF(["name","bonus"])
df2.show()
#Example 2 mapPartitions()
def reformat2(partitionData):
updatedData = []
for row in partitionData:
name=row.firstname+","+row.lastname
bonus=row.salary*10/100
updatedData.append([name,bonus])
return iter(updatedData)
df2=df.rdd.mapPartitions(reformat).toDF(["name","bonus"])
df2.show())
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
dept = [("Finance",10), \
("Marketing",20), \
("Sales",30), \
("IT",40) \
]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)
dataCollect = deptDF.collect()
print(dataCollect)
dataCollect2 = deptDF.select("dept_name").collect()
print(dataCollect2)
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col,array_contains
spark = SparkSession.builder.appName('TusharsExamples').getOrCreate()
df = spark.read.csv("/content/zipcodes.csv")
df.printSchema()
df2 = spark.read.option("header",True) \
.csv("/content/zipcodes.csv")
df2.printSchema()
schema = StructType() \
.add("RecordNumber",IntegerType(),True) \
.add("Zipcode",IntegerType(),True) \
.add("ZipCodeType",StringType(),True) \
.add("City",StringType(),True) \
.add("State",StringType(),True) \
.add("LocationType",StringType(),True) \
.add("Lat",DoubleType(),True) \
.add("Long",DoubleType(),True) \
.add("Xaxis",IntegerType(),True) \
.add("Yaxis",DoubleType(),True) \
.add("Zaxis",DoubleType(),True) \
.add("WorldRegion",StringType(),True) \
.add("Country",StringType(),True) \
.add("LocationText",StringType(),True) \
.add("Location",StringType(),True) \
.add("Decommisioned",BooleanType(),True) \
.add("TaxReturnsFiled",StringType(),True) \
.add("EstimatedPopulation",IntegerType(),True) \
.add("TotalWages",IntegerType(),True) \
.add("Notes",StringType(),True)
df_with_schema = spark.read.format("csv") \
.option("header", True) \
.schema(schema) \
.load("/content/zipcodes.csv")
df_with_schema.printSchema()
df2.write.option("header",True) \
.csv("/zipcodes123")
# Imports
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("parquetFile").getOrCreate()
data =[("James ","","Smith","36636","M",3000),
("Michael ","Rose","","40288","M",4000),
("Robert ","","Williams","42114","M",4000),
("Maria ","Anne","Jones","39192","F",4000),
("Jen","Mary","Brown","","F",-1)]
columns=["firstname","middlename","lastname","dob","gender","salary"]
df=spark.createDataFrame(data,columns)
df.write.mode("overwrite").parquet("/tmp/output/people.parquet")
parDF1=spark.read.parquet("/tmp/output/people.parquet")
parDF1.createOrReplaceTempView("parquetTable")
parDF1.printSchema()
parDF1.show(truncate=False)
df.write.partitionBy("gender","salary").mode("overwrite").parquet("/tmp/output/peop
le2.parquet")
parDF2=spark.read.parquet("/tmp/output/people2.parquet/gender=M")
parDF2.show(truncate=False)
# Import
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
# warehouse_location points to the default location for managed databases and tables
warehouse_location = abspath('spark-warehouse')
# Create DataFrame
data = [(1, "James",30,"M"), (2, "Ann",40,"F"),
(3, "Jeff",41,"M"),(4, "Jennifer",20,"F")]
sampleDF = spark.sparkContext.parallelize(data).toDF(columns)
df = spark.read.table("employee")
df.show()
b) Write a programme to demonstrate PySpark Save Hive Table From Temp view.
# warehouse_location points to the default location for managed databases and tables
warehouse_location = abspath('spark-warehouse')
# Create DataFrame
data = [(1, "James",30,"M"), (2, "Ann",40,"F"),
(3, "Jeff",41,"M"),(4, "Jennifer",20,"F")]
sampleDF = spark.sparkContext.parallelize(data).toDF(columns)
c) Write a programme to demonstrate PySpark Read Hive Table from Remote Hive.