0% found this document useful (0 votes)
19 views18 pages

First Pyspark

Uploaded by

kumar99.vallam
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
19 views18 pages

First Pyspark

Uploaded by

kumar99.vallam
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 18

first-pyspark

October 13, 2024

[1]: from pyspark.sql import SparkSession


from pyspark.sql.types import *
from datetime import datetime

#Create Spark session


spark = SparkSession.builder \
.appName('Spark with Hive') \
.enableHiveSupport() \
.getOrCreate()

#Hardcoded Data
data = [
["Product A", 1001, datetime.strptime("2023-07-20", "%Y-%m-%d"), datetime.
↪strptime("2023-07-20 10:15:30", "%Y-%m-%d %H:%M:%S"), 29.99],

["Product B", 1002, datetime.strptime("2023-07-19", "%Y-%m-%d"), datetime.


↪strptime("2023-07-19 14:20:45", "%Y-%m-%d %H:%M:%S"), 49.99],

["Product C", 1003, datetime.strptime("2023-07-18", "%Y-%m-%d"), datetime.


↪strptime("2023-07-18 09:30:15", "%Y-%m-%d %H:%M:%S"), 39.99],

["Product D", 1004, datetime.strptime("2023-07-17", "%Y-%m-%d"), datetime.


↪strptime("2023-07-17 16:45:00", "%Y-%m-%d %H:%M:%S"), 19.99]

#Define Schema
#StructType class means has hetrogenous data types and defining the schema
#StructField('Field_Name',Datatype,Nullable(Accpets booleans value))
schema = StructType([
StructField("Product", StringType(), True),
StructField("ID", IntegerType(), True),
StructField("Date", DateType(), True),
StructField("Timestamp", TimestampType(), True),
StructField("Price", FloatType(), True)
])

#Create Dataframe
#Calling spark session and creating dataframe by passing data and schema
df = spark.createDataFrame(data,schema)

1
#Print Schema
df.printSchema()

#action calling to get sample of data


#print data
df.show()

Setting default log level to "WARN".


To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use
setLogLevel(newLevel).
24/10/13 05:34:28 INFO SparkEnv: Registering MapOutputTracker
24/10/13 05:34:28 INFO SparkEnv: Registering BlockManagerMaster
24/10/13 05:34:28 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/10/13 05:34:28 INFO SparkEnv: Registering OutputCommitCoordinator
root
|-- Product: string (nullable = true)
|-- ID: integer (nullable = true)
|-- Date: date (nullable = true)
|-- Timestamp: timestamp (nullable = true)
|-- Price: float (nullable = true)

+---------+----+----------+-------------------+-----+
| Product| ID| Date| Timestamp|Price|
+---------+----+----------+-------------------+-----+
|Product A|1001|2023-07-20|2023-07-20 10:15:30|29.99|
|Product B|1002|2023-07-19|2023-07-19 14:20:45|49.99|
|Product C|1003|2023-07-18|2023-07-18 09:30:15|39.99|
|Product D|1004|2023-07-17|2023-07-17 16:45:00|19.99|
+---------+----+----------+-------------------+-----+

[3]: #First read the example should not infer schema ,ignore header row ,provide␣
↪explicit column name and data type

#Define Schema
schema = StructType([
StructField("order_id", StringType(), True),
StructField("order_item_id", IntegerType(), True),
StructField("product_id", StringType(), True),
StructField("seller_id", StringType(), True),
StructField("shipping_limit_date", TimestampType(), True),
StructField("price", DoubleType(), True),
StructField("freight_value", DoubleType(), True)
])

2
#We are running spark on the same cluster as the hadoop cluster so no need to␣
↪mention whole path that is hdfs://

hdfs_path='/tmp/input_data/order_items_dataset.csv'

#format(mention_file_format).options u can edit properties related to␣


↪input_files, header - True means it will not include header in the␣

↪content,infer schema is false means we are explicity giving schema and true␣

↪means let spark decide teh schema

#load(path from where we want to read the data)


df=spark.read.format('csv').option('header','true').
↪option('inferSchema','false').schema(schema).load(hdfs_path)

df.printSchema()

df.show(5)

root
|-- order_id: string (nullable = true)
|-- order_item_id: integer (nullable = true)
|-- product_id: string (nullable = true)
|-- seller_id: string (nullable = true)
|-- shipping_limit_date: timestamp (nullable = true)
|-- price: double (nullable = true)
|-- freight_value: double (nullable = true)

[Stage 2:> (0 + 1) / 1]
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
|00010242fe8c5a6d1…|
1|4244733e06e7ecb49…|48436dade18ac8b2b…|2017-09-19 09:45:35| 58.9|
13.29|
|00018f77f2f0320c5…|
1|e5f2d52b802189ee6…|dd7ddc04e1b6c2c61…|2017-05-03 11:05:13|239.9|
19.93|
|000229ec398224ef6…|
1|c777355d18b72b67a…|5b51032eddd242adc…|2018-01-18 14:48:30|199.0|
17.87|
|00024acbcdf0a6daa…|
1|7634da152a4610f15…|9d7a1d34a50524090…|2018-08-15 10:10:18|12.99|
12.79|
|00042b26cf59d7ce6…|
1|ac6c3623068f30de0…|df560393f3a51e745…|2017-02-13 13:57:51|199.9|

3
18.14|
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
only showing top 5 rows

[4]: #If we want spark to decide the schema

hdfs_path='/tmp/input_data/order_items_dataset.csv'

df2=spark.read.format('csv').option('header','true').
↪option('inferSchema','true').load(hdfs_path)

df2.printSchema()

df2.show(5)

root
|-- order_id: string (nullable = true)
|-- order_item_id: integer (nullable = true)
|-- product_id: string (nullable = true)
|-- seller_id: string (nullable = true)
|-- shipping_limit_date: timestamp (nullable = true)
|-- price: double (nullable = true)
|-- freight_value: double (nullable = true)

+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
|00010242fe8c5a6d1…|
1|4244733e06e7ecb49…|48436dade18ac8b2b…|2017-09-19 09:45:35| 58.9|
13.29|
|00018f77f2f0320c5…|
1|e5f2d52b802189ee6…|dd7ddc04e1b6c2c61…|2017-05-03 11:05:13|239.9|
19.93|
|000229ec398224ef6…|
1|c777355d18b72b67a…|5b51032eddd242adc…|2018-01-18 14:48:30|199.0|
17.87|
|00024acbcdf0a6daa…|
1|7634da152a4610f15…|9d7a1d34a50524090…|2018-08-15 10:10:18|12.99|
12.79|
|00042b26cf59d7ce6…|

4
1|ac6c3623068f30de0…|df560393f3a51e745…|2017-02-13 13:57:51|199.9|
18.14|
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
only showing top 5 rows

[5]: #if we wont mention header

hdfs_path='/tmp/input_data/order_items_dataset.csv'

#option(header --> false)---> means it will create␣


↪_c0,_c1,_c2,_c3,_c4,_c5,_c6---> columns by default names

df3=spark.read.format('csv').option('header','false').
↪option('inferSchema','true').load(hdfs_path)

df3.printSchema()

df3.show(5)

root
|-- _c0: string (nullable = true)
|-- _c1: string (nullable = true)
|-- _c2: string (nullable = true)
|-- _c3: string (nullable = true)
|-- _c4: string (nullable = true)
|-- _c5: string (nullable = true)
|-- _c6: string (nullable = true)

+--------------------+-------------+--------------------+--------------------+--
-----------------+------+-------------+
| _c0| _c1| _c2| _c3|
_c4| _c5| _c6|
+--------------------+-------------+--------------------+--------------------+--
-----------------+------+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_date| price|freight_value|
|00010242fe8c5a6d1…|
1|4244733e06e7ecb49…|48436dade18ac8b2b…|2017-09-19 09:45:35| 58.90|
13.29|
|00018f77f2f0320c5…|
1|e5f2d52b802189ee6…|dd7ddc04e1b6c2c61…|2017-05-03 11:05:13|239.90|
19.93|
|000229ec398224ef6…|
1|c777355d18b72b67a…|5b51032eddd242adc…|2018-01-18 14:48:30|199.00|
17.87|

5
|00024acbcdf0a6daa…|
1|7634da152a4610f15…|9d7a1d34a50524090…|2018-08-15 10:10:18| 12.99|
12.79|
+--------------------+-------------+--------------------+--------------------+--
-----------------+------+-------------+
only showing top 5 rows

[6]: #number of partitions after reading from hdfs

print(f'Number of partitions:{df2.rdd.getNumPartitions()}')

df4=df2.repartition(10)

print(f'Number of partitions:{df4.rdd.getNumPartitions()}')

Number of partitions:2
[Stage 9:=============================> (1 + 1) / 2]
Number of partitions:10

[7]: #get details of minimum partitions


spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Get default partition properties


min_partitions = sc.getConf().get("spark.sql.shuffle.partitions", "not set") ␣
↪# Default shuffle partitions

max_partitions = sc.getConf().get("spark.default.parallelism", "not set") ␣


↪# Default parallelism

print("Minimum partitions (shuffle partitions):", min_partitions)


print("Maximum partitions (default parallelism):", max_partitions)

Minimum partitions (shuffle partitions): 1000


Maximum partitions (default parallelism): not set

[12]: from pyspark.sql.functions import *

#select particular column


df4.select('order_id').show(5)
#select multiple column names
df4.select('order_id','shipping_limit_date').show(5)
#aliasing the column_name
df4.select(col('order_id').alias('o_id'),col('shipping_limit_date').
↪alias('Limit_date')).show(5)

6
+--------------------+
| order_id|
+--------------------+
|6299bb8e855289b41…|
|71fbb9971d84bf97a…|
|74322a01b770c2ea3…|
|a23fc2b3af4f1a48e…|
|747af114bbea56ac1…|
+--------------------+
only showing top 5 rows

+--------------------+-------------------+
| order_id|shipping_limit_date|
+--------------------+-------------------+
|3bbf8f927f288e4a1…|2017-11-09 14:25:38|
|50c40cfcbb6ce3fca…|2018-06-14 09:52:04|
|51c3d73e0e9052253…|2018-02-22 19:15:27|
|183ee0e3ebd4c1c99…|2018-02-07 20:14:08|
|3a1400b5d4dd3082a…|2018-03-27 17:28:20|
+--------------------+-------------------+
only showing top 5 rows

[Stage 34:=============================> (1 + 1) / 2]
+--------------------+-------------------+
| o_id| Limit_date|
+--------------------+-------------------+
|3bbf8f927f288e4a1…|2017-11-09 14:25:38|
|50c40cfcbb6ce3fca…|2018-06-14 09:52:04|
|51c3d73e0e9052253…|2018-02-22 19:15:27|
|183ee0e3ebd4c1c99…|2018-02-07 20:14:08|
|3a1400b5d4dd3082a…|2018-03-27 17:28:20|
+--------------------+-------------------+
only showing top 5 rows

[14]: #Derive new column using withColumn


#it will extract year from date and will store the data in year column similar␣
↪goes for month

#df4 has all columns of df3 and new columns dervied ones
df5=df4.withColumn("year",year(col("shipping_limit_date"))).
↪withColumn("month",month(col("shipping_limit_date")))

df5.select('order_id','shipping_limit_date','year','month').show(5)

7
[Stage 37:=============================> (1 + 1) / 2]
+--------------------+-------------------+----+-----+
| order_id|shipping_limit_date|year|month|
+--------------------+-------------------+----+-----+
|3bbf8f927f288e4a1…|2017-11-09 14:25:38|2017| 11|
|50c40cfcbb6ce3fca…|2018-06-14 09:52:04|2018| 6|
|51c3d73e0e9052253…|2018-02-22 19:15:27|2018| 2|
|183ee0e3ebd4c1c99…|2018-02-07 20:14:08|2018| 2|
|3a1400b5d4dd3082a…|2018-03-27 17:28:20|2018| 3|
+--------------------+-------------------+----+-----+
only showing top 5 rows

[20]: #Renaming the existing columns


df6=df5.withColumnRenamed('shipping_limit_date','shipping_limit_datetime')

df6.select('order_id','shipping_limit_datetime','year').show(4)

+--------------------+-----------------------+----+
| order_id|shipping_limit_datetime|year|
+--------------------+-----------------------+----+
|3bbf8f927f288e4a1…| 2017-11-09 14:25:38|2017|
|50c40cfcbb6ce3fca…| 2018-06-14 09:52:04|2018|
|51c3d73e0e9052253…| 2018-02-22 19:15:27|2018|
|183ee0e3ebd4c1c99…| 2018-02-07 20:14:08|2018|
+--------------------+-----------------------+----+
only showing top 4 rows

[33]: #Filter condition

df6.filter(col('order_id')=='00010242fe8c5a6d1ba2dd792cb16214').show(5)

#similar to in condition for checking multiple


order_li=["00042b26cf59d7ce69dfabb4e55b4fd9","0008288aa423d2a3f00fcb17cd7d8719"]

df6.filter(col("order_id").isin(order_li)).show(5)

#muliple filter condition using &


df6.filter((col('price')<50 ) & (col('freight_value') < 10)).show(5)

#SQL TYPE EXPRESSION


df6.filter("price in (179.0,35.0)").show(5)

8
#counting_columns
df6.count()

+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
|00010242fe8c5a6d1…| 1|4244733e06e7ecb49…|48436dade18ac8b2b…|
2017-09-19 09:45:35| 58.9| 13.29|2017| 9|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+

+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
|00042b26cf59d7ce6…| 1|ac6c3623068f30de0…|df560393f3a51e745…|
2017-02-13 13:57:51|199.9| 18.14|2017| 2|
|0008288aa423d2a3f…| 1|368c6c730842d7801…|1f50f920176fa81da…|
2018-02-21 02:55:52| 49.9| 13.37|2018| 2|
|0008288aa423d2a3f…| 2|368c6c730842d7801…|1f50f920176fa81da…|
2018-02-21 02:55:52| 49.9| 13.37|2018| 2|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+

+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
|363524b17966c3a64…| 2|43ee88561093499d9…|23613d49c3ac2bd30…|
2018-05-24 22:35:14| 10.9| 3.8|2018| 5|
|1d9609dad08db33f3…| 1|7cc67695a7648efc5…|95e03ca3d4146e401…|
2017-12-11 18:10:31|29.99| 8.9|2017| 12|
|50aff4b82439e01c5…| 1|ec1faa2edc27ce323…|cc419e0650a3c5ba7…|
2017-11-23 21:53:21|29.99| 7.78|2017| 11|
|37ee401157a3a0b28…| 9|d34c07a2d817ac73f…|e7d5b006eb624f130…|

9
2018-04-19 02:30:52|29.99| 7.39|2018| 4|
|8f5fac100b291e3c7…| 1|0e996644bf2835621…|b4ffb71f0cb1b1c3d…|
2017-12-08 09:13:27| 6.84| 7.78|2017| 12|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
only showing top 5 rows

+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
|9edba4db56f479798…| 1|25d4f8db663ac141a…|74a9b9bddf14ece02…|
2017-12-01 12:34:33| 35.0| 4.36|2017| 12|
|9c4aacf06d8fee894…| 1|42f33073be6531e8f…|50c361bcf670d16f6…|
2018-02-19 21:07:46| 35.0| 16.6|2018| 2|
|9daeefebc1067e23b…| 1|c1cf541d5b33a4b04…|01fd077212124329b…|
2018-05-04 13:30:35| 35.0| 18.23|2018| 5|
|227fc3d5fef215496…| 1|0f3f3612d3a594da3…|93dc87703c046b603…|
2018-06-05 19:15:17|179.0| 8.44|2018| 6|
|9fa420e862b14f1b9…| 2|eb8c629f70275fd1c…|1025f0e2d44d7041d…|
2018-04-16 04:10:32| 35.0| 12.75|2018| 4|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
only showing top 5 rows

[33]: 112650

[36]: #dropping column name


df7=df6.drop('month','year')
df7.show(5)

#dropduplicates --> passing on basis of which columns need to find duplicates


df7.dropDuplicates(['order_id','order_item_id']).show(5)

#counting the dataframe7 records


df7.count()

+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
| order_id|order_item_id| product_id|

10
seller_id|shipping_limit_datetime|price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
|823e71d0dc92309fa…| 1|67bf0dde94ca85e84…|4a1f694197d05fe70…|
2018-05-17 22:13:44|34.99| 17.93|
|bde70300820015633…| 1|99a4788cb24856965…|4a3ca9315b744ce9f…|
2017-10-20 19:09:40| 89.9| 16.26|
|71c0d1686c9b55563…| 2|eb6c2ecde53034fc9…|1025f0e2d44d7041d…|
2017-12-01 19:31:45|32.99| 16.11|
|8bdc559b124e47eb7…| 1|42a2c92a0979a949c…|813348c996469b40f…|
2017-11-30 15:56:29| 58.9| 17.12|
|85ff272111b8ca343…| 1|a01c3a8e3ccddf440…|fe2032dab1a61af87…|
2017-07-05 20:25:11|179.0| 9.45|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
only showing top 5 rows

+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime| price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+
|00018f77f2f0320c5…| 1|e5f2d52b802189ee6…|dd7ddc04e1b6c2c61…|
2017-05-03 11:05:13| 239.9| 19.93|
|000229ec398224ef6…| 1|c777355d18b72b67a…|5b51032eddd242adc…|
2018-01-18 14:48:30| 199.0| 17.87|
|00048cc3ae777c65d…| 1|ef92defde845ab845…|6426d21aca402a131…|
2017-05-23 03:55:27| 21.9| 12.69|
|0005a1a1728c9d785…| 1|310ae3c140ff94b03…|a416b6a846a117243…|
2018-03-26 18:31:29|145.95| 11.65|
|0005f50442cb953dc…| 1|4535b0e1091c278df…|ba143b05f0110f0dc…|
2018-07-06 14:10:56| 53.99| 11.4|
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+
only showing top 5 rows

[36]: 112650

[43]: #get distinct rows

#distinct values
df7.distinct().show(5)

11
#drops duplicates based on all columns
df7.dropDuplicates().show(5)

+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
|8533497ec8830f25e…| 1|38273c03eb0f88327…|8b321bb669392f516…|
2018-04-24 04:51:11| 19.9| 7.39|
|71f344dcca43baaa0…| 1|22f80c8069aff9c90…|525e75a6fb1454a23…|
2017-07-04 11:30:17|31.76| 3.26|
|255a21c87a4a96bae…| 1|764292b2b0f73f77a…|bd23da73548133471…|
2017-11-28 22:07:25| 89.9| 11.83|
|4a0b592d4d6082de8…| 3|609c35bf8122d5ab8…|7d456afc660226829…|
2018-07-25 17:31:25| 7.5| 4.79|
|2cd938176e6aaa529…| 1|060e9bdedfae37724…|6560211a19b47992c…|
2018-08-07 20:31:16| 45.0| 7.58|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
only showing top 5 rows

[Stage 262:============================================> (8 + 2) / 10]


+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
|fb5e4008a881e833a…| 1|c9c6fde711572c1ad…|562fc2f2c2863ab7e…|
2018-02-14 07:35:37|29.99| 14.1|
|b9cb4e1c0f63ec744…| 1|7f7da198305ad6209…|c0563dd588b775f2e…|
2018-03-21 22:15:47|119.9| 16.81|
|ca7dc685e1c6b20f8…| 1|aae1c80508d794a69…|77530e9772f57a62c…|
2017-10-02 11:28:30|129.0| 38.45|
|8533497ec8830f25e…| 1|38273c03eb0f88327…|8b321bb669392f516…|
2018-04-24 04:51:11| 19.9| 7.39|
|71f344dcca43baaa0…| 1|22f80c8069aff9c90…|525e75a6fb1454a23…|
2017-07-04 11:30:17|31.76| 3.26|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
only showing top 5 rows

12
[50]: hdfs_path1='/tmp/input_data/Emp_data.csv'

df8=spark.read.format('csv').option('header','true').
↪option('InferSchema','True').load(hdfs_path1)

df8.printSchema()

df8.show()

df8.count()

df8.distinct().show()

root
|-- 'Emp_Name': string (nullable = true)
|-- 'Dept_Name': string (nullable = true)
|-- 'Role': string (nullable = true)
|-- 'Company': string (nullable = true)
|-- 'Years_Expereince': integer (nullable = true)

+--------------------+-----------+------------------+--------------+------------
------+
| 'Emp_Name'|'Dept_Name'| 'Role'|
'Company'|'Years_Expereince'|
+--------------------+-----------+------------------+--------------+------------
------+
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
+--------------------+-----------+------------------+--------------+------------
------+

+--------------------+-----------+------------------+--------------+------------
------+
| 'Emp_Name'|'Dept_Name'| 'Role'|
'Company'|'Years_Expereince'|
+--------------------+-----------+------------------+--------------+------------

13
------+
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
+--------------------+-----------+------------------+--------------+------------
------+

[61]: Emp_df=df8.withColumnRenamed("'Emp_Name'","Emp_Name").
↪withColumnRenamed("'Dept_Name'","Dept_Name").

↪withColumnRenamed("'Role'","Role").withColumnRenamed("'Company'","Company").

↪withColumnRenamed("'Years_Expereince'","Years_Experience")

Emp_df.show()

+--------------------+-----------+------------------+--------------+------------
----+
| Emp_Name| Dept_Name| Role|
Company|Years_Experience|
+--------------------+-----------+------------------+--------------+------------
----+
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
+--------------------+-----------+------------------+--------------+------------
----+

[64]: #order by
Emp_df.orderBy(col('Years_Experience').desc()).show()

+--------------------+-----------+------------------+--------------+------------

14
----+
| Emp_Name| Dept_Name| Role|
Company|Years_Experience|
+--------------------+-----------+------------------+--------------+------------
----+
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
+--------------------+-----------+------------------+--------------+------------
----+

[65]: Emp_df.orderBy(col('Dept_Name').desc(),col('Years_Experience').desc()).show()

+--------------------+-----------+------------------+--------------+------------
----+
| Emp_Name| Dept_Name| Role|
Company|Years_Experience|
+--------------------+-----------+------------------+--------------+------------
----+
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
+--------------------+-----------+------------------+--------------+------------
----+

[74]: #GroupBy operation


#.agg allows us to do mulitple aggregation in single expression

15
df6.groupBy('year','month').agg(count('*').alias('total_count'),
avg('price').alias('avg_price'),
sum('price').alias('sum_price'),
min('price').alias('min_price'),
max('price').alias('max_price')).
↪orderBy(col('Year'),col('Month').desc()).show(6)

Year_li=[2017,2018]

df6.filter(col('Year').isin(Year_li)).groupBy('year').agg(count('*').
↪alias('total_count'),

avg('price').alias('avg_price'),
sum('price').alias('sum_price'),
min('price').alias('min_price'),
max('price').alias('max_price')).orderBy(col('Year')).
↪show(6)

+----+-----+-----------+------------------+------------------+---------+--------
-+
|year|month|total_count| avg_price|
sum_price|min_price|max_price|
+----+-----+-----------+------------------+------------------+---------+--------
-+
|2016| 12| 1| 10.9| 10.9| 10.9|
10.9|
|2016| 10| 365|135.83712328767123|49580.549999999996| 6.0|
1399.0|
|2016| 9| 4| 48.61750000000001|194.47000000000003| 44.99|
59.5|
|2017| 12| 7726|116.35011390111308| 898920.9799999996| 4.4|
3124.0|
|2017| 11| 7355|120.10219306594144| 883351.6299999993| 3.85|
2990.0|
|2017| 10| 5189|126.81060512622881| 658020.2300000013| 4.5|
2999.99|
+----+-----+-----------+------------------+------------------+---------+--------
-+
only showing top 6 rows

[Stage 340:============================> (1 + 1) / 2]
+----+-----------+------------------+-----------------+---------+---------+
|year|total_count| avg_price| sum_price|min_price|max_price|
+----+-----------+------------------+-----------------+---------+---------+
|2017| 49765|121.26732804179923|6034868.580000139| 1.2| 6735.0|
|2018| 62511|120.08515685239732|7506643.240000209| 0.85| 6729.0|

16
+----+-----------+------------------+-----------------+---------+---------+

[82]: df6.agg(sum('price').alias('sum_price')).show()

[Stage 366:==================================================> (9 + 1) / 10]


+-------------------+
| sum_price|
+-------------------+
|1.359164369999942E7|
+-------------------+

[80]: #accumulators

accum=spark.sparkContext.accumulator(0)

df6.foreach(lambda row:accum.add(row['price']))

print(accum.value)

[Stage 363:==================================================> (9 + 1) / 10]


13591643.699999437

[85]: #case when statements

df6.withColumn("price_category", when(col('price') >=100,"High")


.when((col('price') <100) &␣
↪(col('price')>=50),'Medium')

.otherwise("Low")).show(5)

[Stage 370:============================> (1 + 1) / 2]
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+----+-----+--------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|
price|freight_value|year|month|price_category|
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+----+-----+--------------+
|1e1bb536916a99649…| 2|0288f8dd74b931b4e…|1da3aeb70d7989d1e…|
2017-09-05 12:10:11| 49.99| 21.15|2017| 9| Low|
|62a0e822dd605871a…| 1|31dbb0d1815bdc83c…|6da1992f915d77be9…|

17
2017-06-08 11:50:18| 29.0| 15.79|2017| 6| Low|
|025c72e88fbf2358b…| 2|bef21943bc2335188…|e49c26c3edfa46d22…|
2017-03-21 21:24:27| 19.9| 20.8|2017| 3| Low|
|23d16dddab46fd3d0…| 1|cca8e09ba6f2d35e4…|43f8c9950d11ecd03…|
2018-01-31 22:17:51|109.99| 14.52|2018| 1| High|
|71c0d1686c9b55563…| 2|eb6c2ecde53034fc9…|1025f0e2d44d7041d…|
2017-12-01 19:31:45| 32.99| 16.11|2017| 12| Low|
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+----+-----+--------------+
only showing top 5 rows

18

You might also like