First Pyspark
First Pyspark
#Hardcoded Data
data = [
["Product A", 1001, datetime.strptime("2023-07-20", "%Y-%m-%d"), datetime.
↪strptime("2023-07-20 10:15:30", "%Y-%m-%d %H:%M:%S"), 29.99],
#Define Schema
#StructType class means has hetrogenous data types and defining the schema
#StructField('Field_Name',Datatype,Nullable(Accpets booleans value))
schema = StructType([
StructField("Product", StringType(), True),
StructField("ID", IntegerType(), True),
StructField("Date", DateType(), True),
StructField("Timestamp", TimestampType(), True),
StructField("Price", FloatType(), True)
])
#Create Dataframe
#Calling spark session and creating dataframe by passing data and schema
df = spark.createDataFrame(data,schema)
1
#Print Schema
df.printSchema()
+---------+----+----------+-------------------+-----+
| Product| ID| Date| Timestamp|Price|
+---------+----+----------+-------------------+-----+
|Product A|1001|2023-07-20|2023-07-20 10:15:30|29.99|
|Product B|1002|2023-07-19|2023-07-19 14:20:45|49.99|
|Product C|1003|2023-07-18|2023-07-18 09:30:15|39.99|
|Product D|1004|2023-07-17|2023-07-17 16:45:00|19.99|
+---------+----+----------+-------------------+-----+
[3]: #First read the example should not infer schema ,ignore header row ,provide␣
↪explicit column name and data type
#Define Schema
schema = StructType([
StructField("order_id", StringType(), True),
StructField("order_item_id", IntegerType(), True),
StructField("product_id", StringType(), True),
StructField("seller_id", StringType(), True),
StructField("shipping_limit_date", TimestampType(), True),
StructField("price", DoubleType(), True),
StructField("freight_value", DoubleType(), True)
])
2
#We are running spark on the same cluster as the hadoop cluster so no need to␣
↪mention whole path that is hdfs://
hdfs_path='/tmp/input_data/order_items_dataset.csv'
↪content,infer schema is false means we are explicity giving schema and true␣
df.printSchema()
df.show(5)
root
|-- order_id: string (nullable = true)
|-- order_item_id: integer (nullable = true)
|-- product_id: string (nullable = true)
|-- seller_id: string (nullable = true)
|-- shipping_limit_date: timestamp (nullable = true)
|-- price: double (nullable = true)
|-- freight_value: double (nullable = true)
[Stage 2:> (0 + 1) / 1]
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
|00010242fe8c5a6d1…|
1|4244733e06e7ecb49…|48436dade18ac8b2b…|2017-09-19 09:45:35| 58.9|
13.29|
|00018f77f2f0320c5…|
1|e5f2d52b802189ee6…|dd7ddc04e1b6c2c61…|2017-05-03 11:05:13|239.9|
19.93|
|000229ec398224ef6…|
1|c777355d18b72b67a…|5b51032eddd242adc…|2018-01-18 14:48:30|199.0|
17.87|
|00024acbcdf0a6daa…|
1|7634da152a4610f15…|9d7a1d34a50524090…|2018-08-15 10:10:18|12.99|
12.79|
|00042b26cf59d7ce6…|
1|ac6c3623068f30de0…|df560393f3a51e745…|2017-02-13 13:57:51|199.9|
3
18.14|
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
only showing top 5 rows
hdfs_path='/tmp/input_data/order_items_dataset.csv'
df2=spark.read.format('csv').option('header','true').
↪option('inferSchema','true').load(hdfs_path)
df2.printSchema()
df2.show(5)
root
|-- order_id: string (nullable = true)
|-- order_item_id: integer (nullable = true)
|-- product_id: string (nullable = true)
|-- seller_id: string (nullable = true)
|-- shipping_limit_date: timestamp (nullable = true)
|-- price: double (nullable = true)
|-- freight_value: double (nullable = true)
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
|00010242fe8c5a6d1…|
1|4244733e06e7ecb49…|48436dade18ac8b2b…|2017-09-19 09:45:35| 58.9|
13.29|
|00018f77f2f0320c5…|
1|e5f2d52b802189ee6…|dd7ddc04e1b6c2c61…|2017-05-03 11:05:13|239.9|
19.93|
|000229ec398224ef6…|
1|c777355d18b72b67a…|5b51032eddd242adc…|2018-01-18 14:48:30|199.0|
17.87|
|00024acbcdf0a6daa…|
1|7634da152a4610f15…|9d7a1d34a50524090…|2018-08-15 10:10:18|12.99|
12.79|
|00042b26cf59d7ce6…|
4
1|ac6c3623068f30de0…|df560393f3a51e745…|2017-02-13 13:57:51|199.9|
18.14|
+--------------------+-------------+--------------------+--------------------+--
-----------------+-----+-------------+
only showing top 5 rows
hdfs_path='/tmp/input_data/order_items_dataset.csv'
df3=spark.read.format('csv').option('header','false').
↪option('inferSchema','true').load(hdfs_path)
df3.printSchema()
df3.show(5)
root
|-- _c0: string (nullable = true)
|-- _c1: string (nullable = true)
|-- _c2: string (nullable = true)
|-- _c3: string (nullable = true)
|-- _c4: string (nullable = true)
|-- _c5: string (nullable = true)
|-- _c6: string (nullable = true)
+--------------------+-------------+--------------------+--------------------+--
-----------------+------+-------------+
| _c0| _c1| _c2| _c3|
_c4| _c5| _c6|
+--------------------+-------------+--------------------+--------------------+--
-----------------+------+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_date| price|freight_value|
|00010242fe8c5a6d1…|
1|4244733e06e7ecb49…|48436dade18ac8b2b…|2017-09-19 09:45:35| 58.90|
13.29|
|00018f77f2f0320c5…|
1|e5f2d52b802189ee6…|dd7ddc04e1b6c2c61…|2017-05-03 11:05:13|239.90|
19.93|
|000229ec398224ef6…|
1|c777355d18b72b67a…|5b51032eddd242adc…|2018-01-18 14:48:30|199.00|
17.87|
5
|00024acbcdf0a6daa…|
1|7634da152a4610f15…|9d7a1d34a50524090…|2018-08-15 10:10:18| 12.99|
12.79|
+--------------------+-------------+--------------------+--------------------+--
-----------------+------+-------------+
only showing top 5 rows
print(f'Number of partitions:{df2.rdd.getNumPartitions()}')
df4=df2.repartition(10)
print(f'Number of partitions:{df4.rdd.getNumPartitions()}')
Number of partitions:2
[Stage 9:=============================> (1 + 1) / 2]
Number of partitions:10
6
+--------------------+
| order_id|
+--------------------+
|6299bb8e855289b41…|
|71fbb9971d84bf97a…|
|74322a01b770c2ea3…|
|a23fc2b3af4f1a48e…|
|747af114bbea56ac1…|
+--------------------+
only showing top 5 rows
+--------------------+-------------------+
| order_id|shipping_limit_date|
+--------------------+-------------------+
|3bbf8f927f288e4a1…|2017-11-09 14:25:38|
|50c40cfcbb6ce3fca…|2018-06-14 09:52:04|
|51c3d73e0e9052253…|2018-02-22 19:15:27|
|183ee0e3ebd4c1c99…|2018-02-07 20:14:08|
|3a1400b5d4dd3082a…|2018-03-27 17:28:20|
+--------------------+-------------------+
only showing top 5 rows
[Stage 34:=============================> (1 + 1) / 2]
+--------------------+-------------------+
| o_id| Limit_date|
+--------------------+-------------------+
|3bbf8f927f288e4a1…|2017-11-09 14:25:38|
|50c40cfcbb6ce3fca…|2018-06-14 09:52:04|
|51c3d73e0e9052253…|2018-02-22 19:15:27|
|183ee0e3ebd4c1c99…|2018-02-07 20:14:08|
|3a1400b5d4dd3082a…|2018-03-27 17:28:20|
+--------------------+-------------------+
only showing top 5 rows
#df4 has all columns of df3 and new columns dervied ones
df5=df4.withColumn("year",year(col("shipping_limit_date"))).
↪withColumn("month",month(col("shipping_limit_date")))
df5.select('order_id','shipping_limit_date','year','month').show(5)
7
[Stage 37:=============================> (1 + 1) / 2]
+--------------------+-------------------+----+-----+
| order_id|shipping_limit_date|year|month|
+--------------------+-------------------+----+-----+
|3bbf8f927f288e4a1…|2017-11-09 14:25:38|2017| 11|
|50c40cfcbb6ce3fca…|2018-06-14 09:52:04|2018| 6|
|51c3d73e0e9052253…|2018-02-22 19:15:27|2018| 2|
|183ee0e3ebd4c1c99…|2018-02-07 20:14:08|2018| 2|
|3a1400b5d4dd3082a…|2018-03-27 17:28:20|2018| 3|
+--------------------+-------------------+----+-----+
only showing top 5 rows
df6.select('order_id','shipping_limit_datetime','year').show(4)
+--------------------+-----------------------+----+
| order_id|shipping_limit_datetime|year|
+--------------------+-----------------------+----+
|3bbf8f927f288e4a1…| 2017-11-09 14:25:38|2017|
|50c40cfcbb6ce3fca…| 2018-06-14 09:52:04|2018|
|51c3d73e0e9052253…| 2018-02-22 19:15:27|2018|
|183ee0e3ebd4c1c99…| 2018-02-07 20:14:08|2018|
+--------------------+-----------------------+----+
only showing top 4 rows
df6.filter(col('order_id')=='00010242fe8c5a6d1ba2dd792cb16214').show(5)
df6.filter(col("order_id").isin(order_li)).show(5)
8
#counting_columns
df6.count()
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
|00010242fe8c5a6d1…| 1|4244733e06e7ecb49…|48436dade18ac8b2b…|
2017-09-19 09:45:35| 58.9| 13.29|2017| 9|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
|00042b26cf59d7ce6…| 1|ac6c3623068f30de0…|df560393f3a51e745…|
2017-02-13 13:57:51|199.9| 18.14|2017| 2|
|0008288aa423d2a3f…| 1|368c6c730842d7801…|1f50f920176fa81da…|
2018-02-21 02:55:52| 49.9| 13.37|2018| 2|
|0008288aa423d2a3f…| 2|368c6c730842d7801…|1f50f920176fa81da…|
2018-02-21 02:55:52| 49.9| 13.37|2018| 2|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
|363524b17966c3a64…| 2|43ee88561093499d9…|23613d49c3ac2bd30…|
2018-05-24 22:35:14| 10.9| 3.8|2018| 5|
|1d9609dad08db33f3…| 1|7cc67695a7648efc5…|95e03ca3d4146e401…|
2017-12-11 18:10:31|29.99| 8.9|2017| 12|
|50aff4b82439e01c5…| 1|ec1faa2edc27ce323…|cc419e0650a3c5ba7…|
2017-11-23 21:53:21|29.99| 7.78|2017| 11|
|37ee401157a3a0b28…| 9|d34c07a2d817ac73f…|e7d5b006eb624f130…|
9
2018-04-19 02:30:52|29.99| 7.39|2018| 4|
|8f5fac100b291e3c7…| 1|0e996644bf2835621…|b4ffb71f0cb1b1c3d…|
2017-12-08 09:13:27| 6.84| 7.78|2017| 12|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
only showing top 5 rows
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
|9edba4db56f479798…| 1|25d4f8db663ac141a…|74a9b9bddf14ece02…|
2017-12-01 12:34:33| 35.0| 4.36|2017| 12|
|9c4aacf06d8fee894…| 1|42f33073be6531e8f…|50c361bcf670d16f6…|
2018-02-19 21:07:46| 35.0| 16.6|2018| 2|
|9daeefebc1067e23b…| 1|c1cf541d5b33a4b04…|01fd077212124329b…|
2018-05-04 13:30:35| 35.0| 18.23|2018| 5|
|227fc3d5fef215496…| 1|0f3f3612d3a594da3…|93dc87703c046b603…|
2018-06-05 19:15:17|179.0| 8.44|2018| 6|
|9fa420e862b14f1b9…| 2|eb8c629f70275fd1c…|1025f0e2d44d7041d…|
2018-04-16 04:10:32| 35.0| 12.75|2018| 4|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+----+-----+
only showing top 5 rows
[33]: 112650
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
| order_id|order_item_id| product_id|
10
seller_id|shipping_limit_datetime|price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
|823e71d0dc92309fa…| 1|67bf0dde94ca85e84…|4a1f694197d05fe70…|
2018-05-17 22:13:44|34.99| 17.93|
|bde70300820015633…| 1|99a4788cb24856965…|4a3ca9315b744ce9f…|
2017-10-20 19:09:40| 89.9| 16.26|
|71c0d1686c9b55563…| 2|eb6c2ecde53034fc9…|1025f0e2d44d7041d…|
2017-12-01 19:31:45|32.99| 16.11|
|8bdc559b124e47eb7…| 1|42a2c92a0979a949c…|813348c996469b40f…|
2017-11-30 15:56:29| 58.9| 17.12|
|85ff272111b8ca343…| 1|a01c3a8e3ccddf440…|fe2032dab1a61af87…|
2017-07-05 20:25:11|179.0| 9.45|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
only showing top 5 rows
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime| price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+
|00018f77f2f0320c5…| 1|e5f2d52b802189ee6…|dd7ddc04e1b6c2c61…|
2017-05-03 11:05:13| 239.9| 19.93|
|000229ec398224ef6…| 1|c777355d18b72b67a…|5b51032eddd242adc…|
2018-01-18 14:48:30| 199.0| 17.87|
|00048cc3ae777c65d…| 1|ef92defde845ab845…|6426d21aca402a131…|
2017-05-23 03:55:27| 21.9| 12.69|
|0005a1a1728c9d785…| 1|310ae3c140ff94b03…|a416b6a846a117243…|
2018-03-26 18:31:29|145.95| 11.65|
|0005f50442cb953dc…| 1|4535b0e1091c278df…|ba143b05f0110f0dc…|
2018-07-06 14:10:56| 53.99| 11.4|
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+
only showing top 5 rows
[36]: 112650
#distinct values
df7.distinct().show(5)
11
#drops duplicates based on all columns
df7.dropDuplicates().show(5)
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|price|freight_value|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
|8533497ec8830f25e…| 1|38273c03eb0f88327…|8b321bb669392f516…|
2018-04-24 04:51:11| 19.9| 7.39|
|71f344dcca43baaa0…| 1|22f80c8069aff9c90…|525e75a6fb1454a23…|
2017-07-04 11:30:17|31.76| 3.26|
|255a21c87a4a96bae…| 1|764292b2b0f73f77a…|bd23da73548133471…|
2017-11-28 22:07:25| 89.9| 11.83|
|4a0b592d4d6082de8…| 3|609c35bf8122d5ab8…|7d456afc660226829…|
2018-07-25 17:31:25| 7.5| 4.79|
|2cd938176e6aaa529…| 1|060e9bdedfae37724…|6560211a19b47992c…|
2018-08-07 20:31:16| 45.0| 7.58|
+--------------------+-------------+--------------------+--------------------+--
---------------------+-----+-------------+
only showing top 5 rows
12
[50]: hdfs_path1='/tmp/input_data/Emp_data.csv'
df8=spark.read.format('csv').option('header','true').
↪option('InferSchema','True').load(hdfs_path1)
df8.printSchema()
df8.show()
df8.count()
df8.distinct().show()
root
|-- 'Emp_Name': string (nullable = true)
|-- 'Dept_Name': string (nullable = true)
|-- 'Role': string (nullable = true)
|-- 'Company': string (nullable = true)
|-- 'Years_Expereince': integer (nullable = true)
+--------------------+-----------+------------------+--------------+------------
------+
| 'Emp_Name'|'Dept_Name'| 'Role'|
'Company'|'Years_Expereince'|
+--------------------+-----------+------------------+--------------+------------
------+
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
+--------------------+-----------+------------------+--------------+------------
------+
+--------------------+-----------+------------------+--------------+------------
------+
| 'Emp_Name'|'Dept_Name'| 'Role'|
'Company'|'Years_Expereince'|
+--------------------+-----------+------------------+--------------+------------
13
------+
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
+--------------------+-----------+------------------+--------------+------------
------+
[61]: Emp_df=df8.withColumnRenamed("'Emp_Name'","Emp_Name").
↪withColumnRenamed("'Dept_Name'","Dept_Name").
↪withColumnRenamed("'Role'","Role").withColumnRenamed("'Company'","Company").
↪withColumnRenamed("'Years_Expereince'","Years_Experience")
Emp_df.show()
+--------------------+-----------+------------------+--------------+------------
----+
| Emp_Name| Dept_Name| Role|
Company|Years_Experience|
+--------------------+-----------+------------------+--------------+------------
----+
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
+--------------------+-----------+------------------+--------------+------------
----+
[64]: #order by
Emp_df.orderBy(col('Years_Experience').desc()).show()
+--------------------+-----------+------------------+--------------+------------
14
----+
| Emp_Name| Dept_Name| Role|
Company|Years_Experience|
+--------------------+-----------+------------------+--------------+------------
----+
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
+--------------------+-----------+------------------+--------------+------------
----+
[65]: Emp_df.orderBy(col('Dept_Name').desc(),col('Years_Experience').desc()).show()
+--------------------+-----------+------------------+--------------+------------
----+
| Emp_Name| Dept_Name| Role|
Company|Years_Experience|
+--------------------+-----------+------------------+--------------+------------
----+
| 'Mahesh Manjrekar'| 'QEA'| 'Sr.Developer'| 'Mphasis'|
10|
| 'Suresh Shinde'|'Insurance'| 'Devops'| 'Cognizant'|
12|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
|'Navinkumar Valla…|'Insurance'| 'ETL Developer'| 'Cognizant'|
3|
| 'Shubham Racha'| 'Finance'|'Accounts Manager'| 'Quarto'|
5|
| 'Jay Parab'| 'Claims'|'Python Developer'|'TechMahindra'|
4|
+--------------------+-----------+------------------+--------------+------------
----+
15
df6.groupBy('year','month').agg(count('*').alias('total_count'),
avg('price').alias('avg_price'),
sum('price').alias('sum_price'),
min('price').alias('min_price'),
max('price').alias('max_price')).
↪orderBy(col('Year'),col('Month').desc()).show(6)
Year_li=[2017,2018]
df6.filter(col('Year').isin(Year_li)).groupBy('year').agg(count('*').
↪alias('total_count'),
avg('price').alias('avg_price'),
sum('price').alias('sum_price'),
min('price').alias('min_price'),
max('price').alias('max_price')).orderBy(col('Year')).
↪show(6)
+----+-----+-----------+------------------+------------------+---------+--------
-+
|year|month|total_count| avg_price|
sum_price|min_price|max_price|
+----+-----+-----------+------------------+------------------+---------+--------
-+
|2016| 12| 1| 10.9| 10.9| 10.9|
10.9|
|2016| 10| 365|135.83712328767123|49580.549999999996| 6.0|
1399.0|
|2016| 9| 4| 48.61750000000001|194.47000000000003| 44.99|
59.5|
|2017| 12| 7726|116.35011390111308| 898920.9799999996| 4.4|
3124.0|
|2017| 11| 7355|120.10219306594144| 883351.6299999993| 3.85|
2990.0|
|2017| 10| 5189|126.81060512622881| 658020.2300000013| 4.5|
2999.99|
+----+-----+-----------+------------------+------------------+---------+--------
-+
only showing top 6 rows
[Stage 340:============================> (1 + 1) / 2]
+----+-----------+------------------+-----------------+---------+---------+
|year|total_count| avg_price| sum_price|min_price|max_price|
+----+-----------+------------------+-----------------+---------+---------+
|2017| 49765|121.26732804179923|6034868.580000139| 1.2| 6735.0|
|2018| 62511|120.08515685239732|7506643.240000209| 0.85| 6729.0|
16
+----+-----------+------------------+-----------------+---------+---------+
[82]: df6.agg(sum('price').alias('sum_price')).show()
[80]: #accumulators
accum=spark.sparkContext.accumulator(0)
df6.foreach(lambda row:accum.add(row['price']))
print(accum.value)
.otherwise("Low")).show(5)
[Stage 370:============================> (1 + 1) / 2]
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+----+-----+--------------+
| order_id|order_item_id| product_id|
seller_id|shipping_limit_datetime|
price|freight_value|year|month|price_category|
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+----+-----+--------------+
|1e1bb536916a99649…| 2|0288f8dd74b931b4e…|1da3aeb70d7989d1e…|
2017-09-05 12:10:11| 49.99| 21.15|2017| 9| Low|
|62a0e822dd605871a…| 1|31dbb0d1815bdc83c…|6da1992f915d77be9…|
17
2017-06-08 11:50:18| 29.0| 15.79|2017| 6| Low|
|025c72e88fbf2358b…| 2|bef21943bc2335188…|e49c26c3edfa46d22…|
2017-03-21 21:24:27| 19.9| 20.8|2017| 3| Low|
|23d16dddab46fd3d0…| 1|cca8e09ba6f2d35e4…|43f8c9950d11ecd03…|
2018-01-31 22:17:51|109.99| 14.52|2018| 1| High|
|71c0d1686c9b55563…| 2|eb6c2ecde53034fc9…|1025f0e2d44d7041d…|
2017-12-01 19:31:45| 32.99| 16.11|2017| 12| Low|
+--------------------+-------------+--------------------+--------------------+--
---------------------+------+-------------+----+-----+--------------+
only showing top 5 rows
18