Apache Spark
Apache Spark
The notebook below shows how you to use Apache Spark in Microsoft Fabric
In [2]:
df = spark.read.format("csv").option("header","false").load("Files/orders/2019.csv")
# df now is a Spark DataFrame containing CSV data from "Files/orders/2019.csv".
display(df)
In [3]:
# You can create a new header for the dataframe
from pyspark.sql.types import *
orderSchema = StructType([
StructField("SalesOrderNumber", StringType()),
StructField("SalesOrderLineNumber", IntegerType()),
StructField("OrderDate", DateType()),
StructField("CustomerName", StringType()),
StructField("Email", StringType()),
StructField("Item", StringType()),
StructField("Quantity", IntegerType()),
StructField("UnitPrice", FloatType()),
StructField("Tax", FloatType())
])
df = spark.read.format("csv").schema(orderSchema).load("Files/orders/2019.csv")
display(df)
In [ ]:
Modify the code so that the file path uses a * wildcard to read the sales order data from all of the
files in the orders folder:
In [4]:
from pyspark.sql.types import *
orderSchema = StructType([
StructField("SalesOrderNumber", StringType()),
StructField("SalesOrderLineNumber", IntegerType()),
StructField("OrderDate", DateType()),
StructField("CustomerName", StringType()),
StructField("Email", StringType()),
StructField("Item", StringType()),
StructField("Quantity", IntegerType()),
StructField("UnitPrice", FloatType()),
StructField("Tax", FloatType())
])
df = spark.read.format("csv").schema(orderSchema).load("Files/orders/*.csv")
display(df)
1. Filter
In [5]:
customers = df['CustomerName', 'Email']
print(customers.count())
print(customers.distinct().count())
display(customers.distinct())
In [6]:
# Use the where clause
customers = df.select("CustomerName", "Email").where(df['Item']=='Road-250 Red, 52')
print(customers.count())
print(customers.distinct().count())
display(customers.distinct())
In [7]:
productSales = df.select("Item", "Quantity").groupBy("Item").sum()
display(productSales)
In [9]:
# year aggregate of oder counts
from pyspark.sql.functions import *
yearlySales = df.select(year("OrderDate").alias("Year")).groupBy("Year").count().orderBy
display(yearlySales)
In [13]:
transformed_df.write.mode("overwrite").parquet('Files/transformed_data/orders')
print ("Transformed data has now been saved! Thanks")
In [14]:
# load a new dataframe from the parquet files in the transformed_orders/orders folder
orders_df = spark.read.format("parquet").load("Files/transformed_data/orders")
display(orders_df)
In [16]:
# To partition data by Year and Month use the below code
orders_df.write.partitionBy("Year","Month").mode("overwrite").parquet("Files/partitioned
print ("Transformed and partitioned data has been data saved!")
In [17]:
# You can load a new dataframe from the orders.parquet file using the below code e-g 202
orders_2020_df = spark.read.format("parquet").load("Files/partitioned_data/Year=2020/Mo
display(orders_2020_df)
In [2]:
# Create a new table
df.write.format("delta").saveAsTable("salesorder")
In [16]:
%%sql
In [30]:
df = spark.sql("SELECT * FROM Sales_LakeHouse.sales LIMIT 1000")
display(df)
In [25]:
%%sql
SELECT YEAR(OrderDate) AS OrderYear,
SUM((UnitPrice * Quantity) + TaxAmount) AS GrossRevenue
FROM sales
GROUP BY YEAR(OrderDate)
ORDER BY OrderYear
In [40]:
sqlQuery = "SELECT CAST(YEAR(OrderDate) AS CHAR(4)) AS OrderYear, \
SUM((UnitPrice * Quantity) + TaxAmount) AS GrossRevenue \
FROM Sales_LakeHouse.sales \
GROUP BY CAST(YEAR(OrderDate) AS CHAR(4)) \
ORDER BY OrderYear"
In [42]:
df_spark = spark.sql(sqlQuery)
df_spark.show()
In [6]:
# conda install pandoc
In [5]:
# conda update -n base -c defaults conda