RDD Actions
RDD Actions
RDD Actions
foreach example
from pyspark import SparkContext
sc = SparkContext("local", "ForEachExample")
rdd = sc.parallelize([1, 2, 3, 4, 5])
def my_function(x):
print(x)
rdd.foreach(my_function)
sc.stop()
foreachPartition example
from pyspark import SparkContext
sc = SparkContext("local", "ForEachPartitionExample")
rdd = sc.parallelize([1, 2, 3, 4, 5], 2) # Creating 2 partitions
def my_partition_function(iterator):
for x in iterator:
print(x)
rdd.foreachPartition(my_partition_function)
sc.stop()
Fold() example
from pyspark.sql import SparkSession
# Create a Spark session
spark = SparkSession.builder.appName("FoldExample").getOrCreate()
# Create an RDD of numbers
numbers_rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])
# Define the binary function for multiplication
def multiply(x, y):
return x * y
# Use the fold function
product_result = numbers_rdd.fold(1, multiply)
# Print the result
print("Product using fold:", product_result)
# Stop the Spark session
spark.stop()
Reduce() example
from pyspark.sql import SparkSession
# Create a Spark session
spark = SparkSession.builder.appName("ReduceExample").getOrCreate()
# Create an RDD of numbers
numbers_rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])
# Define the binary function for addition
def add(x, y):
return x + y
# Use the reduce function
sum_result = numbers_rdd.reduce(add)
# Print the result
print("Sum using reduce:", sum_result)
# Stop the Spark session
spark.stop()
Aggregate Fn example
import findspark
findspark.init() def comb_op(acc1, acc2):
from pyspark.sql import SparkSession # Combine two accumulators by adding their
# Create a Spark session
sum and multiplying their products
return (acc1[0] + acc2[0], acc1[1] * acc2[1])
spark = SparkSession.builder.appName("AggregateExample").getOrCreate()
# Create an RDD of numbers # Use the aggregate function
numbers_rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 2) (sum_result, product_result) =
# Define the zero value and the aggregate functions numbers_rdd.aggregate(zero_value, seq_op,
comb_op)
zero_value = (0, 1) # Accumulator for sum, product
def seq_op(accumulator, element): # Print the results
# Update the accumulator by adding the element to sum and multiplying print("Sum:", sum_result)
to product
print("Product:", product_result)
return (accumulator[0] + element, accumulator[1] * element)
# Stop the Spark session
spark.stop()
takeordered ()
• MEMORY_AND_DISK: Cache the RDD in memory, and spill to disk if the memory is not sufficient.
• MEMORY_AND_DISK_SER: Cache the RDD in memory as serialized Java objects, and spill to disk if the memory is not sufficient.