Big Data Analytics: Snapshot of Class Lab and Data Camp Course
Big Data Analytics: Snapshot of Class Lab and Data Camp Course
Assignment
Snapshot of Class Lab and Data camp Course
A = sc.parallelize(range(3))
print(A)
A = sc.parallelize(range(4))
print(A)
Lines = sc.parallelize(['you are my sunshne','my only sunshine','you m
ake me happy'])
Big Data Exercises Snapshots
print(Lines)
A = sc.parallelize(range(4))
print(A)
L = A.collect()
print(type(L))
print (L)
sc.parallelize(range(4)).collect()
sc.parallelize(range(4)).count()
A = sc.parallelize(range(4))
A.reduce(lambda x,y: x+y)
A = sc.parallelize(range(4)).map(lambda x: (x,x*x))
A.collect()
words = ['this','is','the','best','mac','ever']
wordRDD = sc.parallelize(words)
wordRDD.reduce(lambda w,v: w if len(w) < len(v) else v )
B = sc.parallelize([1,3,5,2])
B.reduce(lambda x,y: x-y)
A = sc.parallelize([(1,3),(4,100),(1,-5),(3,2)])
A.reduceByKey(lambda x,y: x*y).collect()
A = sc.parallelize([(1,3),(4,100),(1,-5),(3,2)])
Big Data Exercises Snapshots
A.countByKey()
A = sc.parallelize([(1,3),(4,100),(1,-5),(3,2)])
A.lookup(3)
A = sc.parallelize([(1,3),(4,100),(1,-5),(3,2)])
A.collectAsMap()
words = ['this','is','the','best','mac','ever']
wordRDD = sc.parallelize(words)
def LargerThan(x,y):
if len(x) > len(y) : return x
elif len(y) > len(x) : return y
else:
if(x>y):
return x
else:
return y
wordRDD.reduce(LargerThan)
# Print the version of SparkContext
print("The version of Spark Context in the PySpark shell is", sc.versi
on )
# Print the Python version of SparkContext
Big Data Exercises Snapshots
print("The Python version of Spark Context in the PySpark shell is", s
c.pythonVer )
# Print the master of SparkContext
print("The master of Spark Context in the PySpark shell is", sc.master
)
# Create a python list of numbers from 1 to 100
numb = range(1, 100)
# Load the list into PySpark
spark_data = sc.parallelize(numb)
# Load a local file into PySpark shell
lines = sc.textFile(file_path)
# Print my_list in the console
print("Input list is", my_list)
# Square all numbers in my_list
squared_list_lambda = list(map(lambda x: x ** 2 , my_list))
# Print the result of the map function
print("The squared numbers are", squared_list_lambda)
# Print my_list2 in the console
print("Input list is:", my_list2)
# Filter numbers divisible by 10
filtered_list = list(filter(lambda x: (x%10 == 0), my_list2))
# Print the numbers divisible by 10
print("Numbers divisible by 10 are:", filtered_list)
Big Data Exercises Snapshots
# Create an RDD from a list of words
RDD = sc.parallelize(["Spark", "is", "a", "framework", "for", "Big Dat
a processing"])
# Print out the type of the created object
print("The type of RDD is", type(RDD))
# Print the file_path
print("The file_path is", file_path)
# Create a fileRDD from file_path
fileRDD = sc.textFile(file_path)
# Check the type of fileRDD
print("The file type of fileRDD is", type(fileRDD))
# Check the number of partitions in fileRDD
print("Number of partitions in fileRDD is", fileRDD.getNumPartitions()
)
# Create a fileRDD_part from file_path with 5 partitions
fileRDD_part = sc.textFile(file_path, minPartitions = 5)
# Check the number of partitions in fileRDD_part
print("Number of partitions in fileRDD_part is", fileRDD_part.getNumPa
rtitions())
Big Data Exercises Snapshots
# Create map() transformation to cube numbers
cubedRDD = numbRDD.map(lambda x: x ** 3 )
# Collect the results
numbers_all = cubedRDD.collect()
# Print the numbers from numbers_all
for numb in numbers_all:
print(numb)
# Filter the fileRDD to select lines with Spark keyword
fileRDD_filter = fileRDD.filter(lambda line: 'Spark' in line)
# How many lines are there in fileRDD?
print("The total number of lines with the keyword Spark is", fileRDD_f
ilter.count() )
# Print the first four lines of fileRDD
for line in fileRDD_filter.take(4):
print(line)
# Create PairRDD Rdd with key value pairs
Rdd = sc.parallelize([(1,2),(3,4),(3,6),(4,5)])
# Apply reduceByKey() operation on Rdd
Rdd_Reduced = Rdd.reduceByKey(lambda x, y: x+y )
# Iterate over the result and print the output
for num in Rdd_Reduced.collect() :
print("Key {} has {} Counts".format(num[0] , num[1]))
# Sort the reduced RDD with the key by descending order
Rdd_Reduced_Sort = Rdd_Reduced.sortByKey(ascending=False)
# Iterate over the result and print the output
for num in Rdd_Reduced_Sort.collect():
print("Key {} has {} Counts".format(num[0], num[1]))
Big Data Exercises Snapshots
# Transform the rdd with countByKey()
total = Rdd.countByKey()
# What is the type of total?
print("The type of total is", type(total))
# Iterate over the total and print the output
for k, v in total.items() :
print("key", k, "has", v, "counts")
# Create a baseRDD from the file path
baseRDD = sc.textFile(file_path)
# Split the lines of baseRDD into words
splitRDD = baseRDD.flatMap(lambda x: x.split(" "))
# Count the total number of words
print("Total number of words in splitRDD:", splitRDD.count())
# Convert the words in lower case and remove stop words from stop_word
s
splitRDD_no_stop = splitRDD.filter(lambda x: x.lower() not in stop_wor
ds)
# Create a tuple of the word and 1
splitRDD_no_stop_words = splitRDD_no_stop.map(lambda w: (w, 1))
# Count of the number of occurences of each word
resultRDD = splitRDD_no_stop_words.reduceByKey(lambda x, y: x + y)
# Display the first 10 words and their frequencies
for word in resultRDD.take(10):
print(word)
# Swap the keys and values
resultRDD_swap = resultRDD.map(lambda x: (x[1], x[0]))
Big Data Exercises Snapshots
# Sort the keys in descending order
resultRDD_swap_sort = resultRDD_swap.sortByKey(ascending=False)
# Show the top 10 most frequent words and their frequencies
for word in resultRDD_swap_sort.take(10):
print("{} has {} counts". format(word[1], word[0]))
# Create a list of tuples
sample_list = [('Mona',20), ('Jennifer',34),('John',20), ('Jim',26)]
# Create a RDD from the list
rdd = sc.parallelize(sample_list)
# Create a PySpark DataFrame
names_df = spark.createDataFrame(rdd , schema=['Name', 'Age'])
# Check the type of names_df
print("The type of names_df is", type(names_df))
# Create an DataFrame from file_path
people_df = spark.read.csv(file_path, header=True, inferSchema=True)
# Check the type of people_df
print("The type of people_df is", type(people_df))
# Print the first 10 observations
people_df.show(10)
# Count the number of rows
print("There are {} rows in the people_df DataFrame.".format(people_df
.count()))
# Count the number of columns and their names
print("There are {} columns in the people_df DataFrame and their names
are {}".format(len(people_df.columns ), people_df.columns ))
Big Data Exercises Snapshots
# Select name, sex and date of birth columns
people_df_sub = people_df.select('name', 'sex', 'date of birth')
# Print the first 10 observations from people_df_sub
people_df_sub.show(10)
# Remove duplicate entries from people_df_sub
people_df_sub_nodup = people_df_sub.dropDuplicates()
# Count the number of rows
print("There were {} rows before removing duplicates, and {} rows afte
r removing duplicates".format(people_df_sub.count(), people_df_sub_nod
up.count()))
# Filter people_df to select females
people_df_female = people_df.filter(people_df.sex == "female")
# Filter people_df to select males
people_df_male = people_df.filter(people_df.sex == "male")
# Count the number of rows
print("There are {} rows in the people_df_female DataFrame and {} rows
in the people_df_male DataFrame".format(people_df_female.count(), peop
le_df_male.count()))
# Create a temporary table "people"
people_df.createOrReplaceTempView("people")
# Construct a query to select the names of the people from the tempora
ry table "people"
query = '''SELECT name FROM people'''
# Assign the result of Spark's query to people_df_names
people_df_names = spark.sql(query)
# Print the top 10 names of the people
people_df_names.show(10)
Big Data Exercises Snapshots
# Filter the people table to select female sex
people_female_df = spark.sql('SELECT * FROM people WHERE sex=="female"
')
# Filter the people table DataFrame to select male sex
people_male_df = spark.sql('SELECT * FROM people WHERE sex=="male"')
# Count the number of rows in both DataFrames
print("There are {} rows in the people_female_df and {} rows in the pe
ople_male_df DataFrames".format(people_female_df.count(), people_male_
df.count()))
# Check the column names of names_df
print("The column names of names_df are", names_df.columns )
# Convert to Pandas DataFrame
df_pandas = names_df.toPandas()
# Create a horizontal bar plot
df_pandas.plot(kind='barh', x='Name', y='Age', colormap='winter_r')
plt.show()
# Load the Dataframe
fifa_df = spark.read.csv(file_path, header=True, inferSchema=True)
# Check the schema of columns
fifa_df.printSchema()
# Show the first 10 observations
fifa_df.show(10)
# Print the total number of rows
print("There are {} rows in the fifa_df DataFrame".format(fifa_df.coun
t()))
Big Data Exercises Snapshots
# Create a temporary view of fifa_df
fifa_df.printSchema()
fifa_df.createOrReplaceTempView('fifa_df_table')
# Construct the "query"
query = '''SELECT Age FROM fifa_df_table WHERE Nationality == "Germany
"'''
# Apply the SQL "query"
fifa_df_germany_age = spark.sql(query)
# Generate basic statistics
fifa_df_germany_age.describe().show()
# Convert fifa_df to fifa_df_germany_age_pandas DataFrame
fifa_df_germany_age_pandas = fifa_df_germany_age.toPandas()
# Plot the 'Age' density of Germany Players
fifa_df_germany_age_pandas.plot(kind='density')
plt.show()