Bigdata - Ipynb - Colab
Bigdata - Ipynb - Colab
ipynb - Colab
Collecting pyspark
Downloading pyspark-3.5.2.tar.gz (317.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 317.3/317.3 MB 3.7 MB/s eta 0:00:00
Preparing metadata (setup.py) ... done
Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-p
Building wheels for collected packages: pyspark
Building wheel for pyspark (setup.py) ... done
Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317
Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2
df = spark.read.csv("/content/sample_data/california_housing_test.csv", header=True, in
df.write.csv("/content/sample_data/file.csv", header=True)
+---------+--------+------------------+-----------+--------------+----------+----
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|hous
+---------+--------+------------------+-----------+--------------+----------+----
| -122.05| 37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3| 34.26| 43.0| 1510.0| 310.0| 809.0|
| -117.81| 33.78| 27.0| 3589.0| 507.0| 1484.0|
| -118.36| 33.82| 28.0| 67.0| 15.0| 49.0|
| -119.67| 36.33| 19.0| 1241.0| 244.0| 850.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sha… 1/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
| -119.56| 36.51| 37.0| 1018.0| 213.0| 663.0|
| -121.43| 38.63| 43.0| 1009.0| 225.0| 604.0|
| -120.65| 35.48| 19.0| 2310.0| 471.0| 1341.0|
| -122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
| -118.02| 34.08| 31.0| 2402.0| 632.0| 2830.0|
| -118.24| 33.98| 45.0| 972.0| 249.0| 1288.0|
| -119.12| 35.85| 37.0| 736.0| 166.0| 564.0|
| -121.93| 37.25| 36.0| 1089.0| 182.0| 535.0|
| -117.03| 32.97| 16.0| 3936.0| 694.0| 1935.0|
| -117.97| 33.73| 27.0| 2097.0| 325.0| 1217.0|
| -117.99| 33.81| 42.0| 161.0| 40.0| 157.0|
| -120.81| 37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2| 38.69| 26.0| 3077.0| 607.0| 1603.0|
| -118.88| 34.21| 26.0| 1590.0| 196.0| 654.0|
| -122.59| 38.01| 35.0| 8814.0| 1307.0| 3450.0|
+---------+--------+------------------+-----------+--------------+----------+----
only showing top 20 rows
df.show()
+---------+--------+------------------+-----------+--------------+----------+----
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|hous
+---------+--------+------------------+-----------+--------------+----------+----
| -122.05| 37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3| 34.26| 43.0| 1510.0| 310.0| 809.0|
| -117.81| 33.78| 27.0| 3589.0| 507.0| 1484.0|
| -118.36| 33.82| 28.0| 67.0| 15.0| 49.0|
| -119.67| 36.33| 19.0| 1241.0| 244.0| 850.0|
| -119.56| 36.51| 37.0| 1018.0| 213.0| 663.0|
| -121.43| 38.63| 43.0| 1009.0| 225.0| 604.0|
| -120.65| 35.48| 19.0| 2310.0| 471.0| 1341.0|
| -122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
| -118.02| 34.08| 31.0| 2402.0| 632.0| 2830.0|
| -118.24| 33.98| 45.0| 972.0| 249.0| 1288.0|
| -119.12| 35.85| 37.0| 736.0| 166.0| 564.0|
| -121.93| 37.25| 36.0| 1089.0| 182.0| 535.0|
| -117.03| 32.97| 16.0| 3936.0| 694.0| 1935.0|
| -117.97| 33.73| 27.0| 2097.0| 325.0| 1217.0|
| -117.99| 33.81| 42.0| 161.0| 40.0| 157.0|
| -120.81| 37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2| 38.69| 26.0| 3077.0| 607.0| 1603.0|
| -118.88| 34.21| 26.0| 1590.0| 196.0| 654.0|
| -122.59| 38.01| 35.0| 8814.0| 1307.0| 3450.0|
+---------+--------+------------------+-----------+--------------+----------+----
only showing top 20 rows
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sha… 2/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
df.describe().show()
+-------+-------------------+------------------+------------------+--------------
|summary| longitude| latitude|housing_median_age| total_ro
+-------+-------------------+------------------+------------------+--------------
| count| 3000| 3000| 3000| 3
| mean|-119.58920000000029| 35.63538999999999|28.845333333333333|2599.578666666
| stddev| 1.9949362939550166|2.1296695233438334|12.555395554955757|2155.593331625
| min| -124.18| 32.56| 1.0|
| max| -114.49| 41.92| 52.0| 3045
+-------+-------------------+------------------+------------------+--------------
print(df.dtypes)
df.printSchema()
root
|-- longitude: double (nullable = true)
|-- latitude: double (nullable = true)
|-- housing_median_age: double (nullable = true)
|-- total_rooms: double (nullable = true)
|-- total_bedrooms: double (nullable = true)
|-- population: double (nullable = true)
|-- households: double (nullable = true)
|-- median_income: double (nullable = true)
|-- median_house_value: double (nullable = true)
df.show(5)
+---------+--------+------------------+-----------+--------------+----------+----
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|hous
+---------+--------+------------------+-----------+--------------+----------+----
| -122.05| 37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3| 34.26| 43.0| 1510.0| 310.0| 809.0|
| -117.81| 33.78| 27.0| 3589.0| 507.0| 1484.0|
| -118.36| 33.82| 28.0| 67.0| 15.0| 49.0|
| -119.67| 36.33| 19.0| 1241.0| 244.0| 850.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sha… 3/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
+---------+--------+------------------+-----------+--------------+----------+----
only showing top 5 rows
print(df.columns)
df.select("longitude", "latitude")
print(df.count)
df.filter(df["latitude"] == "value").show()
+---------+--------+------------------+-----------+--------------+----------+----
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|hous
+---------+--------+------------------+-----------+--------------+----------+----
+---------+--------+------------------+-----------+--------------+----------+----
df = df.withColumn("train", lit("constant_value"))
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sha… 4/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
df = df.withColumnRenamed("old_column_name", "new_column_name")
df = df.drop("households")
df.show()
+---------+--------+------------------+-----------+--------------+----------+----
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|medi
+---------+--------+------------------+-----------+--------------+----------+----
| -122.05| 37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3| 34.26| 43.0| 1510.0| 310.0| 809.0|
| -117.81| 33.78| 27.0| 3589.0| 507.0| 1484.0|
| -118.36| 33.82| 28.0| 67.0| 15.0| 49.0|
| -119.67| 36.33| 19.0| 1241.0| 244.0| 850.0|
| -119.56| 36.51| 37.0| 1018.0| 213.0| 663.0|
| -121.43| 38.63| 43.0| 1009.0| 225.0| 604.0|
| -120.65| 35.48| 19.0| 2310.0| 471.0| 1341.0|
| -122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
| -118.02| 34.08| 31.0| 2402.0| 632.0| 2830.0|
| -118.24| 33.98| 45.0| 972.0| 249.0| 1288.0|
| -119.12| 35.85| 37.0| 736.0| 166.0| 564.0|
| -121.93| 37.25| 36.0| 1089.0| 182.0| 535.0|
| -117.03| 32.97| 16.0| 3936.0| 694.0| 1935.0|
| -117.97| 33.73| 27.0| 2097.0| 325.0| 1217.0|
| -117.99| 33.81| 42.0| 161.0| 40.0| 157.0|
| -120.81| 37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2| 38.69| 26.0| 3077.0| 607.0| 1603.0|
| -118.88| 34.21| 26.0| 1590.0| 196.0| 654.0|
| -122.59| 38.01| 35.0| 8814.0| 1307.0| 3450.0|
+---------+--------+------------------+-----------+--------------+----------+----
only showing top 20 rows
Q.16 How do you calculate the Skewness and Kurtosis of a perticular column in a
dataframe?
df.select(skewness("longitude"), kurtosis("latitude")).show()
+-------------------+------------------+
|skewness(longitude)|kurtosis(latitude)|
+-------------------+------------------+
|-0.2977086831220861|-1.124498724190226|
+-------------------+------------------+
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sha… 5/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
df = df.withColumnRenamed("long","lang").withColumnRenamed("llat","star")
df.show()
+-------+-----+------------------+-----------+--------------+----------+---------
| lang| star|housing_median_age|total_rooms|total_bedrooms|population|median_in
+-------+-----+------------------+-----------+--------------+----------+---------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0| 6.
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0| 3
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0| 5.
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0| 6.
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0| 2.
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0| 1.
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0| 1.
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0| 3
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0| 3.
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0| 2.
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0| 2.
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0| 2.4
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0| 4
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4.
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0| 5.
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0| 1
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0| 2.
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0| 6.
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0| 6.
+-------+-----+------------------+-----------+--------------+----------+---------
only showing top 20 rows
new_rows = [(225, 45, 88, 7945, 2346, 4564, 3.4645, 153742, 100)]
new_df = spark.createDataFrame(new_rows, df.columns)
df = df.union(new_df)
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sha… 6/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
+------------+
|(long = 225)|
+------------+
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
| false|
+------------+
only showing top 20 rows
new_rows = [
Row(long=None, llat=34.05, housing_median_age=30.0, total_rooms=2500.0, total_bedr
Row(long=-120.00, llat=None, housing_median_age=20.0, total_rooms=None, total_bedr
]
new_df = spark.createDataFrame(new_rows)
df = df.union(new_df)
schema = df.schema
new_row = [tuple([None]*len(df.columns))]
new_df = spark.createDataFrame(new_row, schema=schema)
df = df.union(new_df)
df.show()
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sha… 7/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
+-------+-----+------------------+-----------+--------------+----------+---------
| lang| star|housing_median_age|total_rooms|total_bedrooms|population|median_in
+-------+-----+------------------+-----------+--------------+----------+---------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0| 6.
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0| 3
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0| 5.
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0| 6.
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0| 2.
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0| 1.
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0| 1.
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0| 3
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0| 3.
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0| 2.
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0| 2.
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0| 2.4
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0| 4
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4.
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0| 5.
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0| 1
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0| 2.
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0| 6.
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0| 6.
+-------+-----+------------------+-----------+--------------+----------+---------
only showing top 20 rows
df.select("median_income", "median_house_value").toPandas().hist()
plt.show()
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sha… 8/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
df.select("median_income").toPandas().boxplot()
plt.show()
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sha… 9/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
sns.violinplot(df.select("median_income").toPandas())
plt.show()
data = [
(-122.05, 37.37, 27.0, 3885.0, 661.0, 1537.0, 6.6085, 344700.0, 100),
(-118.3, 34.26, 43.0, 1510.0, 310.0, 809.0, 3.599, 176500.0, 100),
(-117.81, 33.78, 27.0, 3589.0, 507.0, 1484.0, 5.7934, 270500.0, 100),
(-118.36, 33.82, 28.0, 67.0, 15.0, 49.0, 6.1359, 330000.0, 100),
(-119.67, 36.33, 19.0, 1241.0, 244.0, 850.0, 2.9375, 81700.0, 100),
(-119.56, 36.51, 37.0, 1018.0, 213.0, 663.0, 1.6635, 67000.0, 100),
(-121.43, 38.63, 43.0, 1009.0, 225.0, 604.0, 1.6641, 67000.0, 100),
(-120.65, 35.48, 19.0, 2310.0, 471.0, 1341.0, 3.225, 166900.0, 100),
(-122.84, 38.4, 15.0, 3080.0, 617.0, 1446.0, 3.6696, 194400.0, 100),
(-118.02, 34.08, 31.0, 2402.0, 632.0, 2830.0, 2.3333, 164200.0, 100),
(-118.24, 33.98, 45.0, 972.0, 249.0, 1288.0, 2.2054, 125000.0, 100),
(-119.12, 35.85, 37.0, 736.0, 166.0, 564.0, 2.4167, 58300.0, 100),
(-121.93, 37.25, 36.0, 1089.0, 182.0, 535.0, 4.69, 252600.0, 100),
(-117.03, 32.97, 16.0, 3936.0, 694.0, 1935.0, 4.5625, 231200.0, 100),
(-117.97, 33.73, 27.0, 2097.0, 325.0, 1217.0, 5.7121, 222500.0, 100),
(-117.99, 33.81, 42.0, 161.0, 40.0, 157.0, 2.2, 153100.0, 100),
(-120.81, 37.53, 15.0, 570.0, 123.0, 189.0, 1.875, 181300.0, 100),
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 10/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
(-121.2, 38.69, 26.0, 3077.0, 607.0, 1603.0, 2.7174, 137500.0, 100),
(-118.88, 34.21, 26.0, 1590.0, 196.0, 654.0, 6.5851, 300000.0, 100),
(-122.59, 38.01, 35.0, 8814.0, 1307.0, 3450.0, 6.1724, 414300.0, 100)
]
data1 = [
(-122.05, 37.37, 27.0, 3885.0, 661.0, 1537.0, 6.6085, 344700.0, 100),
(-118.3, 34.26, 43.0, 1510.0, 310.0, 809.0, 3.599, 176500.0, 100)
]
df1 = spark.createDataFrame(data1, ["long", "llat", "housing_median_age", "total_rooms
data2 = [
(-117.81, 33.78, 27.0, 3589.0, 507.0, 1484.0, 5.7934, 270500.0, 100),
(-118.36, 33.82, 28.0, 67.0, 15.0, 49.0, 6.1359, 330000.0, 100)
]
df2 = spark.createDataFrame(data2, ["long", "llat", "housing_median_age", "total_rooms
df_combined = df1.union(df2)
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 11/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
df.filter(df["total_rooms"].isNull()).show()
+----+----+------------------+-----------+--------------+------------+-----------
|long|llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_inco
+----+----+------------------+-----------+--------------+------------+-----------
+----+----+------------------+-----------+--------------+------------+-----------
df = df.na.drop()
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
df = df.na.drop(thresh=2)
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 12/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
df = df.na.fill({
"long": -999.0,
"llat": -999.0,
"housing_median_age": 0.0,
"total_rooms": 0.0,
"total_bedrooms": 0.0,
"population_2": 0.0,
"median_income": 0.0,
"median_house_value": 0.0,
"value": 0
})
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 13/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
df.orderBy("median_income").show()
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
+-------+-----+------------------+-----------+--------------+------------+-------
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 14/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
df.orderBy(["median_income", "median_house_value"]).show()
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
+-------+-----+------------------+-----------+--------------+------------+-------
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 15/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
df = df.dropDuplicates()
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
+-------+-----+------------------+-----------+--------------+------------+-------
data = [
(-122.05, 37.37, 27.0, 3885.0, 661.0, 1537.0, 6.6085, 344700.0, 100),
(-118.3, 34.26, 43.0, 1510.0, 310.0, 809.0, 3.599, 176500.0, 100),
(-117.81, 33.78, 27.0, 3589.0, 507.0, 1484.0, 5.7934, 270500.0, 100),
(-118.36, 33.82, 28.0, 67.0, 15.0, 49.0, 6.1359, 330000.0, 100),
(-119.67, 36.33, 19.0, 1241.0, 244.0, 850.0, 2.9375, 81700.0, 100),
(-119.56, 36.51, 37.0, 1018.0, 213.0, 663.0, 1.6635, 67000.0, 100),
(-121.43, 38.63, 43.0, 1009.0, 225.0, 604.0, 1.6641, 67000.0, 100),
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 16/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
(-120.65, 35.48, 19.0, 2310.0, 471.0, 1341.0, 3.225, 166900.0, 100),
(-122.84, 38.4, 15.0, 3080.0, 617.0, 1446.0, 3.6696, 194400.0, 100),
(-118.02, 34.08, 31.0, 2402.0, 632.0, 2830.0, 2.3333, 164200.0, 100),
(-118.24, 33.98, 45.0, 972.0, 249.0, 1288.0, 2.2054, 125000.0, 100),
(-119.12, 35.85, 37.0, 736.0, 166.0, 564.0, 2.4167, 58300.0, 100),
(-121.93, 37.25, 36.0, 1089.0, 182.0, 535.0, 4.69, 252600.0, 100),
(-117.03, 32.97, 16.0, 3936.0, 694.0, 1935.0, 4.5625, 231200.0, 100),
(-117.97, 33.73, 27.0, 2097.0, 325.0, 1217.0, 5.7121, 222500.0, 100),
(-117.99, 33.81, 42.0, 161.0, 40.0, 157.0, 2.2, 153100.0, 100),
(-120.81, 37.53, 15.0, 570.0, 123.0, 189.0, 1.875, 181300.0, 100),
(-121.2, 38.69, 26.0, 3077.0, 607.0, 1603.0, 2.7174, 137500.0, 100),
(-118.88, 34.21, 26.0, 1590.0, 196.0, 654.0, 6.5851, 300000.0, 100),
(-122.59, 38.01, 35.0, 8814.0, 1307.0, 3450.0, 6.1724, 414300.0, 100)
]
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
df.show(5)
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 17/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 5 rows
text_file = spark.read.text("/content/sample_data/file.csv/Sample.txt")
word_counts = text_file.rdd.flatMap(lambda line: line[0].split()).map(lambda word: (wo
word_counts.toDF(["word", "count"]).show()
+--------------------+-----+
| word|count|
+--------------------+-----+
| this| 2|
| is| 1|
| a| 1|
| sample| 1|
| file| 1|
| does| 1|
| not| 1|
| any| 1|
| info| 1|
|xxxxxxxxxxxxxxxxx...| 1|
+--------------------+-----+
df.createOrReplaceTempView("sql")
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 18/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
df.show(10)
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 10 rows
df.groupBy().sum("value").show()
+----------+
|sum(value)|
+----------+
| 2200|
+----------+
df.select(df.columns[0], df.columns[1]).show()
+-------+-----+
| long| llat|
+-------+-----+
|-122.05|37.37|
| -118.3|34.26|
|-117.81|33.78|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 19/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-118.36|33.82|
|-119.67|36.33|
|-119.56|36.51|
|-121.43|38.63|
|-120.65|35.48|
|-122.84| 38.4|
|-118.02|34.08|
|-118.24|33.98|
|-119.12|35.85|
|-121.93|37.25|
|-117.03|32.97|
|-117.97|33.73|
|-117.99|33.81|
|-120.81|37.53|
| -121.2|38.69|
|-118.88|34.21|
|-122.59|38.01|
+-------+-----+
only showing top 20 rows
df = df.withColumn("new_column", df["value"] * 2)
df = df.drop("new_column")
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 20/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
df = df.replace("", None)
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
df = df.na.drop()
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 21/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
df = df.filter(df["median_house_value"].isNotNull())
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
df = df.filter(df["median_house_value"].isNotNull())
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 22/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
df = df.na.fill("value")
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 23/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
df = df.na.fill({"median_income": 0})
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
Q.46 Fill Null Value of a Particular Column with the Mean of That Column:
mean_value = df.select(mean(df["median_house_value"])).collect()[0][0]
df = df.na.fill({"median_house_value": mean_value})
df.show()
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 24/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
Q.47 Filter Rows Based on a Particular Condition and Count the Number of Rows:
8
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-118.36|33.82| 28.0| 67.0| 15.0| 49.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-121.93|37.25| 36.0| 1089.0| 182.0| 535.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-118.88|34.21| 26.0| 1590.0| 196.0| 654.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
+-------+-----+------------------+-----------+--------------+------------+-------
only showing top 20 rows
Q.48 Filter Rows Based on a Particular Condition and Display Only a Subset of Features:
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 25/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
+-------+------------------+
| long|median_house_value|
+-------+------------------+
|-122.05| 344700.0|
|-117.81| 270500.0|
|-118.36| 330000.0|
|-121.93| 252600.0|
|-117.03| 231200.0|
|-117.97| 222500.0|
|-118.88| 300000.0|
|-122.59| 414300.0|
+-------+------------------+
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
| -118.3|34.26| 43.0| 1510.0| 310.0| 809.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
| 225.0| 45.0| 88.0| 7945.0| 2346.0| 4564.0|
| 225.0| 45.0| 88.0| 7945.0| 2346.0| 4564.0|
+-------+-----+------------------+-----------+--------------+------------+-------
+-------+-----+------------------+-----------+--------------+------------+-------
| long| llat|housing_median_age|total_rooms|total_bedrooms|population_2|median_
+-------+-----+------------------+-----------+--------------+------------+-------
|-122.05|37.37| 27.0| 3885.0| 661.0| 1537.0|
|-117.81|33.78| 27.0| 3589.0| 507.0| 1484.0|
|-119.67|36.33| 19.0| 1241.0| 244.0| 850.0|
|-119.56|36.51| 37.0| 1018.0| 213.0| 663.0|
|-121.43|38.63| 43.0| 1009.0| 225.0| 604.0|
|-120.65|35.48| 19.0| 2310.0| 471.0| 1341.0|
|-122.84| 38.4| 15.0| 3080.0| 617.0| 1446.0|
|-118.02|34.08| 31.0| 2402.0| 632.0| 2830.0|
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 26/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
|-118.24|33.98| 45.0| 972.0| 249.0| 1288.0|
|-119.12|35.85| 37.0| 736.0| 166.0| 564.0|
|-117.03|32.97| 16.0| 3936.0| 694.0| 1935.0| 4
|-117.97|33.73| 27.0| 2097.0| 325.0| 1217.0|
|-117.99|33.81| 42.0| 161.0| 40.0| 157.0|
|-120.81|37.53| 15.0| 570.0| 123.0| 189.0|
| -121.2|38.69| 26.0| 3077.0| 607.0| 1603.0|
|-122.59|38.01| 35.0| 8814.0| 1307.0| 3450.0|
| 225.0| 45.0| 88.0| 7945.0| 2346.0| 4564.0|
| 225.0| 45.0| 88.0| 7945.0| 2346.0| 4564.0|
+-------+-----+------------------+-----------+--------------+------------+-------
data = [
("Joey", "IT", 5000),
("Sheldon", "HR", 6000),
("Lenord", "HR", 5200),
("Penny", "IT", 6200),
("Howard", "Finance", 7000)
]
column=["Name","Department","Salary"]
df=spark.createDataFrame(data,column)
df.show()
+-------+----------+------+
| Name|Department|Salary|
+-------+----------+------+
| Joey| IT| 5000|
|Sheldon| HR| 6000|
| Lenord| HR| 5200|
| Penny| IT| 6200|
| Howard| Finance| 7000|
+-------+----------+------+
+----------+------------+
|Department|Total Salary|
+----------+------------+
| HR| 11200|
| IT| 11200|
| Finance| 7000|
+----------+------------+
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 27/28
11/25/24, 10:32 AM 232010012_bigdata.ipynb - Colab
Q.53 Find Salary of People in a Department with Salary Between t1 and t2:
https://colab.research.google.com/drive/144WgHqwNx7kJBgi37TJeUDzMqDpD5gx3?usp=sh… 28/28