0% found this document useful (0 votes)
3 views12 pages

Structured API Overview - Ipynb

The document contains a series of code cells that demonstrate the use of PySpark to read and manipulate flight data from a JSON file. It includes defining a schema, selecting specific columns, and performing various transformations on the data. The output showcases the structure of the data and examples of how to query and display it.

Uploaded by

padmanabhapb96
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views12 pages

Structured API Overview - Ipynb

The document contains a series of code cells that demonstrate the use of PySpark to read and manipulate flight data from a JSON file. It includes defining a schema, selecting specific columns, and performing various transformations on the data. The output showcases the structure of the data and examples of how to query and display it.

Uploaded by

padmanabhapb96
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 12

{

"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "9dc7e587-66f5-418b-a6b1-6ec1edf630ab",
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.format(\"json\").load(\"data/flight-data/json/2015-
summary.json\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "99ba3b83-5363-46bb-941e-7b80070fac29",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- DEST_COUNTRY_NAME: string (nullable = true)\n",
" |-- ORIGIN_COUNTRY_NAME: string (nullable = true)\n",
" |-- count: long (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "30e1c524-b488-4aeb-a325-6f9145a5930d",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.types import StructField, StructType, StringType, LongType,
IntegerType\n",
"\n",
"\n",
"myManualSchema = StructType([\n",
" StructField(\"DEST_COUNTRY_NAME\",StringType(), True),\n",
" StructField(\"ORIGIN_COUNTRY_NAME\",StringType(), True),\n",
" StructField(\"count\",IntegerType(), False,
metadata={\"name\":\"darshil\"}),\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6963aa4b-6174-44c6-b44b-c0b3c95e2f5f",
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.format(\"json\").schema(myManualSchema).load(\"data/flight-
data/json/2015-summary.json\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "555f0a19-7601-45c7-bf25-31e7a4f25427",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- DEST_COUNTRY_NAME: string (nullable = true)\n",
" |-- ORIGIN_COUNTRY_NAME: string (nullable = true)\n",
" |-- count: integer (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "91cc54b7-525d-42b6-b8a9-0f62a2a56ab2",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.functions import col, column\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "464ad9f6-75e3-4bdf-a094-f91abc1ab06b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "401686d6-6b02-4c17-ae20-68592718e06a",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import Row\n",
"myRow = Row(\"Hello\", None, 1, False)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "7c708d65-b5af-4e2c-8b7a-d50ec6246ab2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"myRow[2]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "aaaa62ba-9c41-4aae-8dda-0cadcabd936d",
"metadata": {},
"outputs": [],
"source": [
"df.createOrReplaceTempView(\"dfTable\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "0b430478-b840-4fe0-9529-8543c51830a9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+\n",
"|DEST_COUNTRY_NAME|\n",
"+-----------------+\n",
"| United States|\n",
"| United States|\n",
"+-----------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.select(\"DEST_COUNTRY_NAME\").show(2)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "e862ac2c-6639-4c3c-8819-a844e25b91af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-------------------+\n",
"|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|\n",
"+-----------------+-------------------+\n",
"| United States| Romania|\n",
"| United States| Croatia|\n",
"+-----------------+-------------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.select(\"DEST_COUNTRY_NAME\", \"ORIGIN_COUNTRY_NAME\").show(2)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "e3355131-e60c-4fe4-b27c-c755c0d9c224",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-----------------+-----------------+\n",
"|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|\n",
"+-----------------+-----------------+-----------------+\n",
"| United States| United States| United States|\n",
"| United States| United States| United States|\n",
"+-----------------+-----------------+-----------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import expr, col, column\n",
"df.select(\n",
"\t\t expr(\"DEST_COUNTRY_NAME\"),\n",
"\t\t col(\"DEST_COUNTRY_NAME\"),\n",
"\t\t column(\"DEST_COUNTRY_NAME\")).show(2)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "4ded3839-c8b1-466f-8251-8965c80368cb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+\n",
"| destination|\n",
"+-------------+\n",
"|United States|\n",
"|United States|\n",
"+-------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.select(expr(\"DEST_COUNTRY_NAME AS destination\")).show(2)\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "db838014-31ec-41b1-8a06-df13a10f461a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+\n",
"|DEST_COUNTRY_NAME|\n",
"+-----------------+\n",
"| United States|\n",
"| United States|\n",
"+-----------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"#or you can use alias on top the expr \n",
"df.select(expr(\"DEST_COUNTRY_NAME as
destination\").alias(\"DEST_COUNTRY_NAME\")).show(2)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "e1a775c4-9560-4302-a83f-cf262741d5d9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+-----------------+\n",
"|newColumnName|DEST_COUNTRY_NAME|\n",
"+-------------+-----------------+\n",
"|United States| United States|\n",
"|United States| United States|\n",
"+-------------+-----------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.selectExpr(\"DEST_COUNTRY_NAME as
newColumnName\", \"DEST_COUNTRY_NAME\").show(2)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "aef4ddca-0d08-44f7-976f-0a0b5e95a1e1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-------------------+-----+-------------+\n",
"|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|\n",
"+-----------------+-------------------+-----+-------------+\n",
"| United States| Romania| 15| false|\n",
"| United States| Croatia| 1| false|\n",
"+-----------------+-------------------+-----+-------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.selectExpr(\n",
" \"*\", # all original columns\n",
" \"(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry\")\\\n",
" .show(2)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "2839342c-216a-4ea3-bd59-4b0a18c252b9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-------------------+-----+------------+\n",
"| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|negative_999|\n",
"+--------------------+-------------------+-----+------------+\n",
"| United States| Romania| 15| -999|\n",
"| United States| Croatia| 1| -999|\n",
"| United States| Ireland| 344| -999|\n",
"| Egypt| United States| 15| -999|\n",
"| United States| India| 62| -999|\n",
"| United States| Singapore| 1| -999|\n",
"| United States| Grenada| 62| -999|\n",
"| Costa Rica| United States| 588| -999|\n",
"| Senegal| United States| 40| -999|\n",
"| Moldova| United States| 1| -999|\n",
"| United States| Sint Maarten| 325| -999|\n",
"| United States| Marshall Islands| 39| -999|\n",
"| Guyana| United States| 64| -999|\n",
"| Malta| United States| 1| -999|\n",
"| Anguilla| United States| 41| -999|\n",
"| Bolivia| United States| 30| -999|\n",
"| United States| Paraguay| 6| -999|\n",
"| Algeria| United States| 4| -999|\n",
"|Turks and Caicos ...| United States| 230| -999|\n",
"| United States| Gibraltar| 1| -999|\n",
"+--------------------+-------------------+-----+------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import lit\n",
"df.select(expr(\"*\"), lit(-999).alias(\"negative_999\")).show(20)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "a186f652-3a3b-4d50-82e6-e81d19f2de89",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-------------------+-----+---------+\n",
"|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|\n",
"+-----------------+-------------------+-----+---------+\n",
"| United States| Romania| 15| 1|\n",
"| United States| Croatia| 1| 1|\n",
"+-----------------+-------------------+-----+---------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.withColumn(\"numberOne\", lit(1)).show(2)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "2a0ba538-bac6-43af-9ac4-ff8775123d42",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-------------------+-----+\n",
"| dest|ORIGIN_COUNTRY_NAME|count|\n",
"+--------------------+-------------------+-----+\n",
"| United States| Romania| 15|\n",
"| United States| Croatia| 1|\n",
"| United States| Ireland| 344|\n",
"| Egypt| United States| 15|\n",
"| United States| India| 62|\n",
"| United States| Singapore| 1|\n",
"| United States| Grenada| 62|\n",
"| Costa Rica| United States| 588|\n",
"| Senegal| United States| 40|\n",
"| Moldova| United States| 1|\n",
"| United States| Sint Maarten| 325|\n",
"| United States| Marshall Islands| 39|\n",
"| Guyana| United States| 64|\n",
"| Malta| United States| 1|\n",
"| Anguilla| United States| 41|\n",
"| Bolivia| United States| 30|\n",
"| United States| Paraguay| 6|\n",
"| Algeria| United States| 4|\n",
"|Turks and Caicos ...| United States| 230|\n",
"| United States| Gibraltar| 1|\n",
"+--------------------+-------------------+-----+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.withColumnRenamed(\"DEST_COUNTRY_NAME\", \"dest\").show()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "d4aa0c33-7f9c-414e-a5d1-5ca4e04c0dfb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-------------------+-----+------+\n",
"| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|count2|\n",
"+--------------------+-------------------+-----+------+\n",
"| United States| Romania| 15| 65|\n",
"| United States| Croatia| 1| 51|\n",
"| United States| Ireland| 344| 394|\n",
"| Egypt| United States| 15| 65|\n",
"| United States| India| 62| 112|\n",
"| United States| Singapore| 1| 51|\n",
"| United States| Grenada| 62| 112|\n",
"| Costa Rica| United States| 588| 638|\n",
"| Senegal| United States| 40| 90|\n",
"| Moldova| United States| 1| 51|\n",
"| United States| Sint Maarten| 325| 375|\n",
"| United States| Marshall Islands| 39| 89|\n",
"| Guyana| United States| 64| 114|\n",
"| Malta| United States| 1| 51|\n",
"| Anguilla| United States| 41| 91|\n",
"| Bolivia| United States| 30| 80|\n",
"| United States| Paraguay| 6| 56|\n",
"| Algeria| United States| 4| 54|\n",
"|Turks and Caicos ...| United States| 230| 280|\n",
"| United States| Gibraltar| 1| 51|\n",
"+--------------------+-------------------+-----+------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.withColumn(\"count2\", col(\"count\") + 50).show()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "8ac97134-afdd-4ad3-b7bb-da8d12b0f837",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-------------------+-----+\n",
"| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|\n",
"+--------------------+-------------------+-----+\n",
"| United States| Croatia| 1|\n",
"| United States| Singapore| 1|\n",
"| Moldova| United States| 1|\n",
"| Malta| United States| 1|\n",
"| United States| Gibraltar| 1|\n",
"|Saint Vincent and...| United States| 1|\n",
"| Suriname| United States| 1|\n",
"| United States| Cyprus| 1|\n",
"| Burkina Faso| United States| 1|\n",
"| Djibouti| United States| 1|\n",
"| United States| Estonia| 1|\n",
"| Zambia| United States| 1|\n",
"| Cyprus| United States| 1|\n",
"| United States| Lithuania| 1|\n",
"| United States| Bulgaria| 1|\n",
"| United States| Georgia| 1|\n",
"| United States| Bahrain| 1|\n",
"| Cote d'Ivoire| United States| 1|\n",
"| United States| Papua New Guinea| 1|\n",
"| Kosovo| United States| 1|\n",
"+--------------------+-------------------+-----+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.filter(col(\"count\") < 2).show()\n"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "04b6d2c8-6f4a-43b7-b75c-483891931290",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-------------------+-----+\n",
"|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|\n",
"+-----------------+-------------------+-----+\n",
"| United States| Croatia| 1|\n",
"| United States| Singapore| 1|\n",
"+-----------------+-------------------+-----+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.where(\"count < 2\").show(2)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "cf38ceb6-8cbc-4cd2-bab8-d78f8adea923",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rdd.getNumPartitions() # 1"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "2cb74aa9-9afe-4edb-9914-b06794f95d70",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count:
int]"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.repartition(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa9edc7d-9f71-401e-aa7e-a35a9cb24610",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "bab3e5d0-eb46-4167-aa4a-2ef77e10f856",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

You might also like