Structured API Overview - Ipynb
Structured API Overview - Ipynb
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "9dc7e587-66f5-418b-a6b1-6ec1edf630ab",
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.format(\"json\").load(\"data/flight-data/json/2015-
summary.json\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "99ba3b83-5363-46bb-941e-7b80070fac29",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- DEST_COUNTRY_NAME: string (nullable = true)\n",
" |-- ORIGIN_COUNTRY_NAME: string (nullable = true)\n",
" |-- count: long (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "30e1c524-b488-4aeb-a325-6f9145a5930d",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.types import StructField, StructType, StringType, LongType,
IntegerType\n",
"\n",
"\n",
"myManualSchema = StructType([\n",
" StructField(\"DEST_COUNTRY_NAME\",StringType(), True),\n",
" StructField(\"ORIGIN_COUNTRY_NAME\",StringType(), True),\n",
" StructField(\"count\",IntegerType(), False,
metadata={\"name\":\"darshil\"}),\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6963aa4b-6174-44c6-b44b-c0b3c95e2f5f",
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.format(\"json\").schema(myManualSchema).load(\"data/flight-
data/json/2015-summary.json\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "555f0a19-7601-45c7-bf25-31e7a4f25427",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- DEST_COUNTRY_NAME: string (nullable = true)\n",
" |-- ORIGIN_COUNTRY_NAME: string (nullable = true)\n",
" |-- count: integer (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "91cc54b7-525d-42b6-b8a9-0f62a2a56ab2",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.functions import col, column\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "464ad9f6-75e3-4bdf-a094-f91abc1ab06b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "401686d6-6b02-4c17-ae20-68592718e06a",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import Row\n",
"myRow = Row(\"Hello\", None, 1, False)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "7c708d65-b5af-4e2c-8b7a-d50ec6246ab2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"myRow[2]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "aaaa62ba-9c41-4aae-8dda-0cadcabd936d",
"metadata": {},
"outputs": [],
"source": [
"df.createOrReplaceTempView(\"dfTable\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "0b430478-b840-4fe0-9529-8543c51830a9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+\n",
"|DEST_COUNTRY_NAME|\n",
"+-----------------+\n",
"| United States|\n",
"| United States|\n",
"+-----------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.select(\"DEST_COUNTRY_NAME\").show(2)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "e862ac2c-6639-4c3c-8819-a844e25b91af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-------------------+\n",
"|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|\n",
"+-----------------+-------------------+\n",
"| United States| Romania|\n",
"| United States| Croatia|\n",
"+-----------------+-------------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.select(\"DEST_COUNTRY_NAME\", \"ORIGIN_COUNTRY_NAME\").show(2)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "e3355131-e60c-4fe4-b27c-c755c0d9c224",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-----------------+-----------------+\n",
"|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|\n",
"+-----------------+-----------------+-----------------+\n",
"| United States| United States| United States|\n",
"| United States| United States| United States|\n",
"+-----------------+-----------------+-----------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import expr, col, column\n",
"df.select(\n",
"\t\t expr(\"DEST_COUNTRY_NAME\"),\n",
"\t\t col(\"DEST_COUNTRY_NAME\"),\n",
"\t\t column(\"DEST_COUNTRY_NAME\")).show(2)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "4ded3839-c8b1-466f-8251-8965c80368cb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+\n",
"| destination|\n",
"+-------------+\n",
"|United States|\n",
"|United States|\n",
"+-------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.select(expr(\"DEST_COUNTRY_NAME AS destination\")).show(2)\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "db838014-31ec-41b1-8a06-df13a10f461a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+\n",
"|DEST_COUNTRY_NAME|\n",
"+-----------------+\n",
"| United States|\n",
"| United States|\n",
"+-----------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"#or you can use alias on top the expr \n",
"df.select(expr(\"DEST_COUNTRY_NAME as
destination\").alias(\"DEST_COUNTRY_NAME\")).show(2)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "e1a775c4-9560-4302-a83f-cf262741d5d9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+-----------------+\n",
"|newColumnName|DEST_COUNTRY_NAME|\n",
"+-------------+-----------------+\n",
"|United States| United States|\n",
"|United States| United States|\n",
"+-------------+-----------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.selectExpr(\"DEST_COUNTRY_NAME as
newColumnName\", \"DEST_COUNTRY_NAME\").show(2)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "aef4ddca-0d08-44f7-976f-0a0b5e95a1e1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-------------------+-----+-------------+\n",
"|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|\n",
"+-----------------+-------------------+-----+-------------+\n",
"| United States| Romania| 15| false|\n",
"| United States| Croatia| 1| false|\n",
"+-----------------+-------------------+-----+-------------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.selectExpr(\n",
" \"*\", # all original columns\n",
" \"(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry\")\\\n",
" .show(2)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "2839342c-216a-4ea3-bd59-4b0a18c252b9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-------------------+-----+------------+\n",
"| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|negative_999|\n",
"+--------------------+-------------------+-----+------------+\n",
"| United States| Romania| 15| -999|\n",
"| United States| Croatia| 1| -999|\n",
"| United States| Ireland| 344| -999|\n",
"| Egypt| United States| 15| -999|\n",
"| United States| India| 62| -999|\n",
"| United States| Singapore| 1| -999|\n",
"| United States| Grenada| 62| -999|\n",
"| Costa Rica| United States| 588| -999|\n",
"| Senegal| United States| 40| -999|\n",
"| Moldova| United States| 1| -999|\n",
"| United States| Sint Maarten| 325| -999|\n",
"| United States| Marshall Islands| 39| -999|\n",
"| Guyana| United States| 64| -999|\n",
"| Malta| United States| 1| -999|\n",
"| Anguilla| United States| 41| -999|\n",
"| Bolivia| United States| 30| -999|\n",
"| United States| Paraguay| 6| -999|\n",
"| Algeria| United States| 4| -999|\n",
"|Turks and Caicos ...| United States| 230| -999|\n",
"| United States| Gibraltar| 1| -999|\n",
"+--------------------+-------------------+-----+------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import lit\n",
"df.select(expr(\"*\"), lit(-999).alias(\"negative_999\")).show(20)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "a186f652-3a3b-4d50-82e6-e81d19f2de89",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-------------------+-----+---------+\n",
"|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|\n",
"+-----------------+-------------------+-----+---------+\n",
"| United States| Romania| 15| 1|\n",
"| United States| Croatia| 1| 1|\n",
"+-----------------+-------------------+-----+---------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.withColumn(\"numberOne\", lit(1)).show(2)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "2a0ba538-bac6-43af-9ac4-ff8775123d42",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-------------------+-----+\n",
"| dest|ORIGIN_COUNTRY_NAME|count|\n",
"+--------------------+-------------------+-----+\n",
"| United States| Romania| 15|\n",
"| United States| Croatia| 1|\n",
"| United States| Ireland| 344|\n",
"| Egypt| United States| 15|\n",
"| United States| India| 62|\n",
"| United States| Singapore| 1|\n",
"| United States| Grenada| 62|\n",
"| Costa Rica| United States| 588|\n",
"| Senegal| United States| 40|\n",
"| Moldova| United States| 1|\n",
"| United States| Sint Maarten| 325|\n",
"| United States| Marshall Islands| 39|\n",
"| Guyana| United States| 64|\n",
"| Malta| United States| 1|\n",
"| Anguilla| United States| 41|\n",
"| Bolivia| United States| 30|\n",
"| United States| Paraguay| 6|\n",
"| Algeria| United States| 4|\n",
"|Turks and Caicos ...| United States| 230|\n",
"| United States| Gibraltar| 1|\n",
"+--------------------+-------------------+-----+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.withColumnRenamed(\"DEST_COUNTRY_NAME\", \"dest\").show()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "d4aa0c33-7f9c-414e-a5d1-5ca4e04c0dfb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-------------------+-----+------+\n",
"| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|count2|\n",
"+--------------------+-------------------+-----+------+\n",
"| United States| Romania| 15| 65|\n",
"| United States| Croatia| 1| 51|\n",
"| United States| Ireland| 344| 394|\n",
"| Egypt| United States| 15| 65|\n",
"| United States| India| 62| 112|\n",
"| United States| Singapore| 1| 51|\n",
"| United States| Grenada| 62| 112|\n",
"| Costa Rica| United States| 588| 638|\n",
"| Senegal| United States| 40| 90|\n",
"| Moldova| United States| 1| 51|\n",
"| United States| Sint Maarten| 325| 375|\n",
"| United States| Marshall Islands| 39| 89|\n",
"| Guyana| United States| 64| 114|\n",
"| Malta| United States| 1| 51|\n",
"| Anguilla| United States| 41| 91|\n",
"| Bolivia| United States| 30| 80|\n",
"| United States| Paraguay| 6| 56|\n",
"| Algeria| United States| 4| 54|\n",
"|Turks and Caicos ...| United States| 230| 280|\n",
"| United States| Gibraltar| 1| 51|\n",
"+--------------------+-------------------+-----+------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.withColumn(\"count2\", col(\"count\") + 50).show()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "8ac97134-afdd-4ad3-b7bb-da8d12b0f837",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-------------------+-----+\n",
"| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|\n",
"+--------------------+-------------------+-----+\n",
"| United States| Croatia| 1|\n",
"| United States| Singapore| 1|\n",
"| Moldova| United States| 1|\n",
"| Malta| United States| 1|\n",
"| United States| Gibraltar| 1|\n",
"|Saint Vincent and...| United States| 1|\n",
"| Suriname| United States| 1|\n",
"| United States| Cyprus| 1|\n",
"| Burkina Faso| United States| 1|\n",
"| Djibouti| United States| 1|\n",
"| United States| Estonia| 1|\n",
"| Zambia| United States| 1|\n",
"| Cyprus| United States| 1|\n",
"| United States| Lithuania| 1|\n",
"| United States| Bulgaria| 1|\n",
"| United States| Georgia| 1|\n",
"| United States| Bahrain| 1|\n",
"| Cote d'Ivoire| United States| 1|\n",
"| United States| Papua New Guinea| 1|\n",
"| Kosovo| United States| 1|\n",
"+--------------------+-------------------+-----+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.filter(col(\"count\") < 2).show()\n"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "04b6d2c8-6f4a-43b7-b75c-483891931290",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------+-------------------+-----+\n",
"|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|\n",
"+-----------------+-------------------+-----+\n",
"| United States| Croatia| 1|\n",
"| United States| Singapore| 1|\n",
"+-----------------+-------------------+-----+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df.where(\"count < 2\").show(2)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "cf38ceb6-8cbc-4cd2-bab8-d78f8adea923",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rdd.getNumPartitions() # 1"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "2cb74aa9-9afe-4edb-9914-b06794f95d70",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count:
int]"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.repartition(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa9edc7d-9f71-401e-aa7e-a35a9cb24610",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "bab3e5d0-eb46-4167-aa4a-2ef77e10f856",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}