Json To Dataframe
Json To Dataframe
{"ID":1,"NAME":"Jourdan","GENDER":"Female","DOB":"2012-01-01","SALARY":82445.63,"NRI":null}
{"ID":2,"NAME":"Alvera","GENDER":"Female","DOB":"2023-08-08","SALARY":75985.14,"NRI":true}
{"ID":3,"NAME":"Chauncey","GENDER":"Male","DOB":"2010-09-17","SALARY":81600.32,"NRI":null}
{"ID":4,"NAME":"Karrie","GENDER":"Female","DOB":"2024-02-28","SALARY":93889.24,"NRI":null}
{"ID":5,"NAME":"Phil","GENDER":"Female","DOB":"2022-06-06","SALARY":99743.67,"NRI":true}
println("inputDF:")
inputDF.show(false)
Output:
inputDF:
+----------+------+---+--------+----+--------+
|DOB |GENDER|ID |NAME |NRI |SALARY |
+----------+------+---+--------+----+--------+
|2012-01-01|Female|1 |Jourdan |null|82445.63|
|2023-08-08|Female|2 |Alvera |true|75985.14|
|2010-09-17|Male |3 |Chauncey|null|81600.32|
|2024-02-28|Female|4 |Karrie |null|93889.24|
|2022-06-06|Female|5 |Phil |true|99743.67|
+----------+------+---+--------+----+--------+
println("inputDF:")
inputDF.show(false)
inputDF:
+---+--------+------+----------+--------+----+
|ID |NAME |GENDER|DOB |SALARY |NRI |
+---+--------+------+----------+--------+----+
|1 |Jourdan |Female|2012-01-01|82445.63|null|
|2 |Alvera |Female|2023-08-08|75985.14|true|
|3 |Chauncey|Male |2010-09-17|81600.32|null|
|4 |Karrie |Female|2024-02-28|93889.24|null|
|5 |Phil |Female|2022-06-06|99743.67|true|
+---+--------+------+----------+--------+----+
Sample Data:
[{"ID":1,
"NAME":"Jourdan",
"GENDER":"Female",
"DOB":"2012-01-01",
"SALARY":82445.63,
"NRI":null
},
{"ID":2,
"NAME":"Alvera",
"GENDER":"Female",
"DOB":"2023-08-08",
"SALARY":75985.14,
"NRI":true
},
{"ID":3,
"NAME":"Chauncey",
"GENDER":"Male",
"DOB":"2010-09-17",
"SALARY":81600.32,
"NRI":null
},
{"ID":4,
"NAME":"Karrie",
"GENDER":"Female",
"DOB":"2024-02-28",
"SALARY":93889.24,
"NRI":null
},
{"ID":5,
"NAME":"Phil",
"GENDER":"Female",
"DOB":"2022-06-06",
"SALARY":99743.67,
"NRI":true
}]
Code:
println("inputDF:")
inputDF.show(false)
Output:
inputDF:
+---+--------+------+----------+-------------+----+
|ID |NAME |GENDER|DOB |SALARY |NRI |
+---+--------+------+----------+-------------+----+
|1 |Jourdan |Female|2012-01-01|82445.3232323|null|
|2 |Alvera |Female|2023-08-08|75985.14 |true|
|3 |Chauncey|Male |2010-09-17|81600.32 |null|
|4 |Karrie |Female|2024-02-28|93889.24 |null|
|5 |Phil |Female|2022-06-06|99743.67 |true|
+---+--------+------+----------+-------------+----+
[
{
"ID": 2,
"NAME": "Jane Smith",
"AGE": 35,
"HEIGHT": 5.6,
"WEIGHT": 155.0,
"IS_STUDENT": false,
"DOB": "1987-09-20",
"ADDRESS": {
"STREET": "456 Oak St",
"CITY": "Othertown",
"STATE": "CA",
"ZIPCODE": "54321"
},
"GREADES": [75, 85, 90],
"SALARY": 85000.75,
"IS_MANAGER": true
},
{
"ID": 3,
"NAME": "Alice Johnson",
"AGE": 28,
"HEIGHT": 5.4,
"WEIGHT": 140.0,
"IS_STUDENT": true,
"DOB": "1993-03-10",
"ADDRESS": {
"STREET": "789 Pine St",
"CITY": "Smalltown",
"STATE": "TX",
"ZIPCODE": "67890"
},
"GREADES": [90, 95, 100],
"SALARY": 65000.25,
"IS_MANAGER": false
},
{
"ID": 4,
"NAME": "Robert Brown",
"AGE": 40,
"HEIGHT": 6.0,
"WEIGHT": 180.0,
"IS_STUDENT": false,
"DOB": "1982-12-05",
"ADDRESS": {
"STREET": "101 Elm St",
"CITY": "Villagetown",
"STATE": "IL",
"ZIPCODE": "98765"
},
"GREADES": [80, 85, 90],
"SALARY": 90000.00,
"IS_MANAGER": true
},
{
"ID": 5,
"NAME": "Emily Lee",
"AGE": 25,
"HEIGHT": 5.8,
"WEIGHT": 160.0,
"IS_STUDENT": true,
"DOB": "1996-07-08",
"ADDRESS": {
"STREET": "321 Maple St",
"CITY": "Hometown",
"STATE": "FL",
"ZIPCODE": "54321"
},
"GREADES": [95, 95, 95],
"SALARY": 60000.50,
"IS_MANAGER": false
},
{
"ID": 6,
"NAME": "Michael Davis",
"AGE": 45,
"HEIGHT": 6.2,
"WEIGHT": 190.0,
"IS_STUDENT": false,
"DOB": "1977-11-15",
"ADDRESS": {
"STREET": "567 Cedar St",
"CITY": "Mountainview",
"STATE": "CA",
"ZIPCODE": "12345"
},
"GREADES": [70, 75, 80],
"SALARY": 100000.00,
"IS_MANAGER": true
}
]
Code:
println("inputDF:")
inputDF.select("*").show(false)
Output:
+---+-------------+----------+---+--------+------+------+----------+-------------+------------+-----+------------+-------+
|ID |NAME |DOB |AGE|SALARY |HEIGHT|WEIGHT|IS_MANAGER|GRADES |CITY
|STATE|STREET |ZIPCODE|
+---+-------------+----------+---+--------+------+------+----------+-------------+------------+-----+------------+-------+
|2 |Jane Smith |1987-09-20|35 |85000.75|5.6 |155.0 |true |[75, 85, 90] |Othertown |CA |456 Oak
St |54321 |
|3 |Alice Johnson|1993-03-10|28 |65000.25|5.4 |140.0 |false |[90, 95, 100]|Smalltown |TX |789 Pine
St |67890 |
|4 |Robert Brown |1982-12-05|40 |90000.0 |6.0 |180.0 |true |[80, 85, 90] |Villagetown |IL |101 Elm St
|98765 |
|5 |Emily Lee |1996-07-08|25 |60000.5 |5.8 |160.0 |false |[95, 95, 95] |Hometown |FL |321 Maple
St|54321 |
|6 |Michael Davis|1977-11-15|45 |100000.0|6.2 |190.0 |true |[70, 75, 80] |Mountainview|CA |567
Cedar St|12345 |
+---+-------------+----------+---+--------+------+------+----------+-------------+------------+-----+------------+-------+
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
Code:
println("inputDF:")
inputDF.show(false)
println("creating a separate row for each element of “batter” array by exploding “batter” column and \n
Extract the individual elements from the “new_batter” struct")
val finalBatDF = sampleDF
.select(col("key"),
explode(col("batters.batter")).alias("new_batter"))
.select("key", "new_batter.*")
.withColumnRenamed("id", "bat_id")
.withColumnRenamed("type", "bat_type")
finalBatDF.show(false)
completeDF.show(100,false)
Output:
inputDF:
+----+-----+----+----+---------------------------------------------------------+--------------------------------------------------------
---------------------------------------------------------------------------------+
|id |type |name|ppu |batters |topping
|
+----+-----+----+----+---------------------------------------------------------+--------------------------------------------------------
---------------------------------------------------------------------------------+
|0001|donut|Cake|0.55|{[{1001, Regular}, {1002, Chocolate}, {1003, Blueberry}]}|[{5001, None}, {5002,
Glazed}, {5005, Sugar}, {5007, Powdered Sugar}, {5006, Chocolate with Sprinkles}, {5003, Chocolate},
{5004, Maple}]|
+----+-----+----+----+---------------------------------------------------------+--------------------------------------------------------
---------------------------------------------------------------------------------+
creating a separate row for each element of “batter” array by exploding “batter” column and
Extract the individual elements from the “new_batter” struct
+----+------+---------+
|key |bat_id|bat_type |
+----+------+---------+
|0001|1001 |Regular |
|0001|1002 |Chocolate|
|0001|1003 |Blueberry|
+----+------+---------+