0% found this document useful (0 votes)

107 views7 pages

Python Code

The document discusses Python code for cleaning and preparing sales data from a Redshift database for analysis. It performs operations like casting data types, removing null values, and transforming fields before writing the cleaned data to S3 as Parquet files. It also shows joining multiple datasets from Redshift into a single dynamic frame and writing the output to S3, Redshift, and partitioned Parquet files.

Uploaded by

Gnan Shetty

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

107 views7 pages

Python Code

Uploaded by

Gnan Shetty

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

Data Cleaning Python code:

import sys

from [Link] import getResolvedOptions

from [Link] import SparkContext

from [Link] import GlueContext

from [Link] import DynamicFrame

from [Link] import Job

from [Link] import udf

from [Link] import StringType

glueContext = GlueContext([Link]())

# Data Catalog: database and table name

db_name = "Redshift"

tbl_name = "Sales_report"

# S3 location for output

output_dir = "s3://glue-sample-target/output-dir/sales_report"

# Read data into a DynamicFrame using the Data Catalog metadata

medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = Redshift, table_name = Sales_report)

# The `provider id` field will be choice between long and string

# Cast choices into integers, those values that cannot cast result in null

sales_res = sales_dyf.resolveChoice(specs = [('provider id','cast:long')])

# Remove erroneous records

sales_df = sales_res.toDF()

sales_df = sales_df.where("`provider id` is NOT NULL")

# Apply a lambda to remove the '$'

chop_f = udf(lambda x: x[1:], StringType())

sales_df = sales_df.withColumn("ACC", chop_f(sales_df["average covered charges"])).withColumn("ATP",
chop_f(sales_df["average total payments"])).withColumn("AMP", chop_f(sales_df["average sales payments"]))

# Turn it back to a dynamic frame

sales_tmp = [Link](sales_df, glueContext, "nested")

# Rename, cast, and nest with apply_mapping

sales_nest = sales_tmp.apply_mapping([('drg definition', 'string', 'drg', 'string'),

('id', 'long', '[Link]', 'long'),

('name', 'string', '[Link]', 'string'),

('city', 'string', '[Link]', 'string'),

('state', 'string', '[Link]', 'string'),

('zip code', 'long', '[Link]', 'long'),

('sales referral region description', 'string','rr', 'string'),

('ACC', 'string', '[Link]', 'double'),

('ATP', 'string', 'charges.total_pay', 'double'),

('AMP', 'string', 'charges.sales_pay', 'double')])

# Write it out in Parquet

glueContext.write_dynamic_frame.from_options(frame = sales_nest, connection_type = "s3", connection_options =

{"path": output_dir}, format = "parquet")
Join and relationalize of data:
# Copyright 2016-2020 [Link], Inc. or its affiliates. All Rights Reserved.

# SPDX-License-Identifier: MIT-0

import sys

from [Link] import Join

from [Link] import getResolvedOptions

from [Link] import SparkContext

from [Link] import GlueContext

from [Link] import Job

glueContext = GlueContext([Link]())

# catalog: database and table names

db_name = "Redshift"

tbl_persons = "persons_json"

tbl_membership = "memberships_json"

tbl_organization = "organizations_json"

# output s3 and temp directories

output_history_dir = "s3://glue-sample-target/output-dir/sales_history"

redshift_temp_dir = "s3://glue-sample-target/temp-dir/"

# Create dynamic frames from the source tables

persons = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_persons)

memberships = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_membership)

orgs = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_organization)

# Keep the fields we need and rename some.

orgs = orgs.drop_fields(['other_names', 'identifiers']).rename_field('id', 'org_id').rename_field('name', 'org_name')

# Join the frames to create history

l_history = [Link](orgs, [Link](persons, memberships, 'id', 'person_id'), 'org_id',
'organization_id').drop_fields(['person_id', 'org_id'])

# ---- Write out the history ----

# Write out the dynamic frame into parquet in "sales_history" directory

print("Writing to /legislator_history ...")

glueContext.write_dynamic_frame.from_options(frame = l_history, connection_type = "s3", connection_options =

{"path": output_history_dir}, format = "parquet")

# Convert to data frame, write to directory "legislator_part", partitioned by (separate) Senate and House.

print("Writing to /legislator_part, partitioned by Senate and House ...")

l_history.toDF().[Link](output_lg_partitioned_dir, partitionBy=['org_name'])

# ---- Write out to relational databases ----

# Convert the data to flat tables

print("Converting to flat tables ...")

dfc = l_history.relationalize("hist_root", redshift_temp_dir)

# Cycle through and write to Redshift.

for df_name in [Link]():

m_df = [Link](df_name)

print("Writing to Redshift table: ", df_name, " ...")

glueContext.write_dynamic_frame.from_jdbc_conf(frame = m_df, catalog_connection = "redshift3",

connection_options = {"dbtable": df_name, "database": "testdb"}, redshift_tmp_dir = redshift_temp_dir)
Data visualization using Python Scripts

1)Histogram:
import pandas as pd
import [Link] as plt

data = [['E001', 'M', 34, 123, 'Normal', 350],
        ['E002', 'F', 40, 114, 'Overweight', 450],
        ['E003', 'F', 37, 135, 'Obesity', 169],
        ['E004', 'M', 30, 139, 'Underweight', 189],
        ['E005', 'F', 44, 117, 'Underweight', 183],
        ['E006', 'M', 36, 121, 'Normal', 80],
        ['E007', 'M', 32, 133, 'Obesity', 166],
        ['E008', 'F', 26, 140, 'Normal', 120],
        ['E009', 'M', 32, 133, 'Normal', 75],
        ['E010', 'M', 36, 133, 'Underweight', 40] ]
  df = [Link](data, columns = ['EMPID', 'Gender',
                                    'Age', 'Sales',
                                    'BMI', 'Income'] )
  [Link]()
  [Link]()

Output
2)Column Chart:
data = [['E001', 'M', 34, 123, 'Normal', 350],
        ['E002', 'F', 40, 114, 'Overweight', 450],
        ['E003', 'F', 37, 135, 'Obesity', 169],
        ['E004', 'M', 30, 139, 'Underweight', 189],
        ['E005', 'F', 44, 117, 'Underweight', 183],
        ['E006', 'M', 36, 121, 'Normal', 80],
        ['E007', 'M', 32, 133, 'Obesity', 166],
        ['E008', 'F', 26, 140, 'Normal', 120],
        ['E009', 'M', 32, 133, 'Normal', 75],
        ['E010', 'M', 36, 133, 'Underweight', 40] ]
  df = [Link](data, columns = ['EMPID', 'Gender',
                                    'Age', 'Sales',
                                    'BMI', 'Income'] )

[Link]()

[Link](df['Age'], df['Sales'])
[Link]("Age")
[Link]("Sales")
[Link]()

Output:
3) Scatter Plot:
import [Link] as plt

x_axis = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

y_axis = [5, 16, 34, 56, 32, 56, 32, 12, 76, 89]

[Link]("Prices over 10 years")

[Link](x_axis, y_axis, color='darkblue', marker='x', label="item 1")

[Link]("Time (years)")
[Link]("Price (dollars)")

[Link](True)
[Link]()

[Link]()

Output:

VMware & System Admin Expertise
No ratings yet
VMware & System Admin Expertise
3 pages
Set Up EKS Cluster Using Terraform
No ratings yet
Set Up EKS Cluster Using Terraform
12 pages
Data Analyst Career Overview
No ratings yet
Data Analyst Career Overview
1 page
Dev Sharma: Senior .Net Developer Profile
No ratings yet
Dev Sharma: Senior .Net Developer Profile
8 pages
BXXXXXX Mobile: 91-18 E-Mail
No ratings yet
BXXXXXX Mobile: 91-18 E-Mail
9 pages
DevOps Mastery for Aspiring Engineers
No ratings yet
DevOps Mastery for Aspiring Engineers
3 pages
DevOps Engineer Expertise
No ratings yet
DevOps Engineer Expertise
4 pages
2.1 Intro-Ec2-Lab-V1.0 PDF
No ratings yet
2.1 Intro-Ec2-Lab-V1.0 PDF
23 pages
DevOps Engineer with AWS Expertise
No ratings yet
DevOps Engineer with AWS Expertise
4 pages
Syed Ismail Khadry: Website
No ratings yet
Syed Ismail Khadry: Website
4 pages
Aws
100% (1)
Aws
2 pages
DW Olap
No ratings yet
DW Olap
57 pages
SD Master Data, Create Customer-1234
No ratings yet
SD Master Data, Create Customer-1234
71 pages
AWS Solutions Architect Training
No ratings yet
AWS Solutions Architect Training
8 pages
OnkarPramodKurle (3 0)
No ratings yet
OnkarPramodKurle (3 0)
7 pages
Kubernetes Setup and Deployment Guide
No ratings yet
Kubernetes Setup and Deployment Guide
19 pages
01 - AWS Basics - Creating EC2 Instances - 20130517
No ratings yet
01 - AWS Basics - Creating EC2 Instances - 20130517
26 pages
Chef DevOps Interview Q&A Guide
No ratings yet
Chef DevOps Interview Q&A Guide
6 pages
Devops Job Guarantee Program
No ratings yet
Devops Job Guarantee Program
4 pages
Free Use of CloudFormation & Elastic Beanstalk
No ratings yet
Free Use of CloudFormation & Elastic Beanstalk
6 pages
Secure Machine Learning on AWS
No ratings yet
Secure Machine Learning on AWS
39 pages
Kubernatis (K8s) Configuration Guide
No ratings yet
Kubernatis (K8s) Configuration Guide
20 pages
Serverless Website Architecture Guide
No ratings yet
Serverless Website Architecture Guide
21 pages
DevOps Technical Interview Questions Devops
No ratings yet
DevOps Technical Interview Questions Devops
7 pages
DevOps Plus
No ratings yet
DevOps Plus
19 pages
Scaler Academy Program Brochure - July 2021
No ratings yet
Scaler Academy Program Brochure - July 2021
16 pages
Tutorial-HDP-Administration - I HDFS & YARN PDF
No ratings yet
Tutorial-HDP-Administration - I HDFS & YARN PDF
140 pages
Jeffrey Wan's Resume 2023
No ratings yet
Jeffrey Wan's Resume 2023
1 page
BDF 2022 Combined 2
No ratings yet
BDF 2022 Combined 2
266 pages
Elastic Block Store (Amazon EBS)
100% (1)
Elastic Block Store (Amazon EBS)
29 pages
Sravan Kumar Devops 4.6Y
No ratings yet
Sravan Kumar Devops 4.6Y
5 pages
DevOps Plus Training Overview
No ratings yet
DevOps Plus Training Overview
19 pages
Unica Faq HCL
No ratings yet
Unica Faq HCL
3 pages
Mit Data Science Program
No ratings yet
Mit Data Science Program
14 pages
IT Professional Career Overview
No ratings yet
IT Professional Career Overview
5 pages
Cloud Analytics With Informatica and Tableau On The Aws Cloud
No ratings yet
Cloud Analytics With Informatica and Tableau On The Aws Cloud
35 pages
Dynamo DB (RDS)
No ratings yet
Dynamo DB (RDS)
28 pages
CIS Amazon Web Services Foundations Benchmark v1.5.0
No ratings yet
CIS Amazon Web Services Foundations Benchmark v1.5.0
266 pages
Report Aws Imp
100% (1)
Report Aws Imp
62 pages
HCL Unica 12.1 Supported Environments: Recommended Software Environments and Minimum System Requirements
No ratings yet
HCL Unica 12.1 Supported Environments: Recommended Software Environments and Minimum System Requirements
19 pages
TalendOpenStudio DI IG Windows 6.5.1 EN
No ratings yet
TalendOpenStudio DI IG Windows 6.5.1 EN
19 pages
Full Stack Developer Expertise
100% (1)
Full Stack Developer Expertise
6 pages
How-To - Install CDH On Mac OSX 10
No ratings yet
How-To - Install CDH On Mac OSX 10
20 pages
QA Tool
No ratings yet
QA Tool
149 pages
Banking QA & Testing Expert Resume
No ratings yet
Banking QA & Testing Expert Resume
5 pages
Ashutosh Sharma
No ratings yet
Ashutosh Sharma
5 pages
Handling Multiple Attachments Using Java Mapping - SAP PI
75% (4)
Handling Multiple Attachments Using Java Mapping - SAP PI
23 pages
Pavan Resume
No ratings yet
Pavan Resume
3 pages
Managing Unix Scripts and Security
No ratings yet
Managing Unix Scripts and Security
21 pages
1ez S4hana2022 BPD en MX
No ratings yet
1ez S4hana2022 BPD en MX
36 pages
Azure DevOps CI/CD Workshop Guide
No ratings yet
Azure DevOps CI/CD Workshop Guide
46 pages
Azure DevOps POCs for IT Professionals
100% (1)
Azure DevOps POCs for IT Professionals
6 pages
Meanstack Course Content Overview
No ratings yet
Meanstack Course Content Overview
9 pages
DevOps Engineer Resume Overview
No ratings yet
DevOps Engineer Resume Overview
2 pages
Introduction to Amazon EC2 Features
No ratings yet
Introduction to Amazon EC2 Features
6 pages
Fun Fishy Rhymes for Kids
No ratings yet
Fun Fishy Rhymes for Kids
22 pages
AbhijitDas (11 0)
No ratings yet
AbhijitDas (11 0)
1 page
M6 - T-GCPFCI-B - Core Infrastructure 5.0 - ILT PDF
No ratings yet
M6 - T-GCPFCI-B - Core Infrastructure 5.0 - ILT PDF
37 pages
AWS Services
No ratings yet
AWS Services
34 pages
Redshift ETL with AWS Glue Dynamic Frames
No ratings yet
Redshift ETL with AWS Glue Dynamic Frames
6 pages
Cambridge AS Level Computer Science Exam Paper
No ratings yet
Cambridge AS Level Computer Science Exam Paper
20 pages
Operators in C
No ratings yet
Operators in C
22 pages
Hybrid Dataflow and Von-Neumann Models
No ratings yet
Hybrid Dataflow and Von-Neumann Models
21 pages
Minutes of Meeting Project
No ratings yet
Minutes of Meeting Project
4 pages
Session-102 (Rootless - Rootful Containers)
No ratings yet
Session-102 (Rootless - Rootful Containers)
6 pages
Assignment No: 08
No ratings yet
Assignment No: 08
5 pages
Mad Practical 14
No ratings yet
Mad Practical 14
4 pages
Careem Recruitment Overview: Teams & Tech
No ratings yet
Careem Recruitment Overview: Teams & Tech
10 pages
MenuRecommendation Solution
No ratings yet
MenuRecommendation Solution
4 pages
Big Data Hadoop and Spark
No ratings yet
Big Data Hadoop and Spark
27 pages
MAD - Unit - 1 - Notes Introduction To Android
No ratings yet
MAD - Unit - 1 - Notes Introduction To Android
21 pages
Cliosoft Sos Fundamentals
No ratings yet
Cliosoft Sos Fundamentals
124 pages
Aramco SACS Instructions
No ratings yet
Aramco SACS Instructions
31 pages
Blender Python Reference 2 59 2
No ratings yet
Blender Python Reference 2 59 2
1,440 pages
Rohini 59443731861
No ratings yet
Rohini 59443731861
11 pages
DeceptionGrid 7.3 CLI - SDK Developers Guide
No ratings yet
DeceptionGrid 7.3 CLI - SDK Developers Guide
59 pages
Anaconda Training PDF
100% (1)
Anaconda Training PDF
2 pages
Unit V
No ratings yet
Unit V
109 pages
Introduction to Linux OS Features
No ratings yet
Introduction to Linux OS Features
18 pages
Sathaya Institute KVR Sir Java Notes
50% (2)
Sathaya Institute KVR Sir Java Notes
147 pages
OPPs CPP Coading With Harry
No ratings yet
OPPs CPP Coading With Harry
18 pages
C++ Input/Output Basics Explained
No ratings yet
C++ Input/Output Basics Explained
48 pages
Anonymized Resume
No ratings yet
Anonymized Resume
1 page
Toward Ontology-Based Risk Management Framework For Software Projects: An Empirical Study
No ratings yet
Toward Ontology-Based Risk Management Framework For Software Projects: An Empirical Study
24 pages
Msbte News: 22517 Advance Java Programming MCQ PDF
100% (4)
Msbte News: 22517 Advance Java Programming MCQ PDF
9 pages
Release Notes PcVue 12020
No ratings yet
Release Notes PcVue 12020
125 pages
Introduction to ArcGIS Enterprise Basics
100% (1)
Introduction to ArcGIS Enterprise Basics
44 pages
Page Lifecycle Stages: Stage Description
No ratings yet
Page Lifecycle Stages: Stage Description
19 pages
System Analysis and Design
No ratings yet
System Analysis and Design
30 pages
Data Structures & Algorithms Assignment
No ratings yet
Data Structures & Algorithms Assignment
2 pages

Python Code

Uploaded by

Python Code

Uploaded by

Data Cleaning Python code:

from [Link] import getResolvedOptions

from [Link] import SparkContext

from [Link] import GlueContext

from [Link] import DynamicFrame

from [Link] import Job

from [Link] import udf

from [Link] import StringType

# Data Catalog: database and table name

# S3 location for output

# Read data into a DynamicFrame using the Data Catalog metadata

medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = Redshift, table_name = Sales_report)

sales_res = sales_dyf.resolveChoice(specs = [('provider id','cast:long')])

# Remove erroneous records

sales_df = sales_df.where("`provider id` is NOT NULL")

# Apply a lambda to remove the '$'

chop_f = udf(lambda x: x[1:], StringType())

# Turn it back to a dynamic frame

sales_tmp = [Link](sales_df, glueContext, "nested")

# Rename, cast, and nest with apply_mapping

sales_nest = sales_tmp.apply_mapping([('drg definition', 'string', 'drg', 'string'),

('id', 'long', '[Link]', 'long'),

('name', 'string', '[Link]', 'string'),

('city', 'string', '[Link]', 'string'),

('state', 'string', '[Link]', 'string'),

('zip code', 'long', '[Link]', 'long'),

('sales referral region description', 'string','rr', 'string'),

('ACC', 'string', '[Link]', 'double'),

('ATP', 'string', 'charges.total_pay', 'double'),

('AMP', 'string', 'charges.sales_pay', 'double')])

# Write it out in Parquet

glueContext.write_dynamic_frame.from_options(frame = sales_nest, connection_type = "s3", connection_options =

from [Link] import Join

from [Link] import getResolvedOptions

from [Link] import SparkContext

from [Link] import GlueContext

from [Link] import Job

# catalog: database and table names

# output s3 and temp directories

# Create dynamic frames from the source tables

persons = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_persons)

memberships = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_membership)

orgs = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_organization)

# Keep the fields we need and rename some.

orgs = orgs.drop_fields(['other_names', 'identifiers']).rename_field('id', 'org_id').rename_field('name', 'org_name')

# Join the frames to create history

# ---- Write out the history ----

# Write out the dynamic frame into parquet in "sales_history" directory

print("Writing to /legislator_history ...")

glueContext.write_dynamic_frame.from_options(frame = l_history, connection_type = "s3", connection_options =

print("Writing to /legislator_part, partitioned by Senate and House ...")

# ---- Write out to relational databases ----

# Convert the data to flat tables

print("Converting to flat tables ...")

dfc = l_history.relationalize("hist_root", redshift_temp_dir)

# Cycle through and write to Redshift.

for df_name in [Link]():

print("Writing to Redshift table: ", df_name, " ...")

glueContext.write_dynamic_frame.from_jdbc_conf(frame = m_df, catalog_connection = "redshift3",

x_axis = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

[Link]("Prices over 10 years")

You might also like