Import Import As Import As: #Default To CSV

This Python script cleans and processes multiple CSV files containing voter registration and demographic data from 2012 and 2016. It performs several cleaning functions on the raw CSVs like removing commas from numeric fields, inferring gender based on age or education levels, and adding new columns for year and other derived fields. The cleaned CSVs are outputted to a new folder for later use. Key functions include cleaning numeric values, adding gender, education and year columns, and processing multiple CSV files in a consistent way.

Uploaded by

Nekomancer Davion

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

29 views6 pages

Import Import As Import As: #Default To CSV

Uploaded by

Nekomancer Davion

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 6

C:\Users\SAU1.162\Documents\python\amy_project\cleaner.

py
Page 1 of 6 12/17/2020 6:52:32 PM
1
2 import os
3
4 import pandas as pd
5 import numpy as np
6
7 '''
8 input settings
9 '''
10 folder_in = os.path.join("resources", 'registervoterstats')
11
12 '''
13 Output settings
14 '''
15 folder_out = os.path.join(folder_in, "cleaned")
16 overwrite = True
17 field_sep = ',' #Default to CSV
18
19 #Make sure we have the output folder
20 try:
21 os.mkdir(folder_out)
22 except FileExistsError:
23 pass #Folder already exists
24
25 #Lazy overwrite protection setup
26 if overwrite:
27 fout_mode = 'w'
28 else:
29 fout_mode = 'x'
30
31 class DataExpander(object):
32 '''
33 Class used to expand tiered indexes to make a proper dataframe.
34 '''
35 expanding_list = None
36 _current_data_expanding = None
37
38 def __init__(self, expanding_list = None):
39 self.expanding_list = expanding_list
40
41 def data_expander(self, data):
42 if data:
43 if self.expanding_list:
44 self._current_data_expanding = self.expanding_list.pop(0)
45 else:
46 self._current_data_expanding = data.title()
47 return self._current_data_expanding
48
49 def do_csv(filename, column_dict, rows_set, cleaner_dict, header=5,
50 func_list=None, year=None):
51 '''
52 Function which combines the operations on the file into one function.
53 This saved some typing! yay for DRY!
54 '''
55 f_in = os.path.join(folder_in, filename)
56 skiprow = lambda index : not (index in rows_set)
57 df = pd.read_csv(f_in, skipinitialspace=True, header=header,
- 1 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 2 of 6 12/17/2020 6:52:32 PM
58 usecols=column_dict.keys(), skiprows=skiprow,
59 converters=cleaner_dict)
60 df.rename(columns=column_dict, inplace=True)
61 if func_list:
62 for fun in func_list:
63 fun(df)
64 if year:
65 add_year(df, year)
66 csv_out(df, filename)
67 print (f'CSV for {filename} has been created.')
68
69 def csv_out(data_frame, filename):
70 '''
71 Writes the dataframe out to a CSV file for later loading.
72 '''
73 fname = os.path.join(folder_out, filename)
74 data_frame.to_csv(fname, sep=field_sep, index=False, mode=fout_mode)
75
76 def add_genders(data_frame, inplace=True):
77 '''
78 Infers the gender of each line in a data group from one of the age
79 grouping files.
80 '''
81 index_list = []
82 last_age = 0
83 _gender = ["Both", "Male", "Female", "Wookie"]
84 for i in range(0, len(data_frame["Age"])):
85 _age = data_frame["Age"].iloc[i]
86 if _age < last_age:
87 _gender.pop(0)
88 last_age = _age
89 index_list.append(_gender[0])
90 if inplace:
91 data_frame["Gender"] = index_list
92 return index_list
93
94 def add_genders_edu(df, inplace=True):
95 '''
96 Infers the gender of each line in the data group of one of the edu
97 style files.
98 '''
99 index_list = []
100 edu_level_list = []
101 edu_level_dict = {da
102 "Less than 9th grade": 1,
103 "9th to 12th grade, no diploma": 2,
104 "High school graduate": 3,
105 "Some college or associate degree": 4,
106 "Some college or associate's degree": 4,
107 "Bachelor's degree": 5,
108 "Advanced degree": 6,
109 }
110 _gender = ["Both", "Male", "Female", "Wookie"]
111 _current_gender = None
112 for i in range(0, len(df["Education Level"])):
113 _level = df["Education Level"].iloc[i]
114 if _level == "Less than 9th grade":
- 2 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 3 of 6 12/17/2020 6:52:32 PM
115 _current_gender = _gender.pop(0)
116 index_list.append(_current_gender)
117 edu_level_list.append(edu_level_dict[_level])
118 if inplace:
119 df["Gender"] = index_list
120 df["Education Num"] = edu_level_list
121 return index_list
122
123 def add_year(data_frame, year):
124 'adds a year column to the dataframe'
125 lst = [year, ] * len(data_frame["Voted"])
126 data_frame["Census Year"] = lst
127
128 def clean_num(value):
129 '''
130 Attempts to turn a bit of data into an int, mostly by removing the
131 evil ',' from the number in the file. if it fails for any reason
132 it just very lazily assumes that the bit of data is not a number.
133 '''
134 ret = np.nan
135 try:
136 ret = int(''.join(value.split(',')))
137 except:
138 pass
139 return ret
140
141
142 '''
143 Data for Age and Gender.
144 '''
145
146 clean_age_12 = lambda value: int(value[2:4])
147 age_12_cleaner_dict = {
148 "Unnamed: 2":clean_num,
149 'Number': clean_num,
150 'Number.3': clean_num,
151 'Unnamed: 0': clean_age_12,
152 }
153 age_12_filename = '2012specificage.csv'
154 age_12_age_column_names = {
155 "Unnamed: 0":"Age",
156 "Unnamed: 2":"Citizen Population",
157 'Number': 'Registered Voters',
158 'Number.3': 'Voted'
159 }
160
161 age_12_header = 5
162 age_12_rows_to_use = []
163 age_12_rows_to_use += range(0, 6) #Header
164 age_12_rows_to_use += range(18, 82) # Both Genders
165 age_12_rows_to_use += range(94, 158) # Male
166 age_12_rows_to_use += range(170, 234) # Female
167 age_12_rows_to_use = set(age_12_rows_to_use)
168
169 do_csv(age_12_filename, age_12_age_column_names, age_12_rows_to_use,
170 age_12_cleaner_dict, age_12_header, [add_genders, ], year=2012)
171
- 3 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 4 of 6 12/17/2020 6:52:32 PM
172
173 # 2016 age data
174
175 clean_age_16 = lambda value: int(value[:2])
176 age_16_cleaner_dict = {
177 "Unnamed: 1": clean_age_16,
178 "Unnamed: 3":clean_num,
179 'Number': clean_num,
180 'Number.3': clean_num,
181 }
182 age_16_filename = '2016specificage.csv'
183 age_16_csv_path = os.path.join(folder_in, age_16_filename)
184 age_16_age_column_names = {
185 "Unnamed: 1":"Age",
186 "Unnamed: 3":"Citizen Population",
187 'Number': 'Registered Voters',
188 'Number.3': 'Voted'
189 }
190
191 age_16_header = 6
192 age_16_rows_to_use = []
193 age_16_rows_to_use += range(0, 7) #Header
194 age_16_rows_to_use += range(15, 79) # Both Genders
195 age_16_rows_to_use += range(87, 151) # Male
196 age_16_rows_to_use += range(159, 223) # Female
197 age_16_rows_to_use = set(age_16_rows_to_use)
198
199
200 do_csv(age_16_filename, age_16_age_column_names, age_16_rows_to_use,
201 age_16_cleaner_dict, age_16_header, [add_genders, ], year=2016)
202
203 '''
204 Data for cleaning up the education sheets.
205 '''
206
207 edu_12_filename = '2012genderandeducation.csv'
208 edu_12_cleaner_dict = {
209 "Unnamed: 3": clean_num,
210 "Number": clean_num,
211 "Number.3": clean_num,
212 }
213 edu_12_column_names = {
214 "Unnamed: 1": "Education Level",
215 "Unnamed: 3": "Citizen Population",
216 "Number": 'Registered Voters',
217 "Number.3": "Voted",
218 }
219 edu_12_header = 5
220 edu_12_rows_to_use = set(range(0, 27)) - {6, 13, 20}
221
222 do_csv(edu_12_filename, edu_12_column_names, edu_12_rows_to_use,
223 edu_12_cleaner_dict, edu_12_header, [add_genders_edu, ], year=2012)
224
225 #2016 edu data data data
226
227 edu_16_filename = '2016genderandeducation.csv'
228 edu_16_cleaner_dict = {
- 4 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 5 of 6 12/17/2020 6:52:32 PM
229 "Unnamed: 3": clean_num,
230 "Number": clean_num,
231 "Number.3": clean_num,
232 }
233 edu_16_column_names = {
234 "Unnamed: 1": "Education Level",
235 "Unnamed: 3": "Citizen Population",
236 "Number": 'Registered Voters',
237 "Number.3": "Voted",
238 }
239 edu_16_header = 6
240 edu_16_rows_to_use = set(range(0, 28)) - {7, 14, 21}
241
242 do_csv(edu_16_filename, edu_16_column_names, edu_16_rows_to_use,
243 edu_16_cleaner_dict, edu_16_header, [add_genders_edu, ], year=2016)
244
245 '''
246 Data for Ethnicity and States.
247 '''
248 es_12_state_expander = DataExpander(["US", ])
249 es_12_filename = "2012ethnicityandstates.csv"
250 es_12_cleaner_dict = {
251 "State": es_12_state_expander.data_expander,
252 # "Race and Hispanic origin": "Race and Hispanic origin",
253 "Total Citizen Population": clean_num,
254 "Total registered": clean_num,
255 "Total voted": clean_num,
256 }
257 es_12_column_names = {
258 "State": "State",
259 "Race and Hispanic origin": "Ethnicity",
260 "Total Citizen Population": "Citizen Population",
261 "Total registered": 'Registered Voters',
262 "Total voted": "Voted",
263 }
264 es_12_header = 3
265 es_12_rows_to_use = set(range(0, 576))
266
267 do_csv(es_12_filename, es_12_column_names, es_12_rows_to_use,
268 es_12_cleaner_dict, es_12_header, year=2012)
269
270 #2016 data data data?
271
272 es_16_state_expander = DataExpander(["US", ])
273 es_16_filename = "2016ethnicityandstates.csv"
274 es_16_cleaner_dict = {
275 "STATE": es_16_state_expander.data_expander,
276 "Total Citizen Population": clean_num,
277 "Total registered": clean_num,
278 "Total voted": clean_num,
279 }
280 es_16_column_names = {
281 "STATE": "State",
282 "Sex, Race and Hispanic-Origin": "Ethnicity",
283 "Total Citizen Population": "Citizen Population",
284 "Registered": 'Registered Voters',
285 "Voted": "Voted",
- 5 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 6 of 6 12/17/2020 6:52:32 PM
286 }
287 es_16_header = 3
288 es_16_rows_to_use = set(range(0, 577)) - {4, }
289
290 do_csv(es_16_filename, es_16_column_names, es_16_rows_to_use,
291 es_16_cleaner_dict, es_16_header, year=2016)
292

- 6 -

12 Information Practices Text Book Preeti Arora
No ratings yet
12 Information Practices Text Book Preeti Arora
45 pages
SLDG Book - Full
No ratings yet
SLDG Book - Full
2,149 pages
WM-GL-HAL-PSL-503 - Maintenance Procedures For A Lo Torc Plug Valve
100% (2)
WM-GL-HAL-PSL-503 - Maintenance Procedures For A Lo Torc Plug Valve
29 pages
Classroom and Lab Area - Job Roles Wise
No ratings yet
Classroom and Lab Area - Job Roles Wise
115 pages
List of Practical Ip065 Xii Session 2025 CKC Academy
No ratings yet
List of Practical Ip065 Xii Session 2025 CKC Academy
19 pages
ML Lab Manual Final
No ratings yet
ML Lab Manual Final
36 pages
PW2 DataCleaning
No ratings yet
PW2 DataCleaning
6 pages
DS100-1 WS 2.5 Enrico, DM
No ratings yet
DS100-1 WS 2.5 Enrico, DM
5 pages
Jashan ML
No ratings yet
Jashan ML
20 pages
DSBDA Lab Manual24-25
No ratings yet
DSBDA Lab Manual24-25
58 pages
List of Practical Ip065 Xii Session 2025 CKC Academy
No ratings yet
List of Practical Ip065 Xii Session 2025 CKC Academy
19 pages
I037 - Manas Patel Experiment09
No ratings yet
I037 - Manas Patel Experiment09
9 pages
Chapter 2
No ratings yet
Chapter 2
36 pages
168 Python
No ratings yet
168 Python
11 pages
20 Pandas Codes To Master Data Analysis
No ratings yet
20 Pandas Codes To Master Data Analysis
3 pages
Data Cleaning
No ratings yet
Data Cleaning
13 pages
Data Mining Lab 03
No ratings yet
Data Mining Lab 03
10 pages
Python Data Science 101
100% (1)
Python Data Science 101
41 pages
Exp3 Python
No ratings yet
Exp3 Python
15 pages
Membership Constraints: Adel Nehme
No ratings yet
Membership Constraints: Adel Nehme
36 pages
DAwHPC L03 Data Cleaning Practical
No ratings yet
DAwHPC L03 Data Cleaning Practical
43 pages
Data Exploration Preparation
No ratings yet
Data Exploration Preparation
12 pages
Overview of Data Cleaning
No ratings yet
Overview of Data Cleaning
17 pages
ICT2103 Full Book-Part-3
No ratings yet
ICT2103 Full Book-Part-3
14 pages
S08 Slides
No ratings yet
S08 Slides
14 pages
Unit6 - Working With Data
No ratings yet
Unit6 - Working With Data
29 pages
Advanced Python Programming Data Science: The University of Sheffield
No ratings yet
Advanced Python Programming Data Science: The University of Sheffield
55 pages
Do - File - Quan Ly Va Lam Sach Du Lieu
No ratings yet
Do - File - Quan Ly Va Lam Sach Du Lieu
6 pages
Python ClassXII AI
No ratings yet
Python ClassXII AI
4 pages
Pandas Library Documentation
No ratings yet
Pandas Library Documentation
16 pages
Ssce-2025 Practical Test Solution
No ratings yet
Ssce-2025 Practical Test Solution
7 pages
Exp - 1 - Introduction To Data Analytics and Python Fundamentals - SDK - Ok
No ratings yet
Exp - 1 - Introduction To Data Analytics and Python Fundamentals - SDK - Ok
9 pages
Practical File Class Xii
No ratings yet
Practical File Class Xii
25 pages
Hduud
No ratings yet
Hduud
55 pages
Lesson 2 - Data Preprocessing
100% (1)
Lesson 2 - Data Preprocessing
72 pages
EDA - Session-1 - Basic Dataframe Opertaions-1
No ratings yet
EDA - Session-1 - Basic Dataframe Opertaions-1
7 pages
Data Preprocessing - 241024 - 215531
No ratings yet
Data Preprocessing - 241024 - 215531
40 pages
Lesson 07 Data Manipulation With Pandas
No ratings yet
Lesson 07 Data Manipulation With Pandas
82 pages
Practical File Infomatics Practices 2024-25
No ratings yet
Practical File Infomatics Practices 2024-25
39 pages
L32, 33 Pandas
No ratings yet
L32, 33 Pandas
7 pages
Notebook PYTHON DATA SCIENCE
No ratings yet
Notebook PYTHON DATA SCIENCE
16 pages
Cheat Sheet: The Pandas Dataframe Object I: Preliminaries Get Your Data Into A Dataframe
No ratings yet
Cheat Sheet: The Pandas Dataframe Object I: Preliminaries Get Your Data Into A Dataframe
12 pages
Python For Exploratory Data Analysis
No ratings yet
Python For Exploratory Data Analysis
12 pages
AI Practical 2025
No ratings yet
AI Practical 2025
14 pages
Creation of Series Using List, Dictionary & Ndarray
No ratings yet
Creation of Series Using List, Dictionary & Ndarray
65 pages
Working With Panda
No ratings yet
Working With Panda
13 pages
IP 12th Chapter 3
No ratings yet
IP 12th Chapter 3
9 pages
AMLW Assignment 3
No ratings yet
AMLW Assignment 3
2 pages
Experiment 1 Solution
No ratings yet
Experiment 1 Solution
5 pages
Data Analysis by Using Python
No ratings yet
Data Analysis by Using Python
15 pages
Data Analysis Tools
No ratings yet
Data Analysis Tools
26 pages
Introduction To Pandas
No ratings yet
Introduction To Pandas
27 pages
1
No ratings yet
1
3 pages
Practical 3
No ratings yet
Practical 3
2 pages
DAP Writeups - Merged
No ratings yet
DAP Writeups - Merged
33 pages
Enhanced Student Data Processing System
No ratings yet
Enhanced Student Data Processing System
4 pages
Practical File Question 28.09.2022
No ratings yet
Practical File Question 28.09.2022
15 pages
Xii Record (Dataframe & CSV)
No ratings yet
Xii Record (Dataframe & CSV)
11 pages
CH-6 Data Loading, Storage, and File Formats
No ratings yet
CH-6 Data Loading, Storage, and File Formats
163 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Blazor and API Example: Classroom Quiz Application
From Everand
Blazor and API Example: Classroom Quiz Application
Taurius Litvinavicius
No ratings yet
C Language Programming Codes
From Everand
C Language Programming Codes
Durgesh
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Dummy 4 PDF
No ratings yet
Dummy 4 PDF
1 page
Dummy 2
No ratings yet
Dummy 2
1 page
Dummy 1
No ratings yet
Dummy 1
1 page
Dummy 3
No ratings yet
Dummy 3
1 page
1 Essay3
No ratings yet
1 Essay3
2 pages
HW SW Codesign
No ratings yet
HW SW Codesign
514 pages
Day 9: Primary Health Care (PHC) : CHN Lec Term 2 Exam
No ratings yet
Day 9: Primary Health Care (PHC) : CHN Lec Term 2 Exam
46 pages
TP5088 PDF
No ratings yet
TP5088 PDF
6 pages
Internal Analysis of FedEx V
100% (1)
Internal Analysis of FedEx V
3 pages
Lab Experiment 1 - Friction Pipe
No ratings yet
Lab Experiment 1 - Friction Pipe
7 pages
Fin
No ratings yet
Fin
2 pages
Operation Mannual Water Treatment System
No ratings yet
Operation Mannual Water Treatment System
488 pages
Power Press
100% (1)
Power Press
7 pages
Breccia Types: Hydrothermal, Fault, Volcanic, ETC: June 2016
No ratings yet
Breccia Types: Hydrothermal, Fault, Volcanic, ETC: June 2016
40 pages
Beyond C: Team Emertxe
100% (1)
Beyond C: Team Emertxe
135 pages
DLP Cot2
No ratings yet
DLP Cot2
3 pages
Motherboard Labeling Designed by Fujitsu
No ratings yet
Motherboard Labeling Designed by Fujitsu
3 pages
F.M.L. Thompson - The Cambridge Social History of Britain, 1750-1950, Vol. 01. Regions and Communities
No ratings yet
F.M.L. Thompson - The Cambridge Social History of Britain, 1750-1950, Vol. 01. Regions and Communities
592 pages
A Study On Customer Preference Towards Sports Shoes: Bachelor of Business Administration
No ratings yet
A Study On Customer Preference Towards Sports Shoes: Bachelor of Business Administration
8 pages
Accounts Project Bcom 1year
No ratings yet
Accounts Project Bcom 1year
6 pages
0510 s16 Ms 23 PDF
No ratings yet
0510 s16 Ms 23 PDF
11 pages
Delhi Industrial Policy 2010-2021
No ratings yet
Delhi Industrial Policy 2010-2021
42 pages
Aviation Ni-Cd BMT - Battery Maintenance Training
No ratings yet
Aviation Ni-Cd BMT - Battery Maintenance Training
2 pages
Construction Management
No ratings yet
Construction Management
13 pages
Human Values: DR - Sunil Ms Ob LPU
No ratings yet
Human Values: DR - Sunil Ms Ob LPU
11 pages
Preliminar Não Fabricar: Plan View From Above Showing Foundation Hole Drilling
No ratings yet
Preliminar Não Fabricar: Plan View From Above Showing Foundation Hole Drilling
1 page
Embr 1 PDF
No ratings yet
Embr 1 PDF
32 pages
04 CTTC Detailed Syllabus 2016
No ratings yet
04 CTTC Detailed Syllabus 2016
9 pages
(Utkarsh Pandey WTLF)
No ratings yet
(Utkarsh Pandey WTLF)
28 pages
System-On-Chip Design Book 2019 200dpi Aw
No ratings yet
System-On-Chip Design Book 2019 200dpi Aw
334 pages
Laptop Issue Form Sample
100% (1)
Laptop Issue Form Sample
3 pages

Import Import As Import As: #Default To CSV

Uploaded by

Import Import As Import As: #Default To CSV

Uploaded by

C:\Users\SAU1.162\Documents\python\amy_project\cleaner.

You might also like