0% found this document useful (0 votes)
29 views6 pages

Import Import As Import As: #Default To CSV

This Python script cleans and processes multiple CSV files containing voter registration and demographic data from 2012 and 2016. It performs several cleaning functions on the raw CSVs like removing commas from numeric fields, inferring gender based on age or education levels, and adding new columns for year and other derived fields. The cleaned CSVs are outputted to a new folder for later use. Key functions include cleaning numeric values, adding gender, education and year columns, and processing multiple CSV files in a consistent way.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
29 views6 pages

Import Import As Import As: #Default To CSV

This Python script cleans and processes multiple CSV files containing voter registration and demographic data from 2012 and 2016. It performs several cleaning functions on the raw CSVs like removing commas from numeric fields, inferring gender based on age or education levels, and adding new columns for year and other derived fields. The cleaned CSVs are outputted to a new folder for later use. Key functions include cleaning numeric values, adding gender, education and year columns, and processing multiple CSV files in a consistent way.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

C:\Users\SAU1.162\Documents\python\amy_project\cleaner.

py
Page 1 of 6 12/17/2020 6:52:32 PM
1
2 import os
3
4 import pandas as pd
5 import numpy as np
6
7 '''
8 input settings
9 '''
10 folder_in = os.path.join("resources", 'registervoterstats')
11
12 '''
13 Output settings
14 '''
15 folder_out = os.path.join(folder_in, "cleaned")
16 overwrite = True
17 field_sep = ',' #Default to CSV
18
19 #Make sure we have the output folder
20 try:
21 os.mkdir(folder_out)
22 except FileExistsError:
23 pass #Folder already exists
24
25 #Lazy overwrite protection setup
26 if overwrite:
27 fout_mode = 'w'
28 else:
29 fout_mode = 'x'
30
31 class DataExpander(object):
32 '''
33 Class used to expand tiered indexes to make a proper dataframe.
34 '''
35 expanding_list = None
36 _current_data_expanding = None
37
38 def __init__(self, expanding_list = None):
39 self.expanding_list = expanding_list
40
41 def data_expander(self, data):
42 if data:
43 if self.expanding_list:
44 self._current_data_expanding = self.expanding_list.pop(0)
45 else:
46 self._current_data_expanding = data.title()
47 return self._current_data_expanding
48
49 def do_csv(filename, column_dict, rows_set, cleaner_dict, header=5,
50 func_list=None, year=None):
51 '''
52 Function which combines the operations on the file into one function.
53 This saved some typing! yay for DRY!
54 '''
55 f_in = os.path.join(folder_in, filename)
56 skiprow = lambda index : not (index in rows_set)
57 df = pd.read_csv(f_in, skipinitialspace=True, header=header,
- 1 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 2 of 6 12/17/2020 6:52:32 PM
58 usecols=column_dict.keys(), skiprows=skiprow,
59 converters=cleaner_dict)
60 df.rename(columns=column_dict, inplace=True)
61 if func_list:
62 for fun in func_list:
63 fun(df)
64 if year:
65 add_year(df, year)
66 csv_out(df, filename)
67 print (f'CSV for {filename} has been created.')
68
69 def csv_out(data_frame, filename):
70 '''
71 Writes the dataframe out to a CSV file for later loading.
72 '''
73 fname = os.path.join(folder_out, filename)
74 data_frame.to_csv(fname, sep=field_sep, index=False, mode=fout_mode)
75
76 def add_genders(data_frame, inplace=True):
77 '''
78 Infers the gender of each line in a data group from one of the age
79 grouping files.
80 '''
81 index_list = []
82 last_age = 0
83 _gender = ["Both", "Male", "Female", "Wookie"]
84 for i in range(0, len(data_frame["Age"])):
85 _age = data_frame["Age"].iloc[i]
86 if _age < last_age:
87 _gender.pop(0)
88 last_age = _age
89 index_list.append(_gender[0])
90 if inplace:
91 data_frame["Gender"] = index_list
92 return index_list
93
94 def add_genders_edu(df, inplace=True):
95 '''
96 Infers the gender of each line in the data group of one of the edu
97 style files.
98 '''
99 index_list = []
100 edu_level_list = []
101 edu_level_dict = {da
102 "Less than 9th grade": 1,
103 "9th to 12th grade, no diploma": 2,
104 "High school graduate": 3,
105 "Some college or associate degree": 4,
106 "Some college or associate's degree": 4,
107 "Bachelor's degree": 5,
108 "Advanced degree": 6,
109 }
110 _gender = ["Both", "Male", "Female", "Wookie"]
111 _current_gender = None
112 for i in range(0, len(df["Education Level"])):
113 _level = df["Education Level"].iloc[i]
114 if _level == "Less than 9th grade":
- 2 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 3 of 6 12/17/2020 6:52:32 PM
115 _current_gender = _gender.pop(0)
116 index_list.append(_current_gender)
117 edu_level_list.append(edu_level_dict[_level])
118 if inplace:
119 df["Gender"] = index_list
120 df["Education Num"] = edu_level_list
121 return index_list
122
123 def add_year(data_frame, year):
124 'adds a year column to the dataframe'
125 lst = [year, ] * len(data_frame["Voted"])
126 data_frame["Census Year"] = lst
127
128 def clean_num(value):
129 '''
130 Attempts to turn a bit of data into an int, mostly by removing the
131 evil ',' from the number in the file. if it fails for any reason
132 it just very lazily assumes that the bit of data is not a number.
133 '''
134 ret = np.nan
135 try:
136 ret = int(''.join(value.split(',')))
137 except:
138 pass
139 return ret
140
141
142 '''
143 Data for Age and Gender.
144 '''
145
146 clean_age_12 = lambda value: int(value[2:4])
147 age_12_cleaner_dict = {
148 "Unnamed: 2":clean_num,
149 'Number': clean_num,
150 'Number.3': clean_num,
151 'Unnamed: 0': clean_age_12,
152 }
153 age_12_filename = '2012specificage.csv'
154 age_12_age_column_names = {
155 "Unnamed: 0":"Age",
156 "Unnamed: 2":"Citizen Population",
157 'Number': 'Registered Voters',
158 'Number.3': 'Voted'
159 }
160
161 age_12_header = 5
162 age_12_rows_to_use = []
163 age_12_rows_to_use += range(0, 6) #Header
164 age_12_rows_to_use += range(18, 82) # Both Genders
165 age_12_rows_to_use += range(94, 158) # Male
166 age_12_rows_to_use += range(170, 234) # Female
167 age_12_rows_to_use = set(age_12_rows_to_use)
168
169 do_csv(age_12_filename, age_12_age_column_names, age_12_rows_to_use,
170 age_12_cleaner_dict, age_12_header, [add_genders, ], year=2012)
171
- 3 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 4 of 6 12/17/2020 6:52:32 PM
172
173 # 2016 age data
174
175 clean_age_16 = lambda value: int(value[:2])
176 age_16_cleaner_dict = {
177 "Unnamed: 1": clean_age_16,
178 "Unnamed: 3":clean_num,
179 'Number': clean_num,
180 'Number.3': clean_num,
181 }
182 age_16_filename = '2016specificage.csv'
183 age_16_csv_path = os.path.join(folder_in, age_16_filename)
184 age_16_age_column_names = {
185 "Unnamed: 1":"Age",
186 "Unnamed: 3":"Citizen Population",
187 'Number': 'Registered Voters',
188 'Number.3': 'Voted'
189 }
190
191 age_16_header = 6
192 age_16_rows_to_use = []
193 age_16_rows_to_use += range(0, 7) #Header
194 age_16_rows_to_use += range(15, 79) # Both Genders
195 age_16_rows_to_use += range(87, 151) # Male
196 age_16_rows_to_use += range(159, 223) # Female
197 age_16_rows_to_use = set(age_16_rows_to_use)
198
199
200 do_csv(age_16_filename, age_16_age_column_names, age_16_rows_to_use,
201 age_16_cleaner_dict, age_16_header, [add_genders, ], year=2016)
202
203 '''
204 Data for cleaning up the education sheets.
205 '''
206
207 edu_12_filename = '2012genderandeducation.csv'
208 edu_12_cleaner_dict = {
209 "Unnamed: 3": clean_num,
210 "Number": clean_num,
211 "Number.3": clean_num,
212 }
213 edu_12_column_names = {
214 "Unnamed: 1": "Education Level",
215 "Unnamed: 3": "Citizen Population",
216 "Number": 'Registered Voters',
217 "Number.3": "Voted",
218 }
219 edu_12_header = 5
220 edu_12_rows_to_use = set(range(0, 27)) - {6, 13, 20}
221
222 do_csv(edu_12_filename, edu_12_column_names, edu_12_rows_to_use,
223 edu_12_cleaner_dict, edu_12_header, [add_genders_edu, ], year=2012)
224
225 #2016 edu data data data
226
227 edu_16_filename = '2016genderandeducation.csv'
228 edu_16_cleaner_dict = {
- 4 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 5 of 6 12/17/2020 6:52:32 PM
229 "Unnamed: 3": clean_num,
230 "Number": clean_num,
231 "Number.3": clean_num,
232 }
233 edu_16_column_names = {
234 "Unnamed: 1": "Education Level",
235 "Unnamed: 3": "Citizen Population",
236 "Number": 'Registered Voters',
237 "Number.3": "Voted",
238 }
239 edu_16_header = 6
240 edu_16_rows_to_use = set(range(0, 28)) - {7, 14, 21}
241
242 do_csv(edu_16_filename, edu_16_column_names, edu_16_rows_to_use,
243 edu_16_cleaner_dict, edu_16_header, [add_genders_edu, ], year=2016)
244
245 '''
246 Data for Ethnicity and States.
247 '''
248 es_12_state_expander = DataExpander(["US", ])
249 es_12_filename = "2012ethnicityandstates.csv"
250 es_12_cleaner_dict = {
251 "State": es_12_state_expander.data_expander,
252 # "Race and Hispanic origin": "Race and Hispanic origin",
253 "Total Citizen Population": clean_num,
254 "Total registered": clean_num,
255 "Total voted": clean_num,
256 }
257 es_12_column_names = {
258 "State": "State",
259 "Race and Hispanic origin": "Ethnicity",
260 "Total Citizen Population": "Citizen Population",
261 "Total registered": 'Registered Voters',
262 "Total voted": "Voted",
263 }
264 es_12_header = 3
265 es_12_rows_to_use = set(range(0, 576))
266
267 do_csv(es_12_filename, es_12_column_names, es_12_rows_to_use,
268 es_12_cleaner_dict, es_12_header, year=2012)
269
270 #2016 data data data?
271
272 es_16_state_expander = DataExpander(["US", ])
273 es_16_filename = "2016ethnicityandstates.csv"
274 es_16_cleaner_dict = {
275 "STATE": es_16_state_expander.data_expander,
276 "Total Citizen Population": clean_num,
277 "Total registered": clean_num,
278 "Total voted": clean_num,
279 }
280 es_16_column_names = {
281 "STATE": "State",
282 "Sex, Race and Hispanic-Origin": "Ethnicity",
283 "Total Citizen Population": "Citizen Population",
284 "Registered": 'Registered Voters',
285 "Voted": "Voted",
- 5 -
C:\Users\SAU1.162\Documents\python\amy_project\cleaner.py
Page 6 of 6 12/17/2020 6:52:32 PM
286 }
287 es_16_header = 3
288 es_16_rows_to_use = set(range(0, 577)) - {4, }
289
290 do_csv(es_16_filename, es_16_column_names, es_16_rows_to_use,
291 es_16_cleaner_dict, es_16_header, year=2016)
292

- 6 -

You might also like