0% found this document useful (0 votes)
27 views2 pages

Code With Dates HARDCODED

This document contains code to process missing data in Excel sheets. It imports necessary libraries, reads in an XML file to get sheet names and missing data rules. It then reads each sheet, cleans the data, generates a datetime index, and iterates through columns applying missing data rules to fill in missing values. The cleaned sheets are then written to a new Excel file.

Uploaded by

Bhanu Suravarapu
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
27 views2 pages

Code With Dates HARDCODED

This document contains code to process missing data in Excel sheets. It imports necessary libraries, reads in an XML file to get sheet names and missing data rules. It then reads each sheet, cleans the data, generates a datetime index, and iterates through columns applying missing data rules to fill in missing values. The cleaned sheets are then written to a new Excel file.

Uploaded by

Bhanu Suravarapu
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

import pandas as pd

import random
import math
import numpy as np

import xml.etree.ElementTree as ET
result_list = []
tree = ET.parse('DM SHORT.xml')
root = tree.getroot()
missing_data = root.find('MissingData')
for sheet in root.findall('Sheet'):
sheet_name = sheet.find('Name').text
print('Sheet:', sheet_name)
# Read the excel file
df = pd.read_excel('/content/dummy.xlsx', sheet_name=sheet_name, header=2)

df.replace(["NW","Nw","-"], np.nan, inplace=True)


df['DATE'].fillna(method='ffill', inplace=True)
dateTime = pd.date_range(start='2021.12.31', end='01.08.2022', freq='1T')
result = pd.DataFrame({'DateTime': dateTime})

columns = sheet.find('Columns')
include_columns = columns.find('IncludeColumns')
for column in include_columns.findall('Column'):
col_name = column.find('Name').text

print(col_name)
missing_data = column.find('MissingData')
if missing_data is not None:
limit = missing_data.find('Limit')
if limit is not None:
print("Missing Data Limit for Column:", limit.text)
last_known_value = missing_data.find('LastKnownValue')
if last_known_value is not None:
print("Last Known Value for Column:", last_known_value.text)
result[col_name] = None

for i in result.index:
try:
target_date = result.at[i, 'DateTime'].strftime('%d.%m.%Y')
target_hour = result.at[i, 'DateTime'].strftime('%H:00')

filtered_df = df[(df.DATE == target_date) & (df.Time ==


target_hour)]
if not filtered_df.empty:
hour_val = filtered_df.iloc[0][col_name]
if not (isinstance(hour_val, float) and math.isnan(hour_val)):
result.at[i, col_name] = hour_val
except KeyError:
pass

if result.at[i, col_name] is None:


if limit is None:
# Get the previous non-null value for this column
prev_vals = result[col_name].loc[:i-1]
[result[col_name].notnull()]
if not prev_vals.empty:
prev_val = prev_vals.iloc[-1]
try:
prev_val = float(prev_val)
except ValueError:
prev_val = np.nan
if not math.isnan(prev_val):
result.at[i, col_name] = prev_val
else:
limit_value = float(limit.text)
random_val = random.uniform(0, limit_value)
result.at[i, col_name] = random_val

result.set_index('DateTime', inplace=True)
result.index = result.index.strftime('%d.%m.%Y %H:%M:%S')
result_list.append((sheet_name, result))

print(result_list)
# Write the results to an Excel file
with pd.ExcelWriter('RES1.xlsx') as writer:
for sheet_name, result in result_list:
result.to_excel(writer, sheet_name=sheet_name)

You might also like