0% found this document useful (0 votes)
3 views

etl_and_stats_code

The document outlines an ETL (Extract, Transform, Load) process using Python with pandas and numpy. It includes data extraction from a CSV-like structure, transformation by adding a bonus and normalizing age, and loading the transformed data into a CSV file. Additionally, it performs statistical analysis on salary data and implements a simple linear regression model to predict salary based on age.

Uploaded by

Rahul Waldia
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views

etl_and_stats_code

The document outlines an ETL (Extract, Transform, Load) process using Python with pandas and numpy. It includes data extraction from a CSV-like structure, transformation by adding a bonus and normalizing age, and loading the transformed data into a CSV file. Additionally, it performs statistical analysis on salary data and implements a simple linear regression model to predict salary based on age.

Uploaded by

Rahul Waldia
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

import pandas as pd

import numpy as np
from scipy.stats import norm

# Step 1: Extract
def extract_data():
# Example data as a CSV
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [25, 30, 35, 40, 29],
'Salary': [50000, 60000, 70000, 80000, 55000]
}
df = pd.DataFrame(data)
print("Data Extracted:")
print(df)
return df

# Step 2: Transform
def transform_data(df):
# Adding a column for Bonus (10% of Salary)
df['Bonus'] = df['Salary'] * 0.1

# Normalizing Age column (min-max scaling)


df['Age_Normalized'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() -
df['Age'].min())

print("\nData Transformed:")
print(df)
return df

# Step 3: Load
def load_data(df):
# Save transformed data to a CSV file
output_file = "transformed_data.csv"
df.to_csv(output_file, index=False)
print(f"\nData Loaded to {output_file}")

# Statistical Functions
def statistical_functions(df):
# Mean and Median
mean_salary = np.mean(df['Salary'])
median_salary = np.median(df['Salary'])

# Normal Distribution Example


mu, sigma = mean_salary, np.std(df['Salary'])
normal_dist = norm.pdf(df['Salary'], mu, sigma)
df['Normal_Distribution'] = normal_dist

print("\nStatistical Analysis:")
print(f"Mean Salary: {mean_salary}")
print(f"Median Salary: {median_salary}")
print("\nNormal Distribution (Probability Density Function):")
print(df[['Salary', 'Normal_Distribution']])

# Modeling (Linear Regression Example)


def simple_model(df):
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Independent variable: Age, Dependent variable: Salary


X = df[['Age']]
y = df['Salary']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Model Training
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction and Evaluation


y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print("\nSimple Linear Regression Model:")


print(f"Coefficient: {model.coef_[0]}")
print(f"Intercept: {model.intercept_}")
print(f"Mean Squared Error: {mse}")

# Main Function to Execute the Steps


def main():
# ETL Process
df = extract_data()
df = transform_data(df)
load_data(df)

# Statistical Analysis
statistical_functions(df)

# Simple Modeling
simple_model(df)

# Run the main function


if __name__ == "__main__":
main()

You might also like