etl_and_stats_code
etl_and_stats_code
import numpy as np
from scipy.stats import norm
# Step 1: Extract
def extract_data():
# Example data as a CSV
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [25, 30, 35, 40, 29],
'Salary': [50000, 60000, 70000, 80000, 55000]
}
df = pd.DataFrame(data)
print("Data Extracted:")
print(df)
return df
# Step 2: Transform
def transform_data(df):
# Adding a column for Bonus (10% of Salary)
df['Bonus'] = df['Salary'] * 0.1
print("\nData Transformed:")
print(df)
return df
# Step 3: Load
def load_data(df):
# Save transformed data to a CSV file
output_file = "transformed_data.csv"
df.to_csv(output_file, index=False)
print(f"\nData Loaded to {output_file}")
# Statistical Functions
def statistical_functions(df):
# Mean and Median
mean_salary = np.mean(df['Salary'])
median_salary = np.median(df['Salary'])
print("\nStatistical Analysis:")
print(f"Mean Salary: {mean_salary}")
print(f"Median Salary: {median_salary}")
print("\nNormal Distribution (Probability Density Function):")
print(df[['Salary', 'Normal_Distribution']])
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Model Training
model = LinearRegression()
model.fit(X_train, y_train)
# Statistical Analysis
statistical_functions(df)
# Simple Modeling
simple_model(df)