0% found this document useful (0 votes)
13 views4 pages

LAb Test 2

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views4 pages

LAb Test 2

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 4

Q1

# Load necessary libraries


library(dplyr)

# Import the insurance data


insurance_data <- read.csv("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-
datasets/master/insurance.csv")

# Take a peek at the data


head(insurance_data)
glimpse(insurance_data)

# Convert sex, smoker, and region into nominal categorical variables (factor)
insurance_data$sex <- factor(insurance_data$sex)
insurance_data$smoker <- factor(insurance_data$smoker)
insurance_data$region <- factor(insurance_data$region)

# Convert children into an ordinal categorical variable (factor with ordered levels)
insurance_data$children <- factor(insurance_data$children,
ordered = TRUE)

# Check the structure of the updated data


str(insurance_data)
glimpse(insurance_data)

Q2

# Check for missing values in the dataset


colSums(is.na(insurance_data))

# Summary statistics for numerical variables to check for outliers


summary(insurance_data)

# Fit a basic linear model for medical charges based on all other variables
linear_model <- lm(charges ~ age + sex + bmi + children + smoker + region, data =
insurance_data)

# Calculate Cook's Distance to identify influential observations


cooksd <- cooks.distance(linear_model)

# Plot Cook's Distance


plot(cooksd, main = "Cook's Distance for Influential Observations", ylab = "Cook's
Distance")
abline(h = 4/(nrow(insurance_data)), col = "red") # A common threshold for Cook's Distance

# Display the rows with Cook's Distance greater than the threshold
influential_obs <- which(cooksd > 4/nrow(insurance_data))
insurance_data[influential_obs, ]
Q3

# Load necessary library -correlation with significance tests


library(Hmisc)

# Compute Pearson correlation between bmi and age


cor_bmi_age <- cor(insurance_data$bmi, insurance_data$age, method = "pearson")
cor_test_bmi_age <- cor.test(insurance_data$bmi, insurance_data$age, method = "pearson")

# Compute Pearson correlation between bmi and charges


cor_bmi_charges <- cor(insurance_data$bmi, insurance_data$charges, method = "pearson")
cor_test_bmi_charges <- cor.test(insurance_data$bmi, insurance_data$charges, method =
"pearson")

# Display correlation coefficients and significance tests


cat("Correlation between BMI and Age:\n")
print(cor_bmi_age)
print(cor_test_bmi_age)

cat("\nCorrelation between BMI and Charges:\n")


print(cor_bmi_charges)
print(cor_test_bmi_charges)

Q4

# Load the GGally package


library(GGally)

# Select the relevant continuous variables (bmi, age, charges)


data_subset <- insurance_data[, c("bmi", "age", "charges")]

# Create a scatterplot matrix


ggpairs(data_subset,
title = "Scatterplot Matrix of BMI, Age, and Charges",
upper = list(continuous = "cor"), # Show correlation in the upper panels
lower = list(continuous = "smooth"), # Add a smooth line in the lower panels
diag = list(continuous = "density")) # Show density plots on the diagonal

Q5# Build the multiple linear regression model


model <- lm(bmi ~ age + sex + children + charges, data = insurance_data)

# Print out the results of the model


summary(model)

 Equation: bmi = 28.72+0.02472age+0.4544sex-0.1558children


 For every 1 unit that age increases the bmi increases by 0.02472, for every 1 unit sex
increases the bmi increases by 0.02472 and with every 1 unit that children increases
the bmi decreases by 0.1558. Where p-value is less than 0.05, the relationship is
statiscally significant, otherwise the predictor is not statistically significant in
explaining 'bmi'.

Q6

library(ggplot2)
# Create predicted values based on the model
insurance_data$predicted_bmi <- predict(model)

# Create residuals (difference between observed and predicted values)


insurance_data$residuals <- insurance_data$bmi - insurance_data$predicted_bmi
# Scatterplot of observed vs predicted BMI values
ggplot(insurance_data, aes(x = predicted_bmi, y = bmi)) +
geom_point(color = "blue", alpha = 0.5) + # Scatterplot points
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + # Line of
perfect fit
labs(title = "Observed vs. Predicted BMI",
x = "Predicted BMI",
y = "Observed BMI")
 The scatterplot shows a significant scatter of the points around the line of best fit
therefore the model does not necessarily capture the variability in the data and there
may be other factors that can better explain 'bmi'.

You might also like