0% found this document useful (0 votes)
6 views4 pages

RStudio

The document contains R code for analyzing birth weight data, school performance data, and wage data. It includes calculations for descriptive statistics, correlations, and visualizations using ggplot2. The analysis covers topics such as the impact of smoking on birth weight, average pass rates in schools, and the relationship between education and wages.

Uploaded by

eshaan arora
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views4 pages

RStudio

The document contains R code for analyzing birth weight data, school performance data, and wage data. It includes calculations for descriptive statistics, correlations, and visualizations using ggplot2. The analysis covers topics such as the impact of smoking on birth weight, average pass rates in schools, and the relationship between education and wages.

Uploaded by

eshaan arora
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

install.

packages("wooldridge")
install.packages("ggplot2")
library(wooldridge)
library(ggplot2)
data(bwght)

#QUESTION 1

#1a. Calculate descriptive statistics for birth weight:

# Mean
mean_bwght= mean(bwght$bwght, na.rm=T)

# Median
median_bwght= median(bwght$bwght, na.rm = TRUE)

# Mode (since R does not have a built-in mode function, we'll use a custom function)
getmode= function(v) {
uniqv= unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
mode_bwght=getmode(bwght$bwght)

# Quartiles
quartiles_bwght=quantile(bwght$bwght, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)

# Display results
mean_bwght
median_bwght
mode_bwght
quartiles_bwght

#2a. Average number of cigarettes smoked per day:

avg_smoke= mean(bwght$cigs, na.rm = TRUE)


avg_smoke

#2b. Association between smoking and birth weight:


install.packages("ggplot2")
library("ggplot2")

ggplot(bwght, aes(x = cigs, y = bwght)) +


geom_point(aes(color = factor(cigs))) +
geom_smooth(method = "lm") +
labs(title = "Birth Weight vs. Number of Cigarettes Smoked",
x = "Number of Cigarettes Smoked",
y = "Birth Weight")

#3a. Distribution of family income:

mean_income= mean(bwght$faminc, na.rm = TRUE)


sd_income= sd(bwght$faminc, na.rm = TRUE)

mean_income
sd_income

#3b. Correlation between birth weight and log of family income:

bwght$log_income=log(bwght$faminc)

smoke_data=bwght[bwght$cigs>1, ]
cor_smoke= cor(smoke_data$bwght, smoke_data$log_income, use = "complete.obs")

non_smoke_data= bwght[bwght$cigs == 0, ]
cor_non_smoke= cor(non_smoke_data$bwght, non_smoke_data$log_income, use =
"complete.obs")
cor_smoke
cor_non_smoke

#4a. Mean of fatheduc:

mean_fatheduc= mean(bwght$fatheduc, na.rm = TRUE)

count_fatheduc= sum(!is.na(bwght$fatheduc))

mean_fatheduc
count_fatheduc

#4b. Proportion of mothers who are high school graduates:

prop_highschool= mean(bwght$motheduc >= 12, na.rm = TRUE)

prop_highschool

#4c. Average birth weight by high school graduation status:

bwght$highschool= bwght$mothed >= 12

avg_bwght_highschool= aggregate(bwght ~ highschool, data = bwght, FUN = mean)

ggplot(bwght, aes(x = factor(highschool), y = bwght)) +


geom_boxplot() +
labs(title = "Birth Weight by High School Graduation Status",
x = "High School Graduate",
y = "Birth Weight")

avg_bwght_highschool

#QUESTION 2
data(meap01)

#1
min_read4=min(meap01$read4, na.rm = TRUE)
max_read4= max(meap01$read4, na.rm = TRUE)
difference_read4= max_read4 - min_read4

min_read4
max_read4
difference_read4

#2
#Number and percentage of schools with a perfect pass rate:

perfect_pass_rate_count= sum(meap01$read4 == 100, na.rm = TRUE)


total_schools= nrow(meap01)
percentage_perfect= (perfect_pass_rate_count / total_schools)*100

pass_rate_50_count= sum(meap01$read4 == 50, na.rm = TRUE)

perfect_pass_rate_count
percentage_perfect
pass_rate_50_count

#3
#Compute the average pass rates and visualize:

avg_math4= mean(meap01$math4, na.rm = TRUE)


avg_read4= mean(meap01$read4, na.rm = TRUE)

avg_math4
avg_read4

library(ggplot2)
avg_rates= data.frame(
Subject = c("Math", "Reading"),
Average_Pass_Rate = c(avg_math4, avg_read4)
)

ggplot(avg_rates, aes(x = Subject, y = Average_Pass_Rate, fill = Subject)) +


geom_bar(stat = "identity") +
labs(title = "Average Pass Rates for Math and Reading",
x = "Subject",
y = "Average Pass Rate")

#4 a and b
#Pearson Correlation Coefficient and Scatterplot:

correlation= cor(meap01$math4, meap01$read4, use = "complete.obs")

correlation

ggplot(meap01, aes(x = math4, y = read4)) +


geom_point(aes(color = enroll)) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Scatterplot of Math vs. Reading Pass Rates",
x = "Math Pass Rate",
y = "Reading Pass Rate") +
scale_color_distiller(palette = "Spectral")

#5
#Create categories for school size and calculate average pass rates:

meap01$size_cat= cut(meap01$enroll, breaks = c(-Inf, 300, 600, Inf), labels = c("Small


(<300)", "Medium (300-600)", "Large (>600)"))

avg_pass_rates_by_size <- aggregate(cbind(math4, read4) ~ size_cat, data = meap01, FUN =


mean, na.rm = TRUE)

ggplot(avg_pass_rates_by_size, aes(x = size_cat)) +


geom_bar(aes(y = math4, fill = "Math"), stat = "identity", position = "dodge") +
geom_bar(aes(y = read4, fill = "Reading"), stat = "identity", position = "dodge") +
labs(title = "Average Pass Rates by School Size",
x = "School Size Category",
y = "Average Pass Rate") +
scale_fill_manual(name = "Subject", values = c("Math" = "blue", "Reading" = "red"))

#QUESTION 3

data(wage1)

#1
#Calculate the average educational level, lowest, and highest years of education:

avg_education= mean(wage1$educ, na.rm = TRUE)

min_education= min(wage1$educ, na.rm = TRUE)


max_education= max(wage1$educ, na.rm = TRUE)

avg_education
min_education
max_education

#2
#Determine the average per-hour wage and interpret:

avg_wage= mean(wage1$wage, na.rm = TRUE)

avg_wage

#3
#Calculate the proportion of women and men in the sample:

prop_women= mean(wage1$female, na.rm = TRUE)


prop_men= 1 - prop_women

prop_women
prop_men

#4
#Calculate the probability that a randomly chosen woman is married:

women_data= subset(wage1, female == 1)

prop_married_women= mean(women_data$married, na.rm = TRUE)

prop_married_women

#5a. Association between Hourly Wage and Education:

library(ggplot2)

ggplot(wage1, aes(x = educ, y = wage)) +


geom_point() +
geom_smooth(method = "lm") +
labs(title = "Hourly Wage vs. Education",
x = "Years of Education",
y = "Hourly Wage")

#5b. Hourly Wage and Education by Marital Status:

ggplot(wage1, aes(x = educ, y = wage, color = factor(married))) +


geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Hourly Wage vs. Education by Marital Status",
x = "Years of Education",
y = "Hourly Wage",
color = "Marital Status") +
scale_color_manual(values = c("blue", "red"), labels = c("Unmarried", "Married"))

#5c. Hourly Wage and Education by Gender and Marital Status:

ggplot(wage1, aes(x = educ, y = wage, color = factor(married), shape = factor(female)))


+
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Hourly Wage vs. Education by Gender and Marital Status",
x = "Years of Education",
y = "Hourly Wage",
color = "Marital Status",
shape = "Gender") +
scale_color_manual(values = c("blue", "red"), labels = c("Unmarried", "Married")) +
scale_shape_manual(values = c(16, 17), labels = c("Male", "Female"))

You might also like