"Cps - TXT" "Education" "South" "SEX" "Experience" "Union" "WAGE" "AGE" "RACE" "Occupat Ion" "Sector" "MARR"
"Cps - TXT" "Education" "South" "SEX" "Experience" "Union" "WAGE" "AGE" "RACE" "Occupat Ion" "Sector" "MARR"
library(dplyr)
##
## Attaching package: 'dplyr'
#QUESTION 1
#PART A
cps1=data.frame()
cps1= cps %>% select ("SEX","EXPERIENCE","WAGE","AGE","MARR")
#PART B
cps1 %>% cor(cps1)
#PART C
cps2=data.frame()
cps2= cps %>% select(-"SOUTH",-"UNION",-"MARR") %>% filter(between(AGE, 30,
50), SECTOR==2)
I used the select function to drop the columns and also filter function for subseting the
dataframe.
cps %>% mutate(New_Column = WAGE / AGE) %>% filter(New_Column>0.25) %>% count
## n
## 1 216
Mutate function is for combining two variables and generating a new variable. İ could not
solve how to print the output of count.
#PART E
x=cps %>% filter(SEX==0)
tapply(x$WAGE , x$OCCUPATION , mean)
## 1 2 3 4 5 6
## 13.721765 9.495714 7.489048 7.226471 12.773962 9.068175
#or
cps %>%group_by(OCCUPATION)%>% filter(SEX==0) %>% summarise(mean(WAGE))
## # A tibble: 6 x 2
## OCCUPATION `mean(WAGE)`
## <int> <dbl>
## 1 1 13.7
## 2 2 9.50
## 3 3 7.49
## 4 4 7.23
## 5 5 12.8
## 6 6 9.07
There are 2 solutions of this question. The difference is my first answer is including tapply
function which is really good to appyling 1 function to 2 variables.
#PART F
#cps %>% table(cps$MARR,cps$SEX)
In thıs question, in the begining the code was working but, i could not understand why it is
not working now. It says lenghts must be same. I checked the lenghts of them but its not the
problem. In addition the table functions is generating a contingency table for 2 variables.
#QUESTION 2
#PART A
library(ggplot2)
ggplot(cps, aes(x = AGE, y = WAGE)) +
geom_point(aes(color = factor(SEX))) + scale_color_brewer(palette =
"Dark2") + labs (title="Distribution of age and wage by sex")
It representing the distribution of age and wage accordingly to sex variable. I used the
factor function to convert continous values to discrete values.
ggplot(cps, aes(WAGE)) +
geom_density(aes(color = factor(RACE)),size=2,linetype="dashed") + labs
(title="The density plot of the wage by race")
The graph shows the density plot of the wage by race.
cps$RACE[cps$RACE=="1"]="Other"
cps$RACE[cps$RACE=="2"]="Hispanic"
cps$RACE[cps$RACE=="3"]="White"
cps$OCCUPATION[cps$OCCUPATION=="1"]="Management"
cps$OCCUPATION[cps$OCCUPATION=="2"]="Sales"
cps$OCCUPATION[cps$OCCUPATION=="3"]="Clerical"
cps$OCCUPATION[cps$OCCUPATION=="4"]="Service"
cps$OCCUPATION[cps$OCCUPATION=="5"]="Professional"
cps$OCCUPATION[cps$OCCUPATION=="6"]="Other"
cps$RACE = factor(cps$RACE , levels=c("Other", "Hispanic", "White"))
cps$OCCUPATION= factor(cps$OCCUPATION,
levels=c("Management","Sales","Clerical","Service","Professional","Other"))
ggplot(data = cps) +
geom_mosaic(aes(x = product(RACE), fill = OCCUPATION)) +
labs(title="Relationship between occupation and race")
Mosaic plot of relationship between occupation and race variables.
#QUESTION 3
#PART A
set.seed(124)
X = rnorm(1000)
Y = rnorm(50,10,2)
Z = runif(200,-5,20)
obj1=list(X,Y,Z)
lapply(obj1,mean)
## [[1]]
## [1] -0.0653552
##
## [[2]]
## [1] 10.58706
##
## [[3]]
## [1] 6.639656
#PART C
h=LETTERS[1:4]
g=rep(h,50)
obj3=data.frame(Z,g)
mean(obj3$Z[which(obj3$g=="A")])
## [1] 6.839135
mean(obj3$Z[which(obj3$g=="B")])
## [1] 6.986212
mean(obj3$Z[which(obj3$g=="C")])
## [1] 5.560432
mean(obj3$Z[which(obj3$g=="D")])
## [1] 7.172845
j=split(Y,gl(5,10))
matrix(lapply(j,max))
## [,1]
## [1,] 12.63274
## [2,] 13.20395
## [3,] 14.82519
## [4,] 15.30867
## [5,] 14.68656
matrix(lapply(j,min))
## [,1]
## [1,] 5.565061
## [2,] 9.320027
## [3,] 7.280937
## [4,] 4.97121
## [5,] 7.33229