Imputacion
Imputacion
2024-11-16
R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF,
and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the
output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
Including Plots
1
800
600
pressure
400
200
0
temperature
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that
generated the plot.
##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages
2
library(ggplot2)
library(dplyr)
##
## Adjuntando el paquete: ’dplyr’
library(titanic)
library(cowplot)
library(tidyr)
df<-titanic_train
3
df[1,1]<- NA
nadf1 <- df[!complete.cases(df), ]
nadf2 <- df[!complete.cases(df$Age), ]
ndf <- df[complete.cases(df$Age), ]
df <- titanic_train #Retomemos la base riginal sin modifica
ndf<-df%>%drop_na()
ndf<-na.omit(df)
h1<-ggplot(ndf,aes(x=Age))+
geom_histogram(fill="#ad1538", color="#000000",position="identity",bins=30) +
ggtitle("Originaldistribution") +
theme_classic()
plot_grid(h1,nrow=1,ncol=1)
Originaldistribution
60
40
count
20
0 20 40 60 80
Age
4
library(simputation)
## [1] 4
PMM<-impute_pmm(df,Age~Parch,predictor=impute_lm)
KNN<-impute_knn(df,Age~Parch,k=5)
h1<-ggplot(ndf,aes(x=Age))+
geom_histogram(fill="#ad1538", color="#000000",position="identity",bins=30) +
ggtitle("Originaldistribution") +
theme_classic()
h2<-ggplot(RHD,aes(x=Age))+
geom_histogram(fill="#15ad4f", color="#000000",position="identity",bins=30) +
ggtitle("RHD")+
theme_classic()
h3<-ggplot(SHD,aes(x=Age))+
geom_histogram(fill="#1543ad", color="#000000",position="identity",bins=30) +
ggtitle("SHD")+
theme_classic()
h4<-ggplot(PMM,aes(x=Age))+
geom_histogram(fill="#ad8415", color="#000000",position="identity",bins=30) +
ggtitle("PMM")+
theme_classic()
h5<-ggplot(KNN,aes(x=Age))+
geom_histogram(fill="#4515ad", color="#000000",position="identity",bins=30) +
ggtitle("KNN")+
theme_classic()
plot_grid(h1,h2,h3,h4,h5,nrow=3,ncol=2)
5
Originaldistribution RHD
60 75
count
count
40 50
20 25
0 0
0 20 40 60 80 0 20 40 60 80
Age Age
SHD PMM
75 200
count
count
150
50
100
25 50
0 0
0 20 40 60 80 0 20 40 60 80
Age Age
KNN
100
count
50
0
0 20 40 60 80
Age
h1<-ggplot(value_imputed,aes(x=original))+
geom_histogram(fill="#ad1538", color="#000000",position="identity",bins=30) +
ggtitle("Originaldistribution") +
theme_classic()
h2<-ggplot(value_imputed,aes(x=imputed_zero))+
geom_histogram(fill="#15ad4f", color="#000000",position="identity",bins=30) +
ggtitle("Zero-imputeddistribution")+
theme_classic()
h3<-ggplot(value_imputed,aes(x=imputed_mean))+
geom_histogram(fill="#1543ad", color="#000000",position="identity",bins=30) +
ggtitle("Mean-imputeddistribution")+
theme_classic()
h4<-ggplot(value_imputed,aes(x=imputed_median))+
geom_histogram(fill="#ad8415", color="#000000",position="identity",bins=30) +
ggtitle("Median-imputeddistribution")+
theme_classic()
plot_grid(h1,h2,h3,h4,nrow=2,ncol=2)
## Warning: Removed 177 rows containing non-finite outside the scale range
6
## (‘stat_bin()‘).
Originaldistribution Zero−imputeddistribution
200
60 150
count
count
40 100
20 50
0 0
0 20 40 60 80 0 20 40 60 80
original imputed_zero
Mean−imputeddistribution Median−imputeddistribution
250
200
200
150
150
count
count
100 100
50 50
0 0
0 20 40 60 80 0 20 40 60 80
imputed_mean imputed_median
library(mice)
##
## Adjuntando el paquete: ’mice’
7
## The following objects are masked from ’package:base’:
##
## cbind, rbind
714 0
177 1
0 0 0 0 177 177
##
## iter imp variable
## 1 1 Age
## 1 2 Age
8
## 1 3 Age
## 1 4 Age
## 1 5 Age
## 2 1 Age
## 2 2 Age
## 2 3 Age
## 2 4 Age
## 2 5 Age
## 3 1 Age
## 3 2 Age
## 3 3 Age
## 3 4 Age
## 3 5 Age
## 4 1 Age
## 4 2 Age
## 4 3 Age
## 4 4 Age
## 4 5 Age
## 5 1 Age
## 5 2 Age
## 5 3 Age
## 5 4 Age
## 5 5 Age
##
## iter imp variable
## 1 1 Age
## 1 2 Age
## 1 3 Age
## 1 4 Age
## 1 5 Age
## 2 1 Age
## 2 2 Age
## 2 3 Age
## 2 4 Age
## 2 5 Age
## 3 1 Age
## 3 2 Age
## 3 3 Age
## 3 4 Age
## 3 5 Age
## 4 1 Age
## 4 2 Age
## 4 3 Age
## 4 4 Age
## 4 5 Age
## 5 1 Age
## 5 2 Age
## 5 3 Age
## 5 4 Age
## 5 5 Age
##
## iter imp variable
## 1 1 Age
## 1 2 Age
9
## 1 3 Age
## 1 4 Age
## 1 5 Age
## 2 1 Age
## 2 2 Age
## 2 3 Age
## 2 4 Age
## 2 5 Age
## 3 1 Age
## 3 2 Age
## 3 3 Age
## 3 4 Age
## 3 5 Age
## 4 1 Age
## 4 2 Age
## 4 3 Age
## 4 4 Age
## 4 5 Age
## 5 1 Age
## 5 2 Age
## 5 3 Age
## 5 4 Age
## 5 5 Age
head(mice_imputed)
## Warning: Removed 177 rows containing non-finite outside the scale range
## (‘stat_bin()‘).
10
Original distribution PMM−imputed distribution
60 90
count
count
40 60
20 30
0 0
0 20 40 60 80 0 20 40 60 80
original imputed_pmm
75 75
count
count
50 50
25 25
0 0
0 20 40 60 80 0 20 40 60 80
imputed_cart imputed_lasso
library(missForest)
missForest_imputed<-data.frame(
original=titanic_numeric$Age,
imputed_missForest=missForest(titanic_numeric)$ximp$Age
)
head(missForest_imputed)
## original imputed_missForest
## 1 22 22.00000
## 2 38 38.00000
11
## 3 26 26.00000
## 4 35 35.00000
## 5 35 35.00000
## 6 NA 29.25033
h1<-ggplot(missForest_imputed,aes(x=original))+
geom_histogram(fill="#ad1538", color="#000000",position="identity",bins=30)+
ggtitle("Originaldistribution") +
theme_classic()
h2<-ggplot(missForest_imputed,aes(x=imputed_missForest))+
geom_histogram(fill="#15ad4f", color="#000000",position="identity",bins=30)+
ggtitle("MissForest-imputeddistribution")+
theme_classic()
plot_grid(h1,h2,nrow=1,ncol=2)
## Warning: Removed 177 rows containing non-finite outside the scale range
## (‘stat_bin()‘).
Originaldistribution MissForest−imputeddistribution
150
60
100
40
count
count
50
20
0 0
0 20 40 60 80 0 20 40 60 80
original imputed_missForest
Titanic<- titanic_train
Titanic$Survived <- as.factor(ifelse(Titanic$Survived=="0","No","Yes"))
Titanic$Sex <- as.factor(Titanic$Sex)
Titanic$PClass <- as.factor(Titanic$Pclass)
Titanic <- Titanic[,c("Survived","PClass","Sex","Age")]
with(Titanic,{
print(table(Sex))
12
print(table(PClass))
print(table(Survived))
})
## Sex
## female male
## 314 577
## PClass
## 1 2 3
## 216 184 491
## Survived
## No Yes
## 549 342
13
## package ’caret’ successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages
library(caret)
table(Titanic$Sex)
##
## female male
## 314 577
##
## female male
## 314 314
table(Titanic$Sex)
##
## female male
## 314 577
##
## female male
## 577 577
14
NF <- sample_n(filter(Titanic, Sex == "female"), 36)
OverT <- rbind(Titanic,NF)
table(OverT$Sex)
##
## female male
## 350 577
##
## female male
## 314 500
library(scutr)
Titanic<- titanic_train[,c(1,2,3,5,6,7,8,10)]
Titanic<-na.omit(Titanic)
Tsmote<-SCUT(Titanic, "Sex",oversample = oversample_smote)
table(Tsmote$Sex)
##
## female male
## 357 357
15