0% found this document useful (0 votes)
10 views15 pages

Imputacion

Uploaded by

J J
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views15 pages

Imputacion

Uploaded by

J J
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 15

Datos

Kelly Johanna Bojaca

2024-11-16

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF,
and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the
output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00

Including Plots

You can also embed plots, for example:

1
800
600
pressure

400
200
0

0 50 100 150 200 250 300 350

temperature

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that
generated the plot.

install.packages("dplyr", repos = "https://cran.rstudio.com/")

## Installing package into ’C:/Users/kelly/AppData/Local/R/win-library/4.4’


## (as ’lib’ is unspecified)

## package ’dplyr’ successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package ’dplyr’

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar


## C:\Users\kelly\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## a C:\Users\kelly\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied

## Warning: restored ’dplyr’

##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages

2
library(ggplot2)
library(dplyr)

## Warning: package ’dplyr’ was built under R version 4.4.2

##
## Adjuntando el paquete: ’dplyr’

## The following objects are masked from ’package:stats’:


##
## filter, lag

## The following objects are masked from ’package:base’:


##
## intersect, setdiff, setequal, union

install.packages("titanic", repos = "https://cran.rstudio.com/")

## Installing package into ’C:/Users/kelly/AppData/Local/R/win-library/4.4’


## (as ’lib’ is unspecified)

## package ’titanic’ successfully unpacked and MD5 sums checked


##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages

install.packages("cowplot", repos = "https://cran.rstudio.com/")

## Installing package into ’C:/Users/kelly/AppData/Local/R/win-library/4.4’


## (as ’lib’ is unspecified)

## package ’cowplot’ successfully unpacked and MD5 sums checked


##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages

library(titanic)

## Warning: package ’titanic’ was built under R version 4.4.2

library(cowplot)

## Warning: package ’cowplot’ was built under R version 4.4.2

library(tidyr)
df<-titanic_train

3
df[1,1]<- NA
nadf1 <- df[!complete.cases(df), ]
nadf2 <- df[!complete.cases(df$Age), ]
ndf <- df[complete.cases(df$Age), ]
df <- titanic_train #Retomemos la base riginal sin modifica

ndf<-df%>%drop_na()
ndf<-na.omit(df)

h1<-ggplot(ndf,aes(x=Age))+
geom_histogram(fill="#ad1538", color="#000000",position="identity",bins=30) +
ggtitle("Originaldistribution") +
theme_classic()
plot_grid(h1,nrow=1,ncol=1)

Originaldistribution

60

40
count

20

0 20 40 60 80
Age

install.packages("simputation", repos = "https://cran.rstudio.com/")

## Installing package into ’C:/Users/kelly/AppData/Local/R/win-library/4.4’


## (as ’lib’ is unspecified)

## package ’simputation’ successfully unpacked and MD5 sums checked


##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages

4
library(simputation)

## Warning: package ’simputation’ was built under R version 4.4.2

RHD<- impute_rhd(df, Age ~ Sex )


SHD<- impute_shd(df, Age ~ Parch , order ="locf") #opcion de order: "nocb"
4

## [1] 4

PMM<-impute_pmm(df,Age~Parch,predictor=impute_lm)
KNN<-impute_knn(df,Age~Parch,k=5)

h1<-ggplot(ndf,aes(x=Age))+
geom_histogram(fill="#ad1538", color="#000000",position="identity",bins=30) +
ggtitle("Originaldistribution") +
theme_classic()
h2<-ggplot(RHD,aes(x=Age))+
geom_histogram(fill="#15ad4f", color="#000000",position="identity",bins=30) +
ggtitle("RHD")+
theme_classic()
h3<-ggplot(SHD,aes(x=Age))+
geom_histogram(fill="#1543ad", color="#000000",position="identity",bins=30) +
ggtitle("SHD")+
theme_classic()
h4<-ggplot(PMM,aes(x=Age))+
geom_histogram(fill="#ad8415", color="#000000",position="identity",bins=30) +
ggtitle("PMM")+
theme_classic()
h5<-ggplot(KNN,aes(x=Age))+
geom_histogram(fill="#4515ad", color="#000000",position="identity",bins=30) +
ggtitle("KNN")+
theme_classic()
plot_grid(h1,h2,h3,h4,h5,nrow=3,ncol=2)

5
Originaldistribution RHD
60 75
count

count
40 50
20 25
0 0
0 20 40 60 80 0 20 40 60 80
Age Age

SHD PMM
75 200
count

count
150
50
100
25 50
0 0
0 20 40 60 80 0 20 40 60 80
Age Age

KNN

100
count

50
0
0 20 40 60 80
Age

value_imputed <- data.frame(


original = titanic_train$Age,
imputed_zero = replace(titanic_train$Age, is.na(titanic_train$Age), 0),
imputed_mean = replace(titanic_train$Age, is.na(titanic_train$Age), mean(titanic_train$Age, na.rm = TR
imputed_median = replace(titanic_train$Age, is.na(titanic_train$Age), median(titanic_train$Age, na.rm
)

h1<-ggplot(value_imputed,aes(x=original))+
geom_histogram(fill="#ad1538", color="#000000",position="identity",bins=30) +
ggtitle("Originaldistribution") +
theme_classic()
h2<-ggplot(value_imputed,aes(x=imputed_zero))+
geom_histogram(fill="#15ad4f", color="#000000",position="identity",bins=30) +
ggtitle("Zero-imputeddistribution")+
theme_classic()
h3<-ggplot(value_imputed,aes(x=imputed_mean))+
geom_histogram(fill="#1543ad", color="#000000",position="identity",bins=30) +
ggtitle("Mean-imputeddistribution")+
theme_classic()
h4<-ggplot(value_imputed,aes(x=imputed_median))+
geom_histogram(fill="#ad8415", color="#000000",position="identity",bins=30) +
ggtitle("Median-imputeddistribution")+
theme_classic()
plot_grid(h1,h2,h3,h4,nrow=2,ncol=2)

## Warning: Removed 177 rows containing non-finite outside the scale range

6
## (‘stat_bin()‘).

Originaldistribution Zero−imputeddistribution
200

60 150
count

count
40 100

20 50

0 0
0 20 40 60 80 0 20 40 60 80
original imputed_zero

Mean−imputeddistribution Median−imputeddistribution
250
200
200
150
150
count

count

100 100

50 50

0 0
0 20 40 60 80 0 20 40 60 80
imputed_mean imputed_median

install.packages("mice", repos = "https://cran.rstudio.com/")

## Installing package into ’C:/Users/kelly/AppData/Local/R/win-library/4.4’


## (as ’lib’ is unspecified)

## package ’mice’ successfully unpacked and MD5 sums checked


##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages

library(mice)

## Warning: package ’mice’ was built under R version 4.4.2

##
## Adjuntando el paquete: ’mice’

## The following object is masked from ’package:stats’:


##
## filter

7
## The following objects are masked from ’package:base’:
##
## cbind, rbind

titanic_numeric <- df %>%


select(Survived, Pclass, SibSp, Parch, Age)
md.pattern(titanic_numeric)

Survived Pclass SibSp Parch Age

714 0

177 1

0 0 0 0 177 177

## Survived Pclass SibSp Parch Age


## 714 1 1 1 1 1 0
## 177 1 1 1 1 0 1
## 0 0 0 0 177 177

mice_imputed <- data.frame(


original = titanic_train$Age,
imputed_pmm = complete(mice(titanic_numeric, method = "pmm"))$Age,
imputed_cart = complete(mice(titanic_numeric, method = "cart"))$Age,
imputed_lasso = complete(mice(titanic_numeric, method = "lasso.norm"))$Age
)

##
## iter imp variable
## 1 1 Age
## 1 2 Age

8
## 1 3 Age
## 1 4 Age
## 1 5 Age
## 2 1 Age
## 2 2 Age
## 2 3 Age
## 2 4 Age
## 2 5 Age
## 3 1 Age
## 3 2 Age
## 3 3 Age
## 3 4 Age
## 3 5 Age
## 4 1 Age
## 4 2 Age
## 4 3 Age
## 4 4 Age
## 4 5 Age
## 5 1 Age
## 5 2 Age
## 5 3 Age
## 5 4 Age
## 5 5 Age
##
## iter imp variable
## 1 1 Age
## 1 2 Age
## 1 3 Age
## 1 4 Age
## 1 5 Age
## 2 1 Age
## 2 2 Age
## 2 3 Age
## 2 4 Age
## 2 5 Age
## 3 1 Age
## 3 2 Age
## 3 3 Age
## 3 4 Age
## 3 5 Age
## 4 1 Age
## 4 2 Age
## 4 3 Age
## 4 4 Age
## 4 5 Age
## 5 1 Age
## 5 2 Age
## 5 3 Age
## 5 4 Age
## 5 5 Age
##
## iter imp variable
## 1 1 Age
## 1 2 Age

9
## 1 3 Age
## 1 4 Age
## 1 5 Age
## 2 1 Age
## 2 2 Age
## 2 3 Age
## 2 4 Age
## 2 5 Age
## 3 1 Age
## 3 2 Age
## 3 3 Age
## 3 4 Age
## 3 5 Age
## 4 1 Age
## 4 2 Age
## 4 3 Age
## 4 4 Age
## 4 5 Age
## 5 1 Age
## 5 2 Age
## 5 3 Age
## 5 4 Age
## 5 5 Age

head(mice_imputed)

## original imputed_pmm imputed_cart imputed_lasso


## 1 22 22 22.0 22.00000
## 2 38 38 38.0 38.00000
## 3 26 26 26.0 26.00000
## 4 35 35 35.0 35.00000
## 5 35 35 35.0 35.00000
## 6 NA 53 24.5 19.90163

h1 <- ggplot(mice_imputed, aes(x = original)) +


geom_histogram(fill = "#ad1538", color = "#000000", position = "identity",bins=30) +
ggtitle("Original distribution") +
theme_classic()
h2 <- ggplot(mice_imputed, aes(x = imputed_pmm)) +
geom_histogram(fill = "#15ad4f", color = "#000000", position = "identity",bins=30) +
ggtitle("PMM-imputed distribution") +
theme_classic()
h3 <- ggplot(mice_imputed, aes(x = imputed_cart)) +
geom_histogram(fill = "#1543ad", color = "#000000", position = "identity",bins=30) +
ggtitle("CART-imputed distribution") +
theme_classic()
h4 <- ggplot(mice_imputed, aes(x = imputed_lasso)) +
geom_histogram(fill = "#ad8415", color = "#000000", position = "identity",bins=30) +
ggtitle("Lasso-imputed distribution") +
theme_classic()
plot_grid(h1, h2, h3, h4, nrow = 2, ncol = 2)

## Warning: Removed 177 rows containing non-finite outside the scale range
## (‘stat_bin()‘).

10
Original distribution PMM−imputed distribution

60 90
count

count
40 60

20 30

0 0
0 20 40 60 80 0 20 40 60 80
original imputed_pmm

CART−imputed distribution Lasso−imputed distribution

75 75
count

count
50 50

25 25

0 0
0 20 40 60 80 0 20 40 60 80
imputed_cart imputed_lasso

install.packages("missForest", repos = "https://cran.rstudio.com/")

## Installing package into ’C:/Users/kelly/AppData/Local/R/win-library/4.4’


## (as ’lib’ is unspecified)

## package ’missForest’ successfully unpacked and MD5 sums checked


##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages

library(missForest)

## Warning: package ’missForest’ was built under R version 4.4.2

missForest_imputed<-data.frame(
original=titanic_numeric$Age,
imputed_missForest=missForest(titanic_numeric)$ximp$Age
)
head(missForest_imputed)

## original imputed_missForest
## 1 22 22.00000
## 2 38 38.00000

11
## 3 26 26.00000
## 4 35 35.00000
## 5 35 35.00000
## 6 NA 29.25033

h1<-ggplot(missForest_imputed,aes(x=original))+
geom_histogram(fill="#ad1538", color="#000000",position="identity",bins=30)+
ggtitle("Originaldistribution") +
theme_classic()
h2<-ggplot(missForest_imputed,aes(x=imputed_missForest))+
geom_histogram(fill="#15ad4f", color="#000000",position="identity",bins=30)+
ggtitle("MissForest-imputeddistribution")+
theme_classic()
plot_grid(h1,h2,nrow=1,ncol=2)

## Warning: Removed 177 rows containing non-finite outside the scale range
## (‘stat_bin()‘).

Originaldistribution MissForest−imputeddistribution
150

60

100

40
count

count

50

20

0 0

0 20 40 60 80 0 20 40 60 80
original imputed_missForest

Titanic<- titanic_train
Titanic$Survived <- as.factor(ifelse(Titanic$Survived=="0","No","Yes"))
Titanic$Sex <- as.factor(Titanic$Sex)
Titanic$PClass <- as.factor(Titanic$Pclass)
Titanic <- Titanic[,c("Survived","PClass","Sex","Age")]
with(Titanic,{
print(table(Sex))

12
print(table(PClass))
print(table(Survived))
})

## Sex
## female male
## 314 577
## PClass
## 1 2 3
## 216 184 491
## Survived
## No Yes
## 549 342

cat ("Proporción Female:", table(Titanic$Sex)[1]/length(Titanic$Sex))

## Proporción Female: 0.352413

cat ("Proporción Male:", table(Titanic$Sex)[2]/length(Titanic$Sex))

## Proporción Male: 0.647587

cat ("Proporción Clase 1:", table(Titanic$PClass)[1]/length(Titanic$PClass))

## Proporción Clase 1: 0.2424242

cat ("Proporción Clase 2:", table(Titanic$PClass)[2]/length(Titanic$PClass))

## Proporción Clase 2: 0.2065095

cat ("Proporción Clase 3:", table(Titanic$PClass)[3]/length(Titanic$PClass))

## Proporción Clase 3: 0.5510662

cat ("Proporción Sobrevivió:", table(Titanic$Survived)[1]/length(Titanic$Survived))

## Proporción Sobrevivió: 0.6161616

cat ("Proporción Murió:", table(Titanic$Survived)[2]/length(Titanic$Survived))

## Proporción Murió: 0.3838384

install.packages("caret", repos = "https://cran.rstudio.com/")

## Installing package into ’C:/Users/kelly/AppData/Local/R/win-library/4.4’


## (as ’lib’ is unspecified)

13
## package ’caret’ successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package ’caret’

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar


## C:\Users\kelly\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## a C:\Users\kelly\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied

## Warning: restored ’caret’

##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages

library(caret)

## Warning: package ’caret’ was built under R version 4.4.2

## Cargando paquete requerido: lattice

table(Titanic$Sex)

##
## female male
## 314 577

Titanic.down <- downSample(Titanic[,-1],Titanic$Sex,yname="Sex")


table(Titanic.down$Sex)

##
## female male
## 314 314

table(Titanic$Sex)

##
## female male
## 314 577

Titanic.up <- upSample(Titanic[,-1],Titanic$Sex,yname="Sex")


table(Titanic.up$Sex)

##
## female male
## 577 577

14
NF <- sample_n(filter(Titanic, Sex == "female"), 36)
OverT <- rbind(Titanic,NF)
table(OverT$Sex)

##
## female male
## 350 577

remover <- sample(which(Titanic$Sex=="male"),77)


UnderT <- Titanic[-remover,]
table(UnderT$Sex)

##
## female male
## 314 500

install.packages("scutr", repos = "https://cran.rstudio.com/")

## Installing package into ’C:/Users/kelly/AppData/Local/R/win-library/4.4’


## (as ’lib’ is unspecified)

## package ’scutr’ successfully unpacked and MD5 sums checked


##
## The downloaded binary packages are in
## C:\Users\kelly\AppData\Local\Temp\RtmpCu35v5\downloaded_packages

library(scutr)

## Warning: package ’scutr’ was built under R version 4.4.2

Titanic<- titanic_train[,c(1,2,3,5,6,7,8,10)]
Titanic<-na.omit(Titanic)
Tsmote<-SCUT(Titanic, "Sex",oversample = oversample_smote)
table(Tsmote$Sex)

##
## female male
## 357 357

15

You might also like