DATA WRANGLING, CLEANING AND CLENSING
FIDELIS MUTUNGA NDUNGE
2023-10-08
#Data cleaning
library(readxl)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.2 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.2 v tibble 3.2.1
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become err
unclean<-read_excel("unclean.xlsx")
View(unclean)
#investigating the summary of my data set
summary(unclean)
## IdNumber A1Age Sex CaseStatus
## Min. : 1.0 Min. :16.00 Length:227 Min. :1.000
## 1st Qu.: 56.5 1st Qu.:23.00 Class :character 1st Qu.:1.000
## Median :113.0 Median :26.00 Mode :character Median :2.000
## Mean :113.2 Mean :28.04 Mean :1.515
## 3rd Qu.:169.5 3rd Qu.:32.00 3rd Qu.:2.000
## Max. :226.0 Max. :63.00 Max. :3.000
##
## LotFrontage Alley Height Weight
## Length:227 Length:227 Min. : 10.0 Mode:logical
## Class :character Class :character 1st Qu.:156.0 NA's:227
## Mode :character Mode :character Median :161.0
## Mean :161.7
## 3rd Qu.:166.8
1
## Max. :400.0
## NA's :1
## STI D1 D2 D3
## Min. :0.0000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## Median :0.0000 Median :2.000 Median :1.000 Median :2.00
## Mean :0.1534 Mean :1.661 Mean :1.449 Mean :1.73
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.00
## Max. :1.0000 Max. :2.000 Max. :2.000 Max. :2.00
## NA's :64 NA's :1
## D4
## Min. :1.000
## 1st Qu.:2.000
## Median :2.000
## Mean :1.947
## 3rd Qu.:2.000
## Max. :2.000
##
#Viewing the structure of dataset
str(unclean)
## tibble [227 x 13] (S3: tbl_df/tbl/data.frame)
## $ IdNumber : num [1:227] 32 33 34 35 10 11 12 13 14 15 ...
## $ A1Age : num [1:227] 23 24 24 33 63 22 22 19 22 29 ...
## $ Sex : chr [1:227] "Female" "Female" "Female" "Female" ...
## $ CaseStatus : num [1:227] 2 1 2 1 2 1 2 1 2 1 ...
## $ LotFrontage: chr [1:227] "65" "80" "68" "60" ...
## $ Alley : chr [1:227] "NA" "NA" "NA" "NA" ...
## $ Height : num [1:227] 182 300 166 166 156 10 170 158 168 160 ...
## $ Weight : logi [1:227] NA NA NA NA NA NA ...
## $ STI : num [1:227] NA 0 0 NA 0 0 NA NA 0 1 ...
## $ D1 : num [1:227] 2 2 2 1 1 2 2 1 1 1 ...
## $ D2 : num [1:227] 1 2 1 2 1 2 2 1 1 2 ...
## $ D3 : num [1:227] 2 2 2 2 2 2 2 2 1 1 ...
## $ D4 : num [1:227] 2 2 2 2 2 2 2 2 2 2 ...
str(unclean$Sex)
## chr [1:227] "Female" "Female" "Female" "Female" "Male" "Male" "Male" ...
#therefore we can make the conclusion that the dataset has 226 observations and 47 variables.
#viewing data class and structure
class(unclean$CaseStatus)
## [1] "numeric"
class(unclean$Sex)
## [1] "character"
#table of frequencies
table(unclean$Sex)
##
## Female Male
## 107 118
2
#table of proportions
prop.table(table(unclean$Sex))
##
## Female Male
## 0.4755556 0.5244444
#table of percentages
prop.table(table(unclean$Sex))*100
##
## Female Male
## 47.55556 52.44444
#examining and removing of duplicates from the dataset
duplct<-duplicated(unclean$IdNumber)
table(duplct)
## duplct
## FALSE TRUE
## 226 1
unclean$IdNumber[duplicated(unclean$IdNumber)]
## [1] 51
#unclean<unclean[order(unclean$IdNumber),]
View(unclean[unclean$IdNumber==51,])
unclean<-unclean[!(unclean$IdNumber==51 &unclean$A1Age ==23),]
View(unclean[unclean$IdNumber==51,])
##TRANSFORMING CONTINOUS VARIABLES TO CATEGORICAL VARIABLES A categorical variable in a data set is a ty
unclean$SexCategory<-NA
unclean$SexCategory[unclean$Sex=="Female"]=1
unclean$SexCategory[unclean$Sex=="Male"]=0
unclean$SexCategory<-factor(unclean$SexCategory,levels = c(1,0),labels = c("Female","Male"))
class(unclean$SexCategory)
## [1] "factor"
table(unclean$SexCategory)
##
## Female Male
## 107 117
View(unclean)
#EXAMINING INCONSTINCIES IN A DATASET
#to examine for inconstincy we use table command
table(unclean$CaseStatus)
##
## 1 2 3
## 111 113 2
#making corrections
View(unclean[unclean$CaseStatus==3,])
#making appropriate changes
unclean$CaseStatus[unclean$IdNumber==31|unclean$IdNumber==1]=1
table(unclean$CaseStatus)
3
##
## 1 2
## 113 113
#HANDLING MISSING VALUES and REMOVING NA VALUES
sapply(unclean,class)
## IdNumber A1Age Sex CaseStatus LotFrontage Alley
## "numeric" "numeric" "character" "numeric" "character" "character"
## Height Weight STI D1 D2 D3
## "numeric" "logical" "numeric" "numeric" "numeric" "numeric"
## D4 SexCategory
## "numeric" "factor"
unclean$LotFrontage[unclean$LotFrontage=="NA"]<-NA
#unclean$LotFrontage
unclean2<-unclean %>% drop_na(LotFrontage)
view(unclean2)
unclean3<- unclean2[ , colSums(is.na(unclean2)) != nrow(unclean2)]
view(unclean3)
#On the case of Alley we will just assume that that is what was recorded.
#OUTLIERS IN A DATASET
plot(unclean$Height)
YES_files/figure-latex/unnamed-chunk-2-1.pdf
#Identify outliers in column
unclean3$Height[unclean3$Height %in% boxplot.stats(unclean3$Height)$out]
## [1] 300 10 191 350 138 20 30
#Remove rows with outliers
unclean3<- unclean3[!unclean3$Height %in% boxplot.stats(unclean3$Height)$out, ]
plot(unclean3$Height)
YES_files/figure-latex/unnamed-chunk-3-1.pdf