0% found this document useful (0 votes)
11 views4 pages

Data Cleaning

Data cleaning in R
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views4 pages

Data Cleaning

Data cleaning in R
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

DATA WRANGLING, CLEANING AND CLENSING

FIDELIS MUTUNGA NDUNGE

2023-10-08

#Data cleaning
library(readxl)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3


## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.2 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.2 v tibble 3.2.1
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become err
unclean<-read_excel("unclean.xlsx")
View(unclean)
#investigating the summary of my data set
summary(unclean)

## IdNumber A1Age Sex CaseStatus


## Min. : 1.0 Min. :16.00 Length:227 Min. :1.000
## 1st Qu.: 56.5 1st Qu.:23.00 Class :character 1st Qu.:1.000
## Median :113.0 Median :26.00 Mode :character Median :2.000
## Mean :113.2 Mean :28.04 Mean :1.515
## 3rd Qu.:169.5 3rd Qu.:32.00 3rd Qu.:2.000
## Max. :226.0 Max. :63.00 Max. :3.000
##
## LotFrontage Alley Height Weight
## Length:227 Length:227 Min. : 10.0 Mode:logical
## Class :character Class :character 1st Qu.:156.0 NA's:227
## Mode :character Mode :character Median :161.0
## Mean :161.7
## 3rd Qu.:166.8

1
## Max. :400.0
## NA's :1
## STI D1 D2 D3
## Min. :0.0000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## Median :0.0000 Median :2.000 Median :1.000 Median :2.00
## Mean :0.1534 Mean :1.661 Mean :1.449 Mean :1.73
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.00
## Max. :1.0000 Max. :2.000 Max. :2.000 Max. :2.00
## NA's :64 NA's :1
## D4
## Min. :1.000
## 1st Qu.:2.000
## Median :2.000
## Mean :1.947
## 3rd Qu.:2.000
## Max. :2.000
##
#Viewing the structure of dataset
str(unclean)

## tibble [227 x 13] (S3: tbl_df/tbl/data.frame)


## $ IdNumber : num [1:227] 32 33 34 35 10 11 12 13 14 15 ...
## $ A1Age : num [1:227] 23 24 24 33 63 22 22 19 22 29 ...
## $ Sex : chr [1:227] "Female" "Female" "Female" "Female" ...
## $ CaseStatus : num [1:227] 2 1 2 1 2 1 2 1 2 1 ...
## $ LotFrontage: chr [1:227] "65" "80" "68" "60" ...
## $ Alley : chr [1:227] "NA" "NA" "NA" "NA" ...
## $ Height : num [1:227] 182 300 166 166 156 10 170 158 168 160 ...
## $ Weight : logi [1:227] NA NA NA NA NA NA ...
## $ STI : num [1:227] NA 0 0 NA 0 0 NA NA 0 1 ...
## $ D1 : num [1:227] 2 2 2 1 1 2 2 1 1 1 ...
## $ D2 : num [1:227] 1 2 1 2 1 2 2 1 1 2 ...
## $ D3 : num [1:227] 2 2 2 2 2 2 2 2 1 1 ...
## $ D4 : num [1:227] 2 2 2 2 2 2 2 2 2 2 ...
str(unclean$Sex)

## chr [1:227] "Female" "Female" "Female" "Female" "Male" "Male" "Male" ...
#therefore we can make the conclusion that the dataset has 226 observations and 47 variables.
#viewing data class and structure
class(unclean$CaseStatus)

## [1] "numeric"
class(unclean$Sex)

## [1] "character"
#table of frequencies
table(unclean$Sex)

##
## Female Male
## 107 118

2
#table of proportions
prop.table(table(unclean$Sex))

##
## Female Male
## 0.4755556 0.5244444
#table of percentages
prop.table(table(unclean$Sex))*100

##
## Female Male
## 47.55556 52.44444
#examining and removing of duplicates from the dataset
duplct<-duplicated(unclean$IdNumber)
table(duplct)

## duplct
## FALSE TRUE
## 226 1
unclean$IdNumber[duplicated(unclean$IdNumber)]

## [1] 51
#unclean<unclean[order(unclean$IdNumber),]
View(unclean[unclean$IdNumber==51,])
unclean<-unclean[!(unclean$IdNumber==51 &unclean$A1Age ==23),]
View(unclean[unclean$IdNumber==51,])
##TRANSFORMING CONTINOUS VARIABLES TO CATEGORICAL VARIABLES A categorical variable in a data set is a ty
unclean$SexCategory<-NA
unclean$SexCategory[unclean$Sex=="Female"]=1
unclean$SexCategory[unclean$Sex=="Male"]=0
unclean$SexCategory<-factor(unclean$SexCategory,levels = c(1,0),labels = c("Female","Male"))
class(unclean$SexCategory)

## [1] "factor"
table(unclean$SexCategory)

##
## Female Male
## 107 117
View(unclean)
#EXAMINING INCONSTINCIES IN A DATASET
#to examine for inconstincy we use table command
table(unclean$CaseStatus)

##
## 1 2 3
## 111 113 2
#making corrections
View(unclean[unclean$CaseStatus==3,])
#making appropriate changes
unclean$CaseStatus[unclean$IdNumber==31|unclean$IdNumber==1]=1
table(unclean$CaseStatus)

3
##
## 1 2
## 113 113
#HANDLING MISSING VALUES and REMOVING NA VALUES
sapply(unclean,class)

## IdNumber A1Age Sex CaseStatus LotFrontage Alley


## "numeric" "numeric" "character" "numeric" "character" "character"
## Height Weight STI D1 D2 D3
## "numeric" "logical" "numeric" "numeric" "numeric" "numeric"
## D4 SexCategory
## "numeric" "factor"
unclean$LotFrontage[unclean$LotFrontage=="NA"]<-NA
#unclean$LotFrontage
unclean2<-unclean %>% drop_na(LotFrontage)
view(unclean2)
unclean3<- unclean2[ , colSums(is.na(unclean2)) != nrow(unclean2)]
view(unclean3)
#On the case of Alley we will just assume that that is what was recorded.
#OUTLIERS IN A DATASET
plot(unclean$Height)

YES_files/figure-latex/unnamed-chunk-2-1.pdf

#Identify outliers in column


unclean3$Height[unclean3$Height %in% boxplot.stats(unclean3$Height)$out]

## [1] 300 10 191 350 138 20 30


#Remove rows with outliers
unclean3<- unclean3[!unclean3$Height %in% boxplot.stats(unclean3$Height)$out, ]
plot(unclean3$Height)

YES_files/figure-latex/unnamed-chunk-3-1.pdf

You might also like