0% found this document useful (0 votes)

11 views4 pages

Data Cleaning

Data cleaning in R

Uploaded by

otienokevin260290

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

11 views4 pages

Data Cleaning

Data cleaning in R

Uploaded by

otienokevin260290

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 4

DATA WRANGLING, CLEANING AND CLENSING

FIDELIS MUTUNGA NDUNGE

2023-10-08

#Data cleaning
library(readxl)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.2 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.2 v tibble 3.2.1
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become err
unclean<-read_excel("unclean.xlsx")
View(unclean)
#investigating the summary of my data set
summary(unclean)

## IdNumber A1Age Sex CaseStatus

## Min. : 1.0 Min. :16.00 Length:227 Min. :1.000
## 1st Qu.: 56.5 1st Qu.:23.00 Class :character 1st Qu.:1.000
## Median :113.0 Median :26.00 Mode :character Median :2.000
## Mean :113.2 Mean :28.04 Mean :1.515
## 3rd Qu.:169.5 3rd Qu.:32.00 3rd Qu.:2.000
## Max. :226.0 Max. :63.00 Max. :3.000
##
## LotFrontage Alley Height Weight
## Length:227 Length:227 Min. : 10.0 Mode:logical
## Class :character Class :character 1st Qu.:156.0 NA's:227
## Mode :character Mode :character Median :161.0
## Mean :161.7
## 3rd Qu.:166.8

1
## Max. :400.0
## NA's :1
## STI D1 D2 D3
## Min. :0.0000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## Median :0.0000 Median :2.000 Median :1.000 Median :2.00
## Mean :0.1534 Mean :1.661 Mean :1.449 Mean :1.73
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.00
## Max. :1.0000 Max. :2.000 Max. :2.000 Max. :2.00
## NA's :64 NA's :1
## D4
## Min. :1.000
## 1st Qu.:2.000
## Median :2.000
## Mean :1.947
## 3rd Qu.:2.000
## Max. :2.000
##
#Viewing the structure of dataset
str(unclean)

## tibble [227 x 13] (S3: tbl_df/tbl/data.frame)

## $ IdNumber : num [1:227] 32 33 34 35 10 11 12 13 14 15 ...
## $ A1Age : num [1:227] 23 24 24 33 63 22 22 19 22 29 ...
## $ Sex : chr [1:227] "Female" "Female" "Female" "Female" ...
## $ CaseStatus : num [1:227] 2 1 2 1 2 1 2 1 2 1 ...
## $ LotFrontage: chr [1:227] "65" "80" "68" "60" ...
## $ Alley : chr [1:227] "NA" "NA" "NA" "NA" ...
## $ Height : num [1:227] 182 300 166 166 156 10 170 158 168 160 ...
## $ Weight : logi [1:227] NA NA NA NA NA NA ...
## $ STI : num [1:227] NA 0 0 NA 0 0 NA NA 0 1 ...
## $ D1 : num [1:227] 2 2 2 1 1 2 2 1 1 1 ...
## $ D2 : num [1:227] 1 2 1 2 1 2 2 1 1 2 ...
## $ D3 : num [1:227] 2 2 2 2 2 2 2 2 1 1 ...
## $ D4 : num [1:227] 2 2 2 2 2 2 2 2 2 2 ...
str(unclean$Sex)

## chr [1:227] "Female" "Female" "Female" "Female" "Male" "Male" "Male" ...
#therefore we can make the conclusion that the dataset has 226 observations and 47 variables.
#viewing data class and structure
class(unclean$CaseStatus)

## [1] "numeric"
class(unclean$Sex)

## [1] "character"
#table of frequencies
table(unclean$Sex)

##
## Female Male
## 107 118

2
#table of proportions
prop.table(table(unclean$Sex))

##
## Female Male
## 0.4755556 0.5244444
#table of percentages
prop.table(table(unclean$Sex))*100

##
## Female Male
## 47.55556 52.44444
#examining and removing of duplicates from the dataset
duplct<-duplicated(unclean$IdNumber)
table(duplct)

## duplct
## FALSE TRUE
## 226 1
unclean$IdNumber[duplicated(unclean$IdNumber)]

## [1] 51
#unclean<unclean[order(unclean$IdNumber),]
View(unclean[unclean$IdNumber==51,])
unclean<-unclean[!(unclean$IdNumber==51 &unclean$A1Age ==23),]
View(unclean[unclean$IdNumber==51,])
##TRANSFORMING CONTINOUS VARIABLES TO CATEGORICAL VARIABLES A categorical variable in a data set is a ty
unclean$SexCategory<-NA
unclean$SexCategory[unclean$Sex=="Female"]=1
unclean$SexCategory[unclean$Sex=="Male"]=0
unclean$SexCategory<-factor(unclean$SexCategory,levels = c(1,0),labels = c("Female","Male"))
class(unclean$SexCategory)

## [1] "factor"
table(unclean$SexCategory)

##
## Female Male
## 107 117
View(unclean)
#EXAMINING INCONSTINCIES IN A DATASET
#to examine for inconstincy we use table command
table(unclean$CaseStatus)

##
## 1 2 3
## 111 113 2
#making corrections
View(unclean[unclean$CaseStatus==3,])
#making appropriate changes
unclean$CaseStatus[unclean$IdNumber==31|unclean$IdNumber==1]=1
table(unclean$CaseStatus)

3
##
## 1 2
## 113 113
#HANDLING MISSING VALUES and REMOVING NA VALUES
sapply(unclean,class)

## IdNumber A1Age Sex CaseStatus LotFrontage Alley

## "numeric" "numeric" "character" "numeric" "character" "character"
## Height Weight STI D1 D2 D3
## "numeric" "logical" "numeric" "numeric" "numeric" "numeric"
## D4 SexCategory
## "numeric" "factor"
unclean$LotFrontage[unclean$LotFrontage=="NA"]<-NA
#unclean$LotFrontage
unclean2<-unclean %>% drop_na(LotFrontage)
view(unclean2)
unclean3<- unclean2[ , colSums(is.na(unclean2)) != nrow(unclean2)]
view(unclean3)
#On the case of Alley we will just assume that that is what was recorded.
#OUTLIERS IN A DATASET
plot(unclean$Height)

YES_files/figure-latex/unnamed-chunk-2-1.pdf

#Identify outliers in column

unclean3$Height[unclean3$Height %in% boxplot.stats(unclean3$Height)$out]

## [1] 300 10 191 350 138 20 30

#Remove rows with outliers
unclean3<- unclean3[!unclean3$Height %in% boxplot.stats(unclean3$Height)$out, ]
plot(unclean3$Height)

YES_files/figure-latex/unnamed-chunk-3-1.pdf

Hell Energy Drink
No ratings yet
Hell Energy Drink
2 pages
04 Data Cleaning in R
No ratings yet
04 Data Cleaning in R
36 pages
Statistic and R Programming Lab Exercise
No ratings yet
Statistic and R Programming Lab Exercise
8 pages
Data Science
No ratings yet
Data Science
20 pages
4.18 Data Wrangling Slides Part1
No ratings yet
4.18 Data Wrangling Slides Part1
54 pages
BAN5
No ratings yet
BAN5
2 pages
UL2
No ratings yet
UL2
2 pages
Dba Midterm Cheatsheet
No ratings yet
Dba Midterm Cheatsheet
2 pages
Collapse Cheat Sheet
No ratings yet
Collapse Cheat Sheet
2 pages
DV Lab
No ratings yet
DV Lab
52 pages
R Studio Notes
No ratings yet
R Studio Notes
10 pages
Do - File - Quan Ly Va Lam Sach Du Lieu
No ratings yet
Do - File - Quan Ly Va Lam Sach Du Lieu
6 pages
Data Cleaning Using Dataset
No ratings yet
Data Cleaning Using Dataset
12 pages
Shahun Term Workr1
No ratings yet
Shahun Term Workr1
34 pages
1 - Tidying Data - R - Primary
No ratings yet
1 - Tidying Data - R - Primary
13 pages
Analysis Using Statistical: Introduction & Data Exploration
No ratings yet
Analysis Using Statistical: Introduction & Data Exploration
23 pages
R Cheatsheet ABC
No ratings yet
R Cheatsheet ABC
3 pages
R Cheatsheet ABC
No ratings yet
R Cheatsheet ABC
3 pages
Data Tidying With Tidyr::: Cheat Sheet
No ratings yet
Data Tidying With Tidyr::: Cheat Sheet
2 pages
Day2
No ratings yet
Day2
5 pages
R Cheatsheet ABCD
No ratings yet
R Cheatsheet ABCD
3 pages
Unit - 2: Data Manipulation With R & Data Visualization in Watson Studio
No ratings yet
Unit - 2: Data Manipulation With R & Data Visualization in Watson Studio
58 pages
DARecord
No ratings yet
DARecord
21 pages
Advanced R Programming Tidyverse Packages Notes
No ratings yet
Advanced R Programming Tidyverse Packages Notes
12 pages
Data Cleansing Using R
0% (1)
Data Cleansing Using R
10 pages
R Code Snippets
No ratings yet
R Code Snippets
10 pages
R Intro STAT5000
No ratings yet
R Intro STAT5000
17 pages
R Basic and Advanced
No ratings yet
R Basic and Advanced
9 pages
R Cheatsheet ABCD
No ratings yet
R Cheatsheet ABCD
3 pages
Section 03
No ratings yet
Section 03
20 pages
BIO259 Note
No ratings yet
BIO259 Note
55 pages
Programming With R - Subsets of Data
No ratings yet
Programming With R - Subsets of Data
7 pages
Intro To Data Science Lecture 4
No ratings yet
Intro To Data Science Lecture 4
13 pages
R - Tutorial: Matrices Are Vectors
No ratings yet
R - Tutorial: Matrices Are Vectors
13 pages
R Module 8 - Data Cleaning
No ratings yet
R Module 8 - Data Cleaning
48 pages
Manipulating Data in R
No ratings yet
Manipulating Data in R
32 pages
R Sharing
No ratings yet
R Sharing
16 pages
Data Cleaning Using R
No ratings yet
Data Cleaning Using R
5 pages
DSR LAB MANUAL - 10 Programs
No ratings yet
DSR LAB MANUAL - 10 Programs
34 pages
Experiment 5
No ratings yet
Experiment 5
13 pages
(Practical) Programming With R
No ratings yet
(Practical) Programming With R
5 pages
Categorical and Text Data
No ratings yet
Categorical and Text Data
39 pages
MTH 4407 - Group 2 (Dr. Farid Zamani) - Lecture 6
No ratings yet
MTH 4407 - Group 2 (Dr. Farid Zamani) - Lecture 6
22 pages
Cleaning Data2
No ratings yet
Cleaning Data2
39 pages
Big Data - Lab 3
No ratings yet
Big Data - Lab 3
25 pages
How To Work With List Columns
No ratings yet
How To Work With List Columns
104 pages
Statistic and R Programming Lab Exercise
No ratings yet
Statistic and R Programming Lab Exercise
24 pages
Workshop Activity: X Seq y Length
No ratings yet
Workshop Activity: X Seq y Length
3 pages
Week6 Slides Updated
No ratings yet
Week6 Slides Updated
57 pages
14 Clean The Mess
No ratings yet
14 Clean The Mess
77 pages
Datatable
No ratings yet
Datatable
2 pages
2208 351 351m 451 LE4 NAME
No ratings yet
2208 351 351m 451 LE4 NAME
11 pages
Applied Statistics MAT1011
No ratings yet
Applied Statistics MAT1011
22 pages
R Syntax Examples 1
No ratings yet
R Syntax Examples 1
6 pages
Solutions 04
No ratings yet
Solutions 04
25 pages
R Course Own English HS
No ratings yet
R Course Own English HS
70 pages
Curso Básico de Iniciación A La Programación Con R Álvaro Mauricio Bustamante Lozano
No ratings yet
Curso Básico de Iniciación A La Programación Con R Álvaro Mauricio Bustamante Lozano
9 pages
Indexing Exercises
No ratings yet
Indexing Exercises
6 pages
Data Table
No ratings yet
Data Table
2 pages
Practical 1 EDA
No ratings yet
Practical 1 EDA
14 pages
Learn Python through Nursery Rhymes and Fairy Tales: Classic Stories Translated into Python Programs (Coding for Kids and Beginners)
From Everand
Learn Python through Nursery Rhymes and Fairy Tales: Classic Stories Translated into Python Programs (Coding for Kids and Beginners)
Shari Eskenas
5/5 (1)
13 Date
No ratings yet
13 Date
22 pages
14 Date
No ratings yet
14 Date
13 pages
TSLSTM
No ratings yet
TSLSTM
4 pages
SMS 3401 Non Parametric Methods 1 1
No ratings yet
SMS 3401 Non Parametric Methods 1 1
4 pages
EASFA-Strict and Adhered Rules and Regulations For EASFA WhatsApp Group Members
No ratings yet
EASFA-Strict and Adhered Rules and Regulations For EASFA WhatsApp Group Members
13 pages
SMS 3404 Time Series Analysis Ii
No ratings yet
SMS 3404 Time Series Analysis Ii
3 pages
SMS 3404 Time Series Analysis Ii
No ratings yet
SMS 3404 Time Series Analysis Ii
3 pages
SMS 3451 Demographic Techniques
No ratings yet
SMS 3451 Demographic Techniques
4 pages
Chickpea Ceasars Salad Wrap
No ratings yet
Chickpea Ceasars Salad Wrap
3 pages
Itp Foundation For Steel Structure Emerald Project - Approved
No ratings yet
Itp Foundation For Steel Structure Emerald Project - Approved
5 pages
Prof Ed 8 Reviewer
No ratings yet
Prof Ed 8 Reviewer
5 pages
SH1500SB Repair Guide
No ratings yet
SH1500SB Repair Guide
16 pages
DR Prakash Nag MD Medicine NGMC
No ratings yet
DR Prakash Nag MD Medicine NGMC
61 pages
1 s2.0 S2211285523008376 Main
No ratings yet
1 s2.0 S2211285523008376 Main
9 pages
Geography P2 September 2024 Grade 12 MG
No ratings yet
Geography P2 September 2024 Grade 12 MG
11 pages
Esports Team Business Plan by Slidesgo
No ratings yet
Esports Team Business Plan by Slidesgo
9 pages
Msceit
No ratings yet
Msceit
11 pages
Pisa2025 Posttest Qs Science
No ratings yet
Pisa2025 Posttest Qs Science
12 pages
Therapeutic Communication: (Document Subtitle)
No ratings yet
Therapeutic Communication: (Document Subtitle)
9 pages
Foundations of Developmental Psychology Notes
No ratings yet
Foundations of Developmental Psychology Notes
28 pages
3 - Bricks
No ratings yet
3 - Bricks
19 pages
Nexus 117 1 PDF
No ratings yet
Nexus 117 1 PDF
3 pages
GNT Series-Product Brochure
No ratings yet
GNT Series-Product Brochure
36 pages
Types of Building Materials Used in Construction
No ratings yet
Types of Building Materials Used in Construction
4 pages
Report 2402410522 1
No ratings yet
Report 2402410522 1
4 pages
Value Sheet - PreciControl ClinChem Multi 1.05117208922.Lot-410119.Exp-2023-01-31.V288.en
100% (2)
Value Sheet - PreciControl ClinChem Multi 1.05117208922.Lot-410119.Exp-2023-01-31.V288.en
13 pages
C11 MOLECULAR STRUCTURE OF DNA AND RNA Concepts of Genetics 12ed (Brooker)
No ratings yet
C11 MOLECULAR STRUCTURE OF DNA AND RNA Concepts of Genetics 12ed (Brooker)
21 pages
Miconazole Nitrate Clarck
No ratings yet
Miconazole Nitrate Clarck
3 pages
1995 & 1996 Paper On A Multilevel Voltage-Source Inverter With Separate DC Sources For Static VAr Generation
No ratings yet
1995 & 1996 Paper On A Multilevel Voltage-Source Inverter With Separate DC Sources For Static VAr Generation
9 pages
TroCCAP Canine Endo Guidelines V1 2017
No ratings yet
TroCCAP Canine Endo Guidelines V1 2017
68 pages
Nerolac20Sheen20PDS Min Merged
No ratings yet
Nerolac20Sheen20PDS Min Merged
6 pages
Manual Del Kaliburn
No ratings yet
Manual Del Kaliburn
150 pages
Alzheimer S Dementia - 2023 - Zubillaga - Synergistic Interaction Between Cu Overload and A Peptide Promotes
No ratings yet
Alzheimer S Dementia - 2023 - Zubillaga - Synergistic Interaction Between Cu Overload and A Peptide Promotes
2 pages
Infectious Control Measures (Dental Clinic)
No ratings yet
Infectious Control Measures (Dental Clinic)
7 pages
CMM SIDE WINDOWS
No ratings yet
CMM SIDE WINDOWS
11 pages
V2F, V2V Models SV2 Series Safety Shut-Off Valves: Cyber Security Notice
No ratings yet
V2F, V2V Models SV2 Series Safety Shut-Off Valves: Cyber Security Notice
60 pages
Salceda - Experiment 3
No ratings yet
Salceda - Experiment 3
6 pages

Data Cleaning

Uploaded by

Data Cleaning

Uploaded by

DATA WRANGLING, CLEANING AND CLENSING

FIDELIS MUTUNGA NDUNGE

## Warning: package 'tidyverse' was built under R version 4.2.3

## IdNumber A1Age Sex CaseStatus

## tibble [227 x 13] (S3: tbl_df/tbl/data.frame)

## IdNumber A1Age Sex CaseStatus LotFrontage Alley

#Identify outliers in column

## [1] 300 10 191 350 138 20 30

You might also like