0% found this document useful (0 votes)
24 views4 pages

Ex 3

The document outlines a series of tasks involving data manipulation and analysis using R programming. It includes reading and cleaning GCSE data, comparing modified CSV files, working with a nations dataset to analyze life expectancy and GDP, and analyzing the Titanic dataset for missing values and surname extraction. Each task utilizes libraries such as dplyr and tidyr for data processing.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
24 views4 pages

Ex 3

The document outlines a series of tasks involving data manipulation and analysis using R programming. It includes reading and cleaning GCSE data, comparing modified CSV files, working with a nations dataset to analyze life expectancy and GDP, and analyzing the Titanic dataset for missing values and surname extraction. Each task utilizes libraries such as dplyr and tidyr for data processing.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

Task 1: Reading & Cleaning GCSE Data

CopyEdit

# Load necessary library

library(dplyr)

# Read the data, assuming tab-separated values

gcse_data <- read.table("Gcsemv.txt", header = FALSE, sep = "\t", na.strings = "-1")

# Assign column names

colnames(gcse_data) <- c("School_ID", "Student_ID", "Gender", "Written_Paper", "Coursework")

# Summary of the data

summary(gcse_data)

# Handling missing values

gcse_data <- gcse_data %>% na.omit() # Removes rows with missing values

# Converting categorical variables

gcse_data$Gender <- factor(gcse_data$Gender, labels = c("Boy", "Girl"))

# Display cleaned data

head(gcse_data)

Task 2: Comparing Modified myiris.csv File

CopyEdit

# Load required packages

library(dplyr)

library(compareDF)

# Read original and modified CSV

original_iris <- read.csv("myiris.csv")

modified_iris <- read.csv("myiris_modified.csv")


# Compare using different methods

comp_result <- comparedf(original_iris, modified_iris)

print(comp_result)

diff_result <- diffdf(original_iris, modified_iris)

print(diff_result)

all_equal(original_iris, modified_iris)

identical(original_iris, modified_iris) # If FALSE, files differ

Task 3: Working with nations Dataset

CopyEdit

# Load required libraries

library(dplyr)

# Read dataset

nations <- read.csv("nations.csv")

# (a) Filter 2016 data

longevity <- nations %>%

filter(Year == 2016) %>%

select(Country, LifeExpectancy, IncomeGroup, Region)

# (b) 10 high-income countries with the shortest life expectancy

longevity %>%

filter(IncomeGroup == "High income") %>%

arrange(LifeExpectancy) %>%

head(10)

# (c) Countries in North America or Europe & Central Asia with life expectancy 75-80

longevity %>%
filter(Region %in% c("North America", "Europe & Central Asia"),

LifeExpectancy >= 75, LifeExpectancy <= 80)

# (d) 20 longest life expectancies plus US ranking

top20 <- longevity %>% arrange(desc(LifeExpectancy)) %>% head(20)

us_rank <- longevity %>% filter(Country == "United States")

bind_rows(top20, us_rank)

# (e) Total GDP by income group and year

features <- nations %>%

group_by(Year, IncomeGroup) %>%

summarise(Total_GDP = sum(GDP, na.rm = TRUE)) %>%

arrange(desc(Year), desc(Total_GDP))

# (f) Life expectancy range per year

nations %>%

group_by(Year) %>%

summarise(Max_LifeExp = max(LifeExpectancy, na.rm = TRUE),

Min_LifeExp = min(LifeExpectancy, na.rm = TRUE),

LifeExp_Range = Max_LifeExp - Min_LifeExp)

# (g) Total GDP by region over time

nations %>%

group_by(Year, Region) %>%

summarise(Total_GDP_Trillion = sum(GDP, na.rm = TRUE) / 1e12)

# (h) Join `nations` to `nations2` and calculate total CO2 emissions

nations2 <- read.csv("nations2.csv")

merged_data <- left_join(nations, nations2, by = c("Country", "Year"))

merged_data %>%

group_by(Year, Region) %>%

summarise(Total_CO2_Gigatonnes = sum(CarbonDioxide, na.rm = TRUE) / 1e9)


Task 4: Titanic Dataset Analysis

CopyEdit

# Load required packages

library(dplyr)

library(tidyr)

# Load Titanic dataset

titanic <- read.csv("titanic.csv")

# (a) Number of samples missing age values

sum(is.na(titanic$Age))

# (b) Replace missing Fare with median Fare of class & embarkment

titanic <- titanic %>%

group_by(Pclass, Embarked) %>%

mutate(Fare = ifelse(is.na(Fare), median(Fare, na.rm = TRUE), Fare))

# (c) Extract surnames from passenger names

titanic$Surname <- gsub(",.*", "", titanic$Name)

# Display the dataset

head(titanic[c("Name", "Surname")])

You might also like