0% found this document useful (0 votes)
2 views

Preprocessing code

The document outlines a comprehensive data cleaning and exploratory data analysis (EDA) process for a dataset read from an RDS file. It involves removing duplicates, cleaning column names, handling missing values, extracting phases from titles, and creating new categorical variables based on study size and duration. The final dataset is visualized and analyzed to summarize key insights regarding enrollment and overall study status.

Uploaded by

nafees677
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

Preprocessing code

The document outlines a comprehensive data cleaning and exploratory data analysis (EDA) process for a dataset read from an RDS file. It involves removing duplicates, cleaning column names, handling missing values, extracting phases from titles, and creating new categorical variables based on study size and duration. The final dataset is visualized and analyzed to summarize key insights regarding enrollment and overall study status.

Uploaded by

nafees677
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 11

install.

packages("janitor")

install.packages("DataExplorer")

# Load required libraries

library(tidyverse)

library(lubridate)

library(stringr)

library(janitor)

library(DataExplorer)

# Import and structure

query_results_raw <- readRDS("query_results_raw.rds")

# Initial data exploration

print(dim(query_results_raw))

print(summary(query_results_raw))

# Check for duplicate rows

duplicates <- query_results_raw[duplicated(query_results_raw), ]

print(paste("Number of duplicate rows:", nrow(duplicates)))

# Remove duplicates

query_results_raw <- distinct(query_results_raw)

# Make column names unique (some duplicates due to database keys and vague naming)

names(query_results_raw) <- make.unique(names(query_results_raw))


# Clean column names

clean_names(query_results_raw)

# EDA

introduce(query_results_raw)

# Now drop columns where EVERY entry is NA (no data)

query_results_clean <- query_results_raw %>%

select(where(~ !all(is.na(.))))

# Drop metadata columns

query_results_clean_2 <- query_results_clean %>%

select(c(1, 20, 25, 29, 30, 31, 32, 33, 34, 35, 37:44, 46:47, 66, 70:75, 78, 82:83, 85:93, 96:97))

query_results_clean_3 <- query_results_clean_2 %>%

select(-c(target_duration, status, number_of_groups))

# Renaming, factoring

query_results_clean_4 <- query_results_clean_3 %>%

select(-name) %>%

rename(

site = name.1,

condition_name = name.2,

condition_name_lower_case = downcase_name

) %>%

mutate(

nct_id = factor(nct_id),
study_type = factor(study_type),

overall_status = factor(overall_status),

phase = factor(phase),

enrollment_type = factor(enrollment_type),

source = factor(source),

number_of_arms = factor(number_of_arms),

site = factor(site),

city = factor(city),

state = factor(state),

zip = factor(zip),

country = factor(country),

minimum_age_unit = factor(minimum_age_unit),

maximum_age_unit = factor(maximum_age_unit),

condition_name = factor(condition_name),

condition_name_lower_case = factor(condition_name_lower_case)

query_results_clean_4 <- query_results_clean_4 %>%

mutate(

enrollment_actual = if_else(enrollment_type == "Actual", enrollment, NA_integer_),

enrollment_anticipated = if_else(enrollment_type == "Anticipated", enrollment, NA_integer_)

# EVAULTE: phase - lots of missing data, shouldnt be

# phase has 2,605 unlabeled studies; sometimes the title/abstract contains this

# info but it was not entered into CT.gov; could be extracted for fuller dataset

phase_analysis <- query_results_clean_4 %>%

group_by(official_title) %>%

select (phase, official_title) %>%


filter (phase == "Not Applicable") %>%

distinct()

### placeholder for phase mining new variable `phase_extracted` ###

# Implement phase extraction from title

extract_phase <- function(title) {

phase <- str_extract(tolower(title), "phase [1-4]|phase [i-iv]")

if (!is.na(phase)) {

phase <- str_replace(phase, "phase ", "Phase ")

phase <- str_replace(phase, "i", "1")

phase <- str_replace(phase, "ii", "2")

phase <- str_replace(phase, "iii", "3")

phase <- str_replace(phase, "iv", "4")

return(phase)

query_results_clean_4 <- query_results_clean_4 %>%

mutate(

phase_extracted = ifelse(phase == "Not Applicable",

extract_phase(official_title),

as.character(phase)),

phase_extracted = factor(phase_extracted)

View(query_results_clean_4)

# Value replacement

query_results_clean_5 <- query_results_clean_4 %>%

mutate(
number_of_other_outcomes_to_measure = replace_na(number_of_other_outcomes_to_measure, 0),

is_fda_regulated_device = replace_na(is_fda_regulated_device, FALSE),

is_fda_regulated_drug = replace_na(is_fda_regulated_drug, FALSE),

number_of_secondary_outcomes_to_measure =
replace_na(number_of_secondary_outcomes_to_measure, 0),

number_of_arms = replace_na(number_of_arms, "1"),

has_expanded_access = replace_na(has_expanded_access, FALSE),

number_of_primary_outcomes_to_measure =
replace_na(number_of_primary_outcomes_to_measure, 1)

# Making a comprehensive study duration variable (in months and years)

query_results_clean_final <- query_results_clean_5 %>%

mutate(

# Create an interval object from start_date to completion_date

duration_interval = interval(start_date, completion_date),

# Directly calculate the duration in months

duration_months = time_length(duration_interval, unit = "month"),

# Directly calculate the duration in years

duration_years = time_length(duration_interval, unit = "year")

query_results_clean_final <- query_results_clean_final %>%

select(-duration_interval)

# EVALUATE: actual_duration - some missing and used primary endpoint as end


# not usually the case

duration_analysis <- query_results_clean_final %>%

group_by(official_title) %>%

select (nct_id, official_title, actual_duration, duration_months, duration_years) %>%

distinct()

# Step 1: Create a new variable 'overall_status_binary'

query_results_clean_final_transformed <- query_results_clean_final %>%

mutate(overall_status_binary = case_when(

overall_status %in% c("Completed", "Approved for marketing") ~ "Completed",

overall_status %in% c("Withdrawn", "Terminated", "Suspended", "No longer available") ~ "Not


Completed",

# Assuming any other status as "In Progress"

TRUE ~ "In Progress"

))

status_analysis <- query_results_clean_final_transformed %>%

group_by(official_title) %>%

select (nct_id, official_title, overall_status_binary) %>%

distinct()

# Filter out "in progress" to make binary

query_results_clean_final_transformed <- query_results_clean_final_transformed %>%

filter(overall_status_binary != "In Progress")

query_results_clean_final_transformed$overall_status_binary <-
as.factor(query_results_clean_final_transformed$overall_status_binary)

# Confirm
levels(query_results_clean_final_transformed$overall_status_binary)

query_results_clean_final_transformed <- query_results_clean_final_transformed %>%

select(-c(acronym, baseline_population, brief_title, official_title, source, site, zip, condition_name,


condition_name_lower_case, state, country, enrollment, enrollment_type, minimum_age_num,
minimum_age_unit, maximum_age_num, maximum_age_unit, overall_status, why_stopped)) %>%

group_by(nct_id) %>%

distinct() %>%

data.frame() %>%

select(-1)

# 1. Advanced Missing Value Analysis

# Identify columns with a high percentage of missing values

missing_percentage <- colSums(is.na(query_results_clean_final_transformed)) /


nrow(query_results_clean_final_transformed) * 100

high_missing_cols <- names(missing_percentage[missing_percentage > 50]) # Columns with more than


50% missing data

print(high_missing_cols)

# 2. Detect and Handle Outliers

# Detect outliers in numeric columns using the IQR method and remove them

numeric_cols <- query_results_clean_final_transformed %>% select(where(is.numeric)) %>% colnames()

query_results_clean_final_transformed <- query_results_clean_final_transformed %>%

filter(if_any(all_of(numeric_cols), ~ !(. %in% boxplot.stats(.)$out)))

# Handle outliers in enrollment and duration

query_results_clean_final_transformed <- query_results_clean_final_transformed %>%

mutate(

enrollment_actual = if_else(enrollment_actual > quantile(enrollment_actual, 0.99, na.rm = TRUE),

quantile(enrollment_actual, 0.99, na.rm = TRUE),


enrollment_actual),

enrollment_anticipated = if_else(enrollment_anticipated > quantile(enrollment_anticipated, 0.99,


na.rm = TRUE),

quantile(enrollment_anticipated, 0.99, na.rm = TRUE),

enrollment_anticipated),

duration_months = if_else(duration_months > quantile(duration_months, 0.99, na.rm = TRUE),

quantile(duration_months, 0.99, na.rm = TRUE),

duration_months)

# 3. Further Feature Engineering

# Create a feature for study size category

query_results_clean_final_transformed <- query_results_clean_final_transformed %>%

mutate(

study_size_category = case_when(

enrollment_actual <= 50 | enrollment_anticipated <= 50 ~ "Small",

enrollment_actual <= 200 | enrollment_anticipated <= 200 ~ "Medium",

enrollment_actual > 200 | enrollment_anticipated > 200 ~ "Large",

TRUE ~ NA_character_

# Create a feature for study duration category

query_results_clean_final_transformed <- query_results_clean_final_transformed %>%

mutate(

study_duration_category = case_when(
duration_months <= 12 ~ "Short-term",

duration_months <= 36 ~ "Medium-term",

duration_months > 36 ~ "Long-term",

TRUE ~ NA_character_

# 4. Address Potential Data Inconsistencies

# Ensure binary or logical columns are consistently formatted (TRUE/FALSE, Yes/No, etc.)

query_results_clean_final_transformed <- query_results_clean_final_transformed %>%

mutate(

across(where(is.logical), ~ replace_na(., FALSE)), # Replace NA with FALSE for logical columns

overall_status_binary = fct_relevel(overall_status_binary, c("Completed", "Not Completed")) # Ensure


consistent factor levels

# 5. Group and Summarize Data for Better Insights

summary_by_size <- query_results_clean_final_transformed %>%

group_by(study_size_category) %>%

summarise(

mean_enrollment_actual = mean(enrollment_actual, na.rm = TRUE),

mean_enrollment_anticipated = mean(enrollment_anticipated, na.rm = TRUE),

mean_duration_months = mean(duration_months, na.rm = TRUE),

count = n()

print(summary_by_size)

# 6. Further Quality Checks and Transformation

# Detect and handle duplicate entries


query_results_clean_final_transformed <- query_results_clean_final_transformed %>%

distinct()

query_results_clean_final_transformed %>% arrange(completion_date)

View(query_results_clean_final_transformed)

# 7. Visualization for Data Exploration

ggplot(query_results_clean_final_transformed, aes(x = overall_status_binary, y = enrollment_actual, fill


= overall_status_binary)) +

geom_boxplot() +

labs(title = "Enrollment Actual by Overall Status", x = "Overall Status", y = "Enrollment Actual") +

theme_minimal() +

scale_fill_manual(values = c("Completed" = "lightblue", "Not Completed" = "salmon"))

# Final EDA

colnames(query_results_clean_final_transformed)

str(query_results_clean_final_transformed)

# EDA

introduce(query_results_clean_final_transformed)

plot_intro(query_results_clean_final_transformed)

plot_missing(query_results_clean_final_transformed)

plot_bar(query_results_clean_final_transformed)

plot_histogram(query_results_clean_final_transformed)

plot_qq(query_results_clean_final_transformed)
plot_correlation(query_results_clean_final_transformed)

plot_boxplot(query_results_clean_final_transformed, by = "overall_status_binary")

You might also like