Preprocessing code
Preprocessing code
packages("janitor")
install.packages("DataExplorer")
library(tidyverse)
library(lubridate)
library(stringr)
library(janitor)
library(DataExplorer)
print(dim(query_results_raw))
print(summary(query_results_raw))
# Remove duplicates
# Make column names unique (some duplicates due to database keys and vague naming)
clean_names(query_results_raw)
# EDA
introduce(query_results_raw)
select(where(~ !all(is.na(.))))
select(c(1, 20, 25, 29, 30, 31, 32, 33, 34, 35, 37:44, 46:47, 66, 70:75, 78, 82:83, 85:93, 96:97))
# Renaming, factoring
select(-name) %>%
rename(
site = name.1,
condition_name = name.2,
condition_name_lower_case = downcase_name
) %>%
mutate(
nct_id = factor(nct_id),
study_type = factor(study_type),
overall_status = factor(overall_status),
phase = factor(phase),
enrollment_type = factor(enrollment_type),
source = factor(source),
number_of_arms = factor(number_of_arms),
site = factor(site),
city = factor(city),
state = factor(state),
zip = factor(zip),
country = factor(country),
minimum_age_unit = factor(minimum_age_unit),
maximum_age_unit = factor(maximum_age_unit),
condition_name = factor(condition_name),
condition_name_lower_case = factor(condition_name_lower_case)
mutate(
# phase has 2,605 unlabeled studies; sometimes the title/abstract contains this
# info but it was not entered into CT.gov; could be extracted for fuller dataset
group_by(official_title) %>%
distinct()
if (!is.na(phase)) {
return(phase)
mutate(
extract_phase(official_title),
as.character(phase)),
phase_extracted = factor(phase_extracted)
View(query_results_clean_4)
# Value replacement
mutate(
number_of_other_outcomes_to_measure = replace_na(number_of_other_outcomes_to_measure, 0),
number_of_secondary_outcomes_to_measure =
replace_na(number_of_secondary_outcomes_to_measure, 0),
number_of_primary_outcomes_to_measure =
replace_na(number_of_primary_outcomes_to_measure, 1)
mutate(
select(-duration_interval)
group_by(official_title) %>%
distinct()
mutate(overall_status_binary = case_when(
))
group_by(official_title) %>%
distinct()
query_results_clean_final_transformed$overall_status_binary <-
as.factor(query_results_clean_final_transformed$overall_status_binary)
# Confirm
levels(query_results_clean_final_transformed$overall_status_binary)
group_by(nct_id) %>%
distinct() %>%
data.frame() %>%
select(-1)
print(high_missing_cols)
# Detect outliers in numeric columns using the IQR method and remove them
mutate(
enrollment_anticipated),
duration_months)
mutate(
study_size_category = case_when(
TRUE ~ NA_character_
mutate(
study_duration_category = case_when(
duration_months <= 12 ~ "Short-term",
TRUE ~ NA_character_
# Ensure binary or logical columns are consistently formatted (TRUE/FALSE, Yes/No, etc.)
mutate(
group_by(study_size_category) %>%
summarise(
count = n()
print(summary_by_size)
distinct()
View(query_results_clean_final_transformed)
geom_boxplot() +
theme_minimal() +
# Final EDA
colnames(query_results_clean_final_transformed)
str(query_results_clean_final_transformed)
# EDA
introduce(query_results_clean_final_transformed)
plot_intro(query_results_clean_final_transformed)
plot_missing(query_results_clean_final_transformed)
plot_bar(query_results_clean_final_transformed)
plot_histogram(query_results_clean_final_transformed)
plot_qq(query_results_clean_final_transformed)
plot_correlation(query_results_clean_final_transformed)
plot_boxplot(query_results_clean_final_transformed, by = "overall_status_binary")