Rstudio Cours
Rstudio Cours
Kamal ZEHRAOUI
Step 1: Creating an Initial Data Frame
data <- data.frame( ID = 1:3, Name = c("Alice", "Bob", "Charlie"), Age = c(24, 30, 28),
Gender = as.factor(c("Female", "Male", "Male"))
print(data)
Step 2: Adding a New Row
new_row <- data.frame( ID = 4, Name = "Diana", Age = 27, Gender = factor("Female", levels
= levels(data$Gender))
data <- rbind(data, new_row)
print(data)
Step 3: Adding a New Column
data$Occupation <- c("Engineer", "Doctor", "Artist", "Lawyer")
print(data)
• A character data type in R represents plain text or string data. It
stores data as literal text, which means each entry is treated
independently. Characters are typically used for variables where the
values are unique and don’t necessarily repeat (e.g., names, unique
identifiers).
• A factor is a categorical variable used to store a fixed set of unique
values (called "levels"). Factors are ideal for data that belongs to a
limited number of categories, like gender, status, or education level. R
treats factors differently from characters because they are stored as
integer values with corresponding labels, which helps in data analysis,
especially with categorical data.
Basic Data Exploration: Without Packages
# Step 1: Standardize the data (optional, but recommended for clustering) # Using scale() to standardize quantitative variables
data_scaled <- scale(data_kamal[, c("Age", "Income", "Weight")])
# Step 2: Compute the distance matrix
distance_matrix <- dist(data_scaled, method = "euclidean") # Use "manhattan" for Manhattan distance
# Step 3: Perform hierarchical clustering using hclust
hc_result <- hclust(distance_matrix, method = "ward.D2") # Options: "ward.D2", "single", "complete", "average«
# Step 4: Plot the dendrogram
plot(hc_result, main = "Dendrogram of Hierarchical Clustering", xlab = "", sub = "")
# Step 5: Cut the dendrogram into clusters
data_kamal$Cluster <- cutree(hc_result, k = 3) # k is the number of clusters you want
table(data_kamal$Cluster) # View the number of observations in each cluster
Hierarchical Clustering With Packages
install.packages("cluster") install.packages("factoextra") library(cluster) library(factoextra)
# Visualize clusters in a 2D scatter plot of principal components fviz_cluster(list(data = data_scaled, cluster = clusters), geom = "point", ellipse.type = "convex", palette = "jco", main = "Cluster Plot", ggtheme = theme_minimal())
MCA Without Packages
# Convert categorical variables into dummy variables (one-hot encoding)
Use singular value decomposition (SVD) on the indicator matrix to perform MCA.
# Perform SVD on the indicator matrix svd_result <- svd(scale(indicator_matrix, scale = FALSE))
# Extract the principal components (MCA dimensions) mca_dimensions <- svd_result$u %*% diag(svd_result$d)
plot(mca_dimensions[, 1], mca_dimensions[, 2], xlab = "Dimension 1", ylab = "Dimension 2", main = "MCA Scatter Plot")
Ensure all categorical variables are coded as factors. Convert them if necessary.
fviz_screeplot(mca_result, addlabels = TRUE, ylim = c(0, 50), title = "MCA Scree Plot")
fviz_mca_var(mca_result, repel = TRUE, # Avoid overlapping labels col.var = "contrib", # Color by contributions to the dimensions gradient.cols = c("blue", "red"), title = "MCA Variable Categories Plot")
fviz_mca_ind(mca_result, col.ind = "cos2", # Color by quality of representation (cos2) gradient.cols = c("blue", "green", "red"), repel = TRUE, title = "MCA Individuals Plot")
fviz_mca_biplot(mca_result, repel = TRUE, # Avoid overlapping labels geom.ind = "point", col.var = "blue", col.ind = "red", title = "MCA Biplot")
print(mca_result)
Basic Linear Regression Without Packages
Simple Linear Regression
# Simple linear regression: Predict Income based on Age
model_simple <- lm(Income ~ Age, data = data_kamal)
# View the summary of the model to see details
summary(model_simple)
Multiple Linear Regression
Convert Categorical Variables
data_kamal$Gender <- as.factor(data_kamal$Gender)
# Multiple linear regression: Predict Income based on Age, Weight, and Gender
model_multiple <- lm(Income ~ Age + Weight + Gender, data = data_kamal)
plot(model_multiple)