0% found this document useful (0 votes)
36 views11 pages

Rstudio Cours

Uploaded by

fatichourak48
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PPTX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
36 views11 pages

Rstudio Cours

Uploaded by

fatichourak48
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PPTX, PDF, TXT or read online on Scribd
You are on page 1/ 11

R studio_Com

Kamal ZEHRAOUI
Step 1: Creating an Initial Data Frame
data <- data.frame( ID = 1:3, Name = c("Alice", "Bob", "Charlie"), Age = c(24, 30, 28),
Gender = as.factor(c("Female", "Male", "Male"))
print(data)
Step 2: Adding a New Row
new_row <- data.frame( ID = 4, Name = "Diana", Age = 27, Gender = factor("Female", levels
= levels(data$Gender))
data <- rbind(data, new_row)
print(data)
Step 3: Adding a New Column
data$Occupation <- c("Engineer", "Doctor", "Artist", "Lawyer")
print(data)
• A character data type in R represents plain text or string data. It
stores data as literal text, which means each entry is treated
independently. Characters are typically used for variables where the
values are unique and don’t necessarily repeat (e.g., names, unique
identifiers).
• A factor is a categorical variable used to store a fixed set of unique
values (called "levels"). Factors are ideal for data that belongs to a
limited number of categories, like gender, status, or education level. R
treats factors differently from characters because they are stored as
integer values with corresponding labels, which helps in data analysis,
especially with categorical data.
Basic Data Exploration: Without Packages

• # View the structure of the dataset


str(data_kamal)
• # Summary statistics for quantitative variables
summary(data_kamal)
• # Frequency table for categorical variables
table(data_kamal$Gender) table(data_kamal$Department)
• # Basic descriptive statistics for quantitative variables
mean(data_kamal$Age)
median(data_kamal$Income)
sd(data_kamal$Weight)
Basic Data Exploration: With Packages

• # Install and load the dplyr package


• install.packages("dplyr") library(dplyr)
• # Summary statistics for quantitative variables
data_kamal %>% summarise(across(c(Age, Income, Weight), list(mean = mean, sd = sd, median =
median)))

# Frequency table for categorical variables


data_kamal %>% count(Gender) data_kamal %>% count(Department)
Univariate Analysis: Without Packages
• # Histogram for quantitative variables
hist(data_kamal$Age, main = "Histogram of Age", xlab = "Age")
• # Bar plot for categorical variables
barplot(table(data_kamal$Gender), main = "Gender Distribution", xlab = "Gender", ylab = "Count")
Univariate Analysis: With Packages
# Load ggplot2 package
install.packages("ggplot2") library(ggplot2)
# Histogram for quantitative variables
ggplot(data_kamal, aes(x = Age)) + geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
ggtitle("Histogram of Age")
# Bar plot for categorical variables
ggplot(data_kamal, aes(x = Gender)) + geom_bar(fill = "lightgreen") + ggtitle("Gender Distribution")
 Bivariate Analysis : Without Packages
# Scatter plot for quantitative variables
plot(data_kamal$Age, data_kamal$Income, main = "Age vs Income", xlab = "Age", ylab = "Income")
# Boxplot for categorical vs quantitative
boxplot(data_kamal$Income ~ data_kamal$Gender, main = "Income by Gender", xlab = "Gender", ylab =
"Income")
# Correlation between quantitative variables
cor(data_kamal$Age, data_kamal$Income)
 Bivariate Analysis : With Packages
# Scatter plot for quantitative variables
ggplot(data_kamal, aes(x = Age, y = Income)) + geom_point(color = "blue") + ggtitle("Age vs Income")
# Boxplot for categorical vs quantitative
ggplot(data_kamal, aes(x = Gender, y = Income)) + geom_boxplot(fill = "lightgreen") + ggtitle("Income by Gender")
# Correlation matrix for quantitative variables
install.packages("GGally") library(GGally)
ggpairs(data_kamal %>% select(Age, Income, Weight), title = "Correlation Matrix")
 Multivariate Analysis
 Principal Component Analysis (PCA) : Without Packages
# Standardize the quantitative
data pca_data <- scale(data_kamal[, c("Age", "Income", "Weight")])
# Perform PCA
pca_result <- prcomp(pca_data) summary(pca_result)
# Scree plot
plot(pca_result, type = "l", main = "Scree Plot")
 Principal Component Analysis (PCA ) With Packages
install.packages("FactoMineR") install.packages("factoextra") library(FactoMineR) library(factoextra)
# PCA
pca_result <- PCA(data_kamal %>% select(Age, Income, Weight), graph = FALSE)
fviz_pca_var(pca_result)
fviz_pca_biplot(pca_result, geom.ind = "point", geom.var = "arrow", col.var = "black", title = "PCA Biplot")
# Scree plot of PCA with FactoMineR
fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 50), title = "Scree Plot")
# Correlation circle (variables) plot
fviz_pca_var(pca_result, col.var = "contrib", # Color by contribution of variables gradient.cols = c("blue", "red"), title = "PCA Variables Contribution")
# PCA individuals (scores) plot
fviz_pca_ind(pca_result, col.ind = "cos2", # Color by cos2 (quality of representation) gradient.cols = c("blue", "green", "red"), title = "PCA Individuals (Scores)")
# Contributions of individuals to the first principal component
fviz_contrib(pca_result, choice = "ind", axes = 1, top = 10, title = "Top 10 Contributions of Individuals to PC1")
# Contributions of variables to the first principal component
 Hierarchical Clustering Without Packages

# Step 1: Standardize the data (optional, but recommended for clustering) # Using scale() to standardize quantitative variables
data_scaled <- scale(data_kamal[, c("Age", "Income", "Weight")])
# Step 2: Compute the distance matrix
distance_matrix <- dist(data_scaled, method = "euclidean") # Use "manhattan" for Manhattan distance
# Step 3: Perform hierarchical clustering using hclust
hc_result <- hclust(distance_matrix, method = "ward.D2") # Options: "ward.D2", "single", "complete", "average«
# Step 4: Plot the dendrogram
plot(hc_result, main = "Dendrogram of Hierarchical Clustering", xlab = "", sub = "")
# Step 5: Cut the dendrogram into clusters
data_kamal$Cluster <- cutree(hc_result, k = 3) # k is the number of clusters you want
table(data_kamal$Cluster) # View the number of observations in each cluster
 Hierarchical Clustering With Packages
 install.packages("cluster") install.packages("factoextra") library(cluster) library(factoextra)

data_scaled <- scale(data_kamal[, c("Age", "Income", "Weight")])


# Euclidean distance matrix
distance_matrix <- dist(data_scaled, method = "euclidean")
hc_agnes <- agnes(data_scaled, method = "ward") # Options: "ward", "single", "complete", "average" print(hc_agnes$ac) # Agglomerative coefficient
# Basic dendrogram
fviz_dend(hc_agnes, rect = TRUE, k = 3, # Adds rectangles around clusters, set k for number of clusters main = "Dendrogram of Hierarchical Clustering", palette = "jco", # Set colour palette for
clusters cex = 0.5) # Adjust label size
# Cut the dendrogram into clusters clusters <- cutree(hc_agnes, k = 3) # Specify the number of clusters
# Add the cluster assignment to your data data_kamal$Cluster <- as.factor(clusters)

# Visualize clusters in a 2D scatter plot of principal components fviz_cluster(list(data = data_scaled, cluster = clusters), geom = "point", ellipse.type = "convex", palette = "jco", main = "Cluster Plot", ggtheme = theme_minimal())
 MCA Without Packages
# Convert categorical variables into dummy variables (one-hot encoding)

indicator_matrix <- model.matrix(~ Gender + Department - 1, data = data_kamal)

Use singular value decomposition (SVD) on the indicator matrix to perform MCA.

# Perform SVD on the indicator matrix svd_result <- svd(scale(indicator_matrix, scale = FALSE))

# Extract the principal components (MCA dimensions) mca_dimensions <- svd_result$u %*% diag(svd_result$d)

# Basic scatter plot of the first two dimensions

plot(mca_dimensions[, 1], mca_dimensions[, 2], xlab = "Dimension 1", ylab = "Dimension 2", main = "MCA Scatter Plot")

 MCA With Packages


install.packages("FactoMineR") install.packages("factoextra") library(FactoMineR) library(factoextra)

Ensure all categorical variables are coded as factors. Convert them if necessary.

data_kamal$Gender <- as.factor(data_kamal$Gender)

data_kamal$Department <- as.factor(data_kamal$Department)

# Perform MCA on the categorical variables

mca_result <- MCA(data_kamal[, c("Gender", "Department")], graph = FALSE)

fviz_screeplot(mca_result, addlabels = TRUE, ylim = c(0, 50), title = "MCA Scree Plot")

fviz_mca_var(mca_result, repel = TRUE, # Avoid overlapping labels col.var = "contrib", # Color by contributions to the dimensions gradient.cols = c("blue", "red"), title = "MCA Variable Categories Plot")

fviz_mca_ind(mca_result, col.ind = "cos2", # Color by quality of representation (cos2) gradient.cols = c("blue", "green", "red"), repel = TRUE, title = "MCA Individuals Plot")

fviz_mca_biplot(mca_result, repel = TRUE, # Avoid overlapping labels geom.ind = "point", col.var = "blue", col.ind = "red", title = "MCA Biplot")

print(mca_result)
 Basic Linear Regression Without Packages
Simple Linear Regression
# Simple linear regression: Predict Income based on Age
model_simple <- lm(Income ~ Age, data = data_kamal)
# View the summary of the model to see details
summary(model_simple)
Multiple Linear Regression
Convert Categorical Variables
data_kamal$Gender <- as.factor(data_kamal$Gender)
# Multiple linear regression: Predict Income based on Age, Weight, and Gender
model_multiple <- lm(Income ~ Age + Weight + Gender, data = data_kamal)
plot(model_multiple)

 Linear Regression With Packages


install.packages("ggplot2") install.packages("broom") library(ggplot2) library(broom)
Simple Linear Regression with Visualization
# Plot Income vs Age with a linear regression line
ggplot(data_kamal, aes(x = Age, y = Income)) + geom_point() + # Scatter plot geom_smooth(method = "lm", color = "blue") + # Linear regression line labs(title = "Simple Linear Regression of Income on Age", x
= "Age", y = "Income")
Multiple Linear Regression with Broom
# Tidy summary of the multiple regression model
model_multiple_tidy <- tidy(model_multiple) print(model_multiple_tidy)
# Residuals vs Fitted plot
ggplot(data.frame(fitted = fitted(model_multiple), residuals = resid(model_multiple)), aes(x = fitted, y = residuals)) + geom_point() + geom_hline(yintercept = 0, color = "red") + labs(title = "Residuals vs Fitted
Values", x = "Fitted Values", y = "Residuals")

You might also like