Data Science
Data Science
Aim: Perform basic data pre-processing tasks such as handling missing values and outliers.
Code:-
import pandas as pd
data = pd.read_csv("/content/employees.csv")
df = pd.DataFrame(data)
# Checking for missing values using isnull()
missing_values = df.isnull()
print(missing_values)
bool_series = pd.isnull(data["Gender"])
missing_gender_data = data[bool_series]
print(missing_gender_data)
For missing values:-
missing_values = df.isnull()
print(missing_values)
non_missing_values = df.notnull()
print(non_missing_values)
Practical no. 2
Aim: Apply feature scaling techniques like standardization and normalizationto numerical
features.
Code:-
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
# Data
df = pd.DataFrame({'Age': [25, 45, 35, 50, 23], 'Income': [50000, 120000, 80000,
110000, 75000]})
# Standardization
df_standardized = df.copy()
df_standardized[['Age', 'Income']] = StandardScaler().fit_transform(df[['Age',
'Income']])
# Normalization
df_normalized = df.copy()
df_normalized[['Age', 'Income']] = MinMaxScaler().fit_transform(df[['Age', 'Income']])
print("Standardized:\n", df_standardized, "\n")
print("Normalized:\n", df_normalized)
Output:-
Practical no. 3
Code:-
x=c(6.2,6.6,7.1,7.4,7.6,7.9,8,8.3,8.4,8.5,8.6,8.8,8.8,9.1,9.2,9.4,9.4,9.7,9.9,10.2,10.4,10.8,11.3,11.9)
t.test(x-9,alternative="two.sided",conf.level=0.95)
x=c(418,421,421,422,425,427,431,434,437,439,446,447,448,453,454,463,465)
y=c(429,430,430,431,436,437,440,441,445,446,447)
test2<-t.test(x,y,alternative="two.sided",mu=0,var.equal=F,conf.level=0.95)
test2
Output:-
Practical no. 4
Code:-
y1 <- c(18.2, 20.1, 17.6, 16.8, 18.8, 19.7, 19.1)
y2 <- c(17.4, 18.7, 19.1, 16.4, 15.9, 18.4, 17.7)
y3 <- c(15.2, 18.8, 17.7, 16.5, 15.9, 17.1, 16.7)
y <- c(y1, y2, y3)
group <- factor(rep(1:3, each = length(y1)))
tapply(y, group, stem)
tmpfn <- function(x) {
list(sum = sum(x), mean = mean(x), var = var(x), n = length(x))
}
tapply(y, group, tmpfn)
data <- data.frame(y = y, group = group)
fit <- lm(y ~ group, data)
anova_fit <- anova(fit)
df <- anova_fit[,"Df"]
names(df) <- c("trt", "err")
df
alpha <- c(0.05, 0.01)
qf(alpha, df["trt"], df["err"], lower.tail = FALSE)
anova_fit["Residuals", "Sum Sq"]
anova_fit["Residuals", "Sum Sq"] / qchisq(c(0.025, 0.975), df["err"], lower.tail = FALSE)
Output:-
Practical No. 5
Code:-
height <- c(102,117,105,141,135,115,138,114,137,100,131,119,115,121,113)
weight <- c(61,46,62,54,60,69,51,50,46,64,48,56,64,48,59)
student <- lm(weight ~ height)
student
predict(student, data.frame (height = 199), interval="confidence")
plot(student)
Output:-
Practical No. 6
Code:-
# Load dataset
library(datasets)
ir_data <- iris
head(ir_data)
str(ir_data)
levels(ir_data$Species)
# Plot predictions
qplot(
prediction$Sepal.Length,
round(prediction$Predicted.Probability),
col = prediction$Actual.Species,
xlab = "Sepal Length",
ylab = "Prediction using Logistic Regression"
)
Output:-
Practical No. 7
Code:-
data(iris)
names(iris)
new_data<-subset(iris,select = c(-Species))
new_data
cl<-kmeans(new_data,3)
cl
data<-new_data
wss<-sapply(1:15,function(k){kmeans(data,k)$tot.withinss})
wss
plot(1:15,wss,type="b",pch=19,frame=FALSE,xlab="Number of clustersK",ylab ="Total within-clusters sums of
squares")
library(cluster)
clusplot(new_data,cl$cluster,color=TRUE,shade=TRUE, labels=2,lines=0)
cl$cluster
cl$centers
"agglomarative clustering"
clusters<-hclust(dist(iris[,3:4]))
plot(clusters)
clusterCut<-cutree(clusters,3)
table(clusterCut,iris$Species)
Output:-
Practical No. 8
Code:-
data_iris <- iris[1:4]
cov_data <- cov(data_iris)
print(cov_data) # Print covariance matrix to check
Eigen_data <- eigen(cov_data)
print(Eigen_data$values) # Print eigenvalues to check
PCA_data <- princomp(data_iris, cor = FALSE) # Set cor=FALSE instead of "False"
summary(PCA_data) # Print summary to ensure PCA ran correctly
model2 <- PCA_data$loadings[, 1]
print(model2) # Print the first principal component
model2_scores <- as.matrix(data_iris) %*% model2
print(head(model2_scores)) # Print the first few PCA scores
if (!require(e1071)) install.packages("e1071", dependencies = TRUE)
library(e1071)
mod1 <- naiveBayes(iris[, 1:4], iris[, 5])
mod2 <- naiveBayes(model2_scores, iris[, 5])
table(predict(mod1, iris[, 1:4]), iris[, 5])
table(predict(mod2, model2_scores), iris[, 5])
Output:-
Practical No. 9
Code:-
Output:-