TYCS Practical
TYCS Practical
Sr
Title Date Sign
No
1 Introduction to Excel
4 Hypothesis Testing
8 K-Means Clustering
Steps
Step 1: Go to conditional formatting > Greater Than
Step 2: Enter the greater than filter value for example 2000.
C. Use VLOOKUP function to retrieve information from a different worksheet or table. Steps:
Step 1: click on an empty cell and type the following command.
=VLOOKUP(B3, B3:D3,1, TRUE)
Step 2: Fill the information in the window accordingly and click ok.
# Grouping data based on a column and applying an aggregation function (e.g., finding the average age per
city)
grouped_df = df.groupby("city").agg({"age": "mean"})
Code:
# Standardization and normalization import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
print("Max values")
max_vals = np.max(np.abs(df))
print(max_vals)
print((df - max_vals) / max_vals)
print("Normalization")
scaler = Normalizer()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
print(scaled_df.head())
print("Standardization")
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
print(scaled_df.head())
import pandas as pd
data = pd.read_csv("data32.csv")
categorical_features = data.select_dtypes(include="object")
dummies = pd.get_dummies(categorical_features)
data = pd.concat([data, dummies], axis=1)
data.drop(categorical_features, axis=1, inplace=True)
data.to_csv("Output.csv")
np.random.seed(42)
scoreA = np.random.normal(loc=70,scale=10,size=30)
scoreB = np.random.normal(loc=75,scale=10,size=30)
t_stat,pvalue = stats.ttest_ind(scoreA,scoreB)
print(f"T-Statistics: {t_stat}\nP-Value: {pvalue}")
alpha = 0.05
if pvalue < alpha:
print("Reject the null hypothesis. There is a significant difference in exam scores.")
else:
print("Fail to reject the null hypothesis. There is no significant difference in exam scores.")
Output:
Chi-test
import numpy as np
import scipy.stats as stats
observed_data = np.array([[25, 15], [20, 40]])
chi2, pvalue, dof, expected = stats.chi2_contingency(observed_data)
print(f'Chi-Square Statistic: {chi2}\nPvalue: {pvalue}\nDegrees of Freedom: {dof}\nExpected
frequency:\n{expected}')
alpha = 0.05
if pvalue < alpha:
print("Reject the null hypothesis. There is a significant association between gender and job satisfaction.")
else:
print("Fail to reject the null hypothesis. Gender and job satisfaction are independent.")
Output:
print("F-statistic:", f_statistic)
print("P-value:", p_value)
alpha = 0.05
print(
"Reject null hypothesis: There are significant differences between the means of the groups."
else:
print(
"Fail to reject null hypothesis: There are no significant differences between the means of the groups."
Output:-
# Coefficients
print("Intercept:", model.intercept_[0])
print("Coefficient:", model.coef_[0][0])
# Predictions
y_pred = model.predict(X_test)
# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
Output:
# Load data
data = pd.read_csv("wholesale.csv")
Output:
# Perform PCA
pca = PCA(n_components=2) # Specify the number of components (dimensions)
X_r = pca.fit_transform(X)
Output:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Output: