0% found this document useful (0 votes)
16 views16 pages

Wa0003

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views16 pages

Wa0003

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 16

20

install pandas, scikit-learn ,matplotlib


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the Boston housing dataset


boston = load_boston()
data = pd.DataFrame(boston.data, columns=boston.feature_names)
data['PRICE'] = boston.target

# Display the first few rows of the dataset


print(data.head())

# Exploratory Data Analysis (EDA)


plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Prepare the data for linear regression


X = data.drop('PRICE', axis=1) # Features
y = data['PRICE'] # Target variable
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model


model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")


print(f"R² Score: {r2:.2f}")

# Visualize the results


plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.show()

21 A
import pandas as pd
import numpy as np
from scipy import stats

# Load the datasets


uci_diabetes = pd.read_csv('uci_diabetes.csv') # replace with the correct file path
pima_diabetes = pd.read_csv('pima_diabetes.csv') # replace with the correct file path

def univariate_analysis(dataset):
univariate_stats = {}
for column in dataset.columns:
univariate_stats[column] = {
'Mean': dataset[column].mean(),
'Median': dataset[column].median(),
'Mode': dataset[column].mode()[0],
'Variance': dataset[column].var(),
'Standard Deviation': dataset[column].std(),
'Skewness': dataset[column].skew(),
'Kurtosis': dataset[column].kurt()
}
return pd.DataFrame(univariate_stats)

# Univariate analysis for both datasets


uci_univariate = univariate_analysis(uci_diabetes)
pima_univariate = univariate_analysis(pima_diabetes)

print("UCI Diabetes Univariate Analysis:\n", uci_univariate)


print("\nPima Diabetes Univariate Analysis:\n", pima_univariate)

21 B
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Let's assume "Outcome" is the target variable for logistic regression


# Define features and target
X_uci = uci_diabetes.drop(columns='Outcome')
y_uci = uci_diabetes['Outcome']

X_pima = pima_diabetes.drop(columns='Outcome')
y_pima = pima_diabetes['Outcome']

# Bivariate: Linear Regression


def linear_regression(X, y):
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
return model.summary()

# Bivariate: Logistic Regression


def logistic_regression(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))


print("Accuracy Score: ", accuracy_score(y_test, y_pred))

# Linear Regression for UCI and Pima datasets


print("UCI Diabetes Linear Regression:\n", linear_regression(X_uci, y_uci))
print("Pima Diabetes Linear Regression:\n", linear_regression(X_pima, y_pima))

# Logistic Regression for UCI and Pima datasets


print("UCI Diabetes Logistic Regression:")
logistic_regression(X_uci, y_uci)

print("\nPima Diabetes Logistic Regression:")


logistic_regression(X_pima, y_pima)

21 C
# Multiple Regression
def multiple_regression(X, y):
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
return model.summary()

# Perform multiple regression analysis for both datasets


print("UCI Diabetes Multiple Regression:\n", multiple_regression(X_uci, y_uci))
print("Pima Diabetes Multiple Regression:\n", multiple_regression(X_pima, y_pima))

21 D
# Compare Univariate results
print("Comparing Univariate Results:\n")
print("UCI Diabetes Stats:\n", uci_univariate)
print("\nPima Diabetes Stats:\n", pima_univariate)

# Compare Bivariate and Multiple Regression outputs by analyzing the summaries printed
earlier
EXPERIMENT 5
27
import numpy as np
from collections import Counter

# Define the Euclidean distance function


def euclidean_distance(point1, point2):
return np.sqrt(np.sum((np.array(point1) - np.array(point2)) ** 2))

# KNN function
def knn_classify(data, labels, query_point, k=3):
# Calculate distances between the query point and all points in the dataset
distances = []
for i, point in enumerate(data):
distance = euclidean_distance(point, query_point)
distances.append((distance, labels[i]))

# Sort distances and select the k nearest neighbors


distances.sort(key=lambda x: x[0])
k_nearest_labels = [label for _, label in distances[:k]]

# Get the most common label among the nearest neighbors


most_common = Counter(k_nearest_labels).most_common(1)
return most_common[0][0]
# Example usage
if __name__ == "__main__":
# Example dataset (points and their labels)
data = [
[2, 3],
[1, 1],
[4, 4],
[6, 6],
[8, 8]
]
labels = ["A", "A", "B", "B", "A"]

# Query point
query_point = [3, 3]

# Number of neighbors
k=3

# Classify the query point


predicted_label = knn_classify(data, labels, query_point, k)
print(f"The predicted label for the query point {query_point} is:
{predicted_label}")

28
pip install scikit-learn
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load a sample dataset (e.g., Iris dataset)


data = datasets.load_iris()
X = data.data # Features
y = data.target # Target labels

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42)

# Create and train the Decision Tree Classifier


clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict the test set results


y_pred = clf.predict(X_test)

# Calculate and print the accuracy


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Decision Tree Classifier: {accuracy * 100:.2f}%")

# Predict a new query point (example point from the Iris dataset)
query_point = [5.1, 3.5, 1.4, 0.2] # You can change this to any sample
predicted_class = clf.predict([query_point])
print(f"The predicted class for the query point {query_point} is:
{data.target_names[predicted_class][0]}")

29
import numpy as np

# Calculate the Euclidean distance between two points


def euclidean_distance(point1, point2):
return np.sqrt(np.sum((np.array(point1) - np.array(point2)) ** 2))

# Function to find the closest clusters


def find_closest_clusters(clusters):
min_distance = float('inf')
closest_pair = (0, 1)
for i in range(len(clusters)):
for j in range(i + 1, len(clusters)):
# Calculate minimum distance between clusters[i] and clusters[j]
for point1 in clusters[i]:
for point2 in clusters[j]:
distance = euclidean_distance(point1, point2)
if distance < min_distance:
min_distance = distance
closest_pair = (i, j)
return closest_pair

# Hierarchical clustering function


def hierarchical_clustering(data, num_clusters=2):
# Start with each point as its own cluster
clusters = [[point] for point in data]

# Loop until we reach the desired number of clusters


while len(clusters) > num_clusters:
# Find the closest clusters
i, j = find_closest_clusters(clusters)

# Merge clusters i and j


clusters[i].extend(clusters[j])
del clusters[j]

return clusters

# Sample data points


data = [
[1, 2],
[2, 3],
[3, 4],
[8, 7],
[8, 8],
[25, 80]
]

# Set the desired number of clusters


num_clusters = 2
# Perform hierarchical clustering
clusters = hierarchical_clustering(data, num_clusters)

# Output the clusters


for idx, cluster in enumerate(clusters):
print(f"Cluster {idx + 1}: {cluster}")

30
import numpy as np

# Calculate Euclidean distance between two points


def euclidean_distance(point1, point2):
return np.sqrt(np.sum((np.array(point1) - np.array(point2)) ** 2))

# Find all neighbors within epsilon distance of a point


def find_neighbors(data, point_idx, epsilon):
neighbors = []
for idx, point in enumerate(data):
if euclidean_distance(data[point_idx], point) < epsilon:
neighbors.append(idx)
return neighbors

# DBSCAN function
def dbscan(data, epsilon, min_points):
# Initialize all points as unvisited
visited = [False] * len(data)
clusters = [-1] * len(data) # -1 means noise
cluster_id = 0

# Iterate through each point


for point_idx in range(len(data)):
if visited[point_idx]:
continue
# Mark the point as visited
visited[point_idx] = True

# Find neighbors
neighbors = find_neighbors(data, point_idx, epsilon)

# Mark as noise if it has fewer neighbors than min_points


if len(neighbors) < min_points:
clusters[point_idx] = -1
else:
# Otherwise, create a new cluster
cluster_id += 1
clusters[point_idx] = cluster_id
i=0
while i < len(neighbors):
neighbor_idx = neighbors[i]

if not visited[neighbor_idx]:
visited[neighbor_idx] = True
new_neighbors = find_neighbors(data, neighbor_idx, epsilon)
if len(new_neighbors) >= min_points:
neighbors.extend(new_neighbors)

# Add the point to the cluster if it's not already in one


if clusters[neighbor_idx] == -1:
clusters[neighbor_idx] = cluster_id
i += 1

return clusters

# Example data points


data = [
[1, 2],
[2, 2],
[2, 3],
[8, 7],
[8, 8],
[25, 80]
]

# Parameters
epsilon = 2
min_points = 2

# Perform DBSCAN clustering


clusters = dbscan(data, epsilon, min_points)

# Output the clusters


for idx, cluster_id in enumerate(clusters):
print(f"Point {data[idx]} -> Cluster {cluster_id}")

31
import numpy as np

# Function to perform PCA


def pca(data, num_components):
# Step 1: Standardize the dataset by centering it around the mean
mean = np.mean(data, axis=0)
centered_data = data - mean

# Step 2: Calculate the covariance matrix


covariance_matrix = np.cov(centered_data, rowvar=False)

# Step 3: Compute eigenvalues and eigenvectors


eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)

# Step 4: Sort eigenvalues and eigenvectors in descending order


sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]

# Step 5: Select the top 'num_components' eigenvectors


selected_eigenvectors = sorted_eigenvectors[:, :num_components]

# Step 6: Transform the data to the new subspace


reduced_data = np.dot(centered_data, selected_eigenvectors)

return reduced_data, sorted_eigenvalues[:num_components],


selected_eigenvectors

# Example dataset (5 samples, 3 features)


data = np.array([
[4.9, 3.0, 1.4],
[4.7, 3.2, 1.3],
[4.6, 3.1, 1.5],
[5.0, 3.6, 1.4],
[5.4, 3.9, 1.7]
])

# Set the number of components to reduce to (e.g., 2)


num_components = 2

# Perform PCA
reduced_data, eigenvalues, eigenvectors = pca(data, num_components)

# Output the results


print("Reduced Data:")
print(reduced_data)
print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors (Principal Components):")
print(eigenvectors)

You might also like