0% found this document useful (0 votes)
2 views

DP prog

Uploaded by

pramoddoddmane
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

DP prog

Uploaded by

pramoddoddmane
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 10

Laboratory 2

: Introduction to Python library package to support numerical


computations

1. To create ndarray and perform Element-wise Operations, Indexing and Slicing,


Numpy Arithmetic and Statistical Functions, Numpy linear algebra.

import numpy as np
def main():
# Creating NumPy arrays
array1 = np.array([[1, 2, 3], [4, 5, 6]])
array2 = np.array([[7, 8, 9], [10, 11, 12]])

print("Array 1:")
print (array1)
print("\nArray 2:")
print(array2)

# Element-wise operations
print("\nElement-wise operations:")
print("Addition: ")
print (array1 + array2)
print("Subtraction: ")
print (array1 - array2)
print("Multiplication: ")
print (array1*array2)
print("Division: ")
print(array1 / array2)

# Indexing and slicing


print("\nIndexing and slicing: ")
print("Element at (0, 1):", array1[0, 1])
print("First row: ", array1[0, ])
print("Second column: ", array1[:, 1])
print("slicing subarray: \n", array1[:, 1:])

# Numpy arithmetic and statistical functions


print("\nNumpy arithmetic and statistical functions: ")
print("Sum of array1:", np.sum (array1))
print("Mean of array2:", np.mean (array2))
print("standard deviation of array1:", np.std (array1))
print("Max value in array2:", np.max (array2))

#Numpy linear algebra


print("\nNumpy linear algebra: ")
A = np.array([[1, 2], [3,4]])
B= np.array([[5, 6], [7, 8]])

print("Matrix A:")
print (A)
print("\nMatrix B:")
print (B)
print("\nMatrix multiplication AB: ")
print (np.dot (A, B))
print("\nMatrix determinant of A:")
print(np.linalg.det(A))
print("\nMatrix inverse of A:")
print(np.linalg.inv(A))
if__name__ == "__main__":
main()

2. Create aDataFrame object(e.g., fromadictionary, list oftuples, orevennumpy's


ndarrays) and Plotting Series andDataFrame.

import pandas as pd
import matplotlib.pyplot as plt

# Create a DataFrame from a dictionary


data = { 'Name': ['John', 'Anna', 'Peter', 'Linda'],
'Age': [28, 24, 35, 32],
'score': [85, 90, 78, 92]}
df = pd.DataFrame(data)

print("DataFrame:")
print (df)

# Create a DataFrame from a list of tuples


data = [('John', 28, 85), ('Anna', 24, 90), ('Peter', 35, 78), ('Linda', 32, 92)]
df = pd.DataFrame(data, columns=['Name', 'Age','Score'])

print("\nDataFrame from list of tuples: ")


print(df)

# Create a DataFrame from numpy's ndarrays


names = np.array(['John', 'Anna', 'Peter', 'Linda'])
ages= np.array([28, 24, 35, 32])
scores = np.array([85, 90, 78, 92])
df = pd.DataFrame({'Name': names, 'Age': ages, 'Score': scores})

print("\nDataFrame from numpy's ndarrays: ")


print(df)

# Plotting Series
þlt.plot(df['Age']) plt.xlabel('Index') plt.ylabel('Age')
plt.title('Age Distribution')
plt.show()

# Plotting DataFrame
df.plot(kind='bar', x='Name', y='score')
plt.xlabel('Name')
plt.ylabel('Score')
plt.title('Score Distribution')
plt.show()

Laboratory 3
Title of the Laboratory Exercise: Data Exploration

a. Write a Python program to compute various summary statistics from the DataFrame.
Iris sample data, which contains information on 150 Iris flowers, 50 each from one
of
three Iris species: Setosa,Versicolour, and Virginica. Each flower is characterized
by
five attributes:
• sepal length in centimeters
• sepal width in centimeters
• petal length in centimeter
• petal width in centimeters
• class(Setosa, Versicolour,Virginica

import pandas as pd
from sklearn.datasets import load_iris
#load the iris dataset
iris = load_iris()
#iris = pd.read_csv("iris.csv")
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['class'] = iris.target
print("Data overview")
print(df.head())
print("Summary stats")
print(df.describe())
print("counts of each column")
print(df.groupby('class').size())
print("correlation matrix")
print(df.corr())

print(df.tail())
print(df.info())
print(df.dtypes)
print("/n Shape:",df.shape)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Iris dataset


iris = load_iris()
X = iris.data
y = iris.target

# Split data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize K-Nearest Neighbors classifier


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Predict on the test set


y_pred = knn.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Classification report
print(classification_report(y_test, y_pred, target_names=iris.target_names))
# Visualizing the Iris dataset
plt.figure(figsize=(8, 6))

# Plotting sepal length vs. sepal width


plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolor='k')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.title('Iris Dataset - Sepal Length vs. Sepal Width')
plt.colorbar(label='Class') # Specify the colorbar label directly
plt.show()

Laboratory 4
Title ofthe Laboratory Exercise:DataPreprocessing

a. Write a Python program to implement different approaches for handling missing


values(such as outliers, duplicate data, aggregation,sampling, discretization
etc.).

import pandas as pd
import numpy as np

# Create a sample DataFrame with missing values, outliers, and duplicate data
data = {
'A': [1, 2, np.nan, 4, 5, 100], # Missing value and outlier (100)
'B': [10, np.nan, 30, 40, 50, 10], # Missing value and duplicate (10)
'C': ['x', 'Y', 'Z', np.nan, 'w', 'x'],
'D': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
}

df = pd.DataFrame(data)

# Display the original DataFrame


print("Original DataFrame:")
print(df)
print()

# Handling Missing Values


print("Handling Missing Values:")

# Drop rows with any missing values


df_dropped = df.dropna() #removes missing values
print("1. Dropping rows with any missing values:")
print(df_dropped)
print()

# Fill missing values with mean, median, or mode


# Here we fill with mean for numeric columns and mode for categorical columns
df_filled = df.copy()

for col in df_filled.columns:


if df_filled[col].dtype == 'float64':
# Fill with mean for numeric columns
df_filled[col].fillna(df_filled[col].mean(), inplace=True)
elif df_filled[col].dtype == 'object':
# Fill with mode for categorical columns
df_filled[col].fillna(df_filled[col].mode()[0], inplace=True)

print("2. Filling missing values:")


print(df_filled)
print()

# Handling Outliers
print("Handling Outliers:")

# Example of handling outliers by winsorization (capping)


def winsorize_series(s):
q1, q3 = np.percentile(s, [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
s = s.clip(lower=lower_bound, upper=upper_bound) #The clip method replaces
values in the Series s that are less than lower_bound with lower_bound and values
greater than upper_bound with upper_bound
return s

df_outliers_handled = df_filled.copy()
for col in df_outliers_handled.select_dtypes(include=['float64']).columns:
#iterate through each column of that is of float type the data and apply winsorize
function
df_outliers_handled[col] = winsorize_series(df_outliers_handled[col])

print("3. Handling outliers by winsorization:")


print(df_outliers_handled)
print()

# Handling Duplicate Data


print("Handling Duplicate Data:")

# Drop duplicates
df_no_duplicates = df_outliers_handled.drop_duplicates()
print("4. Dropping duplicates:")
print(df_no_duplicates)
print()

# Aggregation (example of aggregating numeric columns)


print("Aggregation:")
# Aggregate by 'C' for numeric columns
df_aggregated = df_no_duplicates.groupby('C').agg({'A': 'sum', 'B':
'sum'}).reset_index()
print("5. Aggregated DataFrame:")
print(df_aggregated)
print()

# Sampling (example of random sampling)


print("Sampling:")
# Randomly sample 3 rows
df_sampled = df_no_duplicates.sample(n=3, random_state=42)
print("6. Randomly sampled DataFrame:")
print(df_sampled)
print()

# Discretization (example of binning numeric data)


print("Discretization:")
# Discretize column 'A' into 3 bins using cut
bins = pd.cut(df_aggregated['A'], bins=3, labels=['Low', 'Medium', 'High'])
df_discretized = pd.concat([df_aggregated, bins.rename('A_bin')], axis=1)
print("7. Discretized DataFrame:")
print(df_discretized)
print()

b. Write a Python program to implement Principal Component Analysis (PCA) for image
data

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_lfw_people

# Load a sample dataset (Labeled Faces in the wild dataset)


lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

# Extract the image data and target labels


faces = lfw_people.images
n_samples, h, w = faces.shape #Retrieves the shape of the images array
X = faces.reshape((n_samples, h * w)) # h and w are the height and width of
each image, respectively.

# Compute PCA
n_components = 150 # Number of principal components
pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True).fit(X)

# Project the data into the PCA space


X_pca = pca.transform(X)

# Reconstruct the images using inverse PCA transformation


X_inverse = pca.inverse_transform(X_pca) #Projects the
original data X into the reduced-dimensional PCA space
faces_inverse = X_inverse.reshape((n_samples, h, w)) #Reshapes the
reconstructed data (X_inverse) back into images (faces_inverse).

# Plot original images and reconstructed images


plt.figure(figsize=(12, 6))
n_images = 10 # Number of images to display
for i in range(n_images):
# Plot original images
plt.subplot(2, n_images, i + 1)
plt.imshow(faces[i], cmap=plt.cm.gray)

import matplotlib.pyplot as plt

# ... (rest of the code, including loading data, PCA, etc.)

# Plot original images and reconstructed images


plt.figure(figsize=(12, 6))
n_images = 10 # Number of images to display
for i in range(n_images):
# Plot original images
plt.subplot(2, n_images, i + 1)
plt.imshow(faces[i], cmap=plt.cm.gray)
plt.title('Original')
plt.axis('off')

# Plot reconstructed images


plt.subplot(2, n_images, i + 1 + n_images)
plt.imshow(faces_inverse[i], cmap=plt.cm.gray)
plt.title('\nPCA Reconstructed')
plt.axis('off')

plt.suptitle(f"\nPCA Reconstruction of {n_images} Images") Adds a


centered title above the plots.
plt.show()

5. Laboratory Exercise : Regression methodsto solve problems

a. Write a Python program to implement fitting linear regression models to a


dataset.

import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

np.random.seed(0)
X = 2 * np.random.rand(100, 1) # 2D array (100, 1). Multiplying each of
these random numbers by 2 scales them to be within the range [0, 2).
y = 4 + 3 * X + np.random.randn(100, 1) # quadraric equation + noise

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,


random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")

print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}") #difference


between the predicted values and the actual values in a regression problem.

print(f"Coefficient of determination (R^2): {r2_score(y_test, y_pred)}")


#R-squared indicates how well the regression model fits the observed data.

plt.scatter(X_test, y_test, color='black', label='Actual data')


plt.plot(X_test, y_pred, color='blue', linewidth=3, label='Fitted line')

plt.xlabel("X")
plt.ylabel('y')

plt.title('Linear Regression Fit')


plt.legend()

plt.show()

Laboratory 6
Title of the Laboratory Exercise: Classifications
a. Write aPython programto apply a decision tree classifier to the vertebrate
dataset.

from sklearn.preprocessing import LabelEncoder


from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Load the vertebral dataset


vertebral_data = pd.read_csv("/content/drive/MyDrive/column_3C_weka.csv")

# Preprocess the data (assuming the last column is the target variable)
X = vertebral_data.iloc[:, :-1] # Features
y = vertebral_data.iloc[:, -1] # Target variable

# Encode categorical target variable to numeric labels


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2,
random_state=42)

# Create a decision tree classifier


dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training data


dt_classifier.fit(X_train, y_train)

# Make predictions on the testing data


y_pred = dt_classifier.predict(X_test)

# Inverse transform the predicted labels to original categorical form


y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the classifier's performance


accuracy = accuracy_score(label_encoder.inverse_transform(y_test), y_pred_original)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(label_encoder.inverse_transform(y_test),
y_pred_original, target_names=label_encoder.classes_))

Laboratory 7
Title ofthe Laboratory Exercise: ClusterAnalysis

a. Write a Python program to implement any two clustering algorithms using Python.

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs


from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage

X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)


kmeans = KMeans(n_clusters=4)
kmeans.fit(X) #fits data points x into a cluster

kmeans_labels = kmeans.labels_
kmeans_centers = kmeans.cluster_centers_

Z = linkage(X, method="ward") #reduces the variance

plt.figure(figsize=(12, 6))

plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis')
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1], marker='x', color='red',
s=200, label='Centers')
plt.title('K-Means Clustering')
plt.legend()

plt.subplot(122)
plt.title('Hierarchical Clustering Dendrogram')
dendrogram(Z)
plt.xlabel("Sample Index")

plt.tight_layout()
plt.show()

Laboratory 8
Title ofthe Laboratory Exercise:Anomaly detection.

a. Write a python program to apply an anomaly detection approach to a multivariate


time series data.

import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Sample multivariate time series data (replace with your data)


data = pd.DataFrame({
"timestamp": pd.date_range(start="2023-01-01", periods=100, freq="D"),
"value1": np.random.randn(100),
"value2": np.random.randn(100)
})

# Normalize the data


scaler = StandardScaler()
data[["value1", "value2"]] = scaler.fit_transform(data[["value1", "value2"]])

# Train the isolation forest model


model = IsolationForest(contamination=0.05) # Adjust contamination based on
expected anomaly rate
model.fit(data[["value1", "value2"]])

# Predict anomalies
data['anomaly'] = pd.Series(model.predict(data[["value1", "value2"]])).apply(lambda
x: 1 if x == -1 else 0)
# Print anomalies
anomalies = data[data['anomaly'] == 1]

print("Detected anomalies:")
print(anomalies)
import matplotlib.pyplot as plt

# Assuming 'anomalies', 'data' are already defined

print(anomalies[['timestamp', 'value1', 'value2']])

# Plotting (optional)
plt.figure(figsize=(12, 6))
plt.plot(data['timestamp'], data['value1'], label='Value 1')
plt.plot(data['timestamp'], data['value2'], label='Value 2')
plt.scatter(anomalies['timestamp'], anomalies['value1'], color='red',
label='Anomaly')
plt.scatter(anomalies['timestamp'], anomalies['value2'], color='red')
plt.title('Anomaly Detection in Multivariate Time Series')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
plt.show()

You might also like