0% found this document useful (0 votes)
8 views8 pages

Data Science

DATA SCIENCE LAB MANUAL

Uploaded by

Geetha A L
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PPTX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views8 pages

Data Science

DATA SCIENCE LAB MANUAL

Uploaded by

Geetha A L
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PPTX, PDF, TXT or read online on Scribd
You are on page 1/ 8

MODULE-3

1 Train a regularized logistic regression classifier on the iris dataset (https://archive.ics.uci.edu/ml/machine-learning-databases/iris/


or the inbuilt iris dataset) using sklearn. Train the model with the following hyperparameter C = 1e4 and report the best
classification accuracy.

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split from sklearn.linear_model

import LogisticRegression from sklearn.preprocessing import StandardScaler from

sklearn.pipeline import make_pipeline

# Load the Iris dataset iris = load_iris()

X = iris.data y = iris.target

# Split the data into training and testing


sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create a pipeline with

StandardScaler and LogisticRegression with regularization pipeline = make_pipeline(StandardScaler(),

LogisticRegression(C=1e4, max_iter=1000)) # Train the model

pipeline.fit(X_train, y_train)

# Calculate the accuracy on the testing set accuracy =

pipeline.score(X_test, y_test) print("Classification accuracy:", accuracy)


2. Train an SVM classifier on the iris dataset using sklearn. Try different kernels and the associated hyperparameters. Train model
with the following set of hyperparameters RBFkernel, gamma=0.5, one-vs-rest classifier, no-feature-normalization. Also try
C=0.01,1,10C=0.01,1,10. For the above set of hyperparameters, find the best classification accuracy along with total number of
support vectors on the test data

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split from sklearn.svm import SVC

# Load the Iris dataset

iris = load_iris()

X = iris.data y = iris.target

# Split the data into training and


testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Set of hyperparameters

to try

hyperparameters = [

{'kernel': 'rbf', 'gamma': 0.5, 'C': 0.01},

{'kernel': 'rbf', 'gamma': 0.5, 'C': 1},

{'kernel': 'rbf', 'gamma': 0.5, 'C': 10}

best_accuracy = 0 best_model = None

best_support_vectors = None

# Train SVM models with different hyperparameters and find the best accuracy for params in

hyperparameters:
model = SVC(kernel=params['kernel'], gamma=params['gamma'], C=params['C'],
decision_function_shape='ovr')

model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test) support_vectors =

model.n_support_.sum()

print(f"For hyperparameters: {params}, Accuracy: {accuracy}, Total


Support Vectors:
{support_vectors}")

if accuracy > best_accuracy:

best_accuracy = accuracy best_model = model

best_support_vectors = support_vectors print("\nBest accuracy:",

best_accuracy)

print("Total support vectors on test data:", best_support_vectors)


MODULE-4

Consider the following dataset. Write a program to demonstrate the working of the decision tree based ID3 algorithm.
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.model_selection import train_test_split from sklearn.metrics import

accuracy_score

import pandas as pd from io import StringIO

from IPython.display import Image import pydotplus

# Define the dataset data = {

'Price': ['Low', 'Low', 'Low', 'Low', 'Low', 'Med',


'Med', 'Med', 'Med', 'High', 'High', 'High',
'High'],

'Maintenance': ['Low', 'Med', 'Low', 'Med', 'High', 'Med', 'Med', 'High', 'High', 'Med', 'Med', 'High', 'High'],

'Capacity': ['2', '4', '4', '4', '4', '4', '4', '2', '5', '4', '2', '2', '5'],

'Airbag': ['No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes'], 'Profitable': [1, 1, 1, 0, 0, 0,

1, 0, 1, 1, 1, 0, 1]

df = pd.DataFrame(data)

# Convert categorical variables into numerical ones

df = pd.get_dummies(df, columns=['Price', 'Maintenance', 'Airbag']) # Separate features and

target variable

X = df.drop('Profitable', axis=1) y = df['Profitable']

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create a decision tree

classifier

clf = DecisionTreeClassifier(criterion='entropy')
# Train the classifier on the training data

clf.fit(X_train, y_train)

# Predict on the testing data y_pred = clf.predict(X_test)

# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred) print("Accuracy:",

accuracy)

# Visualize the decision tree dot_data = StringIO()

export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=X.columns)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) Image(graph.create_png())

Consider the dataset spiral.txt (https://bit.ly/2Lm75Ly). The first two columns in the dataset corresponds to the co-ordinates of each
data point. The third column corresponds to the actual cluster label. Compute the rand index for the following methods:

K – means Clustering

 Single – link Hierarchical Clustering

 Complete link hierarchical clustering.

 Also visualize the dataset and which algorithm will be able to recover the true clusters.

import numpy as np

from sklearn.cluster import KMeans, AgglomerativeClustering from

sklearn.metrics import adjusted_rand_score

import matplotlib.pyplot as plt


# Load the dataset

data = np.loadtxt("Spiral.txt", delimiter=",", skiprows=1)

X = data[:, :2] # Features

y_true = data[:, 2] # Actual cluster labels

# Visualize the dataset

plt.figure(figsize=(8, 6))

plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis') plt.title('True

Clusters')

plt.xlabel('X1')

plt.ylabel('X2') plt.show()

# K-means clustering

# kmeans = KMeans(n_clusters=3, random_state=42)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) kmeans_clusters =

kmeans.fit_predict(X)

# Single-link Hierarchical Clustering

single_link = AgglomerativeClustering(n_clusters=3, linkage='single') single_link_clusters

= single_link.fit_predict(X)

# Complete-link Hierarchical Clustering

complete_link = AgglomerativeClustering(n_clusters=3, linkage='complete') complete_link_clusters

= complete_link.fit_predict(X)
# Compute the Rand Index

rand_index_kmeans = adjusted_rand_score(y_true, kmeans_clusters) rand_index_single_link =

adjusted_rand_score(y_true, single_link_clusters) rand_index_complete_link = adjusted_rand_score(y_true,

complete_link_clusters)

print("Rand Index for K-means Clustering:", rand_index_kmeans)

print("Rand Index for Single-link Hierarchical Clustering:", rand_index_single_link) print("Rand Index for Complete-

link Hierarchical Clustering:", rand_index_complete_link)

# This code will compute the Rand Index for each clustering method and provide a visualization of the true clusters.

# The Rand Index ranges from 0 to 1, where 1 indicates perfect clustering agreement with the true clusters.

# The method with a higher Rand Index is better at recovering the true clusters.

MODULE-5

Mini Project – Simple web scrapping in social media

import requests

from bs4 import BeautifulSoup

# URL of the Instagram profile you want to scrape url = 'https://

www.instagram.com/openai/'

# Send a GET request to the URL response =

requests.get(url) print(response.status_code)

# Check if the request was successful (status


code 200)
if response.status_code == 200:

# Parse the HTML content of the page

soup = BeautifulSoup(response.text, 'html.parser') # Find all post

elements

posts = soup.find_all('div', class_='v1Nh3') # Extract data

from each post

for post in posts:

print("Hi")

# Extract post link

post_link = post.find('a')['href'] # Extract post

image URL

image_url = post.find('img')['src'] print(f"Post

Link: {post_link}") print(f"Image URL:

{image_url}") print("------")

else:

print("Failed to retrieve data from Instagram")

You might also like