0% found this document useful (0 votes)
18 views8 pages

Da 012307

The document consists of two assignments involving data analysis using Python libraries such as pandas, scikit-learn, and mlxtend. It covers tasks including linear regression, logistic regression, and association rule mining with datasets related to sales, purchases, and transactions. The assignments demonstrate model training, evaluation, and visualization techniques for predictive analytics.

Uploaded by

psb18039
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views8 pages

Da 012307

The document consists of two assignments involving data analysis using Python libraries such as pandas, scikit-learn, and mlxtend. It covers tasks including linear regression, logistic regression, and association rule mining with datasets related to sales, purchases, and transactions. The assignments demonstrate model training, evaluation, and visualization techniques for predictive analytics.

Uploaded by

psb18039
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 8

ASSIGNMENT 1

SET A

Import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

#create Dataset

np.random.seed(42)

data_size = 500

df = pd.DataFrame({'ID': np.arange(1,data_size+1),

'TV':np.random.uniform(0, 300, data_size),

'Radio':np.random.uniform(0,100,data_size),

'Newspaper':np.random.uniform(0,50,data_size),

'Sales':np.random.uniform(5,25,data_size)})

df

#split into training and testing data

x = df[['TV']]

y = df['Sales']

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

#Train Linear regression Model

model = LinearRegression()

model.fit(X_train,Y_train)

#Make Predictions

Y_pred = model.predict(X_test)

#Evaluate Model Performance

mse = mean_squared_error(Y_test, Y_pred)

r2_score = model.score(X_test, Y_test)


print("Model Coefficient:", model.coef_)

print("Intercept:", model.intercept_)

print("Mean Squared error:", mse)

print("R2 score:", r2_score)

plt.scatter(X_train, Y_train)

plt.plot(X_train, model.predict(X_train), color = "red")

#2

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

#create Dataset

np.random.seed(42)

data_size = 500

df = pd.DataFrame({'ID': np.arange(1,data_size+1),

'Flat':np.random.uniform(500, 5000, data_size),

'Houses':np.random.uniform(1000,10000,data_size),

'Purchases':np.random.uniform(1,100,data_size),})

df

#split into training and testing data

x = df[['Houses']]

y = df['Purchases']

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)


#Train Linear regression Model

model = LinearRegression()

model.fit(X_train,Y_train)

#Make Predictions

Y_pred = model.predict(X_test)

#Evaluate Model Performance

mse = mean_squared_error(Y_test, Y_pred)

r2_score = model.score(X_test, Y_test)

print("Model Coefficient:", model.coef_)

print("Intercept:", model.intercept_)

print("Mean Squared error:", mse)

print("R2 score:", r2_score)

plt.scatter(X_train, Y_train, color = 'red')

plt.plot(X_train,model.predict(X_train))

#3

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#create Dataset

np.random.seed(42)

data_size = 500

df = pd.DataFrame({'UserID': np.arange(1,data_size+1),

'Gender':np.random.choice(['Male','Female'],data_size),
'Age':np.random.randint(18,70,data_size),

'EstimatedSalary':np.random.randint(20000,150000,data_size),

'Purchased':np.random.choice([0,1],data_size)})

Df

#convert Categorical Data into numeric form

encoder = LabelEncoder()

df['Gender'] = encoder.fit_transform(df['Gender'])

#split into training and testing datasets

x = df[['Gender']]

y = df[['Purchased']]

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

#Train a Logistic Regression Model

df

model = LogisticRegression()

model.fit(X_train, Y_train)

#make Predictions

Y_pred = model.predict(X_test)

#Evaluate Model Performance

accuracy = accuracy_score(Y_test, Y_pred)

conf_matrix = confusion_matrix(Y_test, Y_pred)

class_report = classification_report(Y_test, Y_pred)

print("Accuracy:", accuracy)

print("Confusion Matrix:\n", conf_matrix)

print("Class Report:", class_report)

plt.scatter(X_train, Y_train)

plt.plot(X_train, model.predict(X_train), color = "red")


SET B

#1

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

%matplotlib inline

data = pd.read_csv('Fish.csv')

data

x = data[['Species','Length1', 'Length2', 'Length3', 'Height', 'Width']]

y = data[['Weight']]

encoder = LabelEncoder()

x['Species'] = encoder.fit_transform(df['Species'])

X_train, X_test, Y_train, Y_test = train_test_split(y, x, test_size = 0.3, random_state = 42)

model = LinearRegression()

model.fit(X_train,Y_train)

Y_pred = model.predict(X_test)

mse = mean_squared_error(Y_test, Y_pred)

r2_score = model.score(X_test, Y_test)

print("Model Coefficient:", model.coef_)

print("Intercept:", model.intercept_)

print("Mean Squared error:", mse)

print("R2 score:", r2_score)

plt.scatter(Y_test, Y_pred)

plt.xlabel("actual Weight")
plt.ylabel("Predicted Weight")

plt.title('Actual vs Predicted Fish Weight')

plt.show()

#2

from sklearn import datasets

iris = datasets.load_iris()

d = pd.DataFrame(data = iris.data, columns = iris.feature_names)

d['species'] = iris.target

d['species'] = d['species'].map({0:'Iris-setosa', 1:'Iris-Versicolor', 2:'Iris-virginica'})

d.head(5)

stats = d.groupby('species').describe()

print(stats)

x = d.iloc[:,:-1]

y = d['species']

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

from sklearn import metrics

import seaborn as sns

confusion_matrix = pd.crosstab(Y_test, Y_pred, rownames = ['Actual'], colnames = ['Predicted'])

sns.heatmap(confusion_matrix, annot = True)


ASSIGNMENT 2

#1

#Read the data, Enocode the data

import pandas as pd

from mlxtend.frequent_patterns import apriori, association_rules

transactions = [['1', 'Bread, Milk'],

['2', 'Bread, Diaper, Beer, Eggs'],

['3', 'Milk, Diaper, Beer, Coke'],

['4', 'Bread, Milk, Diaper, Beer'],

['5', 'Bread, Milk, Diaper, Coke']];

from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()

te_array = te.fit(transactions).transform(transactions)

df = pd.DataFrame(te_array, columns = te.columns_)

df

freq_items = apriori(df, min_support = 0.2, use_colnames = True)

print(freq_items)

rules = association_rules(freq_items, metric = 'support', min_threshold = 0.05)

rules = rules.sort_values(['support', 'confidence'], ascending = [False, False])

print(rules)

#2

import pandas as pd

from mlxtend.frequent_patterns import apriori, association_rules

transactions = [['eggs', 'milk', 'bread'],

['eggs', 'apple'],

['milk', 'bread'],

['apple', 'milk'],

['milk', 'apple', 'bread']];


from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()

te_array = te.fit(transactions).transform(transactions)

df = pd.DataFrame(te_array, columns = te.columns_)

df

freq_items = apriori(df, min_support = 0.5, use_colnames = True)

print(freq_items)

rules = association_rules(freq_items, metric = 'support', min_threshold = 0.05)

rules = rules.sort_values(['support', 'confidence'], ascending = [False, False])

print(rules)

You might also like