Data Warehousing and Data Mining
Data Warehousing and Data Mining
3. The License Agreement terms will open. Read it thoroughly and click on “I Agree”.
4. Checkmark the just me and click next.
def sampling(data):
print("Displaying the first five records of the table Without Sampling.")
print(data.head())
print("A sample of size 3 is randomly selected (without replacement) from the original data.")
sample = data.sample(n=3)
print(sample)
print("Randomly select 1% of the data (without replacement) and display the selected
samples.")
sample = data.sample(frac=0.01, random_state=1)
print(sample)
print("A sampling with replacement to create a sample whose size is equal to 1% of the entire
data.")
sample = data.sample(frac=0.01, replace=True, random_state=1)
print(sample)
defremove_duplicate(data):
dups = data.duplicated()
print(f"Number of duplicate rows = {dups.sum()}")
print(data.loc[[11, 28]])
print(f'Number of rows before discarding duplicates = {data.shape[0]}')
data2 = data.drop_duplicates()
print(f'Number of rows after discarding duplicates = {data2.shape[0]}')
def outlier(data):
data2 = data.drop(['Class'], axis=1)
data2['Bare Nuclei'] = pd.to_numeric(data2['Bare Nuclei'])
Z = (data2 - data2.mean()) / data2.std()
print(Z[20:25])
print(f'Number of rows before discarding outliers = {Z.shape[0]}')
Z2 = Z.loc[((Z > -3).sum(axis=1) == 9) & ((Z <= 3).sum(axis=1) == 9), :]
print(f'Number of rows after discarding missing values = {Z2.shape[0]}')
defremove_missing(data):
print(f'Number of rows in original data = {data.shape[0]}')
data = data.dropna()
print(f'Number of rows after discarding missing values = {data.shape[0]}')
defreplace_missing_value_by_median(data):
data2 = data['Bare Nuclei']
print('Before replacing missing values:')
print(data2[20:25])
data2 = data2.fillna(data2.median())
print("\nAfter replacing missing values by median:")
print(data2[20:25])
defnoise_handle(data):
data = data.drop(['Sample code'], axis=1)
data = data.replace('?', np.NaN)
print("Number of instances = %d" % (data.shape[0]))
print("Number of attributes = %d" % (data.shape[1]))
print('Number of missing values:')
for col in data.columns:
print("\t%s: %d" % (col, data[col].isna().sum()))
print("To further preprocess select option:\n"
"0. Exit\n"
"1. Replace missing value by median\n"
"2. Remove missing value\n"
"3. Handle outlier\n"
"4. Remove duplicate\n"
"5. Sampling\n"
"6. Discretization:")
option = int(input())
while option != 0:
if option == 1:
replace_missing_value_by_median(data)
elif option == 2:
remove_missing(data)
elif option == 3:
outlier(data)
elif option == 4:
remove_duplicate(data)
elif option == 5:
sampling(data)
elif option == 6:
discretization(data)
else:
print("Enter correct choice")
print("Select your option again:")
option = int(input())
def view(data):
data.head()
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
LAB 3: Implementing Apriori Algorithm
Source Code:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from IPython.display import display, HTML
deftoy_dataset():
data = [
['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']
]
print("Do you want to view the raw data?")
choice = input()
if choice.lower() == 'yes':
print("Raw Data:")
print(data)
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns)
print("Do you want to view the Encoded data?")
choice = input()
if choice.lower() == 'yes':
print("Encoded Data:")
display(HTML(df.to_html()))
return df
deffrequent_itemset(data):
print("Enter the value of minimum support threshold")
support = float(input())
frequent_itemsets = apriori(data, min_support=support, use_colnames=True)
print("Do you want to view frequent itemsets generated by Apriori?")
choice = input()
if choice.lower() == 'yes':
print("Frequent itemset:")
display(HTML(frequent_itemsets.to_html()))
returnfrequent_itemsets
if __name__ == '__main__':
df = toy_dataset()
if df is not None:
frequent_itemsets(df)
defassociation_rule(frequent_itemsets):
print("Enter your metric of interest ('confidence' or 'lift'):")
choice = input()
if choice == 'confidence':
print("Enter minimum confidence threshold value:")
min_confidence = float(input())
rule = association_rules(frequent_itemsets, metric="confidence",
min_threshold=min_confidence)
elif choice == 'lift':
print("Enter minimum lift threshold value:")
min_lift = float(input())
rule = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)
else:
print("Invalid choice.")
return
print("Do you want to view the learned association rules? (yes/no)")
choice = input()
if choice == 'yes':
print(rule.drop(['leverage','conviction'], axis=1))
else:
quit()
quit()
LAB 4: Implementing FP-growth
Source Code:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
from IPython.display import display_html
deftoy_dataset():
data= [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]
print("Do you want to view the raw data?")
choice =input()
if choice=='yes':
print("Raw Data:")
print(data)
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
print("Do you want to view the Encoded data?")
choice =input()
if choice=='yes':
print("Encoded Data:")
display_html(df)
return df
deffrequent_itemset(data):
print("Enter the value of minimum support threshold:")
support=float(input())
frequent_itemsets = fpgrowth(data, min_support=support, use_colnames=True)
print("Do you want to view frequent itemsets generated by FP-growth?")
choice =input()
if choice=='yes':
print(frequent_itemsets)
display_html(frequent_itemsets)
return frequent_itemsets
defassociation_rule(frequent_itemsets):
print("Enter your metric of interest confidence or lift:")
choice = input()
if choice == 'confidence':
print("Enter minimum confidence threshold value:")
min_confidence = float(input())
rule = association_rules(frequent_itemsets, metric="confidence",
min_threshold=min_confidence)
elif choice == 'lift':
print("Enter minimum lift threshold value:")
min_lift = float(input())
rule = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)
print("Do you want to view the learned association rules?")
choice = input()
if choice == 'yes':
display_html(rule.drop(['leverage', 'conviction'], axis=1))
else:
quit()
def main():
data = toy_dataset()
frequent_itemsets = frequent_itemset(data)
association_rule(frequent_itemsets)
main()
deftoy_dataset():
ratings = [['Lokesh',5,5,2,1],['Jyoti',4,5,3,2],['Bijay',4,4,4,3],['Sita',2,2,4,5],
['Manish',1,2,3,4],['Ram',2,1,5,5]]
titles = ['user', 'Loot', 'Chino', 'Ghar','Aatma']
movies = pd.DataFrame(ratings,columns=titles)
display_html(movies)
return movies
defk_means_learn(k,movies):
data = movies.drop('user',axis=1)
k_means = KMeans(n_clusters=2, max_iter=50, random_state=1, n_init = 'auto')
k_means.fit(data)
labels = k_means.labels_
print(pd.DataFrame(labels, index=movies.user, columns=['Cluster ID']))
print("Learned cluster centroids for two clusters 0 and 1:")
centroids = k_means.cluster_centers_
display_html(pd.DataFrame(centroids,columns=data.columns))
print("Now you can use cluster centroids to other users to determine their cluster
assignments.")
return(k_means)
defcluster_new_data(k_means, movies):
testData = np.array([[4.5,1,2],[3,2,4,4],[2,3,4,1],[3,2,3,3],[5,4,1,4]])
labels = k_means.predict(testData)
labels = labels.reshape(-1,1)
usernames = np.array(['Radhe', 'Riya', 'Pratik','Prativa', 'Shyam']).reshape(-1,1)
cols = movies.columns.tolist()
newusers = pd.DataFrame(np.concatenate((usernames, testData), axis=1),columns=cols)
cols.append('Assigned Cluster')
newusers_cluster = pd.DataFrame(np.concatenate((usernames, testData, labels), axis=1),
columns =cols)
print("Your New users (test data) are:")
display_html(newusers)
print("New Users with their assigned cluster:")
display_html(newusers_cluster)
def main():
k=2
movies = toy_dataset()
k_means = k_means_learn(k,movies)
cluster_new_data(k_means, movies)
if __name__ == "__main__":
main()
Input/Output
Now, the trained model will classify new users based on the learned cluster centroids. The test dataset
contains the following users:
Source Code
importnumpyasnp
import pandas aspdimport
matplotlib.pyplotasplt
%matplotlib inline
importsys
importnumpyasnp
importmatplotlib.pyplotasplt
defplot(data, centroids):
plt.scatter(data[:, 0], data[:, 1], marker='.', color='gray', label='data points')
plt.scatter(centroids[:-1, 0], centroids[:-1, 1], color='black', label='previously selected centroids')
plt.scatter(centroids[-1, 0], centroids[-1, 1], color='red', label='next centroid')
plt.title('Select %d th centroid' % (centroids.shape[0]))
plt.legend()
plt.xlim(-5, 12)
plt.ylim(-10, 15)
plt.show()
defdistance(p1, p2):
returnnp.sum((p1 - p2)**2)
definitialize(data, k):
centroids = []
centroids.append(data[np.random.randint(data.shape[0]), :])
plot(data, np.array(centroids))
forc_idin range(k - 1):
dist = []
foriin range(data.shape[0]):
point = data[i:]
d=sys.maxsize
for j in range(len(centroids)):
temp_dist distance(point, centroids[j])
d=min(d, temp_dist)
dist.append(d)
distnp.array(dist)
next_centroid = data[np.argmax(dist),:]
centroids.append(next_centroid)
dist[]
plot(data, np.array(centroids))
return centroids
entroids initialize(data, k = 4)