BDA Manual
BDA Manual
CONTEXT:
IMPLEMENTATION:
PreConfiguration:
A. JAVA INSTALLATION
1
Check the java version $java –version
$cd /home/hadoop/hadoop-1.2.1/
$./clean
$bin/hadoop namenode-format
$bin/start-all.sh #enter passwords whenever it is prompted
$jps
Check if the jps command lists all the required services,
Jps
DataNode
SecondaryNameNode
NameNode
JobTracker
TaskTracker
Hit the localhost url in the browser and see if hadoop is up and running
OUTPUT:
4
Result:
5
EXP NO: 2A
Implement word count programs using MapReduce
Date:
AIM:
To Implement programs that calculates word count of a document using MapReduce
CONTEXT:
MapReduce is a Java-based, distributed execution framework within the Apache Hadoop
Ecosystem. Using MapReduce, we can concurrently split and process petabytes of data in
parallel. It consists of two main tasks: mapping and reducing. This programming model is
highly dependent on key-value pairs for processing.
Mapping: This process takes an input in the form of key-value pairs and produces
another set of intermediate key-value pairs after processing the input.
Reducing: This process takes the output from the map task and further processes it
into even smaller and possibly readable chunks of data. However, the outcome is still
in form of key -value pairs
IMPLEMENTATION:
PRE-CONFIGURATION:
1. Setup a Environment/ IDE for running Java Code
a. Install latest Eclipse Version
b. Install Java JDK in your system
c. Open Environment Variables information by
Right Clicking on MyPC -> Properties -> View Advanced System Settings ->
Environment Variables
d. Add a New variable ; JAVA_HOME= C:\Program Files\Java\jre1.8.0_441
e. Append ‘Path’ Variable, PATH = C:\Program Files\Java\jre1.8.0_441\bin
f. Download Required Hadoop jars from
https://mvnrepository.com/artifact/org.apache.hadoop
g. In Project ->Properties->Build Path -> Add External Jars ->Add all Hadoop Jars
h. Apply and save the settings
6
JAVA CODE:
Execute the below java code and export the jar of this code as wc.jar
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper extends Mapper<LongWritable,
Text,Text,IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private LongWritable key = new LongWritable();
public void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
String line = value.toString();
String[] words = line.split("\\s+");
for(String wordStr : words)
{
word.set(wordStr.trim());
if(!word.toString().isEmpty())
{
context.write(word, count);
7
}
}
}
}
Result:
9
Date:
AIM:
To Implement multiplication of two matrices using MapReduce
CONTEXT:
Matrix-vector and matrix-matrix calculations fit nicely into the MapReduce style of
computing. Let M and N are two input matrices of dimension p x q and q x r respectively.
And P is the output matrix, P = M.N of dimension p x r
Map and Reduce functions will implement the following algorithms:
IMPLEMENTATION:
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
10
public class Map
extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, Text> {
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
int m = Integer.parseInt(conf.get("m"));
int p = Integer.parseInt(conf.get("p"));
String line = value.toString();
// (M, i, j, Mij);
String[] indicesAndValue = line.split(",");
Text outputKey = new Text();
Text outputValue = new Text();
if (indicesAndValue[0].equals("M")) {
for (int k = 0; k < p; k++) {
outputKey.set(indicesAndValue[1] + "," + k);
// outputKey.set(i,k);
outputValue.set(indicesAndValue[0] + "," +
indicesAndValue[2]
+ "," + indicesAndValue[3]);
// outputValue.set(M,j,Mij);
context.write(outputKey, outputValue);
}
} else {
// (N, j, k, Njk);
for (int i = 0; i < m; i++) {
outputKey.set(i + "," + indicesAndValue[2]);
outputValue.set("N," + indicesAndValue[1] + ","
+ indicesAndValue[3]);
11
context.write(outputKey, outputValue);
}
}
}
}
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
12
job.setJarByClass(MatrixMultiply.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.waitForCompletion(true);
}
}
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.HashMap;
13
//key=(i,k),
//Values = [(M/N,j,V/W),..]
HashMap<Integer, Float> hashA = new HashMap<Integer, Float>();
HashMap<Integer, Float> hashB = new HashMap<Integer, Float>();
for (Text val : values) {
value = val.toString().split(",");
if (value[0].equals("M")) {
hashA.put(Integer.parseInt(value[1]),
Float.parseFloat(value[2]));
} else {
hashB.put(Integer.parseInt(value[1]),
Float.parseFloat(value[2]));
}
}
int n = Integer.parseInt(context.getConfiguration().get("n"));
float result = 0.0f;
float m_ij;
float n_jk;
for (int j = 0; j < n; j++) {
m_ij = hashA.containsKey(j) ? hashA.get(j) : 0.0f;
n_jk = hashB.containsKey(j) ? hashB.get(j) : 0.0f;
result += m_ij * n_jk;
}
if (result != 0.0f) {
context.write(null,
new Text(key.toString() + "," + Float.toString(result)));
}
}
}
OUTPUT:
14
[hadoop@master ~]$ cat matrix_a.txt
0,0,1
0,1,2
0,2,3
1,0,4
1,1,5
1,2,6
2,0,7
2,1,8
2,2,9
15
1,0 84
1,1 69
1,2 54
2,0 138
2,1 114
2,2 90
Result:
EXP NO: 3
Implement an MR program that processes a weather dataset
Date:
16
AIM: To Develop a MapReduce program to find the maximum temperature
from a given weather dataset.
CONTEXT:
The weather data for any year is extracted from National Climatic Data Center –
NCDC website ftp://ftp.ncdc.noaa.gov/pub/data/noaa/.
Map Phase: The input for Map phase is set of weather data files T. Each Map task extracts
the temperature data from the given year file. The output of the map phase is set of key value
pairs. Set of keys are the years. Values are the temperature of each year.
Reduce Phase: Reduce phase takes all the values associated with a particular
key. That is all the temperature values belong to a particular year is fed to a same
reducer. Then each reducer finds the highest recorded temperature for each year.
IMPLEMENTATION:
HighestMapper.java
import java.io.IOException;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
17
}
HighestReducer.java
int max_temp = 0;
while (values.hasNext())
HighestDriver.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
18
JobConf conf = new JobConf(getConf(), HighestDriver.class);
conf.setJobName("HighestDriver"); conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(HighestMapper.class);
conf.setReducerClass(HighestReducer.class);
FileInputFormat.addInputPath(conf, inp);
FileOutputFormat.setOutputPath(conf, out);
JobClient.runJob(conf);
return 0; }
System.exit(res); } }
OUTPUT:
Result:
19
Date:
AIM :
To implement Linear regression to predict housing prices.
CONTEXT:
Linear regression is best used in scenarios where you want to understand and predict the
relationship between a dependent variable and one or more independent variables,
particularly when that relationship appears to be linear. Best use cases are as follows:
Predicting numeric outcomes based on historical data
Examples include sales predictions, housing prices, or stock market trends
Works well when there's a clear linear relationship between variables
Understanding cause-and-effect relationships
SOURCE CODE :
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
df = pd.read_csv('D:\MiniWorks\ML Programs\canada_per_capita_income.csv')
df = df.rename(columns={'per capita income (US$)': 'income'})
plt.xlabel("year")
plt.ylabel("income")
plt.scatter(df.year, df.income, color='blue', marker='*')
newydf = df.income
newxdf = dfx=df.drop('income', axis='columns')
regressionModel = linear_model.LinearRegression()
regressionModel.fit(newxdf, newydf)
print('prediction', regressionModel.predict([[2020]]))
coef =regressionModel.coef_
20
intercept = regressionModel.intercept_
print('coeff', coef)
print('intercept', intercept)
plt.plot(df.year, coef*df.year + intercept, ls='-', marker=' ')
plt.plot(df.year, df.income)
OUTPUT:
21
Figure 3 Linear Regression Line Plot
Result:
EXP NO: 4B
Implement Binary Logistic Regression
Date:
22
AIM:
To perform Logistic Regression to predict if a person would buy life insurance based on his
age using logistic regression
CONTEXT:
Logistic regression is a Supervised Learning technique used for predicting the categorical
dependent variable using a given set of independent variables. Logistic regression is
primarily used for binary classification problems. Logistic regression works best when:
The relationship between features and the outcome is approximately linear
There are no highly correlated independent variables
The sample size is relatively large
The outcome is truly binary
SOURCE CODE
import pandas as pd
from matplotlib import pyplot as plt
import math
def sigmoid(x):
return 1 / (1 + math.exp(-x))
def prediction_function(age,inter,coeff):
z = coeff * age + inter
y = sigmoid(z)
return y
df = pd.read_csv("D:\MiniWorks\ML Programs\insurance_data.csv")
df.head()
plt.scatter(df.age,df.bought_insurance,marker='+',color='red')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(df[['age']],df.bought_insurance,train_size=0.8)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
model.predict_proba(X_test)
model.score(X_test,y_test)
23
if(val > 0.5):
print("Yes - Buy Insurance")
else:
print("No Insurance")
OUTPUT:
Result:
EXP NO: 5
Decision Tree Classifier
Date:
24
AIM:
To execute a decision tree classifier algorithm for predicting diabetic conditions
THEORY:
Decision tree classification starts with the entire dataset at its root and then selects the
best feature to split the data (using metrics like Gini impurity or information gain) . It
then recursively creates branches by making decisions at each node . Splitting is
continued until a stopping criterion is met (max depth, minimum samples, etc.) Best
usecases include Spam email detection , Credit risk assessment, Predicting disease
risk etc
SOURCE CODE:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
# load dataset
pima = pd.read_csv("D:\MiniWorks\ML Programs\inddiab.csv", header=None,
names=col_names)
25
from six import StringIO
from IPython.display import Image
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('diabetes.png')
Image(graph.create_png())
OUTPUT
Result:
EXP NO: 6A
IMPLEMENT CLUSTERING TECHNIQUES – K Means
Date:
AIM:
To implement K Means clustering algorithm for grouping set of Loan applicants.
26
THEORY:
K-Means Clustering Overview:
K-means is a fundamental partitioning clustering algorithm that divides a dataset into K
predefined number of distinct, non-overlapping clusters. The algorithm operates by
identifying K centroids and assigning each data point to the nearest centroid, creating clusters
based on proximity. Its primary goal is to minimize the within-cluster variance, ensuring that
points within each cluster are as similar as possible.
IMPLEMENTATION
#import libraries
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt
data = pd.read_csv('clustering.csv')
data.head()
X = data[["LoanAmount","ApplicantIncome"]]
#Visualise data points
plt.scatter(X["ApplicantIncome"],X["LoanAmount"],c='black')
plt.xlabel('AnnualIncome')
plt.ylabel('Loan Amount (In Thousands)')
plt.show()
K=3
diff = 1
j=0
while(diff!=0):
XD=X
i=1
for index1,row_c in Centroids.iterrows():
ED=[]
for index2,row_d in XD.iterrows():
d1=(row_c["ApplicantIncome"]-row_d["ApplicantIncome"])**2
d2=(row_c["LoanAmount"]-row_d["LoanAmount"])**2
d=np.sqrt(d1+d2)
ED.append(d)
X[i]=ED
i=i+1
C=[]
for index,row in X.iterrows():
min_dist=row[1]
pos=1
for i in range(K):
if row[i+1] < min_dist:
min_dist = row[i+1]
pos=i+1
C.append(pos)
X["Cluster"]=C
28
Centroids_new = X.groupby(["Cluster"]).mean()[["LoanAmount","ApplicantIncome"]]
if j == 0:
diff=1
j=j+1
else:
diff = (Centroids_new['LoanAmount'] - Centroids['LoanAmount']).sum() +
(Centroids_new['ApplicantIncome'] - Centroids['ApplicantIncome']).sum()
print(diff.sum())
Centroids = X.groupby(["Cluster"]).mean()[["LoanAmount","ApplicantIncome"]]
data = pd.read_csv('clustering.csv')
data.head()
X = data[["LoanAmount","ApplicantIncome"]]
#Visualise data points
plt.scatter(X["ApplicantIncome"],X["LoanAmount"],c='black')
plt.xlabel('AnnualIncome')
plt.ylabel('Loan Amount (In Thousands)')
plt.show()
OUTPUT:
29
Figure 5 Dataset Description
Result:
30
Date:
AIM:
To perform exploratory data analysis using various visualization techniques
THEORY:
IMPLEMENTATION
1. Line Chart
import matplotlib.pyplot as plt
import numpy as np
#simple array
x = np.array([1, 2, 3, 4])
#genearting y values
y = x*2
plt.plot(x, y)
plt.show()
#Sample #2
x = np.array([1, 2, 3, 4])
y = np.array([2, 4, 6, 8])
plt.plot(x, y)
plt.xlabel("Time in Hrs")
plt.ylabel("Distance in Km")
plt.title("Time Vs Distance -LINE CHART")
plt.show()
plt.savefig("time_distance.png")
2. Histogram
31
from matplotlib import pyplot as plt
import numpy as np
fig,ax = plt.subplots(1,1)
a=
np.array([25,42,48,55,60,62,67,70,30,38,44,50,54,58,75,78,85,88,89,28,35,90,95])
ax.set_xticks([0,20,40,60,80,100])
ax.set_xlabel('Marks Scored')
ax.set_ylabel('No. of Students')
plt.show()
4. Pie Chart
from matplotlib import pyplot as plt
import numpy as np
Language = ['English', 'Spanish', 'Chinese',
'Russian', 'Japanese', 'French']
data = [379, 480, 918, 154, 128, 77.2]
# Creating plot
fig = plt.figure(figsize =(10, 7))
plt.pie(data, labels = Language)
plt.title("Pie Chart")
plt.show()
5. Area plot
import matplotlib.pyplot as plt
32
days = [1, 2, 3, 4, 5]
plt.xlabel('Days')
plt.ylabel('No of Hours')
plt.show()
6. Scatter Plot
import matplotlib.pyplot as plt
x = [5,7,8,7,2,17,2,9,4,11,12,9]
y = [99,86,87,88,67,86,87,78,77,85,86,56]
plt.scatter(x, y)
plt.title('Scatter Plot')
plt.xlabel('x')
plt.ylabel('y')
plt.show()
7. Heat map
import seaborn as sn
import numpy as np
import pandas as pd
df=pd.DataFrame(np.random.random((7,7)),columns=['a','b','c','d','e','f','g'])
sn.heatmap(df,annot=True,annot_kws={'size':7})
8. Box Plot
33
bp=ax.boxplot(to_plot)
fig.savefig('boxplot.png',bbox_inches='tight')
OUTPUT:
34
35
36
Result:
37
EXP NO: 8
IMPLEMENT AN APPLICATION THAT STORES BIG DATA IN HBASE
Date:
THEORY:
HBase is a Distributed, columnar NoSQL database Built on top of Hadoop Distributed File
System (HDFS)
It is designed for random, real-time read/write access to large datasets. It provides strong
consistency and is horizontally scalable.
Key Storage Concepts
Data stored in tables
Each table has rows and column families
Rows are identified by unique row keys
Column families group related columns together
Supports sparse data storage
IMPLEMENTATION:
# Create a table
create 'users', 'personal', 'contact'
# Insert data
put 'users', 'user_1', 'personal:name', 'John Doe'
put 'users', 'user_1', 'personal:age', '30'
put 'users', 'user_1', 'contact:email', '[email protected]'
38
deleteall 'users', 'user_1'
JAVA-API Implementation
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.util.Bytes;
// Create table
admin.createTable(descriptor);
}
// Add columns
put.addColumn(
Bytes.toBytes("personal"),
Bytes.toBytes("name"),
Bytes.toBytes("John Doe")
);
table.put(put);
}
39
}
public void insertData(Table table, String rowKey) throws IOException { Put put = new
Put(Bytes.toBytes(rowKey)); // Add columns put.addColumn( Bytes.toBytes("personal"),
Bytes.toBytes("name"), Bytes.toBytes("John Doe") ); table.put(put); } public void
deleteRow(Table table, String rowKey) throws IOException { Delete delete = new
Delete(Bytes.toBytes(rowKey)); table.delete(delete); } }
OUTPUT:
40
Result:
41