Information Retrieval Journal
Information Retrieval Journal
c= a & b;
System.out.println("a & b ="+c);
c = a |b;
System.out.println("a | b="+c);
c = a ^ b;
System.out.println("a ^ b="+c);
c = ~a;
System.out.println("~a ="+c);
c = a <<2;
System.out.println("a <<2="+c);
c = a >>2;
System.out.println("a >>2="+c);
c = a>>>2;
System.out.println("a >>>2="+c);
}
}
OUTPUT:
PRACTICAL NO 2
Program Statement :- Implement Page Rank Algorithm.
Steps:
Step1:Open cmd and Install numpy and Install scipy
“pip install numpy” , “pip install scipy”
CODE:
import numpy as np
from scipy.sparse import csc_matrix
def pageRank(G, s = .85, maxerr = .0001):
n = G.shape[0]
# transform G into markov matrix A
A = csc_matrix(G,dtype=np.float)
rsums = np.array(A.sum(1))[:,0]
ri, ci = A.nonzero()
A.data /= rsums[ri]
# bool array of sink states
sink = rsums==0
# Compute pagerank r until we converge
ro, r = np.zeros(n), np.ones(n)
while np.sum(np.abs(r-ro)) > maxerr:
ro = r.copy()
# calculate each pagerank at a time
for i in range(0,n):
# inlinks of state i
Ai = np.array(A[:,i].todense())[:,0]
# account for sink states
Di = sink / float(n)
# account for teleportation to state i
Ei = np.ones(n) / float(n)
r[i] = ro.dot( Ai*s + Di*s + Ei*(1-s) )
# return normalized pagerank
return r/float(sum(r))
if __name__=='__main__':
# Example extracted from 'Introduction to Information Retrieval'
G = np.array([[0,0,1,0,0,0,0],
[0,1,1,0,0,0,0],
[1,0,1,1,0,0,0],
[0,0,0,1,1,0,0],
[0,0,0,0,0,0,1],
[0,0,0,0,0,1,1],
[0,0,0,1,1,0,1]])
print(pageRank(G,s=.86))
OUTPUT:
PRACTICAL NO.3
CODE:
public class Levenshtein {
public static int distance (String a, String b) {
a= a.toLowerCase();
b= b.toLowerCase();
int[]costs=new int[b.length()+1];
}}
return costs[b.length()];}
OUTPUT:
PRACTICAL NO: 4
Program Statement :- Write a program to Copute Similarity between two text
document.
Steps:
Open cmd and type following commands:
“pip install nitk”
“pip install numpy”
CODE:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
#nltk.download()
#nltk.download('punkt')
#nltk.download('stopwords')
def process(file):
raw=open(file).read()
tokens=word_tokenize(raw)
words=[w.lower() for w in tokens]
porter= nltk.PorterStemmer()
stemmed_tokens=[porter.stem(t) for t in words]
# removing stop words
stop_words=set(stopwords.words('english'))
filtered_tokens=[w for w in stemmed_tokens if not w in stop_words]
#count words
count=nltk.defaultdict(int)
for word in filtered_tokens:
count[word]+=1
return count;
def cos_sim(a,b):
dot_product=np.dot(a,b)
norm_a=np.linalg.norm(a)
norm_b=np.linalg.norm(b)
return dot_product/(norm_a * norm_b)
def getSimilarity(dict1,dict2):
all_words_list=[]
for key in dict1:
all_words_list.append(key)
for key in dict2:
all_words_list.append(key)
all_words_list_size=len(all_words_list)
v1=np.zeros(all_words_list_size,dtype=np.int)
v2=np.zeros(all_words_list_size,dtype=np.int)
i=0
for (key) in all_words_list:
v1[i]=dict1.get(key,0)
v2[i]=dict2.get(key,0)
i=i+1
return cos_sim(v1,v2)
if __name__ == '__main__':
dict1=process('C://Users//DELL//Downloads//text1.txt')
dict2=process('C://Users//DELL//Downloads//text2.txt')
print("Similarity between two text documents",getSimilarity(dict1,dict2))
OUTPUT:
PRACTICAL NO: 5
Program Statement :- Write a map-reduce program to count the number of occurrences
of each alphabetic character in the given dataset. The count for each letter should be
case-insensitive (i.e., include both upper-case and lower-case versions of the letter;
Ignore non-alphabetic characters).
Installation of Hadoop:
1) To install Hadoop in your windows machine, at first you need to download and install latest java
JDK version and set JAVA_HOME path as your java installation path, in my case its
“C:\Java\jdk1.8.0_171”.
These softwares should be prepared to install Hadoop 3.1.2 on window 10 64bit
Download Hadoop 2.8.0 (Link: http://www-eu.apache.org/dist/hadoop/common/hadoop-3.1.2/hadoop-
3.1.2-src.tar.gz).
Java JDK 1.8.0.zip (Link: http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-
2133151.html)
2)Check either Java 1.8.0 is already installed on your system or not, use "javac -version" to check.
If Java is not installed on your system then first install java under "C:\Java".
Set the path JAVA_HOME Environment variable on windows 10(see Step 1,2,3 and 4 below).
Next we set the Hadoop bin directory path and JAVA bin directory path.
4) Configuration:-
Edit file C:/hadoop-3.1.2/etc/hadoop/core-site.xml, paste below xml paragraph and save this file.
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
Edit file C:\hadoop-3.1.2/etc/hadoop/hdfs-site.xml, paste below xml paragraph and save this file.
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>C:\hadoop-2.8.0\data\namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>C:\hadoop-2.8.0\data\datanode</value>
</property>
</configuration>
Edit file C:/hadoop-3.1.2/etc/hadoop/yarn-site.xml, paste below xml paragraph and save this file.
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
</configuration>
Edit file C:/hadoop-3.1.2/etc/hadoop/hadoop-env.cmd by closing the command
line"JAVA_HOME=%JAVA_HOME%" instead of set "JAVA_HOME=C:\Java" (On C:\java this is
path to file jdk.1.8.0)
5)Hadoop Configuration:-
Download file Hadoop Configuration.zip (Link: https://github.com/MuhammadBilalYar/HADOOP-
INSTALLATION-ON-WINDOW-10/blob/master/Hadoop%20Configuration.zip)
Delete file bin on C:\Hadoop-2.8.0\bin, replaced by file bin on file just download (from Hadoop
Configuration.zip).
Open cmd and typing command "hdfs namenode –format" . You will see.
6) Testing :-
Open cmd and change directory to "C:\hadoop-3.1.2\sbin" and type "start-all.cmd" to start apache.
http://localhost:9870
7)Word Count:
Open cmd, go to Hadoop folder and type following command:
iii)Create two python files a)mapper.py and b)reducer.py in C:/hadoop-3.1.2 and write
following code:
a)mapper.py:
code:
#!/usr/bin/env python
import sys
def read_input(file):
for line in file:
# split the line into words
yield line.split()
def main(separator='\t'):
# input comes from STDIN (standard input)
data = read_input(sys.stdin)
for words in data:
for word in words:
print ('%s%s%d' % (word, separator, 1))
main()
b)reducer.py:
code:
import sys
last_turf = None
turf_count = 0
line1=[]
for l1 in sys.stdin:
line1.append(l1)
line1.sort()
for line in line1:
line = line.strip()
turf, count = line.split("\t")
count = int(count)
file1.txt:
file2.txt:
OUTPUT:
PRACTICAL NO 6
Program Statement :- Implement a basic IR system using Lucene.
Steps:
Once your project is created successfully, you will have following content in your Project Explorer −
LuceneConstants.java
This class is used to provide various constants to be used across the sample application.
CODE :
package com.tutorialspoint.lucene;
TextFileFilter.java
This class is used as a .txt file filter.
CODE :
package com.tutorialspoint.lucene;
import java.io.File;
import java.io.FileFilter;
@Override
return pathname.getName().toLowerCase().endsWith(".txt");
Indexer.java
This class is used to index the raw data so that we can make it searchable using the Lucene library.
CODE :
package com.tutorialspoint.lucene;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
Directory indexDirectory =
FSDirectory.open(new File(indexDirectoryPath));
writer.close();
file.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED);
file.getCanonicalPath(),Field.Store.YES,Field.Index.NOT_ANALYZED);
document.add(contentField);
document.add(fileNameField);
document.add(filePathField);
return document;
System.out.println("Indexing "+file.getCanonicalPath());
writer.addDocument(document);
throws IOException {
&& !file.isHidden()
&& file.exists()
&& file.canRead()
&& filter.accept(file)
){
indexFile(file);
return writer.numDocs();
Searcher.java
This class is used to search the indexes created by the Indexer to search the requested content.
CODE :
package com.tutorialspoint.lucene;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
QueryParser queryParser;
Query query;
throws IOException {
Directory indexDirectory =
FSDirectory.open(new File(indexDirectoryPath));
LuceneConstants.CONTENTS,
new StandardAnalyzer(Version.LUCENE_35));
query = queryParser.parse(searchQuery);
return indexSearcher.doc(scoreDoc.doc);
indexSearcher.close();
LuceneTester.java
This class is used to test the indexing and search capability of lucene library.
CODE :
package com.tutorialspoint.lucene;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
Indexer indexer;
Searcher searcher;
LuceneTester tester;
try {
tester.createIndex();
tester.search("Mohan");
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
int numIndexed;
indexer.close();
+(endTime-startTime)+" ms");
}
private void search(String searchQuery) throws IOException, ParseException {
System.out.println(hits.totalHits +
System.out.println("File: "
+ doc.get(LuceneConstants.FILE_PATH));
searcher.close();
OUTPUT:
Once you've run the program successfully, you will have the following content in your index
directory –
OUTPUT:
PRACTICAL NO 7
Program Statement :- Write a program for Pre-processing of a Text Document: stop
word removal.
Steps :-
Step1:Open cmd and Install nltk.
“pip install nltk”
CODE:
from nltk.corpus import stopwords
input_str="NLTK is a leading platform for building Python programs to work with human language
data."
stop_words=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
tokens=word_tokenize(input_str)
result=[i for i in tokens if not i in stop_words]
print(result)
OUTPUT:
PRACTICAL NO:8
Program Statement :- Write a program for mining twitter to identify twittes for specific
period and identify trends and name entities.
Steps :-
Install few packages to create this:
tweepy, tkinter, textblob, matplotlib.
App created:
Generate a consumer key, Consumer secret, Access token and Access Token secret:
CODE :
import tweepy
from tkinter import *
from time import sleep
from datetime import datetime
from textblob import TextBlob
import matplotlib.pyplot
import twitter
def load_api():
consumer_key='8lGhUCuzaOprx5uYtkaGazqpj'
consumer_secret='fv8bRgDSusvjQSMNgvSmL4CYTMy3Acc0NlesCgRQPvVapRzr4e'
access_token='1090831287902846976-OiCwMRqiWGa3aXrJsjDVQQi5sZqdNK'
access_token_secret='Lxt4A9w3uq42h7cKRj9HAuYvtMotGptItqL1B9GBY9YpY'
#auth=tweepy.OAuthHandler(consumer_key,consumer_secret)
#auth.set_access_token(access_token,access_token_secret)
#api=tweepy.API(auth)
auth=twitter.oauth.OAuth(access_token,access_token_secret,consumer_key,consumer_secret)
twitter_api=twitter.Twitter(auth=auth)
return twitter_api
def getE1():
return E1.get()
def getE2():
return E2.get()
def getData():
#getE1()
keyword=getE1()
#getE2()
numberOfTweets=getE2()
numberOfTweets=int(numberOfTweets)
twitter_api=load_api()
w_wo=1
us_woh=23424977
wTrend=twitter_api.trends.place(_id=w_wo)
UTrend=twitter_api.trends.place(_id=us_woh)
#abc=wTrend[3]
print(wTrend)
print(UTrend)
root=Tk()
label1=Label(root,text="Search")
E1=Entry(root,bd=5)
label2=Label(root,text="Sample Size")
E2=Entry(root,bd=5)
submit=Button(root,text="Submit",command=getData)
label1.pack()
E1.pack()
label2.pack()
E2.pack()
submit.pack(side=BOTTOM)
root.mainloop()
OUTPUT:
PRACTICAL NO 9:
Program Statement :- Write a program to implement simple web crawler.
Steps :
Step1:Open cmd and Install requests and Install bs4
“pip install requests” , “pip install bs4”
CODE:
import requests
from bs4 import BeautifulSoup
def web(page,WebUrl):
if(page>0):
url=WebUrl
code=requests.get(url)
plain=code.text
s=BeautifulSoup(plain, "html.parser")
for link in s.findAll('a',{'class':'s-access-detail-page'}):
tet=link.get('title')
print(tet)
tet_2=link.get('href')
print(tet_2)
web(1,'http://www.amazon.in/s/ref=s9_acss_bw_cts_VodooFS_T4_w?rh=i%3Aelectronics%2Cn%3
A976419031%2Cn%3A%21976420031%2Cn%3A1389401031%2Cn%3A1389432031%2Cn%3A18
05560031%2Cp_98%3A10440597031%2Cp_36%3A1500000-
99999999&bbn=1805560031&rw_html_to_wsrp=1&pf_rd_m=A1K21FY43GMZF8&pf_rd_s=merch
andised-search-3&pf_rd_r=2EKZMFFDEXJ5HE8RVV6E&pf_rd_t=101&pf_rd_p=c92c2f88-469b-
4b56-
OUTPUT:
Practical No:10
Program Statement : Write a program to parse XML text, generate web graph and
compute topics specific page rank.
Steps:
Step 1:
Open cmd and type following command:
“pip install requests”
“pip install python-csv”
CODE:
import csv
import requests
import xml.etree.ElementTree as ET
def loadRSS():
url='http://www.hindustantimes.com/rss/topnews/rssfeed.xml'
resp=requests.get(url)
with open('topnewsfeed.xml','wb') as f:
f.write(resp.content)
def parseXML(xmlfile):
tree=ET.parse(xmlfile)
root=tree.getroot()
newsitems=[]
for item in root.findall('./channel/item'):
news={}
for child in item:
if child.tag=='{http://search.yahoo.com/mrss/}content':
news['media']=child.attrib['url']
else:
news[child.tag]=child.text.encode('utf8')
newsitems.append(news)
return newsitems
def savetoCSV(newsitems,filename):
fields=['guid','title','pubDate','description','link','media']
with open(filename,'w')as csvfile:
writer=csv.DictWriter(csvfile,fieldnames=fields)
writer.writeheader()
writer.writerows(newsitems)
loadRSS()
newsitems=parseXML('topnewsfeed.xml')
savetoCSV(newsitems,'topnews.csv')
def generate_edges(graph):
edges=[]
for node in graph:
for neighbour in graph[node]:
edges.append((node,neighbour))
return edges
OUTPUT:
topnewsfeed.xml:
topnews.csv: