IR Practical
IR Practical
Code:
import nltk
document1 = "The quick brown fox jumped over the lazy dog"
nltk.download('stopwords')
stopWords = stopwords.words('english')
tokens1 = document1.lower().split()
tokens2 = document2.lower().split()
inverted_index = {}
occ_num_doc1 = {}
occ_num_doc2 = {}
continue
documents =
[]
if term in tokens1:
documents.append("Document 1")
occ_num_doc1[term] =
tokens1.count(term)
if term in tokens2:
documents.append("Document 2")
occ_num_doc2[term] =
tokens2.count(term) inverted_index[term]
= documents
else:
print()
Output:
PRACTICAL NO : 2
Name: Class:TYCS Date:
Code:
documents = {
2: "apple banana",
3: "banana orange",
4: "apple"
def
build_index(docs):
index = {}
terms = set(text.split())
for term in terms:
index[term] =
{doc_id}
else:
index[term].add(doc_id)
return index
inverted_index =
build_index(documents) def
boolean_and(operands, index):
if not operands:
return list(result)
def boolean_or(operands,
return list(result)
return
list(all_docs_set.difference(operand_set))
Output:
B. Implement the vector space model with TF-IDF weighting and
cosine similarity.
Code:
import nltk
import numpy as np
nltk.download('stopwords')
stopWords = stopwords.words('english')
vectorizer =
CountVectorizer(stop_words=stopWords)
transformer = TfidfTransformer()
trainVectorizerArray =
vectorizer.fit_transform(train_set).toarray() testVectorizerArray =
set', testVectorizerArray)
print(vector)
for testV in testVectorizerArray:
print(testV)
cosine = cx(vector,
testV) print(cosine)
transformer.fit(trainVectorizerArray)
print()
print(transformer.transform(trainVectorizerArray).toarray())
transformer.fit(testVectorizerArray)
print()
tfidf =
transformer.transform(testVectorizerArray)
print(tfidf.todense())
Output:
PRACTICAL NO : 3
Name: Class:TYCS Date:
Code:
def editDistance(str1,str2,m,n):
if m==0:
return n
if n==0:
return m
if str1[m-1]==str2[n-1]:
return editDistance(str1,str2,m-1,n-1)
return 1+min(editDistance(str1,str2,m,n-
1), editDistance(str1,str2,m-1,n),
editDistance(str1,str2,m-1,n-1))
str1="saturday"
str2="monday"
print("Edit Distance is:",editDistance(str1,str2,len(str1),len(str2)))
Output:
PRACTICAL NO : 4
Name: Class:TYCS Date:
A. Calculate precision, recall, and F-measure for a given set of retrieval results.
Code:
true_positive =
len(retrieved_set.intersection(relevant_set)) false_positive
= len(retrieved_set.difference(relevant_set)) false_negative
= len(relevant_set.difference(retrieved_set)) '''
(Optional)
PPT values:
true_positive = 20
false_positive = 10
false_negative =
30 '''
print("True Positive: ", true_positive
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-measure: {f_measure}")
Output:
B. Use an evaluation toolkit to measure average precision and other
evaluation metrics.
Code:
y_true = [0, 1, 1, 0, 1, 1]
Output:
PRACTICAL NO : 5
Name: Class:TYCS Date:
clustering).
clustering results.
Code:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
print(kmeans.labels_)
Output:
PRACTICAL NO : 6
Name: Class:TYCS Date:
Code:
import requests
import time
def get_html(url):
try:
response.raise_for_status()
return response.text
return None
def
save_robots_txt(url):
try:
robots_content =
get_html(robots_url) if
robots_content:
file.write(robots_content.encode('utf-8-sig'))
except Exception as e:
def
load_robots_txt():
try:
return file.read().decode('utf-8-
sig')
except
FileNotFoundError:
return None
links.append(absolute_url)
return links
def is_allowed_by_robots(url,
parser.parse(robots_content.split('\n'))
visited_urls = set()
return
visited_urls.add(url)
time.sleep(delay)
html =
get_html(url) if
html:
print(f"Crawling {url}")
robots_content =
load_robots_txt() if not
robots_content:
recursive_crawl(start_url, 1, robots_content)
Output:
PRACTICAL NO : 7
Name: Class:TYCS Date:
Code:
import numpy as np
num_nodes = len(graph)
for _ in range(max_iterations):
prev_page_ranks = np.copy(page_ranks)
if not incoming_links:
continue
return page_ranks
web_graph =
[ [1, 2],
[0, 2],
[0, 1] ,
[1,2],
result = page_rank(web_graph)
for i, pr in enumerate(result):