0% found this document useful (0 votes)

77 views8 pages

Cs 3308 Unit 7 Programming Assignment

The document contains a programming assignment for an Information Retrieval course at the University of the People, which involves creating a web crawler using Python and SQLite. The code includes functionalities for tokenizing text, removing stop words, basic stemming, and storing crawled data in a database. The assignment culminates in crawling a specified website and indexing the retrieved documents and terms.

Uploaded by

Eric Baudouin Ake

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

77 views8 pages

Cs 3308 Unit 7 Programming Assignment

Uploaded by

Eric Baudouin Ake

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 8

CS 3308- INFORMATION RETRIEVAL

UNIT 7 PROGRAMMING ASSIGNMENT

UNIVERSITY OF THE PEOPLE

SOURCE CODE OF THE ASSIGNMENT

import sqlite3
import re
import math
import time
import urllib.request as urllib2
import urllib.parse as urlparse
from html.parser import HTMLParser

# Stop words list

stopwords = ['the', 'of', 'and', 'to', 'in', 'you', 'it', 'with', 'that', 'or', 'was', 'he', 'is', 'for', 'this', 'his', 'as',
'not', 'at', 'by', 'all', 'they', 'but', 'be', 'on', 'from', 'had', 'her', 'work', 'are', 'any', 'she', 'if', 'said', 'so',
'which', 'have', 'do', 'we', 'no', 'my', 'were', 'them', 'their', 'him', 'one', 'will', 'me', 'there', 'who', 'up',
'other', 'an', 'its', 'when', 'what', 'can', 'may', 'into', 'out', 'must', 'your', 'then', 'would', 'could', 'more',
'now', 'has', 'like', 'down', 'where', 'been', 'through', 'did', 'away', 'these', 'such', 'set', 'back', 'some',
'than', 'way', 'made', 'our', 'after', 'well', 'should', 'get', 'even', 'am', 'go', 'saw', 'just', 'put', 'while', 'ever',
'off', 'here', 'also']

# Regular expressions
chars = re.compile(r'\W+')

# Global counters
tokens = 0
documents = 0
terms = 0

class Term:
def __init__(self):
self.termid = 0
self.termfreq = 0
self.docs = 0
self.docids = {}

@staticmethod
def splitchars(line):
return chars.split(line)

@staticmethod
def stripTags(s):
# Using HTMLParser as a built-in alternative to BeautifulSoup
parser = HTMLParser()
return parser.unescape(s)

@staticmethod
def printText(tags):
for tag in tags:
if isinstance(tag, NavigableString):
print(tag)
else:
Term.printText(tag)
print("")

# Simple stemming function

def basic_stem(word):
# Simple suffix stripping (this is just an example, not as robust as PorterStemmer)
suffixes = ['ing', 'ed', 'es', 's']
for suffix in suffixes:
if word.endswith(suffix):
return word[:-len(suffix)]
return word # Return the word if no suffix is found

def parsetoken(db, line):

global documents
global tokens
global terms

# Clean up the line

line = line.replace('\t', ' ').strip()

# Split line into tokens

tokens_list = Term.splitchars(line)

# Process each token

for elmt in tokens_list:
elmt = elmt.replace('\n', '')
lowerElmt = elmt.lower().strip()

# Count tokens
tokens += 1

# Skip short tokens, stopwords, and numbers

if len(lowerElmt) < 2 or lowerElmt in stopwords:
continue

try:
int(lowerElmt) # Check if token is a number
continue
except ValueError:
stemword = lowerElmt

# Apply basic stemming

lowerElmt = basic_stem(stemword)

# Add new term to dictionary if it doesn't exist

if lowerElmt not in db:
terms += 1
db[lowerElmt] = Term()
db[lowerElmt].termid = terms

# Update document frequency and term frequency

if documents not in db[lowerElmt].docids:
db[lowerElmt].docs += 1
db[lowerElmt].docids[documents] = 0

db[lowerElmt].docids[documents] += 1

return tokens_list

def writeindex(db, cur):

for k, term in db.items():
cur.execute('INSERT INTO TermDictionary (Term, TermId) VALUES (?, ?)', (k, term.termid))

docfreq = term.docs
ratio = float(documents) / float(docfreq)
idf = math.log10(ratio)

for i, termfreq in term.docids.items():

tfidf = float(termfreq) * float(idf)

if tfidf > 0:
cur.execute('INSERT INTO Posting (TermId, DocId, tfidf, docfreq, termfreq) VALUES (?, ?, ?, ?, ?)',
(term.termid, i, tfidf, docfreq, termfreq))

def main():
global documents
global tokens
global terms

# Get the starting URL to crawl

start_url = input("Enter URL to crawl (must be in the form http://www.domain.com): ")

# Initialize database
db = {}

# Capture the start time

t2 = time.localtime()
print(f'Start Time: {t2.tm_hour:02d}:{t2.tm_min:02d}')

# Create SQLite database

con = sqlite3.connect("webcrawler.db")
cur = con.cursor()

# Create tables
cur.execute("DROP TABLE IF EXISTS DocumentDictionary")
cur.execute("DROP INDEX IF EXISTS idxDocumentDictionary")
cur.execute("CREATE TABLE IF NOT EXISTS DocumentDictionary (DocumentName TEXT, DocId
INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxDocumentDictionary ON DocumentDictionary (DocId)")

cur.execute("DROP TABLE IF EXISTS TermDictionary")

cur.execute("DROP INDEX IF EXISTS idxTermDictionary")
cur.execute("CREATE TABLE IF NOT EXISTS TermDictionary (Term TEXT, TermId INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxTermDictionary ON TermDictionary (TermId)")
cur.execute("DROP TABLE IF EXISTS Posting")
cur.execute("DROP INDEX IF EXISTS idxPosting1")
cur.execute("DROP INDEX IF EXISTS idxPosting2")
cur.execute("CREATE TABLE IF NOT EXISTS Posting (TermId INTEGER, DocId INTEGER, tfidf REAL,
docfreq INTEGER, termfreq INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxPosting1 ON Posting (TermId)")
cur.execute("CREATE INDEX IF NOT EXISTS idxPosting2 ON Posting (DocId)")

# Initialize crawling variables

crawled = set()
tocrawl = [start_url]
links_queue = 0
crawlcomplete = True

while crawlcomplete:
if links_queue >= 500:
print("URL frontier reached its limit of 500 URLs.")
break

try:
crawling = tocrawl.pop(0)
except IndexError:
crawlcomplete = False
continue

# Skip non-HTML files

if crawling.endswith(('.pdf', '.png', '.jpg', '.gif', '.asp')):
crawled.add(crawling)
continue

print(f'{len(tocrawl)} URLs remaining to crawl. Crawling: {crawling}')

# Fetch the page

url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling).read().decode('utf-8')
except Exception as e:
print(f'Error fetching {crawling}: {e}')
continue

# Parse the page content

text = Term.stripTags(response)

# Process tokens
parsetoken(db, text)

documents += 1

# Store document info

cur.execute("INSERT INTO DocumentDictionary (DocumentName, DocId) VALUES (?, ?)", (crawling,
documents))
# Extract and queue links
if links_queue < 500:
links = re.findall(r'href=["\'](.[^"\']+)["\']', response, re.I)
for link in links:
link = urlparse.urljoin(crawling, link)
if link not in crawled and link not in tocrawl:
tocrawl.append(link)
links_queue += 1

crawled.add(crawling)
links_queue -= 1

# Finish and write index to disk

t2 = time.localtime()
print(f'Indexing Complete, write to disk: {t2.tm_hour:02d}:{t2.tm_min:02d}')
writeindex(db, cur)

# Commit and close the database

con.commit()
con.close()

# Print statistics
print(f"Documents {documents}")
print(f"Terms {terms}")
print(f"Tokens {tokens}")

t2 = time.localtime()
print(f'End Time: {t2.tm_hour:02d}:{t2.tm_min:02d}')

# Call the main function directly

main()

THE OUTPUT OF THE ASSIGNMENT

I crawled the website: http://www.thesaurus.com

2 - RHEL - Enterprise Performance Tuning (RH442)
No ratings yet
2 - RHEL - Enterprise Performance Tuning (RH442)
6 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
System Administrator Checklist
No ratings yet
System Administrator Checklist
4 pages
Information Retrieval WA
No ratings yet
Information Retrieval WA
9 pages
CS 3308 Programming Assignment Unit 4
No ratings yet
CS 3308 Programming Assignment Unit 4
7 pages
Assignment 4
No ratings yet
Assignment 4
11 pages
CS 3308 Programming Assignment Unit 2
No ratings yet
CS 3308 Programming Assignment Unit 2
10 pages
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
Assessment - 2: - K Mary Nikitha
No ratings yet
Assessment - 2: - K Mary Nikitha
27 pages
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
No ratings yet
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
11 pages
Unit 4 Source Code
No ratings yet
Unit 4 Source Code
11 pages
Vanessaa Wim
No ratings yet
Vanessaa Wim
9 pages
20BCE1779 - Web Mining - Lab-4
No ratings yet
20BCE1779 - Web Mining - Lab-4
10 pages
Rescued Document
No ratings yet
Rescued Document
4 pages
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
No ratings yet
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
4 pages
Programming Assignment Unit 05 - CS 3308 - Information Retrieval - University of The People
No ratings yet
Programming Assignment Unit 05 - CS 3308 - Information Retrieval - University of The People
9 pages
IR Assignment5
No ratings yet
IR Assignment5
4 pages
Zref
No ratings yet
Zref
8 pages
Notes - by Kishor
No ratings yet
Notes - by Kishor
11 pages
Assignment 2 IR
No ratings yet
Assignment 2 IR
6 pages
De1 GK NHKTLT
No ratings yet
De1 GK NHKTLT
12 pages
Inverted Index-Unit-3
No ratings yet
Inverted Index-Unit-3
11 pages
Sans Titre
No ratings yet
Sans Titre
11 pages
Azure Rag Implementation Part2
No ratings yet
Azure Rag Implementation Part2
15 pages
IR Assignment4
No ratings yet
IR Assignment4
5 pages
IR Practical
No ratings yet
IR Practical
24 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Ir Lab 2 Ir Learning Outcomes: Pyterrier
No ratings yet
Ir Lab 2 Ir Learning Outcomes: Pyterrier
7 pages
20BCE1779 - Web Mining - Lab-1
No ratings yet
20BCE1779 - Web Mining - Lab-1
9 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
Assignment 2
No ratings yet
Assignment 2
4 pages
Ir Task
No ratings yet
Ir Task
6 pages
Python Script For PDF - Reading
No ratings yet
Python Script For PDF - Reading
2 pages
Ir Op 6
No ratings yet
Ir Op 6
2 pages
IR Practical Code
No ratings yet
IR Practical Code
13 pages
Trip Planner Example
No ratings yet
Trip Planner Example
7 pages
Python v3 URL and Page
No ratings yet
Python v3 URL and Page
4 pages
Language Engineering - Section
No ratings yet
Language Engineering - Section
20 pages
Lab2 IR
No ratings yet
Lab2 IR
16 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
IR
No ratings yet
IR
12 pages
DSBD 7 Ass
No ratings yet
DSBD 7 Ass
9 pages
PYTHON
No ratings yet
PYTHON
2 pages
IR Practical 1
No ratings yet
IR Practical 1
5 pages
Lab3 IR BIM
No ratings yet
Lab3 IR BIM
14 pages
Information Retrival
No ratings yet
Information Retrival
43 pages
Citl Exp 8
No ratings yet
Citl Exp 8
7 pages
IR Journal (Printable)
No ratings yet
IR Journal (Printable)
20 pages
Zihad Projeject
No ratings yet
Zihad Projeject
20 pages
Gen Ai-1
No ratings yet
Gen Ai-1
6 pages
Introduction
No ratings yet
Introduction
17 pages
Another Hack Test3
No ratings yet
Another Hack Test3
4 pages
Demo
No ratings yet
Demo
3 pages
Lab Manual
No ratings yet
Lab Manual
10 pages
Untitled Document
No ratings yet
Untitled Document
18 pages
Tsarecord
No ratings yet
Tsarecord
22 pages
Web Mining Lab Source Code 1-12 PRINT
No ratings yet
Web Mining Lab Source Code 1-12 PRINT
43 pages
LLM Prcess
No ratings yet
LLM Prcess
7 pages
Parser
No ratings yet
Parser
6 pages
A Simple Python Web Crawler...
100% (1)
A Simple Python Web Crawler...
5 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
PES 1st PU Program Lists and Instructions (Science)
No ratings yet
PES 1st PU Program Lists and Instructions (Science)
8 pages
12th Computer Science Important 2 5 Marks English Medium
83% (6)
12th Computer Science Important 2 5 Marks English Medium
3 pages
Merge Sort: SCJ2013 Data Structure & Algorithms
No ratings yet
Merge Sort: SCJ2013 Data Structure & Algorithms
16 pages
Business Analytics and Big Data PDF
100% (1)
Business Analytics and Big Data PDF
15 pages
C B I T: Data Definition Language
No ratings yet
C B I T: Data Definition Language
9 pages
Contiguous Memory Allocation
100% (1)
Contiguous Memory Allocation
5 pages
18CSC302J/ Computer Networks: List of Experiments
No ratings yet
18CSC302J/ Computer Networks: List of Experiments
5 pages
Spoofing and Man-in-the-Middle Attacks: Date Assigned: Mm/dd/yyyy Time Due: Mm/dd/yyyy by HH:MM Educational Objectives
No ratings yet
Spoofing and Man-in-the-Middle Attacks: Date Assigned: Mm/dd/yyyy Time Due: Mm/dd/yyyy by HH:MM Educational Objectives
12 pages
Using SD Card PDF
No ratings yet
Using SD Card PDF
17 pages
Linux GoodToKnow#Basics#Utilities#Tools
No ratings yet
Linux GoodToKnow#Basics#Utilities#Tools
1 page
IT Questions
No ratings yet
IT Questions
108 pages
BUMT3200 Chapter 7
No ratings yet
BUMT3200 Chapter 7
4 pages
Dbms (Khushi Pareek)
No ratings yet
Dbms (Khushi Pareek)
12 pages
Motorola 68HC11 Architecture
No ratings yet
Motorola 68HC11 Architecture
3 pages
2 Module - 1 - 2 - Part 3 Peripheral Interfacing ICs
No ratings yet
2 Module - 1 - 2 - Part 3 Peripheral Interfacing ICs
31 pages
ProxySG Training Slide
No ratings yet
ProxySG Training Slide
18 pages
Lecture 5
No ratings yet
Lecture 5
19 pages
10 Paging 15 06 2023
No ratings yet
10 Paging 15 06 2023
6 pages
Infosys Campus Question Papers
No ratings yet
Infosys Campus Question Papers
9 pages
DMA Controller and Its Operations - Narendra Kumar
100% (1)
DMA Controller and Its Operations - Narendra Kumar
2 pages
EMC Invista: Making Virtual Storage A Reality For Your Enterprise
No ratings yet
EMC Invista: Making Virtual Storage A Reality For Your Enterprise
4 pages
Hsslive XII Chapter 8 Cs Joy John
No ratings yet
Hsslive XII Chapter 8 Cs Joy John
5 pages
Assignment Questions: Module - 1 Application Layer
No ratings yet
Assignment Questions: Module - 1 Application Layer
2 pages
EMC NetWorker Module For Databases and Applications (NMDA) 1.2 Administration Guide
No ratings yet
EMC NetWorker Module For Databases and Applications (NMDA) 1.2 Administration Guide
384 pages
EMV Card Basics: Seminar On Demand: Study Guide
No ratings yet
EMV Card Basics: Seminar On Demand: Study Guide
10 pages
Joins 123
No ratings yet
Joins 123
44 pages
Lecture - 1 - Introduction To Database Systems
No ratings yet
Lecture - 1 - Introduction To Database Systems
40 pages
Assignment 03
No ratings yet
Assignment 03
3 pages

Cs 3308 Unit 7 Programming Assignment

Uploaded by

Cs 3308 Unit 7 Programming Assignment

Uploaded by

CS 3308- INFORMATION RETRIEVAL

UNIT 7 PROGRAMMING ASSIGNMENT

UNIVERSITY OF THE PEOPLE

# Stop words list

# Simple stemming function

def parsetoken(db, line):

# Clean up the line

# Split line into tokens

# Process each token

# Skip short tokens, stopwords, and numbers

# Apply basic stemming

# Add new term to dictionary if it doesn't exist

# Update document frequency and term frequency

def writeindex(db, cur):

for i, termfreq in term.docids.items():

# Get the starting URL to crawl

# Capture the start time

# Create SQLite database

cur.execute("DROP TABLE IF EXISTS TermDictionary")

# Initialize crawling variables

# Skip non-HTML files

print(f'{len(tocrawl)} URLs remaining to crawl. Crawling: {crawling}')

# Fetch the page

# Parse the page content

# Store document info

# Finish and write index to disk

# Commit and close the database

# Call the main function directly

THE OUTPUT OF THE ASSIGNMENT

I crawled the website: http://www.thesaurus.com

You might also like