0% found this document useful (0 votes)
0 views

cs-3308-unit-7-programming-assignment

The document contains a programming assignment for an Information Retrieval course at the University of the People, which involves creating a web crawler using Python and SQLite. The code includes functionalities for tokenizing text, removing stop words, basic stemming, and storing crawled data in a database. The assignment culminates in crawling a specified website and indexing the retrieved documents and terms.
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
0 views

cs-3308-unit-7-programming-assignment

The document contains a programming assignment for an Information Retrieval course at the University of the People, which involves creating a web crawler using Python and SQLite. The code includes functionalities for tokenizing text, removing stop words, basic stemming, and storing crawled data in a database. The assignment culminates in crawling a specified website and indexing the retrieved documents and terms.
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 8

CS 3308- INFORMATION RETRIEVAL

UNIT 7 PROGRAMMING ASSIGNMENT

UNIVERSITY OF THE PEOPLE


SOURCE CODE OF THE ASSIGNMENT

import sqlite3
import re
import math
import time
import urllib.request as urllib2
import urllib.parse as urlparse
from html.parser import HTMLParser

# Stop words list


stopwords = ['the', 'of', 'and', 'to', 'in', 'you', 'it', 'with', 'that', 'or', 'was', 'he', 'is', 'for', 'this', 'his', 'as',
'not', 'at', 'by', 'all', 'they', 'but', 'be', 'on', 'from', 'had', 'her', 'work', 'are', 'any', 'she', 'if', 'said', 'so',
'which', 'have', 'do', 'we', 'no', 'my', 'were', 'them', 'their', 'him', 'one', 'will', 'me', 'there', 'who', 'up',
'other', 'an', 'its', 'when', 'what', 'can', 'may', 'into', 'out', 'must', 'your', 'then', 'would', 'could', 'more',
'now', 'has', 'like', 'down', 'where', 'been', 'through', 'did', 'away', 'these', 'such', 'set', 'back', 'some',
'than', 'way', 'made', 'our', 'after', 'well', 'should', 'get', 'even', 'am', 'go', 'saw', 'just', 'put', 'while', 'ever',
'off', 'here', 'also']

# Regular expressions
chars = re.compile(r'\W+')

# Global counters
tokens = 0
documents = 0
terms = 0

class Term:
def __init__(self):
self.termid = 0
self.termfreq = 0
self.docs = 0
self.docids = {}

@staticmethod
def splitchars(line):
return chars.split(line)

@staticmethod
def stripTags(s):
# Using HTMLParser as a built-in alternative to BeautifulSoup
parser = HTMLParser()
return parser.unescape(s)

@staticmethod
def printText(tags):
for tag in tags:
if isinstance(tag, NavigableString):
print(tag)
else:
Term.printText(tag)
print("")

# Simple stemming function


def basic_stem(word):
# Simple suffix stripping (this is just an example, not as robust as PorterStemmer)
suffixes = ['ing', 'ed', 'es', 's']
for suffix in suffixes:
if word.endswith(suffix):
return word[:-len(suffix)]
return word # Return the word if no suffix is found

def parsetoken(db, line):


global documents
global tokens
global terms

# Clean up the line


line = line.replace('\t', ' ').strip()

# Split line into tokens


tokens_list = Term.splitchars(line)

# Process each token


for elmt in tokens_list:
elmt = elmt.replace('\n', '')
lowerElmt = elmt.lower().strip()

# Count tokens
tokens += 1

# Skip short tokens, stopwords, and numbers


if len(lowerElmt) < 2 or lowerElmt in stopwords:
continue

try:
int(lowerElmt) # Check if token is a number
continue
except ValueError:
stemword = lowerElmt

# Apply basic stemming


lowerElmt = basic_stem(stemword)

# Add new term to dictionary if it doesn't exist


if lowerElmt not in db:
terms += 1
db[lowerElmt] = Term()
db[lowerElmt].termid = terms

# Update document frequency and term frequency


if documents not in db[lowerElmt].docids:
db[lowerElmt].docs += 1
db[lowerElmt].docids[documents] = 0

db[lowerElmt].docids[documents] += 1

return tokens_list

def writeindex(db, cur):


for k, term in db.items():
cur.execute('INSERT INTO TermDictionary (Term, TermId) VALUES (?, ?)', (k, term.termid))

docfreq = term.docs
ratio = float(documents) / float(docfreq)
idf = math.log10(ratio)

for i, termfreq in term.docids.items():


tfidf = float(termfreq) * float(idf)

if tfidf > 0:
cur.execute('INSERT INTO Posting (TermId, DocId, tfidf, docfreq, termfreq) VALUES (?, ?, ?, ?, ?)',
(term.termid, i, tfidf, docfreq, termfreq))

def main():
global documents
global tokens
global terms

# Get the starting URL to crawl


start_url = input("Enter URL to crawl (must be in the form http://www.domain.com): ")

# Initialize database
db = {}

# Capture the start time


t2 = time.localtime()
print(f'Start Time: {t2.tm_hour:02d}:{t2.tm_min:02d}')

# Create SQLite database


con = sqlite3.connect("webcrawler.db")
cur = con.cursor()

# Create tables
cur.execute("DROP TABLE IF EXISTS DocumentDictionary")
cur.execute("DROP INDEX IF EXISTS idxDocumentDictionary")
cur.execute("CREATE TABLE IF NOT EXISTS DocumentDictionary (DocumentName TEXT, DocId
INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxDocumentDictionary ON DocumentDictionary (DocId)")

cur.execute("DROP TABLE IF EXISTS TermDictionary")


cur.execute("DROP INDEX IF EXISTS idxTermDictionary")
cur.execute("CREATE TABLE IF NOT EXISTS TermDictionary (Term TEXT, TermId INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxTermDictionary ON TermDictionary (TermId)")
cur.execute("DROP TABLE IF EXISTS Posting")
cur.execute("DROP INDEX IF EXISTS idxPosting1")
cur.execute("DROP INDEX IF EXISTS idxPosting2")
cur.execute("CREATE TABLE IF NOT EXISTS Posting (TermId INTEGER, DocId INTEGER, tfidf REAL,
docfreq INTEGER, termfreq INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxPosting1 ON Posting (TermId)")
cur.execute("CREATE INDEX IF NOT EXISTS idxPosting2 ON Posting (DocId)")

# Initialize crawling variables


crawled = set()
tocrawl = [start_url]
links_queue = 0
crawlcomplete = True

while crawlcomplete:
if links_queue >= 500:
print("URL frontier reached its limit of 500 URLs.")
break

try:
crawling = tocrawl.pop(0)
except IndexError:
crawlcomplete = False
continue

# Skip non-HTML files


if crawling.endswith(('.pdf', '.png', '.jpg', '.gif', '.asp')):
crawled.add(crawling)
continue

print(f'{len(tocrawl)} URLs remaining to crawl. Crawling: {crawling}')

# Fetch the page


url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling).read().decode('utf-8')
except Exception as e:
print(f'Error fetching {crawling}: {e}')
continue

# Parse the page content


text = Term.stripTags(response)

# Process tokens
parsetoken(db, text)

documents += 1

# Store document info


cur.execute("INSERT INTO DocumentDictionary (DocumentName, DocId) VALUES (?, ?)", (crawling,
documents))
# Extract and queue links
if links_queue < 500:
links = re.findall(r'href=["\'](.[^"\']+)["\']', response, re.I)
for link in links:
link = urlparse.urljoin(crawling, link)
if link not in crawled and link not in tocrawl:
tocrawl.append(link)
links_queue += 1

crawled.add(crawling)
links_queue -= 1

# Finish and write index to disk


t2 = time.localtime()
print(f'Indexing Complete, write to disk: {t2.tm_hour:02d}:{t2.tm_min:02d}')
writeindex(db, cur)

# Commit and close the database


con.commit()
con.close()

# Print statistics
print(f"Documents {documents}")
print(f"Terms {terms}")
print(f"Tokens {tokens}")

t2 = time.localtime()
print(f'End Time: {t2.tm_hour:02d}:{t2.tm_min:02d}')

# Call the main function directly


main()

THE OUTPUT OF THE ASSIGNMENT

I crawled the website: http://www.thesaurus.com

You might also like