cs-3308-unit-7-programming-assignment
cs-3308-unit-7-programming-assignment
import sqlite3
import re
import math
import time
import urllib.request as urllib2
import urllib.parse as urlparse
from html.parser import HTMLParser
# Regular expressions
chars = re.compile(r'\W+')
# Global counters
tokens = 0
documents = 0
terms = 0
class Term:
def __init__(self):
self.termid = 0
self.termfreq = 0
self.docs = 0
self.docids = {}
@staticmethod
def splitchars(line):
return chars.split(line)
@staticmethod
def stripTags(s):
# Using HTMLParser as a built-in alternative to BeautifulSoup
parser = HTMLParser()
return parser.unescape(s)
@staticmethod
def printText(tags):
for tag in tags:
if isinstance(tag, NavigableString):
print(tag)
else:
Term.printText(tag)
print("")
# Count tokens
tokens += 1
try:
int(lowerElmt) # Check if token is a number
continue
except ValueError:
stemword = lowerElmt
db[lowerElmt].docids[documents] += 1
return tokens_list
docfreq = term.docs
ratio = float(documents) / float(docfreq)
idf = math.log10(ratio)
if tfidf > 0:
cur.execute('INSERT INTO Posting (TermId, DocId, tfidf, docfreq, termfreq) VALUES (?, ?, ?, ?, ?)',
(term.termid, i, tfidf, docfreq, termfreq))
def main():
global documents
global tokens
global terms
# Initialize database
db = {}
# Create tables
cur.execute("DROP TABLE IF EXISTS DocumentDictionary")
cur.execute("DROP INDEX IF EXISTS idxDocumentDictionary")
cur.execute("CREATE TABLE IF NOT EXISTS DocumentDictionary (DocumentName TEXT, DocId
INTEGER)")
cur.execute("CREATE INDEX IF NOT EXISTS idxDocumentDictionary ON DocumentDictionary (DocId)")
while crawlcomplete:
if links_queue >= 500:
print("URL frontier reached its limit of 500 URLs.")
break
try:
crawling = tocrawl.pop(0)
except IndexError:
crawlcomplete = False
continue
# Process tokens
parsetoken(db, text)
documents += 1
crawled.add(crawling)
links_queue -= 1
# Print statistics
print(f"Documents {documents}")
print(f"Terms {terms}")
print(f"Tokens {tokens}")
t2 = time.localtime()
print(f'End Time: {t2.tm_hour:02d}:{t2.tm_min:02d}')