Docdist1: 6.006 Intro To Algorithms Recitation 2 September 14, 2011

This document contains code for several algorithms related to document comparison and frequency analysis. It defines functions for getting word frequencies from files, counting word frequencies, sorting frequency mappings, and calculating the cosine similarity between documents. The functions are refined in several versions to improve efficiency, such as using dictionaries instead of lists for frequency counts and merging sorted lists.

Uploaded by

Alireza Kafaei

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

43 views6 pages

Docdist1: 6.006 Intro To Algorithms Recitation 2 September 14, 2011

Uploaded by

Alireza Kafaei

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 6

6.

006 Intro to Algorithms Recitation 2 September 14, 2011

docdist1
1 def main():
2 if len(sys.argv) != 3:
3 print "Usage: docdist1.py filename_1 filename_2"
4 else:
5 filename_1 = sys.argv[1]
6 filename_2 = sys.argv[2]
7 sorted_word_list_1 = word_frequencies_for_file(filename_1)
8 sorted_word_list_2 = word_frequencies_for_file(filename_2)
9 distance = vector_angle(sorted_word_list_1,sorted_word_list_2)
10 print "The distance between the documents is: %0.6f (radians)" %
distance

1 def word_frequencies_for_file(filename):
2 line_list = read_file(filename)
3 word_list = get_words_from_line_list(line_list)
4 freq_mapping = count_frequency(word_list)
5 return freq_mapping

1 def get_words_from_line_list(L):
2 word_list = []
3 for line in L:
4 words_in_line = get_words_from_string(line)
5 word_list = word_list + words_in_line
6 return word_list
7
8 def get_words_from_string(line):
9 word_list = []
10 character_list = []
11 for c in line:
12 if c.isalnum():
13 character_list.append(c)
14 elif len(character_list)>0:
15 word = "".join(character_list)
16 word = word.lower()
17 word_list.append(word)
18 character_list = []
19 if len(character_list)>0:
20 word = "".join(character_list)
21 word = word.lower()
22 word_list.append(word)
23 return word_list

1
6.006 Intro to Algorithms Recitation 2 September 14, 2011

1 def count_frequency(word_list):
2 L = []
3 for new_word in word_list:
4 for entry in L:
5 if new_word == entry[0]:
6 entry[1] = entry[1] + 1
7 break
8 else:
9 L.append([new_word,1])
10 return L

1 def vector_angle(L1,L2):
2 numerator = inner_product(L1,L2)
3 denominator = math.sqrt(inner_product(L1,L1)*inner_product(L2,L2))
4 return math.acos(numerator/denominator)

1 def inner_product(L1,L2):
2 sum = 0.0
3 for word1, count1 in L1:
4 for word2, count2 in L2:
5 if word1 == word2:
6 sum += count1 * count2
7 return sum

docdist2
1 if __name__ == "__main__":
2 import cProfile
3 cProfile.run("main()")

1 def get_words_from_line_list(L):
2 word_list = []
3 for line in L:
4 words_in_line = get_words_from_string(line)
5 word_list.extend(words_in_line)
6 return word_list

docdist3
1 def word_frequencies_for_file(filename):
2 line_list = read_file(filename)
3 word_list = get_words_from_line_list(line_list)
4 freq_mapping = count_frequency(word_list)
5 insertion_sort(freq_mapping)
6 return freq_mapping

2
6.006 Intro to Algorithms Recitation 2 September 14, 2011

1 def insertion_sort(A):
2 for j in range(len(A)):
3 key = A[j]
4 i = j-1
5 while i>-1 and A[i]>key:
6 A[i+1] = A[i]
7 i = i-1
8 A[i+1] = key
9 return A

1 def inner_product(L1,L2):
2 sum = 0.0
3 i = 0
4 j = 0
5 while i<len(L1) and j<len(L2):
6 # L1[i:] and L2[j:] yet to be processed
7 if L1[i][0] == L2[j][0]:
8 # both vectors have this word
9 sum += L1[i][1] * L2[j][1]
10 i += 1
11 j += 1
12 elif L1[i][0] < L2[j][0]:
13 # word L1[i][0] is in L1 but not L2
14 i += 1
15 else:
16 # word L2[j][0] is in L2 but not L1
17 j += 1
18 return sum

docdist4
1 def count_frequency(word_list):
2 D = {}
3 for new_word in word_list:
4 if new_word in D:
5 D[new_word] = D[new_word]+1
6 else:
7 D[new_word] = 1
8 return D.items()

3
6.006 Intro to Algorithms Recitation 2 September 14, 2011

docdist5
1 translation_table = string.maketrans(string.punctuation+string.
uppercase,
2 " "*len(string.punctuation)+string.lowercase)
3
4 def get_words_from_string(line):
5 line = line.translate(translation_table)
6 word_list = line.split()
7 return word_list

docdist6
1 def word_frequencies_for_file(filename):
2 line_list = read_file(filename)
3 word_list = get_words_from_line_list(line_list)
4 freq_mapping = count_frequency(word_list)
5 freq_mapping = merge_sort(freq_mapping)
6 return freq_mapping

1 def merge_sort(A):
2 n = len(A)
3 if n==1:
4 return A
5 mid = n//2
6 L = merge_sort(A[:mid])
7 R = merge_sort(A[mid:])
8 return merge(L,R)
9
10 def merge(L,R):
11 i = 0
12 j = 0
13 answer = []
14 while i<len(L) and j<len(R):
15 if L[i]<R[j]:
16 answer.append(L[i])
17 i += 1
18 else:
19 answer.append(R[j])
20 j += 1
21 if i<len(L):
22 answer.extend(L[i:])
23 if j<len(R):
24 answer.extend(R[j:])
25 return answer

4
6.006 Intro to Algorithms Recitation 2 September 14, 2011

docdist7
1 def count_frequency(word_list):
2 D = {}
3 for new_word in word_list:
4 if new_word in D:
5 D[new_word] = D[new_word]+1
6 else:
7 D[new_word] = 1
8 return D

1 def word_frequencies_for_file(filename):
2 line_list = read_file(filename)
3 word_list = get_words_from_line_list(line_list)
4 freq_mapping = count_frequency(word_list)
5 return freq_mapping

1 def inner_product(D1,D2):
2 sum = 0.0
3 for key in D1:
4 if key in D2:
5 sum += D1[key] * D2[key]
6 return sum

docdist8
1 def get_words_from_text(text):
2 text = text.translate(translation_table)
3 word_list = text.split()
4 return word_list
5
6 def word_frequencies_for_file(filename):
7 text = read_file(filename)
8 word_list = get_words_from_text(text)
9 freq_mapping = count_frequency(word_list)
10 return freq_mapping

5
MIT OpenCourseWare
http://ocw.mit.edu

6.006 Introduction to Algorithms

Fall 2011

For information about citing these materials or our Terms of Use, visit: http://ocw.mit.edu/terms.

Speaking Forecast Q2 - 2024 Official
No ratings yet
Speaking Forecast Q2 - 2024 Official
32 pages
Muhs Nashik Thesis Guidelines
100% (3)
Muhs Nashik Thesis Guidelines
7 pages
Bachelor of Education Primary Program Code 3114 PDF
No ratings yet
Bachelor of Education Primary Program Code 3114 PDF
1 page
The Following Information Relates To Questions 1 and 2
No ratings yet
The Following Information Relates To Questions 1 and 2
3 pages
Python Cost Model: Docdist1
No ratings yet
Python Cost Model: Docdist1
12 pages
Unit 6
No ratings yet
Unit 6
39 pages
"Enter A Number:": Def If Return Else Return
No ratings yet
"Enter A Number:": Def If Return Else Return
5 pages
IR Practical Code
No ratings yet
IR Practical Code
13 pages
Python Output
No ratings yet
Python Output
11 pages
Pract 1 Measuring The Document Similarity in Python
No ratings yet
Pract 1 Measuring The Document Similarity in Python
6 pages
2: Models of Computation: Al-Khw Arizm I
No ratings yet
2: Models of Computation: Al-Khw Arizm I
8 pages
Assignment 02 - Spring 21
No ratings yet
Assignment 02 - Spring 21
4 pages
Python Lab Manual 2023 24
No ratings yet
Python Lab Manual 2023 24
15 pages
Python DS Notes Detailed
No ratings yet
Python DS Notes Detailed
6 pages
Python
No ratings yet
Python
11 pages
Akshat Sethi Practical File
No ratings yet
Akshat Sethi Practical File
50 pages
Ansh Tygai Practical File
No ratings yet
Ansh Tygai Practical File
48 pages
Lab2 24 - 07 - 2024
No ratings yet
Lab2 24 - 07 - 2024
10 pages
Python Programming Laboratory
No ratings yet
Python Programming Laboratory
35 pages
Python
No ratings yet
Python
10 pages
Python
No ratings yet
Python
10 pages
Code3 0
No ratings yet
Code3 0
28 pages
CS Boards Pracs
No ratings yet
CS Boards Pracs
13 pages
3&4 Units Python Programs
No ratings yet
3&4 Units Python Programs
13 pages
CS Practical
No ratings yet
CS Practical
74 pages
Print Lab Programs 1
No ratings yet
Print Lab Programs 1
17 pages
Practical File by Aksh Jaiswal
No ratings yet
Practical File by Aksh Jaiswal
48 pages
Imp (Practical File)
No ratings yet
Imp (Practical File)
51 pages
Pinaki Day2 Roll 11
No ratings yet
Pinaki Day2 Roll 11
7 pages
Python Assignment 3 AMAN GAUTAM 039
No ratings yet
Python Assignment 3 AMAN GAUTAM 039
5 pages
Cs Journal
No ratings yet
Cs Journal
43 pages
Ir Practical
No ratings yet
Ir Practical
13 pages
Class 12 Python Programs
No ratings yet
Class 12 Python Programs
6 pages
Atharv Bhambare
No ratings yet
Atharv Bhambare
73 pages
DAA Summarized Unit 5
No ratings yet
DAA Summarized Unit 5
21 pages
Python Practice Question
No ratings yet
Python Practice Question
5 pages
Pratham FDS
No ratings yet
Pratham FDS
9 pages
Krish 5-12 PR
No ratings yet
Krish 5-12 PR
16 pages
Lab 2: Algorithm Design and Performance - Complexity 1 Operation Counting: Assignments and Comparisons
No ratings yet
Lab 2: Algorithm Design and Performance - Complexity 1 Operation Counting: Assignments and Comparisons
3 pages
Practical Main
No ratings yet
Practical Main
22 pages
Vraj-198 PWP 5-12
No ratings yet
Vraj-198 PWP 5-12
16 pages
PWP - 5-12 Meet
No ratings yet
PWP - 5-12 Meet
16 pages
Data Structure Final Lab Manual
No ratings yet
Data Structure Final Lab Manual
57 pages
IR Practical B1
No ratings yet
IR Practical B1
15 pages
First - Year - Python - Programs - Jupyter Notebook - Python Lab Program VTU
No ratings yet
First - Year - Python - Programs - Jupyter Notebook - Python Lab Program VTU
6 pages
DSL Practical
No ratings yet
DSL Practical
49 pages
Record File
No ratings yet
Record File
35 pages
Lecture 10
No ratings yet
Lecture 10
7 pages
Flipkart Runway Coding Prep Final
No ratings yet
Flipkart Runway Coding Prep Final
5 pages
Xii Practical Updated On 22-10-22
No ratings yet
Xii Practical Updated On 22-10-22
31 pages
Daa Record
No ratings yet
Daa Record
63 pages
Python - Lab - Manual 2
100% (1)
Python - Lab - Manual 2
37 pages
Computer Scinece Practical File
No ratings yet
Computer Scinece Practical File
52 pages
Practical File
No ratings yet
Practical File
32 pages
Write A Python Program Using List and Their Built in Functions
No ratings yet
Write A Python Program Using List and Their Built in Functions
11 pages
Practical File Questions
No ratings yet
Practical File Questions
34 pages
CH 3 - CH 8 Answers
No ratings yet
CH 3 - CH 8 Answers
60 pages
Saanp pt.3-1
No ratings yet
Saanp pt.3-1
21 pages
Unit 4 Python
No ratings yet
Unit 4 Python
17 pages
Lesson 02 - Algorithm Analysis - Student
No ratings yet
Lesson 02 - Algorithm Analysis - Student
10 pages
Python Final Lab 2019
No ratings yet
Python Final Lab 2019
35 pages
Python Programs
No ratings yet
Python Programs
10 pages
Program Ms
No ratings yet
Program Ms
99 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Assessment - Marketing
No ratings yet
Assessment - Marketing
8 pages
Tutorial 3solutions
No ratings yet
Tutorial 3solutions
5 pages
Tutorial 5 Solutions
No ratings yet
Tutorial 5 Solutions
6 pages
Sample/practice Exam 4 November 2019, Questions Sample/practice Exam 4 November 2019, Questions
No ratings yet
Sample/practice Exam 4 November 2019, Questions Sample/practice Exam 4 November 2019, Questions
32 pages
Sample/practice Exam 1 January 2016, Questions and Answers Sample/practice Exam 1 January 2016, Questions and Answers
No ratings yet
Sample/practice Exam 1 January 2016, Questions and Answers Sample/practice Exam 1 January 2016, Questions and Answers
6 pages
This Page Intentionally Left Blank, Use If Needed But It Will Not Be Marked
No ratings yet
This Page Intentionally Left Blank, Use If Needed But It Will Not Be Marked
8 pages
Tutorial 5 - 2022
No ratings yet
Tutorial 5 - 2022
2 pages
Samplepractice Exam 3 March 2016 Questions
No ratings yet
Samplepractice Exam 3 March 2016 Questions
10 pages
Units 1 & 2 Mathematical Methods: P (X) 3 X x+2
No ratings yet
Units 1 & 2 Mathematical Methods: P (X) 3 X x+2
3 pages
Sample/practice Exam 2014, Questions and Answers Sample/practice Exam 2014, Questions and Answers
No ratings yet
Sample/practice Exam 2014, Questions and Answers Sample/practice Exam 2014, Questions and Answers
14 pages
Fit 1008 MST-Solution
No ratings yet
Fit 1008 MST-Solution
6 pages
MIPS Reference Sheet For FIT1008 and FIT2085
No ratings yet
MIPS Reference Sheet For FIT1008 and FIT2085
2 pages
Python Cheat Sheet - Lecture Notes 1-19 Python Cheat Sheet - Lecture Notes 1-19
No ratings yet
Python Cheat Sheet - Lecture Notes 1-19 Python Cheat Sheet - Lecture Notes 1-19
4 pages
Monash University: Semester Two Mid Semester Test 2016 Faculty of Information Technology
No ratings yet
Monash University: Semester Two Mid Semester Test 2016 Faculty of Information Technology
11 pages
Sums & Products: C. F. Gauss
No ratings yet
Sums & Products: C. F. Gauss
4 pages
Deviation of Repeated Trials: How Small?
No ratings yet
Deviation of Repeated Trials: How Small?
3 pages
Solutions To Quiz 1
No ratings yet
Solutions To Quiz 1
8 pages
Asymptotics Stirling's Formula,: Integral Method To Bound
No ratings yet
Asymptotics Stirling's Formula,: Integral Method To Bound
5 pages
Random Walks: Gambler's Ruin
No ratings yet
Random Walks: Gambler's Ruin
5 pages
Harmonic Series, Integral Method, Stirling's Formula: How Far Out?
No ratings yet
Harmonic Series, Integral Method, Stirling's Formula: How Far Out?
7 pages
Allocate+ MAT1830 Applied Class 10 CL 7InnCG02A Mon 1800-Alireza Kafaee Fanaeepour - 8964010 - 0
No ratings yet
Allocate+ MAT1830 Applied Class 10 CL 7InnCG02A Mon 1800-Alireza Kafaee Fanaeepour - 8964010 - 0
3 pages
Quiz 2: Massachusetts Institute of Technology 6.042J/18.062J, Fall '05 Prof. Albert R. Meyer Prof. Ronitt Rubinfeld
No ratings yet
Quiz 2: Massachusetts Institute of Technology 6.042J/18.062J, Fall '05 Prof. Albert R. Meyer Prof. Ronitt Rubinfeld
8 pages
Final
No ratings yet
Final
15 pages
Chapters 16 and 17 Questions For Review
No ratings yet
Chapters 16 and 17 Questions For Review
2 pages
Quiz 1: Massachusetts Institute of Technology 6.042J/18.062J, Fall '05 Prof. Albert R. Meyer Prof. Ronitt Rubinfeld
No ratings yet
Quiz 1: Massachusetts Institute of Technology 6.042J/18.062J, Fall '05 Prof. Albert R. Meyer Prof. Ronitt Rubinfeld
9 pages
Allocate+ MAT1830 Applied Class 10 CL 7InnCG02A Mon 1800-Alireza Kafaee Fanaeepour - 9386129 - 0
No ratings yet
Allocate+ MAT1830 Applied Class 10 CL 7InnCG02A Mon 1800-Alireza Kafaee Fanaeepour - 9386129 - 0
4 pages
Humming Bird - Olympiad & SpellBee
No ratings yet
Humming Bird - Olympiad & SpellBee
2 pages
Pay-For-Performance: The Evidence: Mcgraw-Hill/Irwin
No ratings yet
Pay-For-Performance: The Evidence: Mcgraw-Hill/Irwin
29 pages
The Resilience Bank Account: Skills For Optimal Performance: Michael Maddaus, MD
No ratings yet
The Resilience Bank Account: Skills For Optimal Performance: Michael Maddaus, MD
8 pages
Marx Resume For Sales and Marketing
No ratings yet
Marx Resume For Sales and Marketing
3 pages
Volunteer Teacher's Toolkit by I-To-I TEFL
No ratings yet
Volunteer Teacher's Toolkit by I-To-I TEFL
25 pages
From 22.08.2011 To 30.06.2016 (OCTMP) Worked As District Consultant Under WR Department Govt. of Odisha
No ratings yet
From 22.08.2011 To 30.06.2016 (OCTMP) Worked As District Consultant Under WR Department Govt. of Odisha
3 pages
Machine Learning Techniques For Civil Engineering Problems
No ratings yet
Machine Learning Techniques For Civil Engineering Problems
28 pages
Acknowledgement Sample in Term Paper
100% (1)
Acknowledgement Sample in Term Paper
7 pages
Aaos Elbow Surgical Approaches-1
No ratings yet
Aaos Elbow Surgical Approaches-1
9 pages
3P CEW545 Rubrics Level 3P-Ad Hoc-Covid19
No ratings yet
3P CEW545 Rubrics Level 3P-Ad Hoc-Covid19
2 pages
Career in Law
No ratings yet
Career in Law
31 pages
Resume For Job Training
100% (2)
Resume For Job Training
7 pages
Bunny School Fees
No ratings yet
Bunny School Fees
1 page
Exploring Anatomy Physiology in The Laboratory 3rd Edition Edition Erin C. Amerman - The Full Ebook With All Chapters Is Available For Download Now
No ratings yet
Exploring Anatomy Physiology in The Laboratory 3rd Edition Edition Erin C. Amerman - The Full Ebook With All Chapters Is Available For Download Now
44 pages
Faculty Application Form: Mepco Schlenk Engineering College, Sivakasi
No ratings yet
Faculty Application Form: Mepco Schlenk Engineering College, Sivakasi
5 pages
SZRZ6014 SilibusApprovedSenate2010
No ratings yet
SZRZ6014 SilibusApprovedSenate2010
8 pages
Urbana and Feliza
No ratings yet
Urbana and Feliza
3 pages
Empathy
No ratings yet
Empathy
2 pages
IELTS Writing
100% (1)
IELTS Writing
5 pages
Certificate (10 Files Merged)
No ratings yet
Certificate (10 Files Merged)
10 pages
Easwari Engineering College: (Autonomous Institution)
No ratings yet
Easwari Engineering College: (Autonomous Institution)
84 pages
ME 3217 Meta Cutting
No ratings yet
ME 3217 Meta Cutting
6 pages
Getting Started Guide PDF
No ratings yet
Getting Started Guide PDF
23 pages
Complete Bundle Semiconductor Physics and Devices Basic Principles 4th Edition Neamen
No ratings yet
Complete Bundle Semiconductor Physics and Devices Basic Principles 4th Edition Neamen
401 pages
Lesson 9 The School Head in School-Based Management (SBM)
100% (3)
Lesson 9 The School Head in School-Based Management (SBM)
6 pages
JT - SDT Final-Revised
No ratings yet
JT - SDT Final-Revised
13 pages
International Gastroenterology Conference 220324
No ratings yet
International Gastroenterology Conference 220324
6 pages