Menu

[r6320]: / trunk / py4science / examples / wordfreqs.py  Maximize  Restore  History

Download this file

52 lines (41 with data), 1.4 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python
"""Word frequencies - count word frequencies in a string."""
def word_freq(text):
"""Return a dictionary of word frequencies for the given text."""
freqs = {}
for word in text.split():
freqs[word] = freqs.get(word, 0) + 1
return freqs
def print_vk(lst):
"""Print a list of value/key pairs nicely formatted in key/value order."""
# Find the longest key: remember, the list has value/key paris, so the key
# is element [1], not [0]
#longest_key = max(map(lambda x: len(x[1]),lst))
longest_key = max([len(word) for count, word in lst])
# Make a format string out of it
fmt = '%'+str(longest_key)+'s -> %s'
# Do actual printing
for v,k in lst:
print fmt % (k,v)
def freq_summ(freqs,n=10):
"""Print a simple summary of a word frequencies dictionary.
Inputs:
- freqs: a dictionary of word frequencies.
Optional inputs:
- n: the number of """
words,counts = freqs.keys(),freqs.values()
# Sort by count
items = zip(counts,words)
items.sort()
print 'Number of words:',len(freqs)
print
print '%d least frequent words:' % n
print_vk(items[:n])
print
print '%d most frequent words:' % n
print_vk(items[-n:])
if __name__ == '__main__':
import gzip
text = gzip.open('data/HISTORY.gz').read()
freqs = word_freq(text)
freq_summ(freqs,20)
Want the latest updates on software, tech news, and AI?
Get latest updates about software, tech news, and AI from SourceForge directly in your inbox once a month.