Text Processing
Text Processing
#tokenizing
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
#stopwords
from nltk.corpus import stopwords
#regexp
import re
# pandas dataframe
import pandas as pd
[2]: nltk.download()
[2]: True
[3]: #load the data used in the book examples into the Python environment:
1
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
This command loaded 9 of the text examples available from the corpora package (only
a small number of them!). It has used the variable names text1 through text9 for
theseexamples, and already assigned them values. If you type the variable name, you
get a description of the text
[4]: text1
Note that the first sentence of the book Moby Dick is “Call me Ishmael.” and that
this sentence has been already separated into tokens in the variable sent1
[5]: #The variables sent1 through sent9 have been set to be a list of tokens of the␣
↪first sentence of each text.
sent1
[ ]:
0.1 Counting
[8]: #gives the total number of words in the text
len(text1)
[8]: 260819
[7]: #to find out how many unique words there are, not counting repetitions (gives␣
↪all tokens)
sorted(set(text1))
[7]: 2789
[12]: #Or we can specify just to print the first 30 words in the list of sorted words:
sorted(set(text3))[:30]
[12]: ['!',
"'",
2
'(',
')',
',',
',)',
'.',
'.)',
':',
';',
';)',
'?',
'?)',
'A',
'Abel',
'Abelmizraim',
'Abidah',
'Abide',
'Abimael',
'Abimelech',
'Abr',
'Abrah',
'Abraham',
'Abram',
'Accad',
'Achbor',
'Adah',
'Adam',
'Adbeel',
'Admah']
[13]: #to count how many times the word 'Moby' has appeared in the text1
text1.count("Moby")
[13]: 84
[ ]:
3
[19]: ['austen-emma.txt',
'austen-persuasion.txt',
'austen-sense.txt',
'bible-kjv.txt',
'blake-poems.txt',
'bryant-stories.txt',
'burgess-busterbrown.txt',
'carroll-alice.txt',
'chesterton-ball.txt',
'chesterton-brown.txt',
'chesterton-thursday.txt',
'edgeworth-parents.txt',
'melville-moby_dick.txt',
'milton-paradise.txt',
'shakespeare-caesar.txt',
'shakespeare-hamlet.txt',
'shakespeare-macbeth.txt',
'whitman-leaves.txt']
[22]: 'austen-emma.txt'
[33]: #We can get the original text, using the raw function:
emmatext = nltk.corpus.gutenberg.raw(file1)
emmatext[:120] #Since this is quite long, we can view part of it, e.g. the␣
↪first 120 characters
0.3 1. Tokenization
NLTK has several tokenizers available to break the raw text into tokens; we will use one that
separates by white space and also by special characters (punctuation)
4
#view the tokenized text
emmatokens[:15]
[32]: ['[',
'Emma',
'by',
'Jane',
'Austen',
'1816',
']',
'VOLUME',
'I',
'CHAPTER',
'I',
'Emma',
'Woodhouse',
',',
'handsome']
[34]: #Example
sentence="I have no money at the moment."
nltk.wordpunct_tokenize(sentence)
tokenizer.tokenize(text)
5
['God is Great!', 'I won a lottery.']
[45]: text2="Let us understand the difference between sentence & word tokenizer. It␣
↪is going to be a simple example."
text2.split(". ")
[45]: ['Let us understand the difference between sentence & word tokenizer',
'It is going to be a simple example.']
[ ]:
0.4 2. Stopwords
[19]: #lookat the stopwords listed
print(stopwords.words('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]
[49]: sent1="""He determined to drop his litigation with the monastry, and relinguish␣
↪his claims to the wood-cuting and
fishery rihgts at once. He was the more ready to do this becuase the rights had␣
↪become much less valuable, and he had
indeed the vaguest idea where the wood and river in question were."""
# tokens of words
6
word_tokens = word_tokenize(sent1)
word_tokens[:10]
[49]: ['He',
'determined',
'to',
'drop',
'his',
'litigation',
'with',
'the',
'monastry',
',']
[50]: #empty list to get the final stop word removed text
filtered_sentence = []
Original Sentence
He determined to drop his litigation with the monastry , and relinguish his
claims to the wood-cuting and fishery rihgts at once . He was the more ready to
do this becuase the rights had become much less valuable , and he had indeed the
vaguest idea where the wood and river in question were .
Filtered Sentence
7
0.5 3. Normalizing word Formats
0.6 3.1 Lowercase
[51]: #Example
sentence="I have NO moNey at tHE moMent."
sentence.lower()
[53]: ['[',
'emma',
'by',
'jane',
'austen',
'1816',
']',
'volume',
'i',
'chapter',
'i',
'emma',
'woodhouse',
',',
'handsome']
[55]: # We can further view the words by getting the unique words and sorting them:
emmavocab = sorted(set(emmawords))
emmavocab[:10]
[55]: ['!', '!"', '!"--', "!'", "!'--", '!)--', '!--', '!--"', '!--(', '!--`']
[25]: #uppercased
sentence.upper()
#check Table 3.2 for more operations on strings (Chapter 3, Section 3.2 of NLTK␣
↪book)
8
[26]: ['emma', 'woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',']
emma 1
woodhouse 1
, 8
handsome 1
clever 1
and 4
rich 1
with 2
a 3
comfortable 1
home 1
happy 1
disposition 1
seemed 1
to 3
unite 1
some 1
of 6
the 4
best 1
blessings 1
existence 1
; 2
had 3
lived 1
nearly 1
twenty 1
- 1
one 1
years 1
in 2
world 1
very 2
little 1
distress 1
or 1
vex 1
her 4
9
. 2
she 1
was 1
youngest 1
two 1
daughters 1
most 1
affectionate 1
indulgent 1
father 1
consequence 1
sister 1
' 1
s 1
marriage 1
been 1
mistress 1
his 1
house 1
from 1
early 1
period 1
mother 1
died 1
too 1
long 1
ago 1
for 1
have 1
more 1
[61]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']
10
[30]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']
[31]: ['emm', 'by', 'jan', 'aust', '1816', ']', 'volum', 'i', 'chapt']
[70]: #building our own simple stemmer by making a list of suffixes to take off.
def stem(word):
for suffix in ['ing','ly','ed','ious','ies','ive','es','s']:
if word.endswith(suffix):
return word[:-len(suffix)]
return word
[70]: 'friend'
[71]: stem('relatives')
[71]: 'relativ'
[74]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter']
[82]: wnl.lemmatize('friends')
wnl.lemmatize('relatives')
[82]: 'relative'
11
[85]: #the function replace to replace all the new characters ‘\n’ with a space ‘ ‘.
newemmatext = emmatext.replace('\n', ' ')
shorttext = newemmatext[:150]
[85]: '[Emma by Jane Austen 1816] VOLUME I CHAPTER I Emma Woodhouse, handsome,
clever, and rich, with a comfortable home and happy disposition, seemed to'
re.findall(pword, shorttext)
[38]: ['Emma',
'by',
'Jane',
'Austen',
'1816',
'VOLUME',
'I',
'CHAPTER',
'I',
'Emma',
'Woodhouse',
'handsome',
'clever',
'and',
'rich',
'with',
'a',
'comfortable',
'home',
'and',
'happy',
'disposition',
'seemed',
'to']
[39]: #re.findall will find the substrings that matched anywhere in the specialtext.
specialtext = 'U.S.A. poster-print costs $12.40, with 10% off.'
re.findall(pword, specialtext)
[39]: ['U', 'S', 'A', 'poster', 'print', 'costs', '12', '40', 'with', '10', 'off']
12
[40]: #to match tokens by matching words can have an internal hyphen.
ptoken = re.compile('(\w+(-\w+)*)')
re.findall(ptoken, specialtext)
[41]: #to match abbreviations that might have a “.” inside, like U.S.A.
#We only allow capitalized letters
pabbrev = re.compile('(([A-Z]\.)+)')
re.findall(pabbrev, specialtext)
[42]: #combine it with the words pattern to match either words or abbreviations
ptoken = re.compile('(\w+(-\w+)*|([A-Z]\.)+)')
re.findall(ptoken, specialtext)
13
('with', '', ''),
('10', '', ''),
('off', '', '')]
ptoken = re.compile(r'''([A-Z]\.)+
| \w+(-\w+)*
| \$?\d+(\.\d+)?
''', re.X)
[47]: ['',
'[',
14
'',
'Emma',
'',
'',
'by',
'',
'',
'Jane',
'',
'',
'Austen',
'',
'',
'1816',
'',
']',
'',
'',
'',
'VO',
'']
[48]: ['U.S.A.',
'',
'',
'poster-print',
'',
'',
'costs',
'',
'',
'$12.40',
'',
',',
'',
'',
'with',
'',
'',
'10',
'',
'',
'',
'off',
'',
15
'.',
'']
https://www.nltk.org/book/ch03.html#tab-re-symbols
DTM=pd.DataFrame(countvectorizer.fit_transform(CORPUS).toarray(),
columns=countvectorizer.get_feature_names_out(),index=None)
DTM
[ ]:
16