Python
import nltk
import string
import re
Text Lowercase
We lowercase the text to reduce the size of the vocabulary of our
text data.
Python
def text_lowercase(text):
return text.lower()
input_str = "Hey, did you know that the summer break is coming? Amazing
right !! It's only 5 more days !!";
text_lowercase(input_str)
Example:
Input: “Hey, did you know that the summer break is coming?
Amazing right!! It’s only 5 more days!!”
Output: “hey, did you know that the summer break is coming?
amazing right!! it’s only 5 more days!!”
Remove numbers
Python
# Remove numbers
def remove_numbers(text):
result = re.sub(r'\d+', '', text)
return result
input_str = "There are 3 balls in this bag, and 12 in the other one."
remove_numbers(input_str)
Example:
Input: “There are 3 balls in this bag, and 12 in the other one.”
Output: ‘There are balls in this bag, and in the other one.’
We can also convert the numbers into words. This can be done by
using the inflect library.
Python
# import the inflect library
import inflect
p = inflect.engine()
# convert number into words
def convert_number(text):
# split string into list of words
temp_str = text.split()
# initialise empty list
new_string = []
for word in temp_str:
# if word is a digit, convert the digit
# to numbers and append into the new_string list
if word.isdigit():
temp = p.number_to_words(word)
new_string.append(temp)
# append the word as it is
else:
new_string.append(word)
# join the words of new_string to form a string
temp_str = ' '.join(new_string)
return temp_str
input_str = 'There are 3 balls in this bag, and 12 in the other one.'
convert_number(input_str)
Example:
Input: “There are 3 balls in this bag, and 12 in the other one.”
Output: “There are three balls in this bag, and twelve in the other
one.”
Python
# remove punctuation
def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
input_str = "Hey, did you know that the summer break is coming? Amazing
right !! It's only 5 more days !!"
remove_punctuation(input_str)
Example:
Input: “Hey, did you know that the summer break is coming?
Amazing right!! It’s only 5 more days!!”
Output: “Hey did you know that the summer break is coming
Amazing right Its only 5 more days”
Remove whitespace
We can use the join and split function to remove all the white spaces
in a string.
Python
# remove whitespace from text
def remove_whitespace(text):
return " ".join(text.split())
input_str = "we don't need the given questions"
remove_whitespace(input_str)
Example:
Input: " we don't need the given questions"
Output: "we don't need the given questions"
Remove default stopwords
Example:
Python
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# remove stopwords function
def remove_stopwords(text):
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(text)
filtered_text = [word for word in word_tokens if word not in
stop_words]
return filtered_text
example_text = "This is a sample sentence and we are going to remove the
stopwords from this."
remove_stopwords(example_text)
Example:
Input: “This is a sample sentence and we are going to remove the
stopwords from this”
Output: [‘This’, ‘sample’, ‘sentence’, ‘going’, ‘remove’, ‘stopwords’]
Stemming
books ---> book
looked ---> look
denied ---> deni
flies ---> fli
Python
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()
# stem words in the list of tokenized words
def stem_words(text):
word_tokens = word_tokenize(text)
stems = [stemmer.stem(word) for word in word_tokens]
return stems
text = 'data science uses scientific methods algorithms and many types of
processes'
stem_words(text)
Example:
Input: ‘data science uses scientific methods algorithms and many
types of processes’
Output: [‘data’, ‘scienc’, ‘use’, ‘scientif’, ‘method’, ‘algorithm’, ‘and’,
‘mani’, ‘type’, ‘of’, ‘process’]
Lemmatization
Python
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
def lemma_words(text):
word_tokens = word_tokenize(text)
lemmas = [lemmatizer.lemmatize(word) for word in word_tokens]
return lemmas
input_str = "data science uses scientific methods algorithms and many
types of processes"
lemma_words(input_str)
Example:
Input: ‘data science uses scientific methods algorithms and many
types of processes’
Output: [‘data’, ‘science’, ‘use’, ‘scientific’, ‘methods’, ‘algorithms’,
‘and’, ‘many’, ‘type’, ‘of’, ‘process’]