NLTK Cheatsheet
NLTK Cheatsheet
nltk.download
This method is used to download the necessary NLTK datasets and models. It's essential for setting up
your environment with the required resources.
import nltk
word_tokenize
Tokenizes a given string into individual words.
text = "NLTK is a leading platform for building Python programs to work with human language data."
tokens = word_tokenize(text)
print(tokens)
sent_tokenize
text = "NLTK is a leading platform for building Python programs to work with human language data. I
sentences = sent_tokenize(text)
print(sentences)
pos_tag
text = "NLTK is a leading platform for building Python programs to work with human language data."
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
print(tagged_tokens)
ne_chunk
Performs named entity recognition on a list of tagged tokens.
FreqDist
text = "NLTK is a leading platform for building Python programs to work with human language data. I
tokens = word_tokenize(text)
fdist = FreqDist(tokens)
print(fdist.most_common(5))
ConditionalFreqDist
cfd = ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
print(cfd['news'].most_common(10))
Text
text = "NLTK is a leading platform for building Python programs to work with human language data."
tokens = word_tokenize(text)
text_obj = Text(tokens)
text_obj.concordance('NLTK')
concordance
text = "NLTK is a leading platform for building Python programs to work with human language data. N
tokens = word_tokenize(text)
text_obj = Text(tokens)
text_obj.concordance('NLTK')
similar
text = "NLTK is a leading platform for building Python programs to work with human language data. N
tokens = word_tokenize(text)
text_obj = Text(tokens)
text_obj.similar('NLTK')
common_contexts
Find
text = "NLTK is a leading platform for building Python programs to work with human language data. N
tokens = word_tokenize(text)
text_obj = Text(tokens)
text_obj.common_contexts(['NLTK', 'platform'])
dispersion_plot
text = "NLTK is a leading platform for building Python programs to work with human language data. N
tokens = word_tokenize(text)
dispersion_plot(tokens, ['NLTK', 'platform'])
generate
text = "NLTK is a leading platform for building Python programs to work with human language data. N
tokens = word_tokenize(text)
text_obj = Text(tokens)
text_obj.generate()
bigrams
Generates bigrams from text.
text = "NLTK is a leading platform for building Python programs to work with human language data."
tokens = word_tokenize(text)
bigrams_list = list(bigrams(tokens))
print(bigrams_list)
ngrams
text = "NLTK is a leading platform for building Python programs to work with human language data."
tokens = word_tokenize(text)
trigrams_list = list(ngrams(tokens, 3))
print(trigrams_list)
wordnet.synsets
synsets = wordnet.synsets('dog')
print(synsets)
wordnet.lemmas
synset = wordnet.synset('dog.n.01')
lemmas = synset.lemmas()
print(lemmas)
wordnet.synset
synset = wordnet.synset('dog.n.01')
print(synset)
wordnet.morphy
base_form = wordnet.morphy('running')
print(base_form)
wordnet.synset.lemma_names
synset = wordnet.synset('dog.n.01')
lemma_names = synset.lemma_names()
print(lemma_names)
wordnet.synset.definition
synset = wordnet.synset('dog.n.01')
definition = synset.definition()
print(definition)
wordnet.synset.examples
synset = wordnet.synset('dog.n.01')
examples = synset.examples()
print(examples)
wordnet.synset.hypernyms
synset = wordnet.synset('dog.n.01')
hypernyms = synset.hypernyms()
print(hypernyms)
wordnet.synset.hyponyms
synset = wordnet.synset('dog.n.01')
hyponyms = synset.hyponyms()
print(hyponyms)
wordnet.synset.member_holonyms
synset = wordnet.synset('dog.n.01')
member_holonyms = synset.member_holonyms()
print(member_holonyms)
wordnet.synset.part_meronyms
synset = wordnet.synset('dog.n.01')
part_meronyms = synset.part_meronyms()
print(part_meronyms)
nltk.corpus.words.words
word_list = words.words()
print(word_list[:10])
nltk.corpus.stopwords.words
stopword_list = stopwords.words('english')
print(stopword_list)
nltk.corpus.gutenberg.raw
raw_text = gutenberg.raw('austen-emma.txt')
print(raw_text[:1000])
nltk.corpus.brown.words
brown_words = brown.words()
print(brown_words[:10])
nltk.corpus.reuters.words
reuters_words = reuters.words()
print(reuters_words[:10])
nltk.corpus.inaugural.words
inaugural_words = inaugural.words()
print(inaugural_words[:10])
nltk.corpus.webtext.words
webtext_words = webtext.words()
print(webtext_words[:10])
nltk.corpus.treebank.parsed_sents
parsed_sents = treebank.parsed_sents()
print(parsed_sents[:1])
nltk.corpus.semcor.tagged_sents
Gets tagged sentences from the SemCor corpus.
tagged_sents = semcor.tagged_sents()
print(tagged_sents[:1])
nltk.corpus.names.words
names_list = names.words()
print(names_list[:10])
nltk.corpus.sentiwordnet.senti_synset
senti_synset = swn.senti_synset('dog.n.01')
print(senti_synset)
nltk.corpus.wordnet_ic.ic
ic = wordnet_ic.ic('ic-brown.dat')
print(ic)
nltk.corpus.nps_chat.tagged_posts
tagged_posts = nps_chat.tagged_posts()
print(tagged_posts[:1])
nltk.corpus.movie_reviews.words
movie_reviews_words = movie_reviews.words()
print(movie_reviews_words[:10])
nltk.corpus.twitter_samples.strings
tweets = twitter_samples.strings()
print(tweets[:1])
nltk.classify.NaiveBayesClassifier
random.shuffle(documents)
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains({})'.format(word)] = (word in document_words)
return features
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)
nltk.classify.DecisionTreeClassifier
A Decision Tree classifier for text classification.
from nltk.classify import DecisionTreeClassifier
from nltk.corpus import movie_reviews
import random
documents = [(list(movie_reviews.words(fileid
)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains({})'.format(word)] = (word in document_words)
return features
print(nltk.classify.accuracy(classifier, test_set))
nltk.tag.PerceptronTagger
tagger = PerceptronTagger()
text = "NLTK is a leading platform for building Python programs to work with human language data."
tokens = word_tokenize(text)
tagged_tokens = tagger.tag(tokens)
print(tagged_tokens)
nltk.tag.HMMTagger
trainer = hmm.HiddenMarkovModelTrainer()
tagged_sents = treebank.tagged_sents()
tagger = trainer.train(tagged_sents)
text = "NLTK is a leading platform for building Python programs to work with human language data."
tokens = word_tokenize(text)
tagged_tokens = tagger.tag(tokens)
print(tagged_tokens)
nltk.chunk.RegexpParser
text = "The quick brown fox jumps over the lazy dog."
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
nltk.translate.bleu_score
nltk.stem.PorterStemmer
stemmer = PorterStemmer()
word = "running"
stemmed_word = stemmer.stem(word)
print(stemmed_word)
nltk.stem.LancasterStemmer
stemmer = LancasterStemmer()
word = "running"
stemmed_word = stemmer.stem(word)
print(stemmed_word)
nltk.stem.SnowballStemmer
stemmer = SnowballStemmer("english")
word = "running"
stemmed_word = stemmer.stem(word)
print(stemmed_word)
nltk.stem.WordNetLemmatizer
WordNet Lemmatizer for lemmatizing words.
lemmatizer = WordNetLemmatizer()
word = "running"
lemmatized_word = lemmatizer.lemmatize(word, pos='v')
print(lemmatized_word)
Code Examples Using NLTK for Useful Use Cases
import nltk
from nltk.corpus import movie_reviews
import random
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains({})'.format(word)] = (word in document_words)
return features
import nltk
from nltk import ne_chunk, pos_tag
from nltk.tokenize import word_tokenize
text = "Barack Obama was born in Hawaii. He was elected president in 2008."
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
named_entities = ne_chunk(tagged_tokens)
print(named_entities)
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import word_tokenize
tagger = PerceptronTagger()
text = "NLTK is a leading platform for building Python programs to work with human language data."
tokens = word_tokenize(text)
tagged_tokens = tagger.tag(tokens)
print(tagged_tokens)
4. Building a Language Model
import nltk
from nltk.tokenize import word_tokenize
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
# Sample text
text = "NLTK is a leading platform for building Python programs to work with human language data. I
# Generate text
context = ('nltk', 'is')
print('Generated text:', ' '.join(model.generate(10, text_seed=context)))
import nltk
from nltk.corpus import wordnet
word = "happy"
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
print(set(synonyms))
6. Word Sense Disambiguation
import nltk
from nltk.corpus import wordnet
from nltk.wsd import lesk
import nltk
from nltk.chunk import RegexpParser
from nltk.tokenize import word_tokenize
from nltk import pos_tag
text = "The quick brown fox jumps over the lazy dog."
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
import nltk
from nltk.translate.bleu_score import sentence_bleu
import nltk
from nltk.classify import DecisionTreeClassifier
from nltk.corpus import movie_reviews
import random
random.shuffle(documents)
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains({})'.format(word)] = (word in document_words)
return features
print(nltk.classify.accuracy(classifier, test_set))
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["running", "jumps", "easily", "fairly"]
stemmed
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ["running", "jumps", "easily", "fairly"]
lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
print(lemmatized_words)
import nltk
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
text = "NLTK is a leading platform for building Python programs to work with human language data. I
tokens = word_tokenize(text)
fdist = FreqDist(tokens)
print(fdist.most_common(5))
import nltk
from nltk.probability import ConditionalFreqDist
from nltk.corpus import brown
cfd = ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
print(cfd['news'].most_common(10))
14. Finding Synonyms with WordNet
import nltk
from nltk.corpus import wordnet
word = "good"
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
print(set(synonyms))
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
text = "NLTK is a leading platform for building Python programs to work with human language data."
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
print(tagged_tokens)
import nltk
from nltk import ne_chunk, pos_tag
from nltk.tokenize import word_tokenize
import nltk
from nltk.text import Text
from nltk.tokenize import word_tokenize
text = "NLTK is a leading platform for building Python programs to work with human language data. N
tokens = word_tokenize(text)
text_obj = Text(tokens)
text_obj.concordance('NLTK')
import nltk
from nltk.text import Text
from nltk.tokenize import word_tokenize
text = "NLTK is a leading platform for building Python programs to work with human language data. N
tokens = word_tokenize(text)
text_obj = Text(tokens)
text_obj.similar('NLTK')
import nltk
from nltk.text import Text
from nltk.tokenize import word_tokenize
text = "NLTK is a leading platform for building Python programs to work with human language data. N
tokens = word_tokenize(text)
text_obj = Text(tokens)
text_obj.common_contexts(['NLTK', 'platform'])
20. Dispersion Plot
import nltk
from nltk.draw.dispersion import dispersion_plot
from nltk.tokenize import word_tokenize
text = "NLTK is a leading platform for building Python programs to work with human language data. N
tokens = word_tokenize(text)
dispersion_plot(tokens, ['NLTK', 'platform'])
Follow me on