7 TextAnalysis
7 TextAnalysis
import nltk
nltk.download('punkt') # will download the Punkt tokenizer models.
b1 = TextBlob("beautifull is bettter level than ugly")
b1.words
b1.sentences
b1.words[3].pluralize()
sen = TextBlob("My name name name is anthony gonsalvis main duniya mmein akela
hoon")
sen.word_counts["name"]
print(sen.parse())
sen[0:19]
#substring
b1.upper()
b1.find("ugly")
# character at which first found
apple = TextBlob("apples")
banana = TextBlob("banana")
apple > banana
b1.ngrams(n=3)
#An n-gram is a contiguous sequence of n items from a given sample of text or
speech.
import nltk
from nltk import tokenize
from nltk.tokenize import sent_tokenize
text = """ Goood day it was today in pune, I loved the weather in Pune. Pune is the
best city to live in."""
text
tokenized_text = sent_tokenize(text)
print(tokenized_text)
# splits a text into a list of sentences using an algorithm that considers
punctuation and capitalization.
fd.most_common(4)
nltk.download('stopwords')
# Stopwords are commonly used words (such as "the", "a", "an", "in", "on", etc.)
# Downloading the stopwords corpus is useful because it allows you to access a
predefined list of stopwords that you can
# use to filter out irrelevant words from your text data during preprocessing
filtered_sent = []
for w in tokenizer_word:
if w not in st:
filtered_sent.append(w)
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(tokens)
# nnp proper noun
# vbd past tense verb
# in preposition
# "," mark
# cd cardinal number