NLP Lab1
NLP Lab1
code:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
words = word_tokenize(paragraph)
fdist = FreqDist(words)
for word, frequency in fdist.items():
print(f"{word}: {frequency}")
# Sample document
document = "sukumar is good boy. Sukumar in vitap"
# Tokenize the document into sentences
sentences = sent_tokenize(document)
# Sample document
document = "running in the forest is most dangerous than any ting in world of
human. sukumar sukumar hero model model run"
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
# Define a threshold for rare words (e.g., words that occur less than 2 times)
rare_words = [word for word, frequency in fdist.items() if frequency < 2]
print(filtered_words)
# Sample document
document = "NLTK is a leading platform for building Python programs. It provides
easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet."
6. Write a program to read the words form a string variable/Text and perform
tokenizing
and Lancaster stemming by reading the input string?
code:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
7.NGRAM:
CODE:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.util import ngrams
import re
s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]',' ',s)
tokens = [token for token in s.split(" ") if token!=""]
ouput = list(ngrams(tokens,5))
print(ouput)
10.Affix Tagger
code:
import nltk
from nltk.corpus import treebank
from nltk.tag import AffixTagger
# Input sentences
input_sentences = ['the dog chased the cat', 'the man saw the ball']
13.Shallow parsing
import nltk
nltk.download('averaged_perceptron_tagger')
text = "The quick brown fox jumps over the lazy dog"
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
chunk_grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP followed by VP
"""
chunk_parser = nltk.RegexpParser(chunk_grammar)
chunks = chunk_parser.parse(pos_tags)
print(chunks)
#14 NER:'
code:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
doc = "Harry Potter, the young wizard with a lightning-shaped scar, attended
Hogwarts School, faced challenges, and triumphed over the dark wizard Voldemort,
bringing an end to the magical conflict."
words = word_tokenize(doc)
pos_tags = pos_tag(words)
ne_tags = ne_chunk(pos_tags)