0% found this document useful (0 votes)
16 views

NLP Lab1

The document provides examples of code to perform various natural language processing tasks including tokenization, stemming, part-of-speech tagging, n-gram modeling, and shallow parsing.
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views

NLP Lab1

The document provides examples of code to perform various natural language processing tasks including tokenization, stemming, part-of-speech tagging, n-gram modeling, and shallow parsing.
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

1. Read the paragraph and obtain the frequency of words.

code:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

paragraph = "Sukumar is good at coding and pratcing lot of problems in


leetcode .sukumar is very nice guy"

words = word_tokenize(paragraph)

fdist = FreqDist(words)
for word, frequency in fdist.items():
print(f"{word}: {frequency}")

2. Write a program to slit sentences in a document?


code:
import nltk
from nltk.tokenize import sent_tokenize

# Sample document
document = "sukumar is good boy. Sukumar in vitap"
# Tokenize the document into sentences
sentences = sent_tokenize(document)

# Print each sentence


for sentence in sentences:
print(sentence)

3.Perform tokenizing and stemming by reading the input string?


code:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Sample input string


input_string = "i am running"

# Tokenize the input string into words


words = word_tokenize(input_string)

# Initialize the PorterStemmer


stemmer = PorterStemmer()

# Perform stemming on each word


stemmed_words = [stemmer.stem(word) for word in words]

# Print the original words and their stemmed forms


for original, stemmed in zip(words, stemmed_words):
print(f"{original} -> {stemmed}")

4. Remove the stopwords and rareword in the document?


code:
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# Sample document
document = "running in the forest is most dangerous than any ting in world of
human. sukumar sukumar hero model model run"

# Tokenize the document into words


words = word_tokenize(document)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]

# Calculate the frequency distribution of words


fdist = FreqDist(filtered_words)

# Define a threshold for rare words (e.g., words that occur less than 2 times)
rare_words = [word for word, frequency in fdist.items() if frequency < 2]

# Remove rare words from the filtered words


filtered_words = [word for word in filtered_words if word not in rare_words]

# Join the filtered words back into a document


filtered_document = ' '.join(filtered_words)

print(filtered_words)

5.Identify the parts of speech in the document?


code:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Sample document
document = "NLTK is a leading platform for building Python programs. It provides
easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet."

# Tokenize the document into words


words = word_tokenize(document)

# Perform part-of-speech tagging


pos_tags = pos_tag(words)

# Print the part-of-speech tags


for word, pos_tag in pos_tags:
print(f"{word}: {pos_tag}")

6. Write a program to read the words form a string variable/Text and perform
tokenizing
and Lancaster stemming by reading the input string?
code:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer

# Sample input string


input_string = "NLTK is a leading platform for building Python programs."

# Tokenize the input string into words


words = word_tokenize(input_string)

# Initialize the LancasterStemmer


stemmer = LancasterStemmer()

# Perform stemming on each word


stemmed_words = [stemmer.stem(word) for word in words]

# Print the original words and their stemmed forms


for original, stemmed in zip(words, stemmed_words):
print(f"{original} -> {stemmed}")

7.NGRAM:
CODE:

import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.util import ngrams
import re

s= """Natural language processing is the ability of a computer program to


understand
human language as it is spoken and written referred to as natural language. It is
a
component of Artificial intelligence."""

s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]',' ',s)
tokens = [token for token in s.split(" ") if token!=""]
ouput = list(ngrams(tokens,5))
print(ouput)

9.UNIGRAM BIGRAM TRIGRAM


CODE:
import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger

# Download the Treebank corpus if not already downloaded


nltk.download('treebank')

# Get tagged sentences from the Treebank corpus


tagged_sentences = treebank.tagged_sents()

# Split the tagged sentences into train and test sets


train_size = int(0.8 * len(tagged_sentences))
train_sents = tagged_sentences[:train_size]
test_sents = tagged_sentences[train_size:]
# Train Unigram, Bigram, and Trigram taggers
unigram_tagger = UnigramTagger(train_sents)
bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(train_sents, backoff=bigram_tagger)

# Evaluate the taggers on the test set


print(f"Unigram tagger accuracy: {unigram_tagger.evaluate(test_sents)}")
print(f"Bigram tagger accuracy: {bigram_tagger.evaluate(test_sents)}")
print(f"Trigram tagger accuracy: {trigram_tagger.evaluate(test_sents)}")

# Tag a sample sentence


sentence = "Barack Obama was born in Hawaii."
words = nltk.word_tokenize(sentence)
tags = trigram_tagger.tag(words)
print(tags)

10.Affix Tagger
code:
import nltk
from nltk.corpus import treebank
from nltk.tag import AffixTagger

# Download the Treebank corpus if not already downloaded


nltk.download('treebank')

# Get tagged sentences from the Treebank corpus


tagged_sentences = treebank.tagged_sents()

# Split the tagged sentences into train and test sets


train_size = int(0.8 * len(tagged_sentences))
train_sents = tagged_sentences[:train_size]
test_sents = tagged_sentences[train_size:]

# Specify the affix tagger parameters


prefix_length = 3
suffix_length = 3
min_stem_length = 2

# Train an affix tagger


affix_tagger = AffixTagger(train_sents, affix_length=prefix_length,
min_stem_length=2)

# Tag a sample sentence


sentence = "Barack Obama was born in Hawaii."
words = nltk.word_tokenize(sentence)
tags = affix_tagger.tag(words)
print(tags)

12. Dependency parser


code:
import nltk

# Define a simple context-free grammar for parsing


grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det N | N
VP -> V NP | V
Det -> 'the'
N -> 'dog' | 'cat' | 'man' | 'ball'
V -> 'chased' | 'saw' | 'caught'
""")

# Input sentences
input_sentences = ['the dog chased the cat', 'the man saw the ball']

# Create a chart parser


parser = nltk.ChartParser(grammar)

# Iterate over input sentences


for sent in input_sentences:
# Tokenize the sentence
tokens = nltk.word_tokenize(sent)
# Parse the sentence
for tree in parser.parse(tokens):
# Convert constituency parse tree to dependency parse tree
dep_tree = nltk.tree.ParentedTree.convert(tree)
# Print the original sentence
print("Input Sentence:", sent)
# Print the dependency parse tree
print("Dependency Parse Tree:")
print(dep_tree)
print()

13.Shallow parsing
import nltk
nltk.download('averaged_perceptron_tagger')
text = "The quick brown fox jumps over the lazy dog"

tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)

chunk_grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP followed by VP
"""

chunk_parser = nltk.RegexpParser(chunk_grammar)

chunks = chunk_parser.parse(pos_tags)

print(chunks)
#14 NER:'
code:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

doc = "Harry Potter, the young wizard with a lightning-shaped scar, attended
Hogwarts School, faced challenges, and triumphed over the dark wizard Voldemort,
bringing an end to the magical conflict."

words = word_tokenize(doc)

pos_tags = pos_tag(words)

ne_tags = ne_chunk(pos_tags)

for chunk in ne_tags:


if hasattr(chunk, 'label'):
print(chunk.label(),':',' '.join(c[0] for c in chunk))

You might also like