0% found this document useful (0 votes)
26 views2 pages

Program 1

Uploaded by

mickeypinky123
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
26 views2 pages

Program 1

Uploaded by

mickeypinky123
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 2

1Program 1: Counting words

import nltk
from nltk.corpus import gutenberg
import string
nltk.download()
# Example: Load the text of 'Moby Dick'
raw_text = gutenberg.raw('melville-moby_dick.txt')
import re

def preprocess_text(text):
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))

# Convert to lowercase
text = text.lower()

# Tokenize

tokens = text.split()

return tokens

processed_text = preprocess_text(raw_text)
def calculate_statistics(tokens):
# Assuming one document for simplicity
document_count = 1

word_count = len(tokens)
unique_words = set(tokens)
unique_word_count = len(unique_words)

return document_count, word_count, unique_word_count

document_count, word_count, unique_word_count =


calculate_statistics(processed_text)

print("Document Count:", document_count)


print("Word Count:", word_count)
print("Unique Word Count:", unique_word_count)

You might also like