from nltk.chunk import ChunkParserI
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.classify import NaiveBayesClassifier
from typing import List, Tuple, Callable
class ClassifierChunker(ChunkParserI):
def __init__(self, train_sentences: List[nltk.Tree],
feature_func: Callable = None):
self.feature_func = feature_func or self._default_features
training_data = self._prepare_training_data(train_sentences)
self.classifier = NaiveBayesClassifier.train(training_data)
def _prepare_training_data(self, sentences: List[nltk.Tree]):
data = []
for tree in sentences:
conll_tags = tree2conlltags(tree)
history = []
for i, (word, pos, tag) in enumerate(conll_tags):
features = self.feature_func(conll_tags, i, history)
data.append((features, tag))
history.append(tag)
return data
def parse(self, sentence: List[Tuple[str, str]]) -> nltk.Tree:
history = []
conll_tags = []
for i, (word, pos) in enumerate(sentence):
features = self.feature_func(sentence, i, history)
tag = self.classifier.classify(features)
conll_tags.append((word, pos, tag))
history.append(tag)
return conlltags2tree(conll_tags)
def _default_features(self, sent, i, history):
word, pos = sent[i][0], sent[i][1]
prev_pos = sent[i - 1][1] if i > 0 else "<START>"
next_pos = sent[i + 1][1] if i < len(sent) - 1 else "<END>"
prev_tag = history[i - 1] if i > 0 else "<START>"
return {
"word": word,
"pos": pos,
"prev_pos": prev_pos,
"next_pos": next_pos,
"prev_tag": prev_tag,
"is_capitalized": word[0].isupper(),
"is_numeric": word.isdigit(),
"pos+word": f"{pos}_{word}"
}