from
nltk.chunk
import
ChunkParserI
from
nltk.chunk.util
import
tree2conlltags, conlltags2tree
from
nltk.tag
import
UnigramTagger, BigramTagger
from
tag_util
import
backoff_tagger
def
conll_tag_chunks(chunk_data):
tagged_data
=
[tree2conlltags(tree)
for
tree
in
chunk_data]
return
[[(t, c)
for
(w, t, c)
in
sent]
for
sent
in
tagged_data]
class
TagChunker(ChunkParserI):
def
__init__(
self
, train_chunks,
tagger_classes
=
[UnigramTagger, BigramTagger]):
train_data
=
conll_tag_chunks(train_chunks)
self
.tagger
=
backoff_tagger(train_data, tagger_classes)
def
parse(
self
, tagged_sent):
if
not
tagged_sent:
return
None
(words, tags)
=
zip
(
*
tagged_sent)
chunks
=
self
.tagger.tag(tags)
wtc
=
zip
(words, chunks)
return
conlltags2tree([(w, t, c)
for
(w, (t, c))
in
wtc])