Ufal::NameTag - bindings to NameTag library http://ufal.mff.cuni.cz/nametag.
use Ufal::NameTag; my = 'czech-cnec2.0-140304.ner'; my = Ufal::NameTag::Ner::load() or die "Cannot load NER from file ''\n"; my = Ufal::NameTag::Forms->new(); ->push(./gen.sh) for qw(Jan Hus bydlel v Praze .); my = Ufal::NameTag::NamedEntities->new(); ->recognize(, ); for my (0 .. ->size()-1) { my = ->get(); printf "entity of type %s: '%s'\n", ->{type}, join(' ', map {->get(./gen.sh)} (->{start} .. ->{start}+->{length}-1)); }
To compile the module, C++11 compiler is needed, either g++ 4.7 or newer, clang 3.2 or newer or Visual Studio 2015.
g++
clang
Visual Studio 2015
Ufal::NameTag is a Perl binding to NameTag library http://ufal.mff.cuni.cz/nametag.
Ufal::NameTag
All classes can be imported into the current namespace using the all export tag.
all
The bindings is a straightforward conversion of the C++ bindings API. Vectors do not have native Perl interface, see Ufal::NameTag::Forms source for reference. Static methods and enumerations are available only through the module, not through object instance.
C++
The C++ API being wrapped follows. For a API reference of the original C++ API, see L\<http://ufal.mff.cuni.cz/nametag/api-reference\>.
Helper Structures ----------------- typedef vector<string> Forms; struct TokenRange { size_t start; size_t length; }; typedef vector<TokenRange> TokenRanges; struct NamedEntity { size_t start; size_t length; string type; NamedEntity(); NamedEntity(size_t start, size_t length, const string& type); }; typedef vector<NamedEntity> NamedEntities; Main Classes ------------ class Version { public: unsigned major; unsigned minor; unsigned patch; string prerelease; static Version current(); }; class Tokenizer { public: virtual void setText(const char* text); virtual bool nextSentence(Forms* forms, TokenRanges* tokens); static Tokenizer* newVerticalTokenizer(); }; class Ner { static ner* load(const char* fname); virtual void recognize(Forms& forms, NamedEntities& entities) const; virtual void entityTypes(Forms& types) const; virtual void gazetteers(Forms& gazetteers, Ints& gazetteer_types) const; virtual Tokenizer* newTokenizer() const; };
Simple example performing named entity recognition.
use warnings; use strict; use open qw(:std :utf8); use Ufal::NameTag; sub encode_entities($) { my ($text) = @_; $text =~ s/[&<>"]/$& eq "&" ? "&" : $& eq "<" ? "<" : $& eq ">" ? ">" : """/ge; return $text; } sub sort_entities($) { my ($entities) = @_; my @entities = (); for (my ($i, $size) = (0, $entities->size()); $i < $size; $i++) { push @entities, $entities->get($i); } return sort { $a->{start} <=> $b->{start} || $b->{length} <=> $a->{length} } @entities; } @ARGV >= 1 or die "Usage: $0 recognizer_model\n"; print STDERR "Loading ner: "; my $ner = Ufal::NameTag::Ner::load($ARGV[0]); $ner or die "Cannot load recognizer from file '$ARGV[0]'\n"; print STDERR "done\n"; shift @ARGV; my $forms = Ufal::NameTag::Forms->new(); my $tokens = Ufal::NameTag::TokenRanges->new(); my $entities = Ufal::NameTag::NamedEntities->new(); my @sorted_entities; my @open_entities; my $tokenizer = $ner->newTokenizer(); $tokenizer or die "No tokenizer is defined for the supplied model!"; for (my $not_eof = 1; $not_eof; ) { my $text = ''; # Read block while (1) { my $line = <>; last unless ($not_eof = defined $line); $text .= $line; chomp($line); last unless length $line; } # Tokenize and recognize $tokenizer->setText($text); my $t = 0; while ($tokenizer->nextSentence($forms, $tokens)) { $ner->recognize($forms, $entities); @sorted_entities = sort_entities($entities); # Write entities for (my ($i, $size, $e) = (0, $tokens->size(), 0); $i < $size; $i++) { my $token = $tokens->get($i); my ($token_start, $token_length) = ($token->{start}, $token->{length}); print encode_entities(substr $text, $t, $token_start - $t); print '<sentence>' if $i == 0; # Open entities starting at current token for (; $e < @sorted_entities && $sorted_entities[$e]->{start} == $i; $e++) { printf '<ne type="%s">', encode_entities($sorted_entities[$e]->{type}); push @open_entities, $sorted_entities[$e]->{start} + $sorted_entities[$e]->{length} - 1; } # The token itself printf '<token>%s</token>', encode_entities(substr $text, $token_start, $token_length); # Close entities ending after current token while (@open_entities && $open_entities[-1] == $i) { print '</ne>'; pop @open_entities; } print '</sentence>' if $i + 1 == $size; $t = $token_start + $token_length; } } # Write rest of the text print encode_entities(substr $text, $t); }
Milan Straka <straka@ufal.mff.cuni.cz>
Jana Straková <strakova@ufal.mff.cuni.cz>
Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.
This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
To install Ufal::NameTag, copy and paste the appropriate command in to your terminal.
cpanm
cpanm Ufal::NameTag
CPAN shell
perl -MCPAN -e shell install Ufal::NameTag
For more information on module installation, please visit the detailed CPAN module installation guide.