Milan Straka


Ufal::MorphoDiTa - bindings to MorphoDiTa library


  use Ufal::MorphoDiTa;

  my $tagger_file = 'english-morphium-wsj-140407.tagger';
  my $tagger = Ufal::MorphoDiTa::Tagger::load($tagger_file) or die "Cannot load tagger from file '$tagger_file'\n";
  my $forms  = Ufal::MorphoDiTa::Forms->new(); $forms->push($_) for qw(How are you ?);
  my $lemmas = Ufal::MorphoDiTa::TaggedLemmas->new();

  $tagger->tag($forms, $lemmas);

  for my $i (0 .. $lemmas->size()-1) {
    my $lemma = $lemmas->get($i);
    printf "token = %s, lemma = %s, tag = %s\n", $forms->get($i), $lemma->{lemma}, $lemma->{tag};


To compile the module, C++11 compiler is needed, either g++ 4.7 or newer, clang 3.2 or newer or Visual Studio 2015.


Ufal::MorphoDiTa is a Perl binding to MorphoDiTa library

The bindings is a straightforward conversion of the C++ bindings API. Vectors do not have native Perl interface, see Ufal::MorphoDiTa::Forms source for reference. Static methods and enumerations are available only through the module, not through object instance.

Wrapped C++ API

The C++ API being wrapped follows. For a API reference of the original C++ API, see L\<\>.

  Helper Structures
    typedef vector<int> Indices;
    typedef vector<string> Forms;
    struct TaggedForm {
      string form;
      string tag;
    typedef vector<TaggedForm> TaggedForms;
    struct TaggedLemma {
      string lemma;
      string tag;
    typedef vector<TaggedLemma> TaggedLemmas;
    typedef vector<TaggedLemmas> Analyses;
    struct TaggedLemmaForms {
      string lemma;
      TaggedForms forms;
    typedef vector<TaggedLemmaForms> TaggedLemmasForms;
    struct TokenRange {
      size_t start;
      size_t length;
    typedef vector<TokenRange> TokenRanges;
    struct DerivatedLemma {
      std::string lemma;
    typedef vector<DerivatedLemma> DerivatedLemmas;
  Main Classes
    class Version {
      unsigned major;
      unsigned minor;
      unsigned patch;
      string prerelease;
      static Version current();
    class Tokenizer {
      virtual void setText(const char* text);
      virtual bool nextSentence(Forms* forms, TokenRanges* tokens);
      static Tokenizer* newVerticalTokenizer();
      static Tokenizer* newCzechTokenizer();
      static Tokenizer* newEnglishTokenizer();
      static Tokenizer* newGenericTokenizer();
    class Derivator {
      virtual bool parent(const char* lemma, DerivatedLemma& parent) const;
      virtual bool children(const char* lemma, DerivatedLemmas& children) const;
    class DerivationFormatter {
      virtual string formatDerivation(const char* lemma) const;
      static DerivationFormatter* newNoneDerivationFormatter();
      static DerivationFormatter* newRootDerivationFormatter(const Derivator* derivator);
      static DerivationFormatter* newPathDerivationFormatter(const Derivator* derivator);
      static DerivationFormatter* newTreeDerivationFormatter(const Derivator* derivator);
      static DerivationFormatter* newDerivationFormatter(const char* name, const Derivator* derivator);
    class Morpho {
      static Morpho* load(const char* fname);
      enum { NO_GUESSER = 0, GUESSER = 1 };
      virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
      virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
      virtual string rawLemma(const char* lemma) const;
      virtual string lemmaId(const char* lemma) const;
      virtual string rawForm(const char* form) const;
      virtual Tokenizer* newTokenizer() const;
      virtual Derivator* getDerivator() const;
    class Tagger {
      static Tagger* load(const char* fname);
      virtual const Morpho* getMorpho() const;
      virtual void tag(const Forms& forms, TaggedLemmas& tags, int guesser = -1) const;
      virtual void tagAnalyzed(const Forms& forms, const Analyses& analyses, Indices& tags) const;
      Tokenizer* newTokenizer() const;
    class TagsetConverter {
      static TagsetConverter* newIdentityConverter();
      static TagsetConverter* newPdtToConll2009Converter();
      static TagsetConverter* newStripLemmaCommentConverter(const Morpho& morpho);
      static TagsetConverter* newStripLemmaIdConverter(const Morpho& morpho);
      virtual void convert(TaggedLemma& lemma) const;
      virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
      virtual void convertGenerated(TaggedLemmasForms& forms) const;



Simple example performing morphological analysis and generation.

  use warnings;
  use strict;
  use open qw(:std :utf8);
  use Ufal::MorphoDiTa;
  @ARGV >= 1 or die "Usage: $0 dict_file\n";
  print STDERR "Loading dictionary: ";
  my $morpho = Ufal::MorphoDiTa::Morpho::load($ARGV[0]);
  $morpho or die "Cannot load dictionary from file '$ARGV[0]'\n";
  print STDERR "done\n";
  shift @ARGV;
  my $lemmas = Ufal::MorphoDiTa::TaggedLemmas->new();
  my $lemmas_forms = Ufal::MorphoDiTa::TaggedLemmasForms->new();
  while (<>) {
    my @tokens = split /\t/, $_, -1;
    if (@tokens == 1) { #Analyze
      my $result = $morpho->analyze($tokens[0], $Ufal::MorphoDiTa::Morpho::GUESSER, $lemmas);
      my $guesser = $result == $Ufal::MorphoDiTa::Morpho::GUESSER ? "Guesser " : "";
      for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
        my $lemma = $lemmas->get($i);
        printf "%sLemma: %s %s\n", $guesser, $lemma->{lemma}, $lemma->{tag};
    } elsif (@tokens == 2) { #Generate
      my $result = $morpho->generate($tokens[0], $tokens[1], $Ufal::MorphoDiTa::Morpho::GUESSER, $lemmas_forms);
      my $guesser = $result == $Ufal::MorphoDiTa::Morpho::GUESSER ? "Guesser " : "";
      for (my $i = 0; $i < $lemmas_forms->size(); $i++) {
        my $lemma_forms = $lemmas_forms->get($i);
        printf "%sLemma: %s\n", $guesser, $lemma_forms->{lemma};
        for (my $i = 0; $i < $lemma_forms->{forms}->size(); $i++) {
          my $form = $lemma_forms->{forms}->get($i);
          printf "  %s %s\n", $form->{form}, $form->{tag};


Simple example performing tokenization and PoS tagging.

  use warnings;
  use strict;
  use open qw(:std :utf8);
  use Ufal::MorphoDiTa;
  sub encode_entities($) {
    my ($text) = @_;
    $text =~ s/[&<>"]/$& eq "&" ? "&amp;" : $& eq "<" ? "&lt;" : $& eq ">" ? "&gt;" : "&quot;"/ge;
    return $text;
  @ARGV >= 1 or die "Usage: $0 tagger_file\n";
  print STDERR "Loading tagger: ";
  my $tagger = Ufal::MorphoDiTa::Tagger::load($ARGV[0]);
  $tagger or die "Cannot load tagger from file '$ARGV[0]'\n";
  print STDERR "done\n";
  shift @ARGV;
  my $forms = Ufal::MorphoDiTa::Forms->new();
  my $lemmas = Ufal::MorphoDiTa::TaggedLemmas->new();
  my $tokens = Ufal::MorphoDiTa::TokenRanges->new();
  my $tokenizer = $tagger->newTokenizer();
  $tokenizer or die "No tokenizer is defined for the supplied model!";
  for (my $not_eof = 1; $not_eof; ) {
    my $text = '';
    # Read block
    while (1) {
      my $line = <>;
      last unless ($not_eof = defined $line);
      $text .= $line;
      last unless length $line;
    # Tag
    my $t = 0;
    while ($tokenizer->nextSentence($forms, $tokens)) {
      $tagger->tag($forms, $lemmas);
      for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
        my $lemma = $lemmas->get($i);
        my $token = $tokens->get($i);
        my ($token_start, $token_length) = ($token->{start}, $token->{length});
        printf "%s%s<token lemma=\"%s\" tag=\"%s\">%s</token>%s",
          encode_entities(substr $text, $t, $token_start - $t),
          $i == 0 ? "<sentence>" : "",
          encode_entities(substr $text, $token_start, $token_length),
          $i + 1 == $size ? "</sentence>" : "";
        $t = $token_start + $token_length;
    print encode_entities(substr $text, $t);


Milan Straka <>

Jana Straková <>


Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.

This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at