Milan Straka


Ufal::MorphoDiTa - bindings to Morphodita library


Ufal::MorphoDiTa is a Perl binding to Morphodita library

All classes can be imported into the current namespace using the all export tag.

The bindings is a straightforward conversion of the C++ bindings API. Vectors do not have native Perl interface, see Ufal::MorphoDiTa::Forms source for reference. Static methods and enumerations are available only through the module, not through object instance.

Wrapped C++ API

The C++ API being wrapped follows. For a API reference of the original C++ API, see L\<\>.

  Helper Structures
    typedef vector<string> Forms;
    struct TaggedForm {
      string form;
      string tag;
    typedef vector<TaggedForm> TaggedForms;
    struct TaggedLemma {
      string lemma;
      string tag;
    typedef vector<TaggedLemma> TaggedLemmas;
    struct TaggedLemmaForms {
      string lemma;
      TaggedForms forms;
    typedef vector<TaggedLemmaForms> TaggedLemmasForms;
    struct TokenRange {
      size_t start;
      size_t length;
    typedef vector<TokenRange> TokenRanges;
  Main Classes
    class Version {
      unsigned major;
      unsigned minor;
      unsigned patch;
      static Version current();
    class Tokenizer {
      virtual void setText(const char* text);
      virtual bool nextSentence(Forms* forms, TokenRanges* tokens);
      static Tokenizer* newVerticalTokenizer();
      static Tokenizer* newCzechTokenizer();
      static Tokenizer* newEnglishTokenizer();
      static Tokenizer* newGenericTokenizer();
    class Morpho {
      static Morpho* load(const char* fname);
      enum { NO_GUESSER = 0, GUESSER = 1 };
      virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
      virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
      virtual string rawLemma(const char* lemma) const;
      virtual string lemmaId(const char* lemma) const;
      virtual string rawForm(const char* form) const;
      virtual Tokenizer* newTokenizer() const;
    class Tagger {
      static Tagger* load(const char* fname);
      virtual const Morpho* getMorpho() const;
      virtual void tag(Forms& forms, TaggedLemmas& tags) const;
      Tokenizer* newTokenizer() const;
    class TagsetConverter {
      static TagsetConverter* newIdentityConverter();
      static TagsetConverter* newPdtToConll2009Converter();
      virtual void convert(TaggedLemma& lemma) const;
      virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
      virtual void convertGenerated(TaggedLemmasForms& forms) const;



Simple example performing morphological analysis and generation.

  use strict;
  use open qw(:std :utf8);
  use Ufal::MorphoDiTa qw(:all);
  @ARGV >= 1 or die "Usage: $0 dict_file\n";
  print STDERR "Loading dictionary: ";
  my $morpho = Morpho::load($ARGV[0]);
  $morpho or die "Cannot load dictionary from file '$ARGV[0]'\n";
  print STDERR "done\n";
  shift @ARGV;
  my $lemmas = TaggedLemmas->new();
  my $lemmas_forms = TaggedLemmasForms->new();
  while (<>) {
    my @tokens = split /\t/, $_, -1;
    if (@tokens == 1) { #Analyze
      my $result = $morpho->analyze($tokens[0], $Morpho::GUESSER, $lemmas);
      my $guesser = $result == $Morpho::GUESSER ? "Guesser " : "";
      for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
        my $lemma = $lemmas->get($i);
        printf "%sLemma: %s %s\n", $guesser, $lemma->{lemma}, $lemma->{tag};
    } elsif (@tokens == 2) { #Generate
      my $result = $morpho->generate($tokens[0], $tokens[1], $Morpho::GUESSER, $lemmas_forms);
      my $guesser = $result == $Morpho::GUESSER ? "Guesser " : "";
      for (my $i = 0; $i < $lemmas_forms->size(); $i++) {
        my $lemma_forms = $lemmas_forms->get($i);
        printf "%sLemma: %s\n", $guesser, $lemma_forms->{lemma};
        for (my $i = 0; $i < $lemma_forms->{forms}->size(); $i++) {
          my $form = $lemma_forms->{forms}->get($i);
          printf "  %s %s\n", $form->{form}, $form->{tag};


Simple example performing tokenization and PoS tagging.

  use strict;
  use open qw(:std :utf8);
  use Ufal::MorphoDiTa qw(:all);
  sub encode_entities($) {
    my ($text) = @_;
    $text =~ s/[&<>"]/$& eq "&" ? "&amp;" : $& eq "<" ? "&lt;" : $& eq ">" ? "&gt;" : "&quot;"/ge;
    return $text;
  @ARGV >= 1 or die "Usage: $0 tagger_file\n";
  print STDERR "Loading tagger: ";
  my $tagger = Tagger::load($ARGV[0]);
  $tagger or die "Cannot load tagger from file '$ARGV[0]'\n";
  print STDERR "done\n";
  shift @ARGV;
  my $forms = Forms->new();
  my $lemmas = TaggedLemmas->new();
  my $tokens = TokenRanges->new();
  my $tokenizer = $tagger->newTokenizer();
  $tokenizer or die "No tokenizer is defined for the supplied model!";
  for (my $not_eof = 1; $not_eof; ) {
    my $text = '';
    # Read block
    while (1) {
      my $line = <>;
      last unless ($not_eof = defined $line);
      $text .= $line;
      last unless length $line;
    # Tag
    my $t = 0;
    while ($tokenizer->nextSentence($forms, $tokens)) {
      $tagger->tag($forms, $lemmas);
      for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
        my $lemma = $lemmas->get($i);
        my $token = $tokens->get($i);
        my ($token_start, $token_length) = ($token->{start}, $token->{length});
        printf "%s%s<token lemma=\"%s\" tag=\"%s\">%s</token>%s",
          encode_entities(substr $text, $t, $token_start - $t),
          $i == 0 ? "<sentence>" : "",
          encode_entities(substr $text, $token_start, $token_length),
          $i + 1 == $size ? "</sentence>" : "";
        $t = $token_start + $token_length;
    print encode_entities(substr $text, $t);


Milan Straka <>

Jana Straková <>


Copyright 2014 by Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.

MorphoDiTa is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

MorphoDiTa is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License along with MorphoDiTa. If not, see <>.