NAME

Ufal::UDPipe - bindings to UDPipe library http://ufal.mff.cuni.cz/udpipe.

SYNOPSIS

  use Ufal::UDPipe;
  use open qw(:std :utf8);

  my $model_file = '...';
  my $model = Ufal::UDPipe::Model::load($model_file);
  $model or die "Cannot load model from file '$model_file'\n";

  my $tokenizer = $model->newTokenizer($Ufal::UDPipe::Model::DEFAULT);
  my $conllu_output = Ufal::UDPipe::OutputFormat::newOutputFormat("conllu");
  my $sentence = Ufal::UDPipe::Sentence->new();

  $tokenizer->setText(join('', <>));
  while ($tokenizer->nextSentence($sentence)) {
    $model->tag($sentence, $Ufal::UDPipe::Model::DEFAULT);
    $model->parse($sentence, $Ufal::UDPipe::Model::DEFAULT);

    my $output = $conllu_output->writeSentence($sentence);
    print $output;
  }

REQUIREMENTS

To compile the module, a C++11 compiler is needed, either g++ 4.7 or newer, alternatively clang 3.2 or newer or Visual C++ 2015.

DESCRIPTION

Ufal::UDPipe is a Perl binding to UDPipe library http://ufal.mff.cuni.cz/udpipe.

The binding is a straightforward conversion of the C++ bindings API. Vectors do not have native Perl interface, see Ufal::UDPipe::Words source for reference. Static methods and enumerations are available only through the module, not through object instance.

Wrapped C++ API

The C++ API being wrapped follows. For a API reference of the original C++ API, see http://ufal.mff.cuni.cz/udpipe/api-reference.

  Helper Structures
  -----------------
  
    typedef vector<int> Children;
  
    typedef vector<string> Comments;
  
    class ProcessingError {
    public:
      bool occurred();
      string message;
    };
  
    class Token {
     public:
      string form;
      string misc;
  
      Token(const string& form = string(), const string& misc = string());
  
      // CoNLL-U defined SpaceAfter=No feature
      bool getSpaceAfter() const;
      void setSpaceAfter(bool space_after);
  
      // UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features
      string getSpacesBefore() const;
      void setSpacesBefore(const string& spaces_before);
      string getSpacesAfter() const;
      void setSpacesAfter(const string& spaces_after);
      string getSpacesInToken() const;
      void setSpacesInToken(const string& spaces_in_token);
  
      // UDPipe-specific TokenRange feature
      bool getTokenRange() const;
      size_t getTokenRangeStart() const;
      size_t getTokenRangeEnd() const;
      void setTokenRange(size_t start, size_t end);
    };
  
    class Word : public Token {
     public:
      // form and misc are inherited from token
      int id;         // 0 is root, >0 is sentence word, <0 is undefined
      string lemma;   // lemma
      string upostag; // universal part-of-speech tag
      string xpostag; // language-specific part-of-speech tag
      string feats;   // list of morphological features
      int head;       // head, 0 is root, <0 is undefined
      string deprel;  // dependency relation to the head
      string deps;    // secondary dependencies
  
      Children children;
  
      Word(int id = -1, const string& form = string());
    };
    typedef vector<Word> Words;
  
    class MultiwordToken : public Token {
     public:
      // form and misc are inherited from token
      int idFirst, idLast;
  
      MultiwordToken(int id_first = -1, int id_last = -1, const string& form = string(), const string& misc = string());
    };
    typedef vector<MultiwordToken> MultiwordTokens;
  
    class EmptyNode {
     public:
      int id;          // 0 is root, >0 is sentence word, <0 is undefined
      int index;       // index for the current id, should be numbered from 1, 0=undefined
      string form;     // form
      string lemma;    // lemma
      string upostag;  // universal part-of-speech tag
      string xpostag;  // language-specific part-of-speech tag
      string feats;    // list of morphological features
      string deps;     // secondary dependencies
      string misc;     // miscellaneous information
  
      EmptyNode(int id = -1, int index = 0) : id(id), index(index) {}
    };
    typedef vector<empty_node> EmptyNodes;
  
    class Sentence {
     public:
      Sentence();
  
      Words words;
      MultiwordTokens multiwordTokens;
      EmptyNodes emptyNodes;
      Comments comments;
      static const string rootForm;
  
      // Basic sentence modifications
      bool empty();
      void clear();
      virtual Word& addWord(const char* form);
      void setHead(int id, int head, const string& deprel);
      void unlinkAllWords();
  
      // CoNLL-U defined comments
      bool getNewDoc() const;
      string getNewDocId() const;
      void setNewDoc(bool new_doc, const string& id = string());
      bool getNewPar() const;
      string getNewParId() const;
      void setNewPar(bool new_par, const string& id = string());
  
      string getSentId() const;
      void setSentId(const string& id);
      string getText() const;
      void setText(const string& id);
    };
    typedef vector<Sentence> Sentences;
  
  
  Main Classes
  ------------
  
    class InputFormat {
     public:
      virtual void resetDocument(const string& id = string());
      virtual void setText(const char* text);
      virtual bool nextSentence(Sentence& s, ProcessingError* error = nullptr);
  
      static InputFormat* newInputFormat(const string& name);
      static InputFormat* newConlluInputFormat(const string& id = string());
      static InputFormat* newGenericTokenizerInputFormat(const string& id = string());
      static InputFormat* newHorizontalInputFormat(const string& id = string());
      static InputFormat* newVerticalInputFormat(const string& id = string());
  
      static InputFormat* newPresegmentedTokenizer(InputFormat tokenizer);
  
      static const string CONLLU_V1;
      static const string CONLLU_V2;
      static const string GENERIC_TOKENIZER_NORMALIZED_SPACES;
      static const string GENERIC_TOKENIZER_PRESEGMENTED;
      static const string GENERIC_TOKENIZER_RANGES;
    };
  
    class OutputFormat {
     public:
      virtual string writeSentence(const Sentence& s);
      virtual string finishDocument();
  
      static OutputFormat* newOutputFormat(const string& name);
      static OutputFormat* newConlluOutputFormat(const string& options = string());
      static OutputFormat* newEpeOutputFormat(const string& options = string());
      static OutputFormat* newMatxinOutputFormat(const string& options = string());
      static OutputFormat* newHorizontalOutputFormat(const string& options = string());
      static OutputFormat* newPlaintextOutputFormat(const string& options = string());
      static OutputFormat* newVerticalOutputFormat(const string& options = string());
  
      static const string CONLLU_V1;
      static const string CONLLU_V2;
      static const string HORIZONTAL_PARAGRAPHS;
      static const string PLAINTEXT_NORMALIZED_SPACES;
      static const string VERTICAL_PARAGRAPHS;
    };
  
    class Model {
     public:
      static Model* load(const char* fname);
  
      virtual InputFormat* newTokenizer(const string& options) const;
      virtual bool tag(Sentence& s, const string& options, ProcessingError* error = nullptr) const;
      virtual bool parse(Sentence& s, const string& options, ProcessingError* error) const;
  
      static const string DEFAULT;
      static const string TOKENIZER_PRESEGMENTED;
    };
  
    class Pipeline {
     public:
      Pipeline(const Model* m, const string& input, const string& tagger, const string& parser, const string& output);
  
      void setModel(const Model* m);
      void setInput(const string& input);
      void setTagger(const string& tagger);
      void setParser(const string& parser);
      void setOutput(const string& output);
  
      void setImmediate(bool immediate);
      void setDocumentId(const string& document_id);
  
      string process(const string& data, ProcessingError* error = nullptr) const;
  
      static const string DEFAULT;
      static const string NONE;
    };
  
    class Trainer {
     public:
  
      static string train(const string& method, const Sentences& train, const Sentences& heldout,
                          const string& tokenizer, const string& tagger, const string& parser,
                          ProcessingError* error = nullptr);
  
      static const string DEFAULT;
      static const string NONE;
    };
  
    class Evaluator {
     public:
      Evaluator(const Model* m, const string& tokenizer, const string& tagger, const string& parser);
  
      void setModel(const Model* m);
      void setTokenizer(const string& tokenizer);
      void setTagger(const string& tagger);
      void setParser(const string& parser);
  
      string evaluate(const string& data, ProcessingError* error = nullptr) const;
  
      static const string DEFAULT;
      static const string NONE;
    };
  
    class Version {
     public:
      unsigned major;
      unsigned minor;
      unsigned patch;
      string prerelease;
  
      // Returns current version.
      static version current();
    };

Examples

run_udpipe

Simple pipeline loading data (tokenizing on request), tagging, parsing and writing to specified output format.

  use warnings;
  use strict;
  use open qw(:std :utf8);
  
  use Ufal::UDPipe;
  
  @ARGV >= 3 or die "Usage: $0 input_format(tokenize|conllu|horizontal|vertical) output_format(conllu) model_file\n";
  my $input = shift @ARGV;
  my $output = shift @ARGV;
  my $model_file = shift @ARGV;
  
  print STDERR "Loading model: ";
  my $model = Ufal::UDPipe::Model::load($model_file);
  $model or die "Cannot load model from file '$model_file'\n";
  print STDERR "done\n";
  
  my $pipeline = Ufal::UDPipe::Pipeline->new($model, $input, $Ufal::UDPipe::Pipeline::DEFAULT, $Ufal::UDPipe::Pipeline::DEFAULT, $output);
  my $error = Ufal::UDPipe::ProcessingError->new();
  
  # Read whole input
  my $text = '';
  for (my $line; $line = <>; ) {
    $text .= $line;
  }
  
  # Process data
  my $processed = $pipeline->process($text, $error);
  $error->occurred() and die "An error occurred in run_udpipe: " . $error->{message};
  print $processed;

AUTHORS

Milan Straka <straka@ufal.mff.cuni.cz>

COPYRIGHT AND LICENCE

This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

To install Ufal::UDPipe, copy and paste the appropriate command in to your terminal.

cpanm

cpanm Ufal::UDPipe

CPAN shell

perl -MCPAN -e shell
install Ufal::UDPipe

For more information on module installation, please visit the detailed CPAN module installation guide.

	Global
`s`	Focus search bar
`?`	Bring up this help dialog

	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)

	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse

	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)