ok(
$CWB::CL::Registry
,
"check registry directory"
);
diag(
""
);
diag(
"- registry directory: $CWB::CL::Registry"
);
$CWB::CL::Registry
=
"data/registry"
;
our
$C
= new CWB::CL::Corpus
"VSS"
;
isa_ok(
$C
,
"CWB::CL::Corpus"
,
"corpus object for VSS corpus"
)
or BAIL_OUT(
"failed to access test corpus VSS"
);
our
$Word
=
$C
->attribute(
"word"
,
"p"
);
isa_ok(
$Word
,
"CWB::CL::PosAttrib"
,
"attribute object for VSS.word"
)
or BAIL_OUT(
"failed to access word attribute of corpus VSS"
);
our
$S
=
$C
->attribute(
"s"
,
"s"
);
isa_ok(
$S
,
"CWB::CL::StrucAttrib"
,
"attribute object for VSS.s"
)
or BAIL_OUT(
"failed to access <s> attribute of corpus VSS"
);
our
$StoryTitle
=
$C
->attribute(
"story_title"
,
"s"
);
isa_ok(
$S
,
"CWB::CL::StrucAttrib"
,
"attribute object for VSS.story_title"
)
or BAIL_OUT(
"failed to access <story title=''> annotations"
);
ok(
$StoryTitle
->struc_values,
"attribute <story title=''> contains annotations"
);
our
$Chapter
=
$C
->attribute(
"chapter"
,
"s"
)
or BAIL_OUT(
"failed to access <chapter> attribute of corpus VSS"
);
our
$Align
=
$C
->attribute(
"vss"
,
"a"
)
or BAIL_OUT(
"failed to access self-alignment (VSS.vss attribute)"
);
our
$corpus_size
=
$Word
->max_cpos;
is(
$corpus_size
, 8043,
"corpus size (tokens)"
);
our
$lex_size
=
$Word
->max_id;
is(
$lex_size
, 2111,
"lexicon size (types)"
);
diag(
"- VSS corpus has $corpus_size word form tokens and $lex_size types"
);
our
$id
=
$Word
->str2id(
"elephant"
);
is(
$id
, 1977,
"look up 'elephant' in word form lexicon"
);
ok((not
defined
$Word
->str2id(
"corpus"
)),
"unknown word type returns undef"
);
is(
$Word
->id2str(
$id
),
"elephant"
,
"look up word form 'elephant' from ID"
);
our
$f
=
$Word
->id2freq(
$id
);
is(
$f
, 14,
"frequency of 'elephant'"
);
diag(
"- 'elephant' occurs $f times in VSS corpus"
);
our
$regex
=
"[a-z]+(ally|ily)"
;
our
$perl_regex
=
qr/^(?:${regex})$/
i;
our
@id
=
$Word
->regex2id(
$regex
,
"c"
);
our
$n_types
=
@id
;
is(
$n_types
, 24,
"match regular expressions against lexicon"
);
our
@words
=
$Word
->id2str(
@id
);
ok(
@words
==
@id
,
"map matching IDs to words"
);
our
@errors
=
grep
{not /
$perl_regex
/}
@words
;
ok(
@errors
== 0,
"regular expression matches are correct"
);
diag(
"- these words should not have matched: @errors"
)
if
@errors
;
our
$total_f
=
$Word
->idlist2freq(
@id
);
our
$sum_f
= 0;
foreach
my
$f
(
$Word
->id2freq(
@id
)) {
$sum_f
+=
$f
};
is(
$total_f
,
$sum_f
,
"total frequency counts are consistent"
);
is(
$total_f
, 37,
"total frequency of matching words"
);
diag(
"- regexp \"$regex\"\%c matches $n_types types, $total_f tokens"
);
our
@cpos
=
$Word
->idlist2cpos(
@id
);
is(
@cpos
+0,
$total_f
,
"index lookup returns correct number of corpus positions"
);
@errors
=
grep
{
$Word
->cpos2str(
$_
) !~ /
$perl_regex
/}
@cpos
;
ok(
@errors
== 0,
"index entries are correct"
);
diag(
"- these corpus positions should not have been in the index: @errors"
)
if
@errors
;
our
@first5
=
$Word
->cpos2str(
@cpos
[0 .. 4]);
diag(
"- index entries: "
.
join
(
", "
,
@first5
,
"..."
));
our
$n_sentences
=
$S
->max_struc;
our
$n_stories
=
$StoryTitle
->max_struc;
is(
$n_sentences
, 459,
"number of sentences"
);
is(
$n_stories
, 6,
"number of stories"
);
diag(
"- VSS contains $n_stories stories with a total of $n_sentences sentences"
);
our
$sent_num
=
$S
->cpos2struc(7300);
is(
$sent_num
, 430,
"sentence number at corpus position 7300"
);
our
(
$start
,
$end
) =
$S
->struc2cpos(
$sent_num
);
our
@sentence
=
$Word
->cpos2str(
$start
..
$end
);
is(
"@sentence"
,
"It was an elephant ."
,
"full sentence containing corpus position 7300"
);
our
$title
=
$StoryTitle
->cpos2str(7300);
is(
$title
,
"The Garden"
,
"value of <story_title> region at cpos 7300"
);
diag(
"- found sentence '@sentence' in story '$title'"
);
our
@cpos_pairs
=
$S
->struc2cpos(1, 3, 4, 0);
is_deeply(\
@cpos_pairs
, [16, 39, 59, 69, 70, 73, 0, 15],
"vectorised struc2cpos() method"
);
@cpos_pairs
=
$S
->cpos2struc2cpos(20, 60, -10, 70, 15);
is_deeply(\
@cpos_pairs
, [16, 39, 59, 69,
undef
,
undef
, 70, 73, 0, 15],
"vectorised cpos2struc2cpos() method"
);
our
$f_i
=
$CWB::CL::Boundary
{
"inside"
};
our
$f_o
=
$CWB::CL::Boundary
{
"outside"
};
our
$f_l
=
$CWB::CL::Boundary
{
"left"
} |
$CWB::CL::Boundary
{
"inside"
};
our
$f_r
=
$CWB::CL::Boundary
{
"right"
} |
$CWB::CL::Boundary
{
"inside"
};
our
$f_lr
=
$CWB::CL::Boundary
{
"leftright"
} |
$CWB::CL::Boundary
{
"inside"
};
our
@flags
=
$Chapter
->cpos2boundary(2000, 1117, 1118, 5000, 7009, 6000);
is_deeply(\
@flags
, [
$f_i
,
$f_r
,
$f_l
,
$f_o
,
$f_r
,
$f_o
],
"test region boundaries (cpos2boundary)"
);
@flags
=
$S
->cpos2boundary(2701, 4000, 4380, 5000, 6250);
is_deeply(\
@flags
, [
$f_l
,
$f_r
,
$f_lr
,
$f_i
,
$f_lr
],
"test region boundaries of single-token region"
);
@flags
=
$Chapter
->cpos2is_boundary(
"inside"
, 2000, 1117, 1118, 5000, 7009, 6000);
is_deeply(\
@flags
, [1, 1, 1, 0, 1, 0],
"test whether inside region (cpos2is_boundary)"
);
@flags
=
$Chapter
->cpos2is_boundary(
"outside"
, 2000, 1117, 1118, 5000, 7009, 6000);
is_deeply(\
@flags
, [0, 0, 0, 1, 0, 1],
"test whether outside region (cpos2is_boundary)"
);
@flags
=
$Chapter
->cpos2is_boundary(
"right"
, 2000, 1117, 1118, 5000, 7009, 6000);
is_deeply(\
@flags
, [0, 1, 0, 0, 1, 0],
"test for right boundary (cpos2is_boundary)"
);
@flags
=
$S
->cpos2is_boundary(
"leftright"
, 2701, 4000, 4380, 5000, 6250);
is_deeply(\
@flags
, [0, 0, 1, 0, 1],
"test for double boundary (cpos2is_boundary)"
);
our
@align
=
$Align
->cpos2alg(2725, 4242, 7777);
is_deeply(\
@align
, [90,
undef
, 299],
"find alignment beads (cpos2alg)"
);
our
@beads
=
$Align
->alg2cpos(90, 299);
is_deeply(\
@beads
, [2723, 2727, 2728, 2732, 7777, 7792, 7760, 7776],
"expand alignment beads (alg2cpos)"
);
@beads
=
$Align
->cpos2alg2cpos(2725, 4242, 7777);
is_deeply(\
@beads
, [2723, 2727, 2728, 2732,
undef
,
undef
,
undef
,
undef
, 7777, 7792, 7760, 7776],
"expand alignment beads directly (cpos2alg2cpos)"
);