our
$VERSION
=
'0.82'
;
sub
termtagging {
my
(
$corpus_filename
,
$term_list_filename
,
$output_filename
,
$lemmatised_corpus_filename
,
$caseSensitive
) =
@_
;
my
@term_list
;
my
%term_listIdx
;
my
@regex_term_list
;
my
@regex_lemmawordterm_list
;
my
%corpus
;
my
%lc_corpus
;
my
%lemmatised_corpus
;
my
%lc_lemmatised_corpus
;
my
%corpus_index
;
my
%lemmatised_corpus_index
;
my
%idtrm_select
;
my
%idlemtrm_select
;
if
(!
defined
$caseSensitive
) {
$caseSensitive
= -1;
}
&load_TermList
(
$term_list_filename
,\
@term_list
, \
%term_listIdx
);
&get_Regex_TermList
(\
@term_list
, \
@regex_term_list
, \
@regex_lemmawordterm_list
);
&load_Corpus
(
$corpus_filename
, \
%corpus
, \
%lc_corpus
);
if
(
defined
$lemmatised_corpus_filename
) {
&load_Corpus
(
$lemmatised_corpus_filename
, \
%lemmatised_corpus
, \
%lc_lemmatised_corpus
);
}
&corpus_Indexing
(\
%lc_corpus
, \
%corpus
, \
%corpus_index
,
$caseSensitive
);
if
(
defined
$lemmatised_corpus_filename
) {
&corpus_Indexing
(\
%lc_lemmatised_corpus
, \
%lemmatised_corpus
, \
%lemmatised_corpus_index
,
$caseSensitive
);
}
&term_Selection
(\
%corpus_index
, \
@term_list
, \
%idtrm_select
,
$caseSensitive
);
if
(
defined
$lemmatised_corpus_filename
) {
&term_Selection
(\
%lemmatised_corpus_index
, \
@term_list
, \
%idlemtrm_select
,
$caseSensitive
);
}
&term_tagging_offset
(\
@term_list
, \
@regex_term_list
, \
%idtrm_select
, \
%corpus
,
$output_filename
,
$caseSensitive
);
if
(
defined
$lemmatised_corpus_filename
) {
&term_tagging_offset
(\
@term_list
, \
@regex_lemmawordterm_list
, \
%idlemtrm_select
, \
%lemmatised_corpus
,
$output_filename
,
$caseSensitive
);
}
return
(0);
}
sub
termtagging_brat {
my
(
$corpus_filename
,
$term_list_filename
,
$output_filename
,
$lemmatised_corpus_filename
,
$caseSensitive
) =
@_
;
my
@term_list
;
my
%term_listIdx
;
my
@regex_term_list
;
my
@regex_lemmawordterm_list
;
my
%corpus
;
my
%lc_corpus
;
my
%lemmatised_corpus
;
my
%lc_lemmatised_corpus
;
my
%corpus_index
;
my
%lemmatised_corpus_index
;
my
%idtrm_select
;
my
%idlemtrm_select
;
if
(!
defined
$caseSensitive
) {
$caseSensitive
= -1;
}
&load_TermList
(
$term_list_filename
,\
@term_list
, \
%term_listIdx
);
&get_Regex_TermList
(\
@term_list
, \
@regex_term_list
, \
@regex_lemmawordterm_list
);
&load_Corpus
(
$corpus_filename
, \
%corpus
, \
%lc_corpus
);
if
(
defined
$lemmatised_corpus_filename
) {
&load_Corpus
(
$lemmatised_corpus_filename
, \
%lemmatised_corpus
, \
%lc_lemmatised_corpus
);
}
&corpus_Indexing
(\
%lc_corpus
, \
%corpus
, \
%corpus_index
,
$caseSensitive
);
if
(
defined
$lemmatised_corpus_filename
) {
&corpus_Indexing
(\
%lc_lemmatised_corpus
, \
%lemmatised_corpus
, \
%lemmatised_corpus_index
,
$caseSensitive
);
}
&term_Selection
(\
%corpus_index
, \
@term_list
, \
%idtrm_select
,
$caseSensitive
);
if
(
defined
$lemmatised_corpus_filename
) {
&term_Selection
(\
%lemmatised_corpus_index
, \
@term_list
, \
%idlemtrm_select
,
$caseSensitive
);
}
&term_tagging_offset_brat
(\
@term_list
, \
@regex_term_list
, \
%idtrm_select
, \
%corpus
,
$output_filename
,
$caseSensitive
);
if
(
defined
$lemmatised_corpus_filename
) {
&term_tagging_offset_brat
(\
@term_list
, \
@regex_lemmawordterm_list
, \
%idlemtrm_select
, \
%lemmatised_corpus
,
$output_filename
,
$caseSensitive
);
}
return
(0);
}
sub
load_TermList {
my
(
$termlist_name
,
$ref_termlist
,
$ref_termlistIdx
) =
@_
;
my
$line
;
my
$line1
;
my
$term
;
my
$suppl_info
;
my
@tab
;
warn
"Loading the terminological resource\n"
;
open
DESC_TERMLIST,
$termlist_name
or
die
"$0: $termlist_name: No such file\n"
;
binmode
(DESC_TERMLIST,
":utf8"
);
while
(
$line1
= <DESC_TERMLIST>) {
chomp
$line1
;
utf8::decode(
$line1
);
$line
=
$line1
;
if
((
$line
!~ /^\s*\
my
@tab
=
split
/ ?[\|:] ?/,
$line
;
if
(
$tab
[0] !~ /^\s*$/) {
$tab
[0] =~ s/ +/ /go;
$tab
[0] =~ s/ $//go;
$tab
[0] =~ s/^ //go;
if
(!
exists
$ref_termlistIdx
->{
$tab
[0]}) {
push
@$ref_termlist
, \
@tab
;
$ref_termlistIdx
->{
$tab
[0]} =
scalar
(
@$ref_termlist
) -1;
}
else
{
$ref_termlist
->[
$ref_termlistIdx
->{
$tab
[0]}]->[2] .=
";"
.
$tab
[2];
}
}
}
}
close
DESC_TERMLIST;
print
STDERR
"\n\tTerm list size : "
.
scalar
(
@$ref_termlist
) .
"\n\n"
;
}
sub
get_Regex_TermList {
my
(
$ref_termlist
,
$ref_regex_termlist
,
$ref_regex_lemmaWordtermlist
) =
@_
;
my
$term_counter
;
warn
"Generating the regular expression from the terms\n"
;
for
(
$term_counter
= 0;
$term_counter
<
scalar
@$ref_termlist
;
$term_counter
++) {
$ref_regex_termlist
->[
$term_counter
] =
$ref_termlist
->[
$term_counter
]->[0];
if
(
defined
$ref_regex_lemmaWordtermlist
) {
if
(
defined
$ref_termlist
->[
$term_counter
]->[3]) {
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =
$ref_termlist
->[
$term_counter
]->[3];
}
else
{
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =
$ref_termlist
->[
$term_counter
]->[0];
}
}
$ref_regex_termlist
->[
$term_counter
] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\
$ref_regex_termlist
->[
$term_counter
] =~ s/ /[\- \n]/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/A/[\x{00C0}-\x{00C5}\x{00E0}-\x{00E5}A]/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/AE/(\x{00C6}|AE)/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/C/[\x{00C7}|C]/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/E/[\x{00C8}-\x{00CB}E]/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/I/[\x{00CC}-\x{00CF}I]/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/D/[\x{00D0}D]/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/N/[\x{00D1}N]/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/O/[\x{00D2}-\x{00D8}O]/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/U/[\x{00D9}-\x{00DC}U]/og;
$ref_regex_termlist
->[
$term_counter
] =~ s/Y/[\x{00DD}Y]/og;
if
(
defined
$ref_regex_lemmaWordtermlist
) {
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/ /[\- \n]/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/A/[\x{00C0}-\x{00C5}A]/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/AE/(\x{00C6}|AE)/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/C/[\x{00C7}C]/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/E/[\x{00C8}-\x{00CB}E]/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/I/[\x{00CC}-\x{00CF}I]/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/D/[\x{00D0}D]/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/N/[\x{00D1}N]/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/O/[\x{00D2}-\x{00D8}O]/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/U/[\x{00D9}-\x{00DC}U]/og;
$ref_regex_lemmaWordtermlist
->[
$term_counter
] =~ s/Y/[\x{00DD}Y]/og;
}
}
print
STDERR
"\n\tTerm/regex list size : "
.
scalar
(
@$ref_regex_termlist
);
if
(
defined
$ref_regex_lemmaWordtermlist
) {
print
STDERR
" / "
.
scalar
(
@$ref_regex_lemmaWordtermlist
);
}
print
STDERR
"\n\n"
;
}
sub
load_Corpus {
my
(
$corpus_filename
,
$ref_tabh_Corpus
,
$ref_tabh_Corpus_lc
) =
@_
;
my
$line
;
my
$sent_id
= 1;
my
$offset
= 0;
my
$lineLen
= 0;
warn
"Loading the corpus\n"
;
open
CORPUS,
$corpus_filename
or
die
"File $corpus_filename not found\n"
;
binmode
(CORPUS,
":utf8"
);
while
(
$line
=<CORPUS>){
$lineLen
=
length
(
$line
);
chomp
$line
;
$ref_tabh_Corpus
->{
$sent_id
}->{
'line'
} =
$line
;
$ref_tabh_Corpus
->{
$sent_id
}->{
'offset'
} =
$offset
;
$ref_tabh_Corpus_lc
->{
$sent_id
}->{
'line'
} =
lc
$line
;
$ref_tabh_Corpus_lc
->{
$sent_id
}->{
'offset'
} =
$offset
;
$sent_id
++;
$offset
+=
$lineLen
;
}
close
CORPUS;
print
STDERR
"\n\tCorpus size : "
.
scalar
(
keys
%$ref_tabh_Corpus
) .
"\n\n"
;
}
sub
corpus_Indexing {
my
(
$ref_corpus_lc
,
$ref_corpus
,
$ref_corpus_index
,
$caseSensitive
) =
@_
;
my
$word
;
my
@tab_words
;
my
@tab_words_lc
;
my
$sent_id
;
my
$i
;
warn
"Indexing the corpus\n"
;
foreach
$sent_id
(
keys
%$ref_corpus_lc
) {
@tab_words
=
split
/[ ()\
',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]/, $ref_corpus->{$sent_id}->{'
line'};
@tab_words_lc
=
split
/[ ()\
',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]/, $ref_corpus_lc->{$sent_id}->{'
line'};
for
(
$i
=0;
$i
<
scalar
(
@tab_words_lc
);
$i
++) {
if
((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$tab_words_lc
[
$i
]) <=
$caseSensitive
))) {
$word
=
$tab_words
[
$i
];
}
else
{
$word
=
$tab_words_lc
[
$i
];
}
if
(
$word
ne
""
) {
$word
=~ s/[\x{00C0}-\x{00C5}\x{00E0}-\x{00E5}]/A/og;
$word
=~ s/\x{00C6}/AE/og;
$word
=~ s/[\x{00C7}]/C/og;
$word
=~ s/[\x{00C8}-\x{00CB}]/E/og;
$word
=~ s/[\x{00CC}-\x{00CF}]/I/og;
$word
=~ s/[\x{00D0}]/D/og;
$word
=~ s/[\x{00D1}]/N/og;
$word
=~ s/[\x{00D2}-\x{00D8}]/O/og;
$word
=~ s/[\x{00D9}-\x{00DC}]/U/og;
$word
=~ s/[\x{00DD}]/Y/og;
if
(!
exists
$ref_corpus_index
->{
$word
}) {
my
@tabtmp
;
$ref_corpus_index
->{
$word
} = \
@tabtmp
;
}
push
@{
$ref_corpus_index
->{
$word
}},
$sent_id
;
}
}
}
print
STDERR
"\n\tSize of the first selected term list: "
.
scalar
(
keys
%$ref_corpus_index
) .
"\n\n"
;
}
sub
print_corpus_index {
my
(
$ref_corpus_index
) =
@_
;
my
$word
;
foreach
$word
(
sort
keys
%$ref_corpus_index
) {
print
STDERR
"$word\t"
;
print
STDERR
join
(
", "
, @{
$ref_corpus_index
->{
$word
}});
print
STDERR
"\n"
;
}
}
sub
_term_Selection2 {
my
(
$ref_corpus_index
,
$ref_termlist
,
$ref_tabh_idtrm_select
) =
@_
;
my
$counter
;
my
$term
;
my
@tab_termlex
;
my
$i
;
my
$word
;
my
$sent_id
;
my
$word_found
= 0;
warn
"Selecting the terms potentialy appearing in the corpus\n"
;
my
%tabh_numtrm_select
;
for
(
$counter
= 0;
$counter
<
scalar
@$ref_termlist
;
$counter
++) {
$term
=
lc
$ref_termlist
->[
$counter
]->[0];
@tab_termlex
=
split
/[ \-]+/,
$term
;
$word_found
= 0;
$i
=0;
do
{
$word
=
$tab_termlex
[
$i
];
if
((
$word
ne
""
) && ((
length
(
$word
) > 2) || (
scalar
(
@tab_termlex
)==1)) &&
((
exists
$ref_corpus_index
->{
$word
}))) {
$word_found
= 1;
if
(!
exists
$ref_tabh_idtrm_select
->{
$counter
}) {
my
%tabhtmp2
;
$ref_tabh_idtrm_select
->{
$counter
} = \
%tabhtmp2
;
}
foreach
$sent_id
(@{
$ref_corpus_index
->{
$word
}}) {
${
$ref_tabh_idtrm_select
->{
$counter
}}{
$sent_id
} = 1;
}
}
$i
++;
}
while
((!
$word_found
) && (
$i
<
scalar
@tab_termlex
));
}
warn
"\nEnd of selecting the terms potentialy appearing in the corpus\n"
;
}
sub
term_Selection {
my
(
$ref_corpus_index
,
$ref_termlist
,
$ref_tabh_idtrm_select
,
$caseSensitive
,
$termField
) =
@_
;
my
$counter
;
my
$term
;
my
@tab_termlex
;
my
$termCap
;
my
@tab_termlexCap
;
my
$i
;
my
$word
;
my
$sent_id
;
my
$word_found
= 0;
my
@recordedWords
;
if
(!
defined
$termField
) {
$termField
= 0;
}
warn
"Selecting the terms potentialy appearing in the corpus ($termField)\n"
;
my
%tabh_numtrm_select
;
for
(
$counter
= 0;
$counter
<
scalar
@$ref_termlist
;
$counter
++) {
if
(
defined
$ref_termlist
->[
$counter
]->[
$termField
]) {
if
((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField
]) <=
$caseSensitive
))) {
$term
=
$ref_termlist
->[
$counter
]->[
$termField
];
$termCap
=
$ref_termlist
->[
$counter
]->[
$termField
];
}
else
{
$term
=
lc
$ref_termlist
->[
$counter
]->[
$termField
];
$termCap
=
$ref_termlist
->[
$counter
]->[
$termField
];
}
}
else
{
$term
=
lc
$ref_termlist
->[
$counter
]->[0];
$termCap
=
$ref_termlist
->[
$counter
]->[0];
}
@tab_termlex
=
split
/[ ()\',\[\]\?\!:;\/\.\+\-\*\
@tab_termlexCap
=
split
/[ ()\',\[\]\?\!:;\/\.\+\-\*\
$word_found
= 0;
$i
=0;
@recordedWords
= ();
$word
=
$tab_termlex
[
$i
];
while
((
$i
<
scalar
(
@tab_termlex
)) && (
$i
<
scalar
(
@tab_termlexCap
)) &&
(((
$word
eq
""
) || (
exists
$ref_corpus_index
->{
$word
})) ||
(((
$caseSensitive
== 0) || (
length
(
$termCap
) >
$caseSensitive
)) &&
(
exists
$ref_corpus_index
->{
$tab_termlexCap
[
$i
]})))
) {
if
(
$word
ne
""
) {
push
@recordedWords
,
$word
;
}
$i
++;
$word
=
$tab_termlex
[
$i
];
}
if
(
$i
==
scalar
(
@tab_termlex
)) {
foreach
$word
(
@recordedWords
) {
if
(!
exists
$ref_tabh_idtrm_select
->{
$counter
}) {
my
%tabhtmp2
;
$ref_tabh_idtrm_select
->{
$counter
} = \
%tabhtmp2
;
}
foreach
$sent_id
(@{
$ref_corpus_index
->{
$word
}}) {
${
$ref_tabh_idtrm_select
->{
$counter
}}{
$sent_id
} = 1;
}
}
}
}
warn
"Size of the selected list: "
.
scalar
(
keys
%$ref_tabh_idtrm_select
) .
"\n"
;
warn
"\nEnd of selecting the terms potentialy appearing in the corpus\n"
;
}
sub
term_tagging_offset {
my
(
$ref_termlist
,
$ref_regex_termlist
,
$ref_tabh_idtrm_select
,
$ref_tabh_corpus
,
$offset_tagged_corpus_name
,
$caseSensitive
,
$termField
) =
@_
;
my
$counter
;
my
$term_regex
;
my
$sent_id
;
my
$line
;
my
$termField2
;
if
(!
defined
$termField
) {
$termField
= 0;
}
warn
"Term tagging\n"
;
open
TAGGEDCORPUS,
">>$offset_tagged_corpus_name"
or
die
"$0: $offset_tagged_corpus_name: No such file\n"
;
binmode
(TAGGEDCORPUS,
":utf8"
);
foreach
$counter
(
keys
%$ref_tabh_idtrm_select
) {
$term_regex
=
$ref_regex_termlist
->[
$counter
];
$termField2
= 0;
if
(
defined
$ref_termlist
->[
$counter
]->[
$termField
]) {
$termField2
=
$termField
;
}
foreach
$sent_id
(
keys
%{
$ref_tabh_idtrm_select
->{
$counter
}}){
$line
=
$ref_tabh_corpus
->{
$sent_id
}->{
'line'
};
print
STDERR
"."
;
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /[,.?!:;\/ \n\-\/\*
'\#\{\}\(\)\[\]\+]($term_regex)[,.?!:;\/ \n\-\/\*'
\#\(\)\[\]\{\}\+]/)) ||
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /[,.?!:;\/ \n\-\/\*
'\#\{\}\(\)\[\]\+]($term_regex)[,.?!:;\/ \n\-\/\*'
\#\(\)\[\]\{\}\+]/i))) {
printMatchingTerm(\
*TAGGEDCORPUS
,
$ref_termlist
->[
$counter
],
$sent_id
);
}
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /^(
$term_regex
)[,.?!:;\/ \n\-\/\*'\
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /^(
$term_regex
)[,.?!:;\/ \n\-\/\*'\
printMatchingTerm(\
*TAGGEDCORPUS
,
$ref_termlist
->[
$counter
],
$sent_id
);
}
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /[,.?!:;\/ \n\-\/\*'\
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /[,.?!:;\/ \n\-\/\*'\
printMatchingTerm(\
*TAGGEDCORPUS
,
$ref_termlist
->[
$counter
],
$sent_id
);
}
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /^(
$term_regex
)$/i)) ||
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /^(
$term_regex
)$/i))) {
printMatchingTerm(\
*TAGGEDCORPUS
,
$ref_termlist
->[
$counter
],
$sent_id
);
}
}
print
STDERR
"\n"
;
}
close
TAGGEDCORPUS;
warn
"\nEnd of term tagging\n"
;
}
sub
printMatchingTerm() {
my
(
$descriptor
,
$ref_matching_term
,
$sent_id
) =
@_
;
print
$descriptor
"$sent_id\t"
;
print
$descriptor
join
(
"\t"
,
@$ref_matching_term
);
print
$descriptor
"\n"
;
}
sub
term_tagging_offset_tab {
my
(
$ref_termlist
,
$ref_regex_termlist
,
$ref_tabh_idtrm_select
,
$ref_tabh_corpus
,
$ref_tab_results
,
$caseSensitive
,
$termField
) =
@_
;
my
$counter
;
my
$term_regex
;
my
$sent_id
;
my
$line
;
my
$i
;
my
$size_termselect
=
scalar
(
keys
%$ref_tabh_idtrm_select
);
my
$termField2
;
$i
= 0;
if
(!
defined
$termField
) {
$termField
= 0;
}
foreach
$counter
(
keys
%$ref_tabh_idtrm_select
) {
$term_regex
=
$ref_regex_termlist
->[
$counter
];
$termField2
= 0;
if
(
defined
$ref_termlist
->[
$counter
]->[
$termField
]) {
$termField2
=
$termField
;
}
foreach
$sent_id
(
keys
%{
$ref_tabh_idtrm_select
->{
$counter
}}){
$line
=
$ref_tabh_corpus
->{
$sent_id
}->{
'line'
};
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /[,.?!:;\/ \n\-\/\*
'\#\{\}\(\)\[\]\+](?<term>$term_regex)[,.?!:;\/ \n\-\/\*'
\#\(\)\[\]\{\}\+]/s)) ||
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /[,.?!:;\/ \n\-\/\*
'\#\{\}\(\)\[\]\+](?<term>$term_regex)[,.?!:;\/ \n\-\/\*'
\#\(\)\[\]\{\}\+]/is))) {
printMatchingTerm_tab(
$ref_termlist
->[
$counter
], $+{term},
$sent_id
,
$ref_tab_results
);
}
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /^(?<term>
$term_regex
)[,.?!:;\/ \n\-\/\*'\
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /^(?<term>
$term_regex
)[,.?!:;\/ \n\-\/\*'\
printMatchingTerm_tab(
$ref_termlist
->[
$counter
], $+{term},
$sent_id
,
$ref_tab_results
);
}
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /[,.?!:;\/ \n\-\/\*'\
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /[,.?!:;\/ \n\-\/\*'\
printMatchingTerm_tab(
$ref_termlist
->[
$counter
], $+{term},
$sent_id
,
$ref_tab_results
);
}
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /^(?<term>
$term_regex
)$/s)) ||
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /^(?<term>
$term_regex
)$/is))) {
printMatchingTerm_tab(
$ref_termlist
->[
$counter
], $+{term},
$sent_id
,
$ref_tab_results
);
}
}
$i
++;
}
print
STDERR
"\n"
;
warn
"\nEnd of term tagging\n"
;
}
sub
term_tagging_offset_brat {
my
(
$ref_termlist
,
$ref_regex_termlist
,
$ref_tabh_idtrm_select
,
$ref_tabh_corpus
,
$offset_tagged_corpus_name
,
$caseSensitive
,
$termField
) =
@_
;
my
$counter
;
my
$term_regex
;
my
$sent_id
;
my
$line
;
my
$i
;
my
$size_termselect
=
scalar
(
keys
%$ref_tabh_idtrm_select
);
my
$termField2
;
my
$termId
= 1;
my
$offset
;
my
$currOffset
;
$i
= 0;
warn
"Term tagging ($offset_tagged_corpus_name)\n"
;
open
TAGGEDCORPUS,
">$offset_tagged_corpus_name"
or
die
"$0: $offset_tagged_corpus_name: No such file\n"
;
binmode
(TAGGEDCORPUS,
":utf8"
);
if
(!
defined
$termField
) {
$termField
= 0;
}
foreach
$counter
(
keys
%$ref_tabh_idtrm_select
) {
$term_regex
=
$ref_regex_termlist
->[
$counter
];
$termField2
= 0;
if
(
defined
$ref_termlist
->[
$counter
]->[
$termField
]) {
$termField2
=
$termField
;
}
foreach
$sent_id
(
keys
%{
$ref_tabh_idtrm_select
->{
$counter
}}){
$line
=
$ref_tabh_corpus
->{
$sent_id
}->{
'line'
};
$offset
=
$ref_tabh_corpus
->{
$sent_id
}->{
'offset'
};
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /(?<
before
>[,.?!:;\/ \n\-\/\*
'\#\{\}\(\)\[\]\+])(?<term>$term_regex)[,.?!:;\/ \n\-\/\*'
\#\(\)\[\]\{\}\+]/s)) ||
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /(?<
before
>[,.?!:;\/ \n\-\/\*
'\#\{\}\(\)\[\]\+])(?<term>$term_regex)[,.?!:;\/ \n\-\/\*'
\#\(\)\[\]\{\}\+]/is))) {
$currOffset
=
$offset
+
length
($`)+
length
($+{
before
});
print_brat_output(\
*TAGGEDCORPUS
, \
$termId
, $+{term},
$currOffset
,
$currOffset
+
length
($+{term}),
$ref_termlist
->[
$counter
]->[2]);
}
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /^(?<term>
$term_regex
)[,.?!:;\/ \n\-\/\*'\
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /^(?<term>
$term_regex
)[,.?!:;\/ \n\-\/\*'\
$currOffset
=
$offset
+
length
($`);
print_brat_output(\
*TAGGEDCORPUS
, \
$termId
, $+{term},
$currOffset
,
$currOffset
+
length
($+{term}),
$ref_termlist
->[
$counter
]->[2]);
}
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /(?<
before
>[,.?!:;\/ \n\-\/\*'\
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /(?<
before
>[,.?!:;\/ \n\-\/\*'\
$currOffset
=
$offset
+
length
($`)+
length
($+{
before
});
print_brat_output(\
*TAGGEDCORPUS
, \
$termId
, $+{term},
$currOffset
,
$currOffset
+
length
($+{term}),
$ref_termlist
->[
$counter
]->[2]);
}
if
((((
defined
$caseSensitive
) && ((
$caseSensitive
== 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) <=
$caseSensitive
))) &&
(
$line
=~ /^(?<term>
$term_regex
)$/s)) ||
(((!
defined
$caseSensitive
) || (
$caseSensitive
< 0) || (
length
(
$ref_termlist
->[
$counter
]->[
$termField2
]) >
$caseSensitive
)) &&
(
$line
=~ /^(?<term>
$term_regex
)$/is))) {
$currOffset
=
$offset
+
length
($`);
print_brat_output(\
*TAGGEDCORPUS
, \
$termId
, $+{term},
$currOffset
,
$currOffset
+
length
($+{term}),
$ref_termlist
->[
$counter
]->[2]);
}
}
$i
++;
}
print
STDERR
"\n"
;
close
TAGGEDCORPUS;
warn
"\nEnd of term tagging\n"
;
}
sub
print_brat_output() {
my
(
$descriptor
,
$termId
,
$matching_term
,
$start_offset
,
$end_offset
,
$semtag
) =
@_
;
if
((!
defined
$semtag
) || (
$semtag
=~ /^\s*$/)) {
$semtag
=
"term"
;
}
print
$descriptor
"T$$termId\t$semtag $start_offset $end_offset\t$matching_term\n"
;
$$termId
++;
}
sub
printMatchingTerm_tab() {
my
(
$ref_matching_term
,
$term
,
$sent_id
,
$ref_tab_results
) =
@_
;
my
$tmp_line
=
""
;
my
$tmp_key
;
if
(
ref
(
$ref_tab_results
) eq
"ARRAY"
) {
$tmp_line
.=
"$sent_id\t"
;
$tmp_line
.=
join
(
"\t"
,
@$ref_matching_term
);
push
@$ref_tab_results
,
$tmp_line
;
}
else
{
if
(
ref
(
$ref_tab_results
) eq
"HASH"
) {
my
@tab_tmp
;
$term
=~ s/\\([\-\+\(\)\{\}])/$1/og;
$tmp_key
.=
$sent_id
.
"_"
;
$tmp_key
.=
$term
;
push
@tab_tmp
,
$sent_id
;
push
@tab_tmp
,
$term
;
push
@tab_tmp
,
@$ref_matching_term
;
if
(!
exists
(
$ref_tab_results
->{
$tmp_key
})) {
$ref_tab_results
->{
$tmp_key
} = \
@tab_tmp
;
}
else
{
if
(
defined
$tab_tmp
[4]) {
$ref_tab_results
->{
$tmp_key
}->[4] .=
";"
.
$tab_tmp
[4];
}
else
{
$ref_tab_results
->{
$tmp_key
}->[4] .=
";"
;
}
}
}
}
}
1;