#!/usr/bin/perl
if
(
$#ARGV
< 0)
{
printUsage();
exit
(0);
}
if
(
$ARGV
[0] =~ /--help/)
{
printHelp();
exit
(0);
}
if
(
$#ARGV
< 1)
{
print
"Input or Output missing!\n"
;
print
"Type Insertion.pl --help for help.\n"
;
exit
(0);
}
$tempIn
=
""
;
@inLemma
= (
""
,
""
,
""
,
""
,
""
);
@outLemma
= (
""
,
""
,
""
);
$tempOut
=
""
;
$wn
= WordNet::QueryData->new;
@wordNetNouns
;
@wordNetVerbs
;
%wnGlosses
= ();
@wnNounSenses
;
@wnVerbSenses
;
%wnHypes
= ();
%wnHypos
= ();
%wnSyns
= ();
%wnFreq
= ();
%senseScores
= ();
$highSenseScore
= 0;
$highSense
=
""
;
$wikParser
= Wiktionary::Parser->new();
$attachMerge
=
""
;
$count
=0;
sub
run()
{
open
TRIAL,
"$ARGV[0]"
or
die
$!;
open
(OUTTRIAL,
'>'
,
"$ARGV[1]"
) or
die
$!;
preProcessing();
while
(<TRIAL>)
{
for
$tempIn
(
split
(
"\n"
))
{
@inLemma
=
split
(
"\t"
);
processLemma();
$tempOut
=
"$outLemma[0]\t$outLemma[1]\t$outLemma[2]\n"
;
print
OUTTRIAL
"$tempOut"
;
print
"$tempOut"
;
}
}
}
sub
preProcessing()
{
@wordNetNouns
=
$wn
->listAllWords(
'noun'
);
@wordNetVerbs
=
$wn
->listAllWords(
'verb'
);
foreach
my
$noun
(
@wordNetNouns
)
{
my
@nSenses
=
$wn
->querySense(
"$noun\#n"
); #gets all senses
for
that word
foreach
my
$curNSense
(
@nSenses
)
{
push
(
@wnNounSenses
,
$curNSense
);
my
@nGlosses
=
$wn
->querySense(
$curNSense
,
"glos"
);
my
$tempSenseGloss
=
$nGlosses
[0];
$tempSenseGloss
=~ s/(\(|\)|\.)//g;
$tempSenseGloss
=~ s/^a-zA-Z//g;
$tempSenseGloss
=
lc
$tempSenseGloss
;
$tempSenseGloss
=~ s/\b(the|is|at|which|on|a|an|and|or|up)\b//g;
$wnGlosses
{
$curNSense
} =
$tempSenseGloss
;
my
@hypes
=
$wn
->querySense(
$curNSense
,
"hype"
);
$wnHypes
{
$curNSense
} = \
@hypes
;
my
@hypos
=
$wn
->querySense(
$curNSense
,
"hypo"
);
$wnHypos
{
$curNSense
} = \
@hypos
;
my
@syns
=
$wn
->querySense(
$curNSense
,
"syns"
);
$wnSyns
{
$curNSense
} = \
@syns
;
$wnFreq
{
$curNSense
} =
$wn
->frequency(
$curNSense
);
}
}
foreach
my
$verb
(
@wordNetVerbs
)
{
my
@vSenses
=
$wn
->querySense(
"$verb\#v"
); #gets all senses
for
that word
foreach
my
$curVSense
(
@vSenses
)
{
push
(
@wnVerbSenses
,
$curVSense
);
my
@vGlosses
=
$wn
->querySense(
$curVSense
,
"glos"
);
my
$tempSenseGloss
=
$vGlosses
[0];
$tempSenseGloss
=~ s/(\(|\)|\.)//g;
$tempSenseGloss
=~ s/^a-zA-Z//g;
$tempSenseGloss
=
lc
$tempSenseGloss
;
$tempSenseGloss
=~ s/\b(the|is|at|which|on|a|an|and|or|up)\b//g;
$wnGlosses
{
$curVSense
} =
$tempSenseGloss
;
my
@hypes
=
$wn
->querySense(
$curVSense
,
"hype"
);
$wnHypes
{
$curVSense
} = \
@hypes
;
my
@hypos
=
$wn
->querySense(
$curVSense
,
"hypo"
);
$wnHypos
{
$curVSense
} = \
@hypos
;
my
@syns
=
$wn
->querySense(
$curVSense
,
"syns"
);
$wnSyns
{
$curVSense
} = \
@syns
;
$wnFreq
{
$curVSense
} =
$wn
->frequency(
$curVSense
);
}
}
}
sub
processLemma()
{
%senseScores
= ();
$highSenseScore
= 0;
$highSense
=
""
;
if
(
$inLemma
[1] =~ /noun/)
{
foreach
$curSense
(
@wnNounSenses
)
{
scoreSense();
}
}
else
{
foreach
$curSense
(
@wnVerbSenses
)
{
scoreSense();
}
}
refineSense();
if
(
$wnFreq
{
$highSense
} == 0)
{
$attachMerge
=
"attach"
;
}
else
{
$attachMerge
=
"merge"
;
}
$outLemma
[0] =
$inLemma
[2];
$outLemma
[1] =
$highSense
;
$outLemma
[2] =
$attachMerge
;
}
sub
scoreSense()
{
$word
=
substr
(
$curSense
, 0,
index
(
$curSense
,
'#'
)); #extracts base word.
my
@curSenseGloss
=
split
(
' '
,
$wnGlosses
{
$curSense
});
my
@senseHypes
= @{
$wnHypes
{
$curSense
}};
my
@senseHypeGloss
= ();
my
$tempAllHypeGloss
=
""
;
for
my
$hype
(0..
$#senseHypes
)
{
my
$tempHypeGloss
=
$wnGlosses
{
$hype
};
$tempAllHypeGloss
=
$tempAllHypeGloss
.
" "
.
$tempHypeGloss
;
}
@senseHypeGloss
=
split
(
' '
,
$tempAllHypeGloss
);
my
@senseHypos
= @{
$wnHypos
{
$curSense
}};
my
@senseHypoGloss
= ();
my
$tempAllHypoGloss
=
""
;
for
my
$hypo
(0..
$#senseHypos
)
{
my
$tempHypoGloss
=
$wnGlosses
{
$hypo
};
$tempAllHypoGloss
=
$tempAllHypoGloss
.
" "
.
$tempHypoGloss
;
}
@senseHypoGloss
=
split
(
' '
,
$tempAllHypoGloss
);
my
@senseSyns
= @{
$wnSyns
{
$curSense
}};
my
@senseSynsGloss
= ();
my
$tempAllSynsGloss
=
""
;
for
my
$syns
(0..
$#senseSyns
)
{
if
(!(
$syns
=~ /\b
$word
\b/))
{
my
$tempSynsGloss
=
$wnGlosses
{
$syns
};
$tempAllSynsGloss
=
$tempAllSynsGloss
.
" "
.
$tempSynsGloss
;
}
}
@senseSynsGloss
=
split
(
' '
,
$tempAllSynsGloss
);
my
$tempLemmaGloss
=
$inLemma
[3];
$tempLemmaGloss
=~ s/(\(|\)|\.)//g;
$tempLemmaGloss
=~ s/^a-zA-Z//g;
$tempLemmaGloss
=
lc
$tempLemmaGloss
;
$tempLemmaGloss
=~ s/\b(the|is|at|which|on|a|an|and|or|up)\b//g;
my
@curLemmaGloss
=
split
(
' '
,
$tempLemmaGloss
);
my
$glossLength
= 0;
my
$overlaps
= 0.0;
for
my
$lWord
(0..
$#curLemmaGloss
)
{
$glossLength
=
$glossLength
+
length
$curLemmaGloss
[
$lWord
];
if
(
$curLemmaGloss
[
$lWord
] =~ /\b
$word
\b/)
{
$overlaps
=
$overlaps
+ 10*(
length
$word
);
}
$spaceWord
=
$word
;
$spaceWord
=~ s/_/ /g;
if
(
$spaceWord
=~ /(^\w+\s\b
$curLemmaGloss
[
$lWord
]\b$)|(^\b
$curLemmaGloss
[
$lWord
]\b\s\w+$)/)
{
$overlaps
=
$overlaps
+ 100*(
length
$curLemmaGloss
[
$lWord
]);
}
for
my
$sWord
(0..
$#curSenseGloss
)
{
if
(
$curLemmaGloss
[
$lWord
] =~ /\b\Q
$curSenseGloss
[
$sWord
]\E\b?/)
{
$overlaps
=
$overlaps
+
length
$curSenseGloss
[
$sWord
];
}
}
for
my
$hypeWord
(0..
$#senseHypeGloss
)
{
if
(
$curLemmaGloss
[
$lWord
] =~ /\b\Q
$senseHypeGloss
[
$hypeWord
]\E\b?/)
{
$overlaps
=
$overlaps
+
length
$senseHypeGloss
[
$hypeWord
];
}
}
for
my
$hypoWord
(0..
$#senseHypoGloss
)
{
if
(
$curLemmaGloss
[
$lWord
] =~ /\b\Q
$senseHypoGloss
[
$hypoWord
]\E\b?/)
{
$overlaps
=
$overlaps
+
length
$senseHypeGloss
[
$hypoWord
];
}
}
for
my
$synsWord
(0..
$#senseSynsGloss
)
{
if
(
$curLemmaGloss
[
$lWord
] =~ /\b\Q
$senseSynsGloss
[
$synsWord
]\E\b?/)
{
$overlaps
=
$overlaps
+
length
$senseSynsGloss
[
$synsWord
];
}
}
}
$score
=
$overlaps
/
$glossLength
;
if
(
$score
>=
$highSenseScore
)
{
$highSenseScore
=
$score
;
$highSense
=
$curSense
;
}
$senseScores
{
$curSense
} =
$score
;
}
sub
refineSense()
{
$word
=
substr
(
$highSense
, 0,
index
(
$highSense
,
'#'
)); #extracts base word.
$shortSense
=
substr
(
$inLemma
[1], 0, 1);
$sense
=
$word
.
"#"
.
$shortSense
;
$highSenseScore
= 0;
my
$rSenseScore
= 0;
my
$refineHigh
=
"$sense#1"
; #assume first sense.
my
$tempLemmaGloss
=
$inLemma
[3];
$tempLemmaGloss
=~ s/(\(|\)|\.)//g;
$tempLemmaGloss
=~ s/^a-zA-Z//g;
$tempLemmaGloss
=
lc
$tempLemmaGloss
;
$tempLemmaGloss
=~ s/\b(the|is|at|which|on|a|an|and|or|up)\b//g;
my
@refineLemmaGloss
=
split
(
' '
,
$tempLemmaGloss
);
my
$rGlossLength
= 0.0;
my
$rOverlaps
= 0.0;
my
@refineSenses
=
$wn
->querySense(
$sense
);
for
my
$rSense
(0..
$#refineSenses
)
{
my
$tempSenseGloss
=
$wnGlosses
{
$rSense
};
for
my
$rLemma
(0..
$#refineLemmaGloss
)
{
$rGlossLength
=
$rGlossLength
+
length
$refineLemmaGloss
[
$rLemma
];
if
(
$refineLemmaGlos
[
$rLemma
] ne
$word
)
{
if
(
$tempSenseGloss
=~ /
$refineLemmaGloss
[
$rLemma
]/)
{
$rOverlaps
=
$rOverlaps
+
length
$refineLemmaGloss
[
$rLemma
];
}
}
}
$rSenseScore
=
$rOverlaps
/
$rGlossLength
;
if
(
$rSenseScore
>
$highSenseScore
)
{
$highSenseScore
=
$rSenseScore
;
$refineHigh
=
$rHypo
;
}
}
$highSense
=
$refineHigh
;
}
sub
printUsage()
{
print
"Usage: Insertion.pl DATA SOURCE | DESTINATION SOURCE\n"
;
print
"\tType Insertion.pl --help for help.\n"
;
}
sub
printHelp()
{
printUsage();
print
"Takes in lemmas from file and attempts to\n"
;
print
"insert them into WordNet by first finding\n"
;
print
"a hypernym, then either a) merging the \n"
;
print
"lemma with the hypernym or b) attaching \n"
;
print
"the lemma to the hypernym.\n"
;
}
run();