# WordNet::Extend::Insert.pm version 0.01
# Updated: 09/19/16
# Ted Pedersen, University of Minnesota Duluth
# tpederse at d.umn.edu
# Jon Rusert, University of Minnesota Duluth
# ruse0008 at d.umn.edu
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
=head1 NAME
WordNet::Extend::Insert - Perl module for inserting a lemma into
=head2 Basic Usage Example
use WordNet::Extend::Insert;
my $insert = WordNet::Extend::Insert->new();
@in1 = ("crackberry","noun","withdef.1", "A BlackBerry, a handheld device considered addictive for its networking capability.");
@in2 = ("slackberry","noun","withdef.2", "A mocking name for crackberry.");
@loc1 = ("withdef.5","cellphone#n#1");
@loc2 = ("withdef.6","crackberry#n#1");
$insert->attach(\@in1, \@loc1);
$insert->merge(\@in2, \@loc2);
=head2 Introduction
WordNet is a widely used tool in NLP and other research areas. A drawback of WordNet is the amount of time between updates. WordNet::Extend::Insert aims to allow developers insert their own lemmas into WordNet which can help keep WordNet updated with new language in the world. It can also revert back to the original untouched WordNet if the user makes a mistake or simply wants the untouched WordNet to access.
@ISA = qw(Exporter);
@EXPORT_OK = ();
@EXPORT = ();
$VERSION = '0.01';
$wn = WordNet::QueryData->new; #to be used to access data from wordnet
$WNHOME = "/usr/local/WordNet-3.0";
$wnCRLength = 29; #number of lines the copyright takes up in data.pos and index.pos.
GetOptions('help' => \$help);
if($help == 1)
=head2 Methods
The following methods are defined in this package:
=head3 Public methods
=item $obj->new()
The constructor for WordNet::Extend objects.
Parameters: none.
Return value: the new blessed object
sub new
my $class = shift;
my $self = {};
$self->{errorString} = '';
bless $self, $class;
return $self;
=item $obj->getError()
Allows the object to check if any errors have occurred.
Returns an array ($error, $errString), where $error
value equal to 1 represents a warning and greater than that
represents an error and $errString contains the possible error.
Parameter: None
Returns: array of the form ($error, $errorString).
sub getError()
my $self = shift;
my $error = $self->{error};
my $errString = $self->{errorString};
$self->{errorString} = "";
$errString =~ s/^[\r\n\t ]+//;
return ($error, $errString);
=item $obj->attach($newSynset, $location)
Takes in a new synset and inserts it into WordNet at the specified location
by attaching it to the specified location lemma as a hyponym.
Parameters: Synset array in form (lemma, part-of-speech, item-id, definition)
or "word\tpos\titem-id\tdef", and location to be inserted in form
(item-id, WordNet sense).
Returns: nothing
sub attach()
my $base = 0;
if(scalar @_ == 3)#checks if method entered by object.
$base = 1;
my @newSyn = @{$_[$base]};
$base = $base +1;
if(scalar @newSyn == 1) #in second form
my @tempSyn = split("\t", $newSyn[0]);
@newSyn = @tempSyn;
my $pos = substr($newSyn[1], 0, 1);
my @location = @{$_[$base]};
my $write = 1; #write flag changes to 0 if error occurs so no write() will occur.
if(scalar @newSyn < 4)
my $self = shift;
$self->{error} = 2;
$self->{errorString} = "New synset does not contain enough elements.";
$write = 0;
if(scalar @location < 2)
my $self = shift;
$self->{error} = 2;
$self->{errorString} = "Location does not contain enough elements.";
$write = 0;
if($write == 1)
my $newOffset = findNewOffset($newSyn[1]);
my %offsetMap; #used to stored changes in offsets.
my $indexPos = "";
my $dataPos = "";
my $indexSense = "";
my $posNum = 0;
my $locationLemma = $location[1];
$locationLemma =~ s/#.*//; #extract lemma
my $locationPos = $newSyn[1]; #must be same pos as new.
my $locationOffset = $wn->offset("$location[1]");
while(length($locationOffset) < 8) #QueryData->offset() does not keep the 8 digits, need to add back lost 0's
$locationOffset = "0".$locationOffset;
my $indexFile = "$WNSEARCHDICT/index.$locationPos";
my $dataFile = "$WNSEARCHDICT/data.$locationPos";
my $senseFile = "$WNSEARCHDICT/index.sense";
open (WNINDEXNEW, '>', "$indexFile.new") or die $!;
open (WNDATANEW, '>', "$dataFile.new") or die $!;
open (WNSENSENEW, '>', "$senseFile.new") or die $!;
#make filehandles hot
my $fhIndex = select(WNINDEXNEW);
my $fhData = select(WNDATANEW);
my $fhSense = select(WNSENSENEW);
if($pos eq "n")
$posNum = 1;
if($pos eq "v")
$posNum = 2;
my $self = shift;
$self->{error} = 2;
$self->{errorString} = "Part of speech must be verb or noun";
$write = 0;
if(isNewWord($newSyn[0], $newSyn[1]) == 0)
my %hypData = %{getDataInfo($locationOffset, $locationPos)};
my %hypInfo = %{getIndexInfo($locationLemma, $locationPos)};
#print from three required files.
open WNINDEX, "$indexFile" or die $!;
open WNDATA, "$dataFile" or die $!;
open WNSENSE, "$senseFile" or die $!;
open (WNDATATEMP, '>', "$dataFile.temp") or die $!;
my $changed = 0;
my $curLine = 1;
for $tempIn (split("\n"))
if($curLine > $wnCRLength)
my @tempLine = split /\s/, $tempIn;
if($changed == 1)
my $newNewOffset = $tempLine[0] +14;
while(length($newNewOffset) < 8)
$newNewOffset = "0".$newNewOffset;
$offsetMap{$tempLine[0]} = $newNewOffset;
$offsetMap{$tempLine[0]} = $tempLine[0];
if($tempLine[0] == $locationOffset)
my $newPcnt = $hypData{'p_cnt'} + 1;
while(length $newPcnt < 3)#needs to be represented by 3 digits.
$newPcnt = "0".$newPcnt;
$newOffset = $newOffset + 14; #14 is the length of new data being added.
$tempIn = "$hypData{'synset_offset'} $hypData{'lex_filenum'} $hypData{'ss_type'} $hypData{'w_cnt'} $hypData{'word_lex_id'} $newPcnt $hypData{'ptr'} ~ $newOffset $pos 0000 | $hypData{'gloss'}";
$changed = 1;
print WNDATATEMP "$tempIn\n";
$indexPos ="$newSyn[0] $pos 1 1 \@ 1 0 $newOffset";
$dataPos = "$newOffset $hypData{'lex_filenum'} $pos 01 $newSyn[0] 0 001 \@ $hypData{'synset_offset'} $pos 0000 | $newSyn[3]";
$indexSense = "$newSyn[0]%$posNum:$hypData{'lex_filenum'}:00:: $newOffset 1 0";
open WNDATATEMP, "$dataFile.temp" or die $!;
for $tempIn (split("\n"))
my @tempLine = split /\s/, $tempIn;
for my $i (0 .. $#tempLine)
if(exists $offsetMap{$tempLine[$i]})
$tempLine[$i] = "$offsetMap{$tempLine[$i]}";
$tempIn = join(' ', @tempLine);
print WNDATANEW "$tempIn \n";
print WNDATANEW "$dataPos \n";
$curLine = 1;
for $tempIn (split("\n"))
if($curLine > $wnCRLength)
#need to add hyponym pointer if it did not exist before on hypernym
if($tempIn =~ /^$locationLemma\b[^-]/)
unless($tempIn =~ /\~/)
my $newPcnt = $hypInfo{'p_cnt'};
$tempIn ="$hypInfo{'lemma'} $hypInfo{'pos'} $hypInfo{'synset_cnt'} $newPcnt $hypInfo{'ptr_symbol'} ~ $hypInfo{'sense_cnt'} $hypInfo{'tagsense_cnt'} $hypInfo{'synset_offset'}";
my @tempLine = split /\s/, $tempIn;
my $tmpPcnt = $tempLine[2];
my $offsetPtr = scalar(@tempLine) - 1;
while($tmpPcnt > 0)
if(exists $offsetMap{$tempLine[$offsetPtr]})
$tempLine[$offsetPtr] = "$offsetMap{$tempLine[$offsetPtr]}";
$tempIn = join(' ', @tempLine);
print WNINDEXNEW "$tempIn \n";
print WNINDEXNEW "$indexPos \n";
for $tempIn (split("\n"))
my @tempLine = split /\s/, $tempIn;
if(exists $offsetMap{$tempLine[1]})
$tempLine[1] = "$offsetMap{$tempLine[1]}";
$tempIn = join(' ', @tempLine);
print WNSENSENEW "$tempIn\n";
print WNSENSENEW "$indexSense\n";
close WNINDEX;
close WNDATA;
close WNSENSE;
else #lemma already exists
my %hypData = %{getDataInfo($locationOffset, $locationPos)};
my %hypInfo = %{getIndexInfo($locationLemma, $locationPos)};
my %lemmaIndex = %{getIndexInfo($newSyn[0], $newSyn[1])};
my $newSynNum = $lemmaIndex{'synset_cnt'} + 1;
#print to three required files.
open WNINDEX, "$indexFile" or die $!;
open WNDATA, "$dataFile" or die $!;
open WNSENSE, "$senseFile" or die $!;
open (WNDATATEMP, '>', "$dataFile.temp") or die $!;
my $changed = 0;
my $curLine = 1;
for $tempIn (split("\n"))
if($curLine > $wnCRLength)
my @tempLine = split /\s/, $tempIn;
if($changed == 1)
my $newNewOffset = $tempLine[0] +14;
while(length($newNewOffset) < 8)
$newNewOffset = "0".$newNewOffset;
$offsetMap{$tempLine[0]} = $newNewOffset;
$offsetMap{$tempLine[0]} = $tempLine[0];
if($tempLine[0] == $locationOffset)
my $newPcnt = $hypData{'p_cnt'} + 1;
while(length $newPcnt < 3)#needs to be represented by 3 digits.
$newPcnt = "0".$newPcnt;
$newOffset = $newOffset + 14; #14 is the length of new data being added.
$tempIn = "$hypData{'synset_offset'} $hypData{'lex_filenum'} $hypData{'ss_type'} $hypData{'w_cnt'} $hypData{'word_lex_id'} $newPcnt $hypData{'ptr'} ~ $newOffset $pos 0000 | $hypData{'gloss'}";
$changed = 1;
print WNDATATEMP "$tempIn\n";
$indexPos ="$newSyn[0] $pos $newSynNum $lemmaIndex{'p_cnt'} $lemmaIndex{'ptr_symbol'} $newSynNum $lemmaIndex{'tagsense_cnt'} $lemmaIndex{'synset_offset'} $newOffset";
$dataPos = "$newOffset $hypData{'lex_filenum'} $pos 01 $newSyn[0] 0 001 @ $hypData{'synset_offset'} $pos 0000 | $newSyn[3]";
$indexSense = "$newSyn[0]%$posNum:$hypData{'lex_filenum'}:00:: $newOffset $newSynNum 0";
open WNDATATEMP, "$dataFile.temp" or die $!;
for $tempIn (split("\n"))
my @tempLine = split /\s/, $tempIn;
for my $i (0 .. $#tempLine)
if(exists $offsetMap{$tempLine[$i]})
$tempLine[$i] = "$offsetMap{$tempLine[$i]}";
$tempIn = join(' ', @tempLine);
print WNDATANEW "$tempIn \n";
print WNDATANEW "$dataPos \n";
$curLine = 1;
for $tempIn (split("\n"))
if($curLine > $wnCRLength)
#need to add hyponym pointer if it did not exist before on hypernym
if($tempIn =~ /^$locationLemma\b[^-]/)
unless($tempIn =~ /\~/)
my $newPcnt = $hypInfo{'p_cnt'};
$tempIn ="$hypInfo{'lemma'} $hypInfo{'pos'} $hypInfo{'synset_cnt'} $newPcnt $hypInfo{'ptr_symbol'} ~ $hypInfo{'sense_cnt'} $hypInfo{'tagsense_cnt'} $hypInfo{'synset_offset'}";
if($tempIn =~ /^$newSyn[0]\b[^-]/)
$tempIn = "$indexPos";
my @tempLine = split /\s/, $tempIn;
my $tmpPcnt = $tempLine[2];
my $offsetPtr = scalar(@tempLine) - 1;
while($tmpPcnt > 0)
if(exists $offsetMap{$tempLine[$offsetPtr]})
$tempLine[$offsetPtr] = "$offsetMap{$tempLine[$offsetPtr]}";
$tempIn = join(' ', @tempLine);
print WNINDEXNEW "$tempIn \n";
for $tempIn (split("\n"))
my @tempLine = split /\s/, $tempIn;
if(exists $offsetMap{$tempLine[1]})
$tempLine[1] = "$offsetMap{$tempLine[1]}";
$tempIn = join(' ', @tempLine);
print WNSENSENEW "$tempIn\n";
print WNSENSENEW "$indexSense\n";
close WNINDEX;
close WNDATA;
close WNSENSE;
if($write == 1)#if write was successful, overwrite old files with new.
#if no backup files exists make for easy revert.
my $backupcheck = "$indexFile.backup";
unless(-f $backupcheck)
copy($indexFile, "$indexFile.backup");
copy($dataFile, "$dataFile.backup");
unless(-f "$senseFile.backup")
copy($senseFile, "$senseFile.backup");
if(-f "$dataFile.temp")
unlink "$dataFile.temp";
#overwrite old files with new updated files
unlink $indexFile;
unlink $dataFile;
unlink $senseFile;
move("$indexFile.new", $indexFile);
move("$dataFile.new", $dataFile);
move("$senseFile.new", $senseFile);
=item $obj->merge($newSynset, $location)
Takes in a new synset and inserts it into WordNet at the specified location
by merging it into the specified location lemma as a synset.
Parameters: Synset array in form (lemma, part-of-speech, item-id, definition)
or "word\tpos\titem-id\tdef", and location to be inserted in form
(item-id, WordNet sense).
Returns: nothing
sub merge()
my $base = 0;
if(scalar @_ == 3)#checks if method entered by object.
$base = 1;
my @newSyn = @{$_[$base]};
$base = $base +1;
if(scalar @newSyn == 1) #in second form
my @tempSyn = split("\t", $newSyn[0]);
@newSyn = @tempSyn;
my $pos = substr($newSyn[1], 0, 1);
my @location = @{$_[$base]};
my $write = 1; #write flag changes to 0 if error occurs so no write() will occur.
if(scalar @newSyn < 4)
my $self = shift;
$self->{error} = 2;
$self->{errorString} = "New synset does not contain enough elements.";
$write = 0;
if(scalar @location < 2)
my $self = shift;
$self->{error} = 2;
$self->{errorString} = "Location does not contain enough elements.";
$write = 0;
if($write == 1)
my $newOffset = findNewOffset($newSyn[1]);
my %offsetMap; #used to stored changes in offsets.
my $indexPos = "";
my $dataPos = "";
my $indexSense = "";
my $posNum = 0;
my $locationLemma = $location[1];
$locationLemma =~ s/#.*//; #extract lemma
my $locationPos = $newSyn[1]; #must be same pos as new.
my $locationOffset = $wn->offset("$location[1]");
while(length($locationOffset) < 8) #QueryData->offset() does not keep the 8 digits, need to add back lost 0's
$locationOffset = "0".$locationOffset;
my $indexFile = "$WNSEARCHDICT/index.$locationPos";
my $dataFile = "$WNSEARCHDICT/data.$locationPos";
my $senseFile = "$WNSEARCHDICT/index.sense";
open (WNINDEXNEW, '>', "$indexFile.new") or die $!;
open (WNDATANEW, '>', "$dataFile.new") or die $!;
open (WNSENSENEW, '>', "$senseFile.new") or die $!;
#make filehandles hot
my $fhIndex = select(WNINDEXNEW);
my $fhData = select(WNDATANEW);
my $fhSense = select(WNSENSENEW);
if($pos eq "n")
$posNum = 1;
if($pos eq "v")
$posNum = 2;
my $self = shift;
$self->{error} = 2;
$self->{errorString} = "Part of speech must be verb or noun";
$write = 0;
if(isNewWord($newSyn[0], $newSyn[1]) == 0)
my %synIndex = %{getIndexInfo($locationLemma, $locationPos)};
my %synData = %{getDataInfo($locationOffset, $locationPos)};
#print to three required files.
open WNINDEX, "$indexFile" or die $!;
open WNDATA, "$dataFile" or die $!;
open WNSENSE, "$senseFile" or die $!;
open (WNDATATEMP, '>', "$dataFile.temp") or die $!;
my $changed = 0;
my $curLine = 1;
my $newWordLength = length($newSyn[0]) + 1;
for $tempIn (split("\n"))
if($curLine > $wnCRLength)
my @tempLine = split /\s/, $tempIn;
if($changed == 1)
my $newNewOffset = $tempLine[0] + $newWordLength;
while(length($newNewOffset) < 8)
$newNewOffset = "0".$newNewOffset;
$offsetMap{$tempLine[0]} = $newNewOffset;
$offsetMap{$tempLine[0]} = $tempLine[0];
if($tempLine[0] == $locationOffset)
$changed = 1;
print WNDATATEMP "$tempIn\n";
$indexPos = "$newSyn[0] $pos 1 $synIndex{'p_cnt'} $synIndex{'ptr_symbol'} 1 0 $locationOffset";
my $wcnt = $synData{'w_cnt'} + 1;
$dataPos = "$locationOffset $synData{'lex_filenum'} $synData{'ss_type'} $wcnt $synData{'word_lex_id'} $newSyn[0] 0 $synData{'p_cnt'} $synData{'ptr'} | $synData{'gloss'}";
$indexSense = "$newSyn[0]%$posNum:$synData{'lex_filenum'}:00:: $locationOffset 1 0";
open WNDATATEMP, "$dataFile.temp" or die $!;
for $tempIn (split("\n"))
if($tempIn =~ /^$locationOffset\b/)
$tempIn = $dataPos;
my @tempLine = split /\s/, $tempIn;
for my $i (0 .. $#tempLine)
if(exists $offsetMap{$tempLine[$i]})
$tempLine[$i] = "$offsetMap{$tempLine[$i]}";
$tempIn = join(' ', @tempLine);
print WNDATANEW "$tempIn \n";
$curLine = 1;
for $tempIn (split("\n"))
if($curLine > $wnCRLength)
my @tempLine = split /\s/, $tempIn;
my $tmpPcnt = $tempLine[2];
my $offsetPtr = scalar(@tempLine) - 1;
while($tmpPcnt > 0)
if(exists $offsetMap{$tempLine[$offsetPtr]})
$tempLine[$offsetPtr] = "$offsetMap{$tempLine[$offsetPtr]}";
$tempIn = join(' ', @tempLine);
print WNINDEXNEW "$tempIn \n";
print WNINDEXNEW "$indexPos \n";
for $tempIn (split("\n"))
my @tempLine = split /\s/, $tempIn;
if(exists $offsetMap{$tempLine[1]})
$tempLine[1] = "$offsetMap{$tempLine[1]}";
$tempIn = join(' ', @tempLine);
print WNSENSENEW "$tempIn\n";
print WNSENSENEW "$indexSense\n";
close WNINDEX;
close WNDATA;
close WNSENSE;
else #lemma already exists
my %synIndex = %{getIndexInfo($locationLemma, $locationPos)};
my %synData = %{getDataInfo($locationOffset, $locationPos)};
my %lemmaIndex = %{getIndexInfo($newSyn[0], $newSyn[1])};
my $newSynNum =$lemmaIndex{'synset_cnt'} + 1;
#print to three required files.
open WNINDEX, "$indexFile" or die $!;
open WNDATA, "$dataFile" or die $!;
open WNSENSE, "$senseFile" or die $!;
open (WNDATATEMP, '>', "$dataFile.temp") or die $!;
my $changed = 0;
my $curLine = 1;
my $newWordLength = length($newSyn[0]) + 1;
for $tempIn (split("\n"))
if($curLine > $wnCRLength)
my @tempLine = split /\s/, $tempIn;
if($changed == 1)
my $newNewOffset = $tempLine[0] + $newWordLength;
while(length($newNewOffset) < 8)
$newNewOffset = "0".$newNewOffset;
$offsetMap{$tempLine[0]} = $newNewOffset;
$offsetMap{$tempLine[0]} = $tempLine[0];
if($tempLine[0] == $locationOffset)
$changed = 1;
print WNDATATEMP "$tempIn\n";
$indexPos = "$newSyn[0] $pos $newSynNum $lemmaIndex{'p_cnt'} $lemmaIndex{'ptr_symbol'} $newSynNum $lemmaIndex{'tagsense_cnt'} $lemmaIndex{'synset_offset'} $locationOffset";
my $wcnt = $synData{'w_cnt'} + 1;
if(length $wcnt < 2)
$wcnt = "0".$wcnt; #needs to be represented by 2 digit number.
$dataPos = "$locationOffset $synData{'lex_filenum'} $synData{'ss_type'} $wcnt $synData{'word_lex_id'} $newSyn[0] 0 $synData{'p_cnt'} $synData{'ptr'} | $synData{'gloss'}";
$indexSense = "$newSyn[0]%$posNum:$synData{'lex_filenum'}:00:: $locationOffset $newSynNum 0";
open WNDATATEMP, "$dataFile.temp" or die $!;
for $tempIn (split("\n"))
if($tempIn =~ /^$locationOffset\b/)
$tempIn = $dataPos;
my @tempLine = split /\s/, $tempIn;
for my $i (0 .. $#tempLine)
if(exists $offsetMap{$tempLine[$i]})
$tempLine[$i] = "$offsetMap{$tempLine[$i]}";
$tempIn = join(' ', @tempLine);
print WNDATANEW "$tempIn \n";
$curLine = 1;
for $tempIn (split("\n"))
if($curLine > $wnCRLength)
if($tempIn =~ /^$newSyn[0]\b[^-]/)
$tempIn = $indexPos;
my @tempLine = split /\s/, $tempIn;
my $tmpPcnt = $tempLine[2];
my $offsetPtr = scalar(@tempLine) - 1;
while($tmpPcnt > 0)
if(exists $offsetMap{$tempLine[$offsetPtr]})
$tempLine[$offsetPtr] = "$offsetMap{$tempLine[$offsetPtr]}";
$tempIn = join(' ', @tempLine);
print WNINDEXNEW "$tempIn \n";
for $tempIn (split("\n"))
my @tempLine = split /\s/, $tempIn;
if(exists $offsetMap{$tempLine[1]})
$tempLine[1] = "$offsetMap{$tempLine[1]}";
$tempIn = join(' ', @tempLine);
print WNSENSENEW "$tempIn\n";
print WNSENSENEW "$indexSense\n";
close WNINDEX;
close WNDATA;
close WNSENSE;
if($write == 1)#if write was successful, overwrite old files with new.
#if no backup files exists make for easy revert.
my $backupcheck = "$indexFile.backup";
unless(-f $backupcheck)
copy($indexFile, "$indexFile.backup");
copy($dataFile, "$dataFile.backup");
unless(-f "$senseFile.backup")
copy($senseFile, "$senseFile.backup");
if(-f "$dataFile.temp")
unlink "$dataFile.temp";
#overwrite old files with new updated files
unlink $indexFile;
unlink $dataFile;
unlink $senseFile;
move("$indexFile.new", $indexFile);
move("$dataFile.new", $dataFile);
move("$senseFile.new", $senseFile);
=item $obj->restoreWordNet()
Causes all WordNet dict files to be restored to their original
state before any inserts were performed.
Parameter: none
Returns: nothing
sub restoreWordNet()
my $backupFlag = 0;
if(-f "$WNSEARCHDICT/index.noun.backup")
unlink "$WNSEARCHDICT/index.noun";
unlink "$WNSEARCHDICT/data.noun";
$backupFlag = 1;
move("$WNSEARCHDICT/index.noun.backup", "$WNSEARCHDICT/index.noun");
move("$WNSEARCHDICT/data.noun.backup", "$WNSEARCHDICT/data.noun");
if(-f "$WNSEARCHDICT/index.verb.backup")
unlink "$WNSEARCHDICT/index.verb";
unlink "$WNSEARCHDICT/data.verb";
$backupFlag = 1;
move("$WNSEARCHDICT/index.verb.backup", "$WNSEARCHDICT/index.verb");
move("$WNSEARCHDICT/data.verb.backup", "$WNSEARCHDICT/data.verb");
if(-f "$WNSEARCHDICT/index.adj.backup")
unlink "$WNSEARCHDICT/index.adj";
unlink "$WNSEARCHDICT/data.adj";
$backupFlag = 1;
move("$WNSEARCHDICT/index.adj.backup", "$WNSEARCHDICT/index.adj");
move("$WNSEARCHDICT/data.adj.backup", "$WNSEARCHDICT/data.adj");
if(-f "$WNSEARCHDICT/index.adv.backup")
unlink "$WNSEARCHDICT/index.adv";
unlink "$WNSEARCHDICT/data.adv";
$backupFlag = 1;
move("$WNSEARCHDICT/index.adv.backup", "$WNSEARCHDICT/index.adv");
move("$WNSEARCHDICT/data.adv.backup", "$WNSEARCHDICT/data.adv");
if($backupFlag == 1)
unlink "$WNSEARCHDICT/index.sense";
move("$WNSEARCHDICT/index.sense.backup", "$WNSEARCHDICT/index.sense");
=item $obj->isNewWord($lemma, $pos)
Takes in a lemma and searches wordnet to see if it exists.
Parameter: the lemma to search against along with the part of speech.
Returns: 1 if lemma is found or 0 if not.
sub isNewWord()
my $base = 0;
if(scalar @_ == 3)
$base = 1;#checks if method entered by object.
my $lemma = $_[$base];
$base = $base +1;
my $pos = $_[$base];
my $indexFile = "$WNSEARCHDICT/index.$pos"; #wn file to be searched\
open WNINDEX, "$indexFile" or die $!;
for $tempIn (split("\n"))
if($tempIn =~ /^$lemma\b[^-]/)
close WNINDEX;
return 1;
close WNINDEX;
return 0;
=item $obj->getIndexInfo($lemma, $pos)
Takes in lemma and returns the information from the index.pos file.
Parameter: the lemma info required and part of speech
Returns: hash lemma info from index.pos with following information:
lemma pos synset_cnt p_cnt ptr_symbol sense_cnt tagsense_cnt synset_offset
sub getIndexInfo()
my $base = 0;
if(scalar @_ == 3)
$base = 1;#checks if method entered by object.
my $lemma = $_[$base];
$base = $base+1;
my $pos = $_[$base];
my $indexFile = "$WNSEARCHDICT/index.$pos";
my $indexInfoLine = "";
my %indexInfo;
open WNINDEX, "$indexFile" or die $!;
for $tempIn (split("\n"))
if($tempIn =~ /^$lemma\b[^-]/)
$indexInfoLine = $tempIn;
close WNINDEX;
my @index = split /\s/, $indexInfoLine;
$indexInfo{'lemma'} = $index[0];
$indexInfo{'pos'} = $index[1];
$indexInfo{'synset_cnt'} = $index[2];
$indexInfo{'p_cnt'} = $index[3];
#We gather all pointer symbols into one string for storing in the hash.
my $pcnt = $index[3];
my $ptrSym = "";
my $offset = 0;
while($pcnt >0)
my $sym = 4 + $offset;
$ptrSym = $ptrSym . " $index[$sym]";
if($pcnt > 0)
$offset += 1;
$ptrSym =~ s/^\s+//; #remove extra front whitespace
$indexInfo{'ptr_symbol'} = $ptrSym;
my $indexPtr = 5 + $offset; #new pointer to account for different number of ptr symbols
$indexInfo{'sense_cnt'} = $index[$indexPtr];
$indexInfo{'tagsense_cnt'} = $index[$indexPtr];
#Finally we gather all offsets into one string to store in the hash.
my $scnt = $index[2];
my $indexOffsets = "";
while($scnt > 0)
$indexOffsets = $indexOffsets . " $index[$indexPtr]";
$indexOffsets =~ s/^\s+//; #remove extra front whitespace
$indexInfo{'synset_offset'} = $indexOffsets;
return \%indexInfo;
=item $obj->getDataInfo($synsetOffset, $pos)
Takes in synset offset and pos to find data associated with it in data.pos.
Parameters: the synset offset and part of speech
Returns: hash offset info from data.pos with following information:
synset_offset lex_filenum ss_type w_cnt 'word_lex_id' p_cnt ptr | gloss
sub getDataInfo()
my $base = 0;
if(scalar @_ == 3)
$base = 1;#checks if method entered by object.
my $synOffset = $_[$base];
my $pos = $_[$base];
my $dataFile = "$WNSEARCHDICT/data.$pos";
my $dataInfoLine = "";
open WNDATA, "$dataFile" or die $!;
for $tempIn (split("\n"))
if($tempIn =~ /^$synOffset\b/)
$dataInfoLine = $tempIn;
close WNDATA;
my @data = split /\s/, $dataInfoLine;
my %dataInfo;
$dataInfo{'synset_offset'} = $data[0];
$dataInfo{'lex_filenum'} = $data[1];
$dataInfo{'ss_type'} = $data[2];
$dataInfo{'w_cnt'} = $data[3];
#we must consolidate the words and their lex ids into one string. it should be noted that
# the lex ids for each word are stored within the string in the hash not separately.
my $offset = 0;
my $wcnt = $data[3];
my $words = "";
while($wcnt > 0)
my $wptr = 4 + $offset;
$words = $words . " $data[$wptr]"; #appends word
$words = $words . " $data[$wptr]"; #appends lex_id
if($wcnt > 0)
$offset+=2; #makes up for both the word and lex_id
$words =~ s/^\s+//; #remove extra front whitespace
$dataInfo{'word_lex_id'} = $words;
my $dataPtr = 6 + $offset;
$dataInfo{'p_cnt'} = $data[$dataPtr];
#likewise, we consolidate all ptrs together into a single string.
$offset = 0;
my $pcnt = $dataInfo{'p_cnt'};;
my $ptrs = "";
while($pcnt > 0)
my $pptr = $dataPtr + $offset;
$ptrs = $ptrs . " $data[$pptr]";#appends ptr symbol
$ptrs = $ptrs . " $data[$pptr]";#appends synset offset
$ptrs = $ptrs . " $data[$pptr]";#appends pos
$ptrs = $ptrs . " $data[$pptr]";#appends source/target
$offset+=4;#makes up for all extracted data above.
$ptrs =~ s/^\s+//; #remove extra front whitespace
$dataInfo{'ptr'} = $ptrs;
$dataPtr = $dataPtr + $offset; #move ptr past retrieved info.
$dataPtr+=1; #skip over '|' in file.
my $size = scalar @data;
my $gloss = "";
#all the info that is left is the gloss, extract until no more info remains.
while($dataPtr < $size)
$gloss = $gloss . " $data[$dataPtr]";
$gloss =~ s/^\s+//; #remove extra front whitespace
$dataInfo{'gloss'} = $gloss;
return \%dataInfo;
=item $obj->getSenseInfo($synsetOffset)
Takes in a synset offset and returns the sense associated with the offset.
Parameter: the synset offset of the desired lemma
Returns: a hash offset info from index.sense with data:
sense_key synset_offset sense_number tag_cnt
sub getSenseInfo()
my $base = 0;
if(scalar @_ == 2)
$base = 1;#checks if method entered by object.
my $synOffset = $_[$base];
my $senseFile = "$WNSEARCHDICT/index.sense";
my $senseInfoLine = "";
open WNSENSE, "$senseFile" or die $!;
for $tempIn (split("\n"))
if($tempIn =~ /\b$synOffset\b/)
$senseInfoLine = $tempIn;
close WNSENSE;
my @sense = split /\s/, $senseInfoLine;
my %senseInfo;
$senseInfo{'sense_key'} = $sense[0];
$senseInfo{'synset_offset'} = $sense[1];
$senseInfo{'sense_number'} = $sense[2];
$senseInfo{'tag_cnt'} = $sense[3];
return \%senseInfo;
=item $obj->findNewOffset()
Searches through and calculates the offset for inserting.
Parameters: pos of new lemma
Returns: new unused offset
sub findNewOffset()
my $offset = 0;
my $base = 0;
if(scalar @_ == 3)
$base = 1;#checks if method entered by object.
my $pos = $_[$base];
my $dataFile = "$WNSEARCHDICT/data.$pos";
my $dataLastLine = "";
open WNDATA, "$dataFile" or die $!;
for $tempIn (split("\n"))
$dataLastLine = $tempIn;
close WNDATA;
my @data = split /\s/, $dataLastLine;
$offset = $data[0] + length($dataLastLine) + 3;
return $offset;
# Prints indepth help guide to screen.
sub printHelp()
print "Takes in lemmas from file and attempts to\n";
print "insert them into WordNet by first finding\n";
print "a hypernym, then either a) merging the \n";
print "lemma with the hypernym or b) attaching \n";
print "the lemma to the hypernym.\n";