#!/usr/bin/perl -w

package Text::SenseClusters::LabelEvaluation::ReadingFilesData;

use strict;
use encoding "utf-8";

# The following two lines will make this module inherit from the Exporter Class.
require Exporter;
our @ISA = qw(Exporter);

#######################################################################################################################

=head1 Name 

Text::SenseClusters::LabelEvaluation::ReadingFilesData - Module for reading the data from a file as single string object. 

=head1 SYNOPSIS

	The following code snippet will show how to use this module.

Example 1: Reading the label file generated by sense cluster.	

		use Text::SenseClusters::LabelEvaluation::ReadingFilesData;
		
		# Reading the cluster's labels file.
		my $clusterFileName = "TVS.label";
	
		# Getting the clusters file name.
		my $clusterFileName = $driverObject->{$senseClusterLabelFileName};
	
		# Creating the read file object and reading the label examples.
		my $readClusterFileObject = 
				Text::SenseClusters::LabelEvaluation::ReadingFilesData->new ($clusterFileName);
		my %labelSenseClustersHash = ();
		my $labelSenseClustersHashRef = 
				$readClusterFileObject->readLinesFromClusterFile(\%labelSenseClustersHash);
		%labelSenseClustersHash = %$labelSenseClustersHashRef;
					
		# Iterating the Hash to print the value.
		foreach my $key (sort keys %labelSenseClustersHash){
			foreach my $innerkey (sort keys %{$labelSenseClustersHash{$key}}){
				print "$key :: $innerkey :: $labelSenseClustersHash{$key}{$innerkey} \n";
			}
		}	
	
	
Example 2: Reading the user provided Gold Standard keys and their data.	
	
		use Text::SenseClusters::LabelEvaluation::ReadingFilesData;
		# Reading the topic file name.
		my $topicsFileName = "TVS.txt";
	
		# Creating the read object, which will read the gold-standard keys and data provided by user.
		my $readFileObject =
	  		Text::SenseClusters::LabelEvaluation::ReadingFilesData->new($topicsFileName);
	
		# Reading the Mapping with help of function.
		my ( $hashRef, $topicArrayRef ) = $readFileObject->readMappingFromTopicFile();
		
		# Reading the hash from its reference.
		my %mappingHash = %$hashRef;
		my @topicArray  = @$topicArrayRef;
		# Iterating the Hash to print the value.
		foreach my $key ( sort keys %mappingHash ) {
			print "$key=$mappingHash{$key}\n";
		}
		# Iterating the Hash to print the value.
		foreach my $key (@topicArray) {
			print "$key\n";
		}


=head1 DESCRIPTION

	This module provides the various functions to read the labels and topic files. 
	
	The first function reads the labelled data generated by the SenseClusters and 
	create hash from it. The data-format of the input file must match the format 
	of label-file generated by SenseClusters. 
	
	The second function reads a file into a string variable by removing all the 
	newline characters from it.
	
	The remaining functions read the user provided file that contains the mapping 
	of clusters labels with gold standard keys, and/or data about the gold standard
	key or list of topics.    	
			
=cut


# Parameter for reading the file.
our $fileName  = "fileName";


##########################################################################################

=head1 Constructor: new()   

This is the constructor which will create object for this class.
Reference : http://perldoc.perl.org/perlobj.html

This constructor takes the following argument:
	1. $fileNameArg :  
		 The name of the file whose data has to be read.

=cut

##########################################################################################
sub new {
	# Creating the object.
	my $class        = shift;
	my $readFileObject = {};

	# Explicit association is created by the built-in bless function.
	bless $readFileObject, $class;

	# Getting the Hash as the argument.
	my $fileNameArg = shift;
	$readFileObject->{$fileName} = $fileNameArg; 

	# Returning the blessed hash refered by $self.
	return $readFileObject;
}	
	

###########################################################################################

=head1 Function: readLinesFromClusterFile

This function will read lines from the file containing the Labels of the  
Clusters and make the hash file.

@argument1	: Name of the cluster file name.

@argument2	: Reference of Hash ($labelSenseClustersHash) which will hold   
			  the information in the following format:
			  
 		For e.g.:\tCluster0{
					Descriptive    => George Bush, Al Gore, White House, New York
					Discriminating => George Bush, York Times	
 				  } 
				  Cluster1{
					Descriptive    => George Bush, BRITAIN London, Prime Minister
					Discriminating => BRITAIN London, Prime Minister	
 				  } 


@return 	: It will return the reference of the Hash mentioned above: 
  			$labelSenseClustersHashRef.
		  			
@description	:

1. Read the file line by line.
2. Ignore the lines which do not follow one of the following format:
   		 Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York
   		 Cluster 0 (Discriminating): George Bush, BRITAIN London
3. Create Key from the "Cluster # (Descriptive)" or "Cluster # (Discrim
	- inating)" as "OuterKey: Cluster#" "InnerKey: Descriptive".
4. Store the value of hash as the keywords similar to above example:
	for e.g:
	  $labelSenseClustersGlobalRef{Cluster0}{Discriminating}
	  			= "BRITAIN London, Prime Minister";
			  
			  
=cut

###########################################################################################

sub readLinesFromClusterFile {
	# Reading the object as the argument.
	my $readFileObject = shift;
	
	# Getting the fileName Contains the Cluster and topic mapping..
	my $clusterFileName = $readFileObject->{$fileName};
	
	# Reading the reference from the argument.
	my $labelSenseClustersHashRef = shift;
	# Getting the hash from the reference.
	my %labelSenseClustersHash = %$labelSenseClustersHashRef;

	# Opening the File passed by user as the first command line argument.
	# It should be the name of the cluster file containing the labels.
	open clusterFile, $clusterFileName or die $!;

	# Reading all the lines of the clusterslabel file.
	while (<clusterFile>) {
		# Removing the new line character.
		chomp;

		# Removing the white space from the front and end of the word.
		$_ =~ s/^\s+|\s+$//g;

		# If the line is empty then ignore that line and go to next line.
		if ( $_ eq '' ) {
			next;
		}

		# Contents of LabelFile.
		#     Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York
		#     Cluster 0 (Discriminating): George Bush, BRITAIN London

		# Spiliting each line by ":".
		my @lineArray = split( /:/, $_ );

		# If the given do not have Two elements after split. (It means no data for the
		# given cluster.) Then ignore that cluster.
		if ( scalar(@lineArray) != 2 ) {
			next;
		}

		# Following Code are for making the Key (which will be Cluster Number and Type of
		# Labels) Typical Key Structure --> "Cluster 0 (Descriptive)"

		# Spiliting the elements contianing the information about the key with whitespace
		my @keyArray = split( /\s+/, $lineArray[0] );

		# If something wrong with the structure than ignore the key and carry on with
		# next line.
		if ( scalar(@keyArray) != 3 ) {
			next;
		}

		# Making of the Outer key, which is "cluster#"
		my $outerKey = $keyArray[0] . $keyArray[1];

		# The inner key indicates the type of label i.e. Descriptive or Discriminating.
		my $innerKey = $keyArray[2];

		# Removing the start parenthesis '(' and closing ')' parenthesis from the inner
		# key.
		$innerKey =~ s/[(,)]+//g;

		# Setting the keywords associated with this keys as the value.
		# For e.g.: Cluster0{
		#				Descriptive =>		George Bush, Al Gore, White House, New York
		#				Discriminating =>	George Bush, BRITAIN London
		# 			}
		$labelSenseClustersHash{$outerKey}{$innerKey} = $lineArray[1];
	}

	# Close the file handle.
	close(clusterFile);

	# Returning the reference of the Hash containg the Labels information from
	# the cluster.
	return \%labelSenseClustersHash;
}

##########################################################################################

=head1 Function: readLinesFromTopicFile

This function will read lines from the topic file and list of all the topics. 

@argument1	: Name of the topicFile.
  
@return 	: String containing the list of all the topics(labels) for  
			  the clusters.
	  
@description	:
1. Read the file line by line.
2. Remove the new line characters and making string variable which contains the 
   list of all the topics.
  		   
=cut

##########################################################################################

sub readLinesFromTopicFile {
	# Reading the object as the argument.
	my $readFileObject = shift;
	
	# Getting the topic file name from argument.
	my $topicFileName = $readFileObject->{$fileName};

	# Opening the File, whose name is passed as the second command-line-argument.
	# It is the name of the file which contains the list of the topics for clusters.
	open topicFile, $topicFileName or die $!;

	# Defining the variable which will hold all the topics.
	my $topicData = "";

	# Reading the file line by line till end of file.
	while (<topicFile>) {
		# Removing the new line character.
		chomp;

		# Concatenating it to previous line.
		$topicData = $topicData . $_;
	}
	# Close the file handle.
	close(topicFile);

	# Returning the topic list.
	return $topicData;
}




##########################################################################################

=head1 Function: readMappingFromTopicFile

This function will read mapping provided by the user for the Cluster's label (Cluster#)
and gold standard key(topic-name). 

 	Syntax of the file:
		<Cluster><#><Seprator(:::)><topic>
	Example:
		 Cluster0:::topic1
		 Cluster1:::topic2
		 Cluster2:::topic0

@argument	: $readFileObject	: Object of the current file.
  
@return1 	: \%clusterTopicMappingHash : DataType : (Reference to Hash)
				Reference of Hash containing the mapping between the Cluster's 
				label and gold standard key.

@return2 	: \@topicArray : DataType : (Reference to array)				   
	   			Reference of array containg the gold standard keys.
	   			
@description	:
1. Read the file line by line.
2. Check the line, if it contains the "Cluster#:::".
3. Spliting these line with Seprator":::".
4. A WordArray do not have 2 elements, ignore it.
3. Otherwise ignore the remaining lines. 

  		
	Reason for selecting the separtor as ":::"
		1. It will ensure that it is unique and it has very rare chance of occuring
	   	   in a documents or text.  		   
  		   
=cut

##########################################################################################
sub readMappingFromTopicFile {
	# Reading the object as the argument.
	my $readFileObject = shift;
	
	# Getting the fileName Contains the Cluster and topic mapping..
	my $topicFileName = $readFileObject->{$fileName};

	# Opening the topicFile.
	open topicFile, $topicFileName or die $!;

	# Defining the hash which will store the hash information.
	my %clusterTopicMappingHash = ();
	
	# Defining the array which will hold the topic name.
	my @topicArray = (); 
	
	# Defining the index for the array.
	my $index = 0;
	
	# Reading the file line by line till end of file.
	while ( my $lineData = <topicFile> ) {

		# Removing the new line character.
		chomp($lineData);

		# Removing space from the front and back.
		$lineData =~ s/^\s+|\s+$//g;

		# If the line start with "Cluster".
		if ( $lineData =~ m/^cluster/i ) {

			# Spliting with Seprator":::".
			my @wordsOfSentenceArray = split( /:::/, $lineData );
			
			# If the WordArray do not have 2 elements, ignore it.
			if ( scalar(@wordsOfSentenceArray) != 2 ) {
				next;
			}
			
			# Removing the front and last whitespace from cluster name and topic name.
			$wordsOfSentenceArray[0]=~ s/^\s+|\s+$//g;
			$wordsOfSentenceArray[1]=~ s/^\s+|\s+$//g;
			
			# Storing the mapping into the hash.
			$clusterTopicMappingHash{$wordsOfSentenceArray[0]} = $wordsOfSentenceArray[1];
			
			# Also storing the list of all the topics name in the array.
			$topicArray[$index++] = $wordsOfSentenceArray[1];
		}
		# If the line do not start with Cluster, ignore it.
	}

	# Close the file handle.
	close(topicFile);

	# Returning the topic list.
	return (\%clusterTopicMappingHash, \@topicArray);
}


###########################################################################################

=head1 Function: readTopicDataFromTopicFile

This function will read data about the gold standard key(topic-name). 

 	Syntax of the file:
		<topicName><Seprator(:::)><multi lines topic data>
	Example:

	topic1:::data1, data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
	data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
	topic2:::data2, data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2
	data2 data2 data2 data2 data2 data2 data2 data2 data2 data1 data1 data1 data1 data1	
	
@argument	: $readFileObject	: Object of the current file.
  
@return 	: \%topicDataHash : DataType : (Reference to Hash)
			  Reference of Hash containing the topics and their corresponding 
			  data.	
	   			
@description	:
1. Read the file line by line.
2. Check the line, if it contains the ":::" and starts with one of the topic:
	a. This indicates the start of the topic's data.
	b. Read the line till we encounter another "topic-name:::" or "cluster#:::" 
4. Finally, make hash containing the topic as the key and topic's data as the 
   value.
3. Return the reference of this hash. 
  		   
=cut

##########################################################################################
sub readTopicDataFromTopicFile {
	# Reading the object as the argument.
	my $readFileObject = shift;
	
	# Getting the fileName Contains the Cluster and topic mapping..
	my $topicFileName = $readFileObject->{$fileName};

	# Opening the topicFile.
	open topicFile, $topicFileName or die $!;

	# Getting the reference of the array containing the topic name.
	my $topicNameArrayRef = shift;
	
	# Getting the array from the reference.
	my @topicNameArray = @$topicNameArrayRef;
	# print "@topicNameArray \n\n\n";
	
	# Defining the hash containing the topics and their corresponding data.
	my %topicDataHash = ();
	
	# Defining the array which will hold all the data for the file.
	my @fileData =();

	# Reading the file line by line till end of file.
	while ( my $lineData = <topicFile> ) {
			# Adding all the line data in a array.
			push (@fileData, $lineData);
	}
			
	# If the topic is not present than read the files and populate the value.
	if(@topicNameArray ==0){

		# Iterating through the array which contains all the data.
		foreach my $lineData (@fileData) {
			
			# Removing the new line character.
			chomp($lineData);
			
			# Reading the topic value.
			if($lineData =~ m/^(.+):::/){
				# Removing the white spaces around the topic.
				$1 =~ s/^\s+|\s+$//;
				push(@topicNameArray,$1);
			}
		}
	}
	
	# Iterating through the array which contains all the data.
	foreach my $topicName (@topicNameArray){
		
		my $topicKey = "";
		my $topicData = "";
		
		# This variable is set to 1, only when we are reading the "topic:::" for first time.
		my $startOfTopicData = 1;
		
		# Reading the file line by line till end of file.
		foreach my $lineData (@fileData) {
	
			# Removing the new line character.
			chomp($lineData);
			
			# If this is already set 0 and we encounter "topicName:::" or "Cluster#:::" then 
			# reading of data about the topic is over.
			if($startOfTopicData == 0 && ($lineData =~ m/^.+:::/i) ){
				last;
			}
			
			# Making the variable metadata for Regular expression search.
			my $searchString = quotemeta $topicName;
			
			# If the line start with any topicName.
			if ( $lineData =~ m/^$searchString\:::/i) {
			
				# ReSetting the start counter to 0. 	
				$startOfTopicData = 0;
											
				# Setting the topic key.
				$topicKey = $topicName;
				
				# The removing the topic from the line and remaining terms will be part of 
				# the data about the topics.
				$lineData =~ s/(^$searchString\:::)//i;
				$topicData = $lineData;
				next;
			}
			
			# Each subsequent line will be keep on adding to the topic data, till we encounter
			# "Cluster#:::" or  "topic#:::" or "end of file"
			if($startOfTopicData == 0){
				$topicData = $topicData." ".$lineData;
			}	 
		}
		# Removing space from the front and back.
		$topicData =~ s/^\s+|\s+$//g;
		
		if($topicKey){
			$topicDataHash{$topicKey} = $topicData;	
		}
		
	}

	# Close the file handle.
	close(topicFile);

	# Returning the topic list.
	return \%topicDataHash;
}

###########################################################################################

=head1 Function: readTopicNamesFromTopicFile

This function will list all the topics from the file provided by user. 

 	Syntax of the file:
 		<Cluster#><Seprator(:::)><topicName>
		<topicName><Seprator(:::)><multi lines topic data>
		<topicName><Seprator(:::)><multi lines topic data>
		<topicName><Seprator(:::)><multi lines topic data>
		<Cluster#><Seprator(:::)><topicName>
		<Cluster#><Seprator(:::)><topicName>
		
	Example:

	topic1:::data1, data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
	data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
	topic2:::data2, data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2
	data2 data2 data2 data2 data2 data2 data2 data2 data2 data1 data1 data1 data1 data1
	cluster0:::topic1
	cluster1:::topic2	
	cluster2:::topic0
	
@argument	: $readFileObject	: Object of the current file.
  
@return 	: \@topicNameArray : DataType : (Reference to array)
			  Reference of array containing the list of topics.	
	   			
@description	:
1. Read the file line by line.
2. Check the line, if it contains the ":::"
	a. if starts with "cluster" ignore it.
	b. otherwise, split that line with separator, ":::" and store the results in array.
	c. The first element of the array is the topic-name.
	d. Push, this topic-name into the array.    
3. Return the reference of this array. 


Reason for selecting the separtor as ":::"
	1. It will ensure that it is unique and it has very rare chance of occuring
	   in a documents or text.  		   
=cut

##########################################################################################
sub readTopicNamesFromTopicFile {
	
	# Reading the object as the argument.
	my $readFileObject = shift;
	
	# Getting the fileName Contains the Cluster and topic mapping..
	my $topicFileName = $readFileObject->{$fileName};

	# Opening the topicFile.
	open topicFile, $topicFileName or die $!;

	# Variable which will contains the topics.
	my @topicNameArray = ();
	
	# Defining the index for the array.
	my $index = 0;
	
	# Reading the file line by line till end of file.
	while ( my $lineData = <topicFile> ) {
		# Removing the new line character.
		chomp($lineData);

		# Removing space from the front and back.
		$lineData =~ s/^\s+|\s+$//;

		# If the line start with Cluster, ignore it.
		if ( $lineData =~ m/^cluster/i ) {
			next;
		}	

		# If the line contains ":::".
		if ( $lineData =~ m/:::/ ) {
			# Spliting with Seprator":::".
			my @wordsOfSentenceArray = split( /:::/, $lineData );

			# Removing the white spaces around the topic.
			$wordsOfSentenceArray[0] =~ s/^\s+|\s+$//;

			# Adding the terms to topic-array.
			$topicNameArray[$index++] = $wordsOfSentenceArray[0];
		}	
	}
	# Close the file handle.
	close(topicFile);

	# Returning the topic list.
	return (\@topicNameArray);
}

#######################################################################################################

=pod

=head1 SEE ALSO

http://senseclusters.cvs.sourceforge.net/viewvc/senseclusters/LabelEvaluation/

Last modified by :
$Id: ReadingFilesData.pm,v 1.5 2013/03/07 23:15:49 jhaxx030 Exp $
 
=head1 AUTHORS

 	Anand Jha, University of Minnesota, Duluth
 	jhaxx030 at d.umn.edu

 	Ted Pedersen, University of Minnesota, Duluth
 	tpederse at d.umn.edu

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2012-2013 Ted Pedersen, Anand Jha 

See http://dev.perl.org/licenses/ for more information.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to: 
 
	
	The Free Software Foundation, Inc., 59 Temple Place, Suite 330, 
	Boston, MA  02111-1307  USA
	
	
=cut

#######################################################################################################

# Making the default return statement as 1;
# Reference : http://lists.netisland.net/archives/phlpm/phlpm-2001/msg00426.html

1;