package String::Simrank; # Note to developers: # enable the VERSIONs (here and in the "use Inline C ..." # statement) to match each other when ready to install in sitelib or # when ready to distribute. Otherwise, disable them # with commenting symbol for testing. # use base 'Exporter'; # @EXPORT_OK = qw(add subtract); use strict; use File::Basename; use IO::File; use Fcntl; use Data::Dumper; use Storable; use vars qw($VERSION $NAME); $VERSION = '0.079'; $NAME = 'String::Simrank'; # Must specify a NAME and VERSION parameter. The NAME must match your module's package # name. The VERSION parameter must match the module's $VERSION # variable and they must both be of the form /^\d\.\d\d$/. # DATA refers to the __DATA__ section near bottom of this file. use Inline C => 'DATA'; =head1 NAME String::Simrank - Rapid comparisons of many strings for k-mer similarity. =cut =head1 SYNOPSIS use String::Simrank; my $sr = new String::Simrank( { data => 'db.fasta'}); $sr->formatdb(); $sr->match_oligos({ query => 'query.fasta' }); You may utilize k-mers of various sizes, e.g.: $sr->formatdb( { wordlen => 6 } ); =cut =head1 DESCRIPTION The String::Simrank module allows rapid searches for similarity between query strings and a database of strings. This module is maintained by molecular biologists who use it for searching for similarities among strings representing contiguous DNA or RNA sequences. This program does not construct an alignment but rather finds the ratio of nmers (A.K.A. ngrams) that are shared between a query and database records. The input file should be fasta formatted (either aligned or unaligned) multiple sequence file. The memory consumption is moderate and grows linearly with the number of sequences and depends on the nmer size defined by the user. Using 7-mers, ~20,000 strings each ~1,500 characters in length requires ~50 Mb. The module can be used from the command line through the script C provided in the examples folder. By default the output is written to STDOUT and represents the similarity of each query string to the top hits in the the database. The format is query_id, tab, best match database_id, colon, percent similarity, space second best match database_id, colon, percent similarity. query1 EscCol36:100.00 EscCol36:100.00 EscCol43:99.59 EscCol29:99.24 EscCol33:99.17 EscCol10:99.02 =cut =head1 METHODS =cut =head2 new( { data_path => $path }) my $sr = new String::Simrank( { data => 'many_seqs.fasta' }); Instantiates a new String::Simrank object. Each simrank database should live within its own String::Simrank object. In other words, each unique combination of a fasta database and n-mer length will have its own instance. If the database has already been formated then the wordlength will be read from the formatted database's .bin file. Parameters: =over 4 =item data Sets the path of where the data fasta file is (Database sequences, fasta format). The same terminal directory is where the formated database will reside with the same base name but the .fasta will be substituted with .bin. Sequence names(identifiers) will be first 10 characters matched between the '>' and the first space character. Be sure the sequences names are unique within this string. =back =cut sub new { # Todd Z. DeSantis, March 2008 my ($class, $cl_args) = @_; # Parameter hr is called $cl_args since in # many instances these will come directly from the command line. my $self = bless {}, $class; $self->_check_arguments_for_new($cl_args); # print STDERR "you are using String::Simrank version $VERSION \n"; return $self; } =head2 _check_arguments_for_new Internally used function. Validates the 'data' argument needed so this instance knows what data file will be linked. Method checks that fasta file exists and is readable and creates the name which corresponds to the .bin file. If errors are found, program exits. Returns a hash or hash ref depending on the context of the method call. =cut sub _check_arguments_for_new { # Niels Larsen, May 2004. # Todd Z. DeSantis, March 2008. my ($self, $cl_args, # Command Line ARGument hash # not really required to come from the command line ) = @_; my ( @errors, $error); if ( $cl_args->{"data"} ) { # $cl_args->{"data"} = &Common::File::full_file_path( $cl_args->{"data"} ); if ( -r $cl_args->{"data"}) { $cl_args->{"binary"} = $cl_args->{"data"}; $cl_args->{"binary"} =~ s/\.[^\.]+$//; $cl_args->{"binary"} .= ".bin"; $self->{param}{data} = $cl_args->{"data"}; $self->{param}{binary} = $cl_args->{"binary"}; if (-e $cl_args->{"binary"} && -r $cl_args->{"binary"}) { print $cl_args->{"binary"} . " has been formatted\n"; $self->{binary_ready} = 1; } else { $self->{binary_ready} = 0; } } else { push @errors, qq (Simrank.pm: Input fasta data file not found or not readable -> "$cl_args->{'data'}"); } } else { push @errors, qq (Fasta sequence file must be specified with *data* argument); } # Print errors if any and exit, if ( @errors ) { foreach $error ( @errors ) { print STDERR "$error\n"; # &Common::Messages::echo_red( "ERROR: "); # &Common::Messages::echo( "$error\n" ); } exit; } else { wantarray ? return %{ $cl_args } : return $cl_args; } } =head2 formatdb({ wordlen => $int, minlen => $int, silent = $bool }) $sr->formatdb({wordlen => 8, minlen => 500, silent => 0, valid_chars => 'ACGT'}); From an input collection of strings (data), generates a binary file optimized for retrieval of k-mer similarities. The file will contain a pre-computed map of which sequences - given by their index number in the input data file - contain a given k-mer. All the valid k-mers of the given length are mapped. This means each k-mer from a query string can be used to look up the database strings it occurs in, very quickly. The memory consumption is moderate and grows linearly with the number of sequences; 20,000 takes 50-55 mb. The first characters between the '>' character and the first space is recognized as the string (sequence) identifier and should be unique for each record. The number of sequences found in the input collection is returned and the file xyz.bin is written to disk. Parameters: =over 4 =item pre_subst An integer that determines how to interpret some special characters in the strings. The substitution is done before k-mers are extracted. Use 1 to eliminate all periods(.), hypens(-), and space characters(\s, which includes \s,\t,\n). This is the default behavior. Use 2 to convert same characters as above into underscores(_). =item wordlen An integer that specifies the k-mer length to index for each record in the input data file. Default is 7. =item minlen Specifies the minimum string length for a database sequence to be worthy of indexing. Default is 50. =item valid_chars Specifies the characters to allow for formatting. Entire k-mer must be composed of these characters otherwise it will be ignored. When not defined, all characters are valid. Default is undef (indicating all characters are valid) =item silent Specifies if caller wants progress messages printed to STDERR while formatting. Default is false, meaning not silent. =back =cut sub formatdb { # Niels Larsen, November 2005. # Todd Z. DeSantis, July 2010 my ( $self, $cl_args) = @_; $self->_check_arguments_for_formatdb($cl_args); # Returns an integer. my ( $data_fh, $bin_fh, $entry, @seq_oligos, $oligos, $oli2seqoffs, $seqnum, $oligo, @sids, $sids, $bytstr, $begpos, $seqtot, $bytlen, $olibeg, $oli_counts, $id, $seq, $count, $all_begpos, $all_bytlen, $valid_chars, $lastnums, $lastnum, @seqoffs, $off, @seqnums, $id_len); ## I think I can exlude $lastnums, never written to disk. my $cl_wordlen = $self->{param}{wordlen}; my $cl_silent = $self->{param}{silent}; my $cl_binary = $self->{param}{binary}; my $cl_data = $self->{param}{data}; print "cl_data: $cl_data\n" unless ($cl_silent); print "cl_binary: $cl_binary\n" unless ($cl_silent); # Find the number of sequences and the longest length identifier, print "Counting new fasta records and measuring size of identifiers... " if not $cl_silent; $seqtot = _count_records($cl_data); print "found $seqtot\n" if not $cl_silent; $seqnum = 0; $oli_counts = ""; $valid_chars = $self->{param}{valid_chars}; $id_len = 0; # init $data_fh = new IO::File $self->{param}{data}, "r"; { local $/ = '>'; print "Mapping sub-sequences ... \n" unless ($cl_silent); while ( <$data_fh> ) { next if ($_ eq '>'); # toss the first empty record chomp $_; my ($header, @seq_lines) = split /\n/, $_; my ($id) = split /\s/, $header; my $sequence = ''; if ($self->{param}{pre_subst} == 1) { $sequence = join '', @seq_lines; $sequence =~ s/[\.\-\s]//g; } elsif ($self->{param}{pre_subst} == 2) { $sequence = join '_', @seq_lines; $sequence =~ s/[\.\-\s]/_/g; } next unless ($id && $sequence); push @sids, $id; # format to a uniform character length AFTER longest is found $id_len = length($id) if (length($id) > $id_len); @seq_oligos = _create_oligos( $sequence, $cl_wordlen, $valid_chars); foreach $oligo ( @seq_oligos ) { if ( exists $oli2seqoffs->{ $oligo } ) { $lastnum = $lastnums->{ $oligo }; ## I think I can exlude $lastnums, never written to disk. $oli2seqoffs->{ $oligo } .= ",". ($seqnum - $lastnum); # Keep track of the hops needed to find this oligo in the seqnums } else { $oli2seqoffs->{ $oligo } = $seqnum; } $lastnums->{ $oligo } = $seqnum; # keep track of the last $seqnum in which this $oligo was found. ## I think I can exlude $lastnums, never written to disk. } $oli_counts .= ",". ( scalar @seq_oligos ); # string-encoded ordered list of count of unique oligos in each seqnum $seqnum++; if ( !$cl_silent && $seqnum % 5000 == 0 ) { print "$seqnum done of $seqtot ," . scalar(keys %{ $oli2seqoffs }) . " unique kmers found ...\n"; # this message is accurate since we just incremented # $seqnum. So if we just finshed index 4 (which is the # fifth sequence, then $seqnum would have a value of 5" } } } $seqtot = $seqnum; # this is true since $seqnum was incremented after last sequence if ( not $cl_silent ) { print "Total sequence count: $seqnum of $seqtot ," . scalar(keys %{ $oli2seqoffs }) . " unique kmers found.\n"; } $data_fh->close; $self->{db_string_count} = $seqnum; $self->{unique_kmer_count} = scalar(keys %{ $oli2seqoffs }); # Write binary file. We write binary representations using syswrite # and save the word length and number of sequences (the rest can be # calculated) eval { $bin_fh = IO::File->new( $cl_binary, 'w' ) }; if ($@) { print STDERR $@; # such as: your vendor has not defined Fcntl macro O_LARGEFILE if ( not $bin_fh = new IO::File $cl_binary, "w") { print STDERR "Could not syswrite-open file $cl_binary\n"; exit; } } # print STDERR Dumper($bin_fh); unless (defined $bin_fh) { print STDERR "A filehandle to $cl_binary has not been defined\n"; exit; } if ( not $cl_silent ) { print "Total number of unique oligos: " . scalar(keys %{ $oli2seqoffs } ) . "\n"; print "Writing oligo map to file ...\n"; } # Id length, word_length, and total number of sequences, $bytstr = pack "A10", $id_len; # the max_length is encoded as text syswrite $bin_fh, $bytstr; $bytstr = pack "A10", $cl_wordlen; syswrite $bin_fh, $bytstr; $bytstr = pack "A10", $seqtot; syswrite $bin_fh, $bytstr; # Short ids as a string of fixed-length character words, $bytstr = pack "A$id_len" x $seqtot, @sids; syswrite $bin_fh, $bytstr; # Write byte strings of matching sequence numbers, while keeping # track of their byte position offsets, $begpos = sysseek $bin_fh, 0, 1; # recommended "systell", perl book says $begpos = 0 if ($begpos =~ /^0/); # sometimes $begpos gets set from sysseek as "0 but true" # I think this has something to do with not using the Fcntl # flags: O_WRONLY|O_CREAT|O_EXCL|O_LARGEFILE # since this behavoir emerged when I stopped using them. # BUT, I expect sysseek to return a positive int # since many bytes have been already written to this filehandle. @seq_oligos = (); $all_begpos = ""; $all_bytlen = ""; $count = 0; foreach $oligo ( sort(keys %{ $oli2seqoffs }) ) # "sort" added by tzd Apr-11-2008 { @seqoffs = eval $oli2seqoffs->{ $oligo }; # print STDERR '@seqoffs:' . join(',', @seqoffs) . "\n"; @seqnums = shift @seqoffs; foreach $off ( @seqoffs ) { push @seqnums, $seqnums[-1] + $off; # -1 looks-up the value of the last element in the @seqnums } # print STDERR '@seqnums:' . join(',', @seqnums) . "\n"; $bytstr = pack "I*", @seqnums; # unsigned integers # the rest of this module # assumes that integers will # be 4-byte which is a fairly # safe assumption syswrite $bin_fh, $bytstr; $bytlen = length $bytstr; $all_begpos .= ",$begpos"; $all_bytlen .= ",$bytlen"; $begpos += $bytlen; delete $oli2seqoffs->{ $oligo }; push @seq_oligos, $oligo; $count += 1; } $oli2seqoffs = {}; $olibeg = $begpos; # Write the oligos found, $bytstr = pack "A$cl_wordlen" x ( scalar @seq_oligos ), @seq_oligos; syswrite $bin_fh, $bytstr; # The begin positions of the corresponding run of sequence indices, # and their lengths, $all_begpos =~ s/^,//; # get rid of initial comma $bytstr = pack "I*", eval $all_begpos; syswrite $bin_fh, $bytstr; $all_bytlen =~ s/^,//; $bytstr = pack "I*", eval $all_bytlen; syswrite $bin_fh, $bytstr; # Store the number of unique oligos for every sequence, $oli_counts =~ s/^,//; $bytstr = pack "I*", eval $oli_counts; syswrite $bin_fh, $bytstr; # Finally, we need to store the number of oligos encountered and the # byte position of where they were stored, $bytstr = pack "A10", scalar @seq_oligos; syswrite $bin_fh, $bytstr; $bytstr = pack "A10", $olibeg; syswrite $bin_fh, $bytstr; $bin_fh->close; print "format binary file complete\n" unless ($cl_silent); return $seqnum; } =head2 _check_arguments_for_formatdb It checks that word length and minimum sequence length is reasonable. Sets silent to true by default. Sets pre_subst to 1 by default. Defines binary file name. If errors are found, program exits. Returns a hash or hash ref depending on the context of the method call. =cut sub _check_arguments_for_formatdb { # Niels Larsen, May 2004. # Todd Z. DeSantis, March 2008. my ( $self, $cl_args, # Command Line ARGument hash # not really reqired to come from the command line ) = @_; my ( @errors, $error); ## see if caller wants to constrain the alphabet if ( exists $cl_args->{valid_chars} ) { $self->{param}{valid_chars} = $cl_args->{valid_chars}; } else { $self->{param}{valid_chars} = undef; } ## see if caller wants to make some pre-substitutions if ( exists $cl_args->{pre_subst} ) { $self->{param}{pre_subst} = $cl_args->{pre_subst}; } else { $self->{param}{pre_subst} = 1; } # Ensure mininum sequence length is 1 or more and fill in 50 # if user didnt specify it if ( $cl_args->{"minlen"} ) { if ( $cl_args->{"minlen"} < 1 ) { push @errors, qq (Minimum sequence length should be 1 or more); } } else { $cl_args->{"minlen"} = 50; } $self->{param}{minlen} = $cl_args->{"minlen"}; # Ensure word length is over 0 and less than # or equal to minlen and fill in 7 if the # user didnt specify it, if ( $cl_args->{"wordlen"} ) { if ( $cl_args->{"wordlen"} < 1 ) { push @errors, qq (Word length should be at least 1); } if ( $cl_args->{"wordlen"} > $cl_args->{"minlen"} ) { push @errors, qq (Word length no greater than the minlen); } } else { $cl_args->{"wordlen"} = 7; } $self->{param}{wordlen} = $cl_args->{"wordlen"}; if ( not defined $cl_args->{"silent"} ) { $cl_args->{"silent"} = 0 } $self->{param}{silent} = $cl_args->{"silent"}; # Print errors if any and exit, if ( @errors ) { foreach $error ( @errors ) { print STDERR "$error\n"; } exit; } else { wantarray ? return %{ $cl_args } : return $cl_args; } } =head2 match_oligos({query => $path}) $sr->match_oligos({query => 'query.fasta'}); $sr->match_oligos({ query => 'query.fasta', outlen => 10, minpct => 95, reverse => 1, outfile => '/home/donny/sr_results.txt', }); my $matches = $sr->match_oligos({query => 'query.fasta'}); print Dumper($matches); foreach my $k (keys %{$matches} ) { print "matches for $k :\n"; foreach my $hit ( @{ $matches->{$k} } ) { print "hit id:" . $hit->[0] . " perc:" . $hit->[1] . "\n"; } } This routine quickly estimates the overall similarity between a given set of DNA or RNA sequence(s) and a background set of database sequences (usually homologues). It returns a sorted list of similarities as a table. The similarity between sequences A and B are the number of unique k-words (short subsequence) that they share, divided by the smallest total unique k-word count in either A or B. The result are scores that do not depend on sequence lengths. When called in void context, the routine prints to the given output file or STDOUT; otherwise a hash_ref is returned. Parameters: =over 4 =item query Sets the path where the query fasta file will be found. Required. In future versions, query could be a data structure instead. Need to build an abstract iterator for this feature to be enabled. =item minpct A real number indicating the the minimum percent match that should be output/returned. Default = 50. =item outlen An integer indicating the maximum number of ranked db matches that should be output/returned. Default = 100. =item valid_chars Specifies the characters to allow for kmer-searching. Entire n-mer must be composed of these characters otherwise it will be ignored. When not defined, all characters are valid. Default is undef (indicating all characters are valid) =item reverse A boolean value. If true, reverses input sequence. Default = false. =item noids A boolean value. If true, prints database index numbers instead of sequence ids. Default = false. =item outfile Sets the path of the output file (instead of sending to STDOUT). Default = false. Meaning output is sent to STDOUT. =item silent Specifies if caller wants progress messages printed to STDERR while matching. Default is false, meaning not silent. =back =cut sub match_oligos { # Niels Larsen, May 2004. # Todd Z. DeSantis, March 2008. my ( $self, $cl_args, # Arguments hash ) = @_; $self->_check_arguments_for_match_oligos($cl_args); # Returns a list of matches if not called in void context. my ( $q_seqs, $count, $silent, @sids, $wordlen, $seqnum, $bin_fh, $query_fh, $bytstr, $olinum, $length, @oligos, $oligo, @scores, $oli2seqnums, $vector, @begpos, @endpos, $oli2pos, $i, @seqnums, $j, $seqtot, $score, @lengths, @temp, $valid_chars, $minscore, $olibeg, $olitot, $begpos, $endpos, $pack_bits, @scovec, $scovec, $query_sid, $out_fh, $outdir, $outline, $matches, @oli_counts, $oli_count, $scovec_null, $index, $outlen, $pos, $id, $seq, $id_len); my $void_context = 1; # Recall 'wantarray' returns # True if the context is looking for a list value, # False if the context is looking for a scalar, # Undef if the context is looking for no value (void context). $void_context = 0 if (defined wantarray); # caller wants data back $silent = $self->{param}{silent}; # >>>>>>>>>>>>>>>>>>>>>> FILE MAP SECTION <<<<<<<<<<<<<<<<<<<<<<<< # This section creates $oli2pos, where each key is a subsequence # and values are [ file byte position, byte length to read ]. This # map is used below to sum up similarities. eval { $bin_fh = IO::File->new( $self->{param}{binary}, O_RDONLY|O_LARGEFILE ) }; if ($@) { # such as: your vendor has not defined Fcntl macro O_LARGEFILE if ( not $bin_fh = new IO::File $self->{param}{binary}, "r") { print STDERR "Could not open file $self->{param}{binary} for reading\n"; exit; } } # Word length and total number of sequences, # reminder: sysread FILEHANDLE,SCALAR,LENGTH # Attempts to read LENGTH bytes of data into variable SCALAR # from the specified FILEHANDLE, sysread $bin_fh, $bytstr, 10; $id_len = $bytstr * 1; sysread $bin_fh, $bytstr, 10; $wordlen = $bytstr * 1; sysread $bin_fh, $bytstr, 10; $seqtot = $bytstr * 1; # String ids as a string of fixed-length character words, sysread $bin_fh, $bytstr, $seqtot * $id_len; @sids = unpack "A$id_len" x $seqtot, $bytstr; # Get the oligos and the begin and end positions of where the sequence # indices start and their lengths, sysseek $bin_fh, -10, 2; # goto 10 bytes before EOF sysread $bin_fh, $bytstr, 10; $olibeg = $bytstr * 1; sysseek $bin_fh, -20, 2; # goto 20 bytes before EOF sysread $bin_fh, $bytstr, 10; $olitot = $bytstr * 1; sysseek $bin_fh, $olibeg, 0; sysread $bin_fh, $bytstr, $olitot * $wordlen; @oligos = unpack "A$wordlen" x $olitot, $bytstr; sysread $bin_fh, $bytstr, 4 * $olitot; @begpos = unpack "I*", $bytstr; sysread $bin_fh, $bytstr, 4 * $olitot; @lengths = unpack "I*", $bytstr; sysread $bin_fh, $bytstr, 4 * $seqtot; @oli_counts = unpack "I*", $bytstr; # Create a hash that returns the file positions of where to get the # matching sequence indices, for ( $i = 0; $i < scalar @oligos; $i++ ) { $oli2pos->{ $oligos[$i] } = [ $begpos[$i], $lengths[$i] ]; } # >>>>>>>>>>>>>>>>>>>> PROCESS QUERY ENTRIES <<<<<<<<<<<<<<<<<<<<<<< if ( $void_context == 1 && $self->{param}{outfile} ) { # call was made in void context $out_fh = new IO::File $self->{param}{outfile}, "w"; print "Outfile opened.\n" unless ($self->{param}{silent}); } $query_fh = new IO::File $self->{param}{"query"}, "r"; $scovec_null = pack "I*", (0) x $seqtot; $minscore = ( $self->{param}{"minpct"} || 0 ) / 100; { local $/ = '>'; while ( <$query_fh>) { next if ($_ eq '>'); # toss the first empty record chomp $_; my ($header, @seq_lines) = split /\n/, $_; my ($query_sid) = split /\s/, $header; my $sequence = ''; if ($self->{param}{pre_subst} == 1) { $sequence = join '', @seq_lines; $sequence =~ s/[\.\-\s]//g; } elsif ($self->{param}{pre_subst} == 2) { $sequence = join '_', @seq_lines; $sequence =~ s/[\.\-\s]/_/g; } next unless ($query_sid && $sequence); if ( $self->{param}{"reverse"} ) { $sequence = reverse $sequence; } print "Processing $query_sid ... \n" unless ( $silent ); $scovec = $scovec_null; # >>>>>>>>>>>>>>>>>>>>>> COUNTING SECTION <<<<<<<<<<<<<<<<<<<<<<<< # Look up the sequences that contain each of the oligos found # in the sequence. The sort makes the disk jump around less, # since the oligos were written in sorted order to the bin file @oligos = _create_oligos( $sequence, $wordlen, $self->{param}{valid_chars} ); foreach $oligo ( sort @oligos ) { # "sort" added by tzd Apr-11-2008 if ( defined ( $pos = $oli2pos->{ $oligo }->[0] ) ) { sysseek $bin_fh, $pos, 0; $length = $oli2pos->{ $oligo }->[1]; sysread $bin_fh, $bytstr, $length; # $bytstr is a perl string. Every 4-byte block # in this string encodes a 4 byte integer # so >4 billion integers are available. # but when passed to C # it will be an integer array. # This is a C function for speed, the code is at the end of this # file. The third argument is the maximum index of $bytstr array, ####### testing only # print STDERR join (',', (unpack "I*", $scovec)) . "\n"; # print STDERR join (',', (unpack "I*", $bytstr)) . "\n"; ####### &update_scores_C( \$bytstr, \$scovec, ( $length - 1 ) / 4 ); ####### testing only # print STDERR '$query_sid:' . $query_sid . # ' $oligo:' . $oligo . "\n" . # Dumper(unpack "I*", $scovec) . "\n"; ####### } } @scovec = unpack "I*", $scovec; # >>>>>>>>>>>>>>>>>>>>>> EXTRACT OUTPUT DATA <<<<<<<<<<<<<<<<<<<<<< # The score vector, @scovec, now contains integer scores in some # slots, zeros in others. To get an ordered list of just the positive # scores we could grep and sort, but that would be slow because the # list is long. Instead, keep a list of scores that is only as long # as the output length; insert values that are higher than the last # element and ignore the rest, $oli_count = scalar @oligos; $outlen = $self->{param}{"outlen"}; @scores = (); for ( $i = 0; $i < scalar @scovec; $i++ ) { if ( $scovec[$i] ) { #### use for testing # print STDERR '$oli_count:' . $oli_count . ' $oli_counts[$i]:' . $oli_counts[$i] . ' $scovec[$i]:' . $scovec[$i] . "\n"; ##### if ( $oli_count < $oli_counts[$i] ) { $score = $scovec[$i] / $oli_count; } else { $score = $scovec[$i] / $oli_counts[$i]; } if ( $score >= $minscore ) { if ( scalar @scores < $outlen ) { $index = _nearest_index( \@scores, $score ); splice @scores, $index, 0, [ $i, $score ]; } elsif ( $score > $scores[0]->[1] ) { $index = _nearest_index( \@scores, $score ); splice @scores, $index, 0, [ $i, $score ]; shift @scores; } } } } if ( $cl_args->{"noids"} ) { foreach $score ( @scores ) { $score->[1] = sprintf "%.2f", 100 * $score->[1]; } } else { foreach $score ( @scores ) { $score->[0] = $sids[ $score->[0] ]; $score->[1] = sprintf "%.2f", 100 * $score->[1]; # Can we add a percision parameter here ? # may be able to save disk space in the output file by getting # just the 1/10 place for instance } } @scores = reverse @scores; # >>>>>>>>>>>>>>>>>>>>>> OUTPUT SECTION <<<<<<<<<<<<<<<<<<<<<<<<< # If called in non-void context, generate a data structure. If # in void context, print tabular data to stdout or an output file # if specified, if ( $void_context == 0 ) { # means caller wants a scalar back # which will be a hash ref $matches->{ $query_sid } = &Storable::dclone( \@scores ); } else { $outline = "$query_sid\t" . join "\t", map { $_->[0] .":". $_->[1] } @scores; if ( defined $out_fh ) { print $out_fh "$outline\n"; } else { print "$outline\n"; } } unless ( $silent ) { print "$query_sid done\n"; # &Common::Messages::echo_green( qq (done\n) ); } } # end while query_fh } # end local block $query_fh->close; $bin_fh->close; # Return data structure if non-void context, otherwise nothing, if ( $void_context == 0 ) { # means caller wants a scalar back return $matches; } else { return; } } =head2 _check_arguments_for_match_oligos Internally used function. Validates the arguments for outlen, minpct, reverse, query, noids, silent, pre_subst and output. If errors are found, program exits. Returns a hash or hash ref depending on the context of the method call. =cut sub _check_arguments_for_match_oligos { # Niels Larsen, May 2004. # Todd Z. DeSantis, March 2008. my ( $self, $cl_args, # Command Line ARGument hash # not really reqired to come from the command line ) = @_; my ( @errors, $error, $outdir ); ## see if caller wants to constrain the alphabet if ( exists $cl_args->{valid_chars} ) { $self->{param}{valid_chars} = $cl_args->{valid_chars}; } else { $self->{param}{valid_chars} = undef; } ## see if caller wants to make some pre-substitutions if ( exists $cl_args->{pre_subst} ) { $self->{param}{pre_subst} = $cl_args->{pre_subst}; } else { $self->{param}{pre_subst} = 1; } # Ensure output list has at least one similarity in it and set # it to 100 if the user didnt specify it, if ( $cl_args->{"outlen"} ) { if ( $cl_args->{"outlen"} < 1 ) { push @errors, qq (Output list length should be 1 or more); } } else { $cl_args->{"outlen"} = 100; } $self->{param}{outlen} = $cl_args->{"outlen"}; if ($cl_args->{"minpct"} ) { $self->{param}{minpct} = $cl_args->{"minpct"}; } else { $self->{param}{minpct} = 0; } if ($cl_args->{reverse}) { $self->{param}{"reverse"} = $cl_args->{reverse}; } else { $self->{param}{"reverse"} = 0; } # The mandatory query: # must be a file path # in the future, allow a hash ref of the structure # $hr->{$id} = $sequence_string # if file given, # check that its readable. If not given, error, if ( $cl_args->{"query"} ) { # $cl_args->{"query"} = &Common::File::full_file_path( $cl_args->{"query"} ); if ( not -r $cl_args->{"query"} ) { push @errors, qq (Query file not found or not readable -> "$cl_args->{'query'}"); } } else { push @errors, qq (Query sequence file must be specified); } $self->{param}{query} = $cl_args->{"query"}; # If output file given, check if it exists and if its directory # does not exist if ( $cl_args->{"outfile"} ) { if ( -e $cl_args->{"outfile"} ) { push @errors, qq (Output file exists -> "$cl_args->{'outfile'}"); } $outdir = File::Basename::dirname( $cl_args->{"outfile"} ); if ( not -d $outdir ) { push @errors, qq (Output directory does not exist -> "$outdir"); } elsif ( not -w $outdir ) { push @errors, qq (Output directory is not writable -> "$outdir"); } } else { $cl_args->{"outfile"} = ""; } $self->{param}{outfile} = $cl_args->{"outfile"}; # print STDERR 'outfile:' . $self->{param}{outfile} . "\n"; if ( defined $cl_args->{"silent"} ) { $self->{param}{silent} = $cl_args->{"silent"}; } if ( not defined $self->{param}{silent}) { $self->{param}{silent} = 1; } # Print errors if any and exit, if ( @errors ) { foreach $error ( @errors ) { print STDERR "$error\n"; # &Common::Messages::echo_red( "ERROR: "); # &Common::Messages::echo( "$error\n" ); } exit; } else { wantarray ? return %{ $cl_args } : return $cl_args; } } =head2 _create_oligos Internal method. Creates a list of unique words of a given length from a given sequence. Enforces k-mers composed of purely valid_char if requested. Converts characters to upper case where possible. This may become an option in the future. =cut sub _create_oligos { # Niels Larsen, November 2005. # Todd Z. DeSantis, March 2008 my ( $str, # Sequence string $word_len, # Word length $valid_chars ) = @_; my (@oligos, %oligos, $i, $len); $str = uc $str; my @good_spans = (); if ($valid_chars) { @good_spans = split (/[^$valid_chars]/, $str); # pattern match done once # instead of for every k-mer } else { # put whole string as first element $good_spans[0] = $str; } foreach my $good_span (@good_spans) { # print STDERR "good_span: $good_span\n"; my $span_len = length($good_span); next if ($span_len < $word_len); my $i; for ( $i = 0; $i <= $span_len - $word_len; $i++ ) { $oligos{ substr $good_span, $i, $word_len } = undef; } } @oligos = keys %oligos; ## use for testing: # foreach my $o (@oligos) { # if ($o =~ /[^ACGT]/) { # print STDERR $o . "\n"; # } # } wantarray ? return @oligos : \@oligos; } =head2 _nearest_index Finds the index of a given sorted array where a given number would fit in the sorted order. The returned index can then be used to insert the number, for example. The array may contain integers and/or floats, same for the number. =cut sub _nearest_index { # Niels Larsen, May 2004. my ( $array, # Array of numbers $value, # Lookup value ) = @_; # Returns an integer. my ( $low, $high ) = ( 0, scalar @{ $array } ); my ( $cur ); while ( $low < $high ) { $cur = int ( ( $low + $high ) / 2 ); if ( $array->[$cur]->[1] < $value ) { $low = $cur + 1; } else { $high = $cur; } } return $low; } =head2 _count_records Counts the number of records in a fasta formated file. =cut sub _count_records { # Todd Z. DeSantis, July 2010. my ( $fasta_file ) = @_; # Returns an integer. my $c = 0; my $ffh = new IO::File $fasta_file, "r"; while (<$ffh>) { $c++ if ($_ =~ /^\>/); } $ffh->close; return $c; } 1; =head2 _update_scores_C Receives a list of indicies, score vector, and the final index Updates the score vector by incrementing the oligo match count for the correct sequences. =cut __DATA__ =pod =cut __C__ /* Niels Larsen, May 2004. */ static void* get_ptr( SV* obj ) { return SvPVX( SvRV( obj ) ); } /* A sub which takes as input the ScalarVector pointer. Returns a C string. The SvPVX function is part of the Perl internal API and converts SV* to a C string SvRV dereferences a SV when its a perl reference */ // define four macros for compiler to substitute where needed: #define DEF_SCOPTR( str ) int* scoptr = get_ptr( str ) // receives a SV, returns scoptr: a pointer to an integer #define DEF_NDXPTR( str ) int* ndxptr = get_ptr( str ) // receives a SV, returns ndxptr: a pointer to an integer #define FETCH( idx ) ndxptr[ idx ] // receives an integer, looks up an array element in nxptr #define INCR( idx ) scoptr[ idx ]++ // receives an integer, increments the corresponding array element in scoptr void update_scores_C( SV* ndxstr, SV* scovec, int maxndx ) { DEF_NDXPTR( ndxstr ); DEF_SCOPTR( scovec ); int i; for ( i = 0; i <= maxndx; i++ ) { INCR( FETCH(i) ); } } /* helps test the InlineC connection. */ int add(int x, int y) { return x + y; } int subtract(int x, int y) { return x - y; } __END__