—#!perl
use
strict;
use
Data::Dumper;
use
Carp;
#
# This is a SAS Component
#
=head1 all_entities_Family
Return all instances of the Family entity.
The Kbase will support the maintenance of protein families (as sets of Features
with associated translations). We are initially only supporting the notion of a family
as composed of a set of isofunctional homologs. That is, the families we
initially support should be thought of as containing protein-encoding genes whose
associated sequences all implement the same function
(we do understand that the notion of "function" is somewhat ambiguous, so let
us sweep this under the rug by calling a functional role a "primitive concept").
We currently support families in which the members are
translations of features, and we think of Features as
having an associated function. Identical protein sequences
as products of translating distinct genes may or may not
have identical functions, and we allow multiple members of
the same Family to share identical protein sequences. This
may be justified, since in a very, very, very few cases
identical proteins do, in fact, have distinct functions.
We would prefer to reach the point where our Families are
sets of protein sequence, rather than sets of
protein-encoding Features.
Example:
all_entities_Family -a
would retrieve all entities of type Family and include all fields
in the entities in the output.
=head2 Related entities
The Family entity has the following relationship links:
=over 4
=item HasMember Feature
=item HasProteinMember ProteinSequence
=item IsCoupledTo Family
=item IsCoupledWith Family
=item IsFamilyFor Role
=item IsRepresentedIn Genome
=back
=head2 Command-Line Options
=over 4
=item -a
Return all fields.
=item -h
Display a list of the fields available for use.
=item -fields field-list
Choose a set of fields to return. Field-list is a comma-separated list of
strings. The following fields are available:
=over 4
=item type
=item release
=item family_function
=item alignment
=back
=back
=head2 Output Format
The standard output is a tab-delimited file. It consists of the input
file with an extra column added for each requested field. Input lines that cannot
be extended are written to stderr.
=cut
use
Getopt::Long;
#Default fields
my
@all_fields
= (
'type'
,
'release'
,
'family_function'
,
'alignment'
);
my
%all_fields
=
map
{
$_
=> 1 }
@all_fields
;
my
$usage
=
"usage: all_entities_Family [-show-fields] [-a | -f field list] > entity.data"
;
my
$a
;
my
$f
;
my
@fields
;
my
$show_fields
;
my
$geO
= Bio::KBase::CDMI::CDMIClient->new_get_entity_for_script(
"a"
=> \
$a
,
"show-fields"
=> \
$show_fields
,
"h"
=> \
$show_fields
,
"fields=s"
=> \
$f
);
if
(
$show_fields
)
{
STDERR
"Available fields: @all_fields\n"
;
exit
0;
}
if
(
@ARGV
!= 0 || (
$a
&&
$f
))
{
STDERR
$usage
,
"\n"
;
exit
1;
}
if
(
$a
)
{
@fields
=
@all_fields
;
}
elsif
(
$f
) {
my
@err
;
for
my
$field
(
split
(
","
,
$f
))
{
if
(!
$all_fields
{
$field
})
{
push
(
@err
,
$field
);
}
else
{
push
(
@fields
,
$field
);
}
}
if
(
@err
)
{
STDERR
"all_entities_Family: unknown fields @err. Valid fields are: @all_fields\n"
;
exit
1;
}
}
my
$start
= 0;
my
$count
= 1_000_000;
my
$h
=
$geO
->all_entities_Family(
$start
,
$count
, \
@fields
);
while
(
%$h
)
{
while
(
my
(
$k
,
$v
) =
each
%$h
)
{
join
(
"\t"
,
$k
,
map
{
ref
(
$_
) eq
'ARRAY'
?
join
(
","
,
@$_
) :
$_
}
@$v
{
@fields
}),
"\n"
;
}
$start
+=
$count
;
$h
=
$geO
->all_entities_Family(
$start
,
$count
, \
@fields
);
}