#!/usr/bin/perl -w
my
(
$output
,
$append
,
$min_len
,
$no_duplicate_check
,
$desc_count
,
$delimiter
,
$expect_prefixes
,
$help
);
$delimiter
=
';'
;
GetOptions(
'o|output:s'
=> \
$output
,
'a|append:s'
=> \
$append
,
'n:s'
=> \
$desc_count
,
'l:s'
=> \
$min_len
,
'd:s'
=> \
$delimiter
,
'p'
=> \
$expect_prefixes
,
'i'
=> \
$no_duplicate_check
,
'h'
=> \
$help
,
);
die
(
"must supply a positive integer for -d"
)
if
(
defined
$desc_count
&&
(
$desc_count
!~ /^\d+$/ ||
$desc_count
< 1) );
die
(
"must supply a positive integer for -l"
)
if
(
defined
$min_len
&&
(
$min_len
!~ /^\d+$/ ||
$min_len
< 1) );
my
@files
;
if
(
$help
|| !
@ARGV
) {
exec
(
'perldoc'
,$0);
exit
(0);
}
while
(
@ARGV
) {
my
(
$file
,
$id
) = (
undef
,
''
);
if
(
$expect_prefixes
) {
(
$file
,
$id
) = (
shift
@ARGV
,
shift
@ARGV
);
if
( !
$id
) {
die
(
"Must provide 'name id' pairing of dbfile and id"
);
}
}
else
{
$file
=
shift
@ARGV
;
}
push
@files
, [
$file
,
$id
];
}
my
$out
;
if
(
$append
) {
$out
= new Bio::SeqIO(
-file
=>
">>$append"
);
}
elsif
(
$output
) {
$out
= new Bio::SeqIO(
-file
=>
">$output"
);
}
else
{
$out
= new Bio::SeqIO();
}
my
%unique
;
my
%seqcount
;
my
$counter
= 0;
foreach
my
$pair
(
@files
) {
my
(
$file
,
$id
) =
@$pair
;
my
$in
= new Bio::SeqIO(
-file
=>
$file
);
while
(
my
$seq
=
$in
->next_seq ) {
next
if
defined
$min_len
&&
$seq
->
length
<
$min_len
;
if
(
$id
) {
$seq
->display_id(
"$id:"
.
$seq
->display_id);
}
my
$s
=
lc
(
$seq
->seq());
my
$md5sum
= md5_hex(
$s
);
if
(
$no_duplicate_check
) {
$md5sum
=
$counter
++;
}
if
(
defined
$unique
{
$md5sum
} ) {
$seqcount
{
$md5sum
}++;
next
if
defined
$desc_count
&&
$seqcount
{
$md5sum
++} >
$desc_count
;
my
$desc
=
$unique
{
$md5sum
}->description;
my
$id2
=
sprintf
(
"%s %s:%s %s"
,
$delimiter
,
$id
,
$seq
->display_id,
$seq
->description);
$unique
{
$md5sum
}->desc(
$desc
.
$id2
);
}
else
{
$unique
{
$md5sum
} =
$seq
;
}
}
}
foreach
my
$seq
(
values
%unique
) {
$out
->write_seq(
$seq
);
}