#!/usr/bin/perl
use
constant
FATTRIBUTE
=>
'fattribute'
;
use
constant
FATTRIBUTE_TO_FEATURE
=>
'fattribute_to_feature'
;
my
$DO_FAST
=
eval
"use POSIX 'WNOHANG'; 1;"
;
Hide Show 95 lines of Pod
$Bio::DB::GFF::Adaptor::faux::VERSION
=
'1.7.4'
;
@ISA
=
'Bio::DB::GFF::Adaptor::dbi::mysqlopt'
;
sub
insert_sequence {
my
$self
=
shift
;
my
(
$id
,
$offset
,
$seq
) =
@_
;
print
join
"\t"
,
$id
,
$offset
,
$seq
,
"\n"
;
}
$main::VERSION
=
'1.7.4'
;
eval
"use Time::HiRes"
;
undef
$@;
my
$timer
=
defined
&Time::HiRes::time
;
my
(
$DSN
,
$CREATE
,
$USER
,
$PASSWORD
,
$FASTA
,
$FAILED
,
$LOCAL
,
%PID
,
$MAX_BIN
,
$GROUP_TAG
,
$LONG_LIST
,
$MUNGE
,
$TMPDIR
,
$SUMMARY_STATS
);
if
(
$DO_FAST
) {
$SIG
{CHLD} =
sub
{
while
((
my
$child
=
waitpid
(-1,
&WNOHANG
)) > 0) {
delete
$PID
{
$child
} or
next
;
$FAILED
++
if
$? != 0;
}
}
};
$SIG
{INT} =
$SIG
{TERM} =
sub
{cleanup();
exit
-1};
GetOptions (
'database:s'
=> \
$DSN
,
'create'
=> \
$CREATE
,
'user:s'
=> \
$USER
,
'local'
=> \
$LOCAL
,
'password:s'
=> \
$PASSWORD
,
'fasta:s'
=> \
$FASTA
,
'group:s'
=> \
$GROUP_TAG
,
'long_list:s'
=> \
$LONG_LIST
,
'maxbin|maxfeature:s'
=> \
$MAX_BIN
,
'gff3_munge'
=> \
$MUNGE
,
'summary'
=> \
$SUMMARY_STATS
,
'Temporary:s'
=> \
$TMPDIR
,
) or (
system
(
'pod2text'
,$0),
exit
-1);
$DSN
||=
'test'
;
$MAX_BIN
||= 1_000_000_000;
my
(
@args
,
$AUTH
);
if
(
defined
$USER
) {
push
@args
,(
-user
=>
$USER
);
$AUTH
.=
" -u$USER"
;
}
if
(
defined
$PASSWORD
) {
push
@args
,(
-pass
=>
$PASSWORD
);
$AUTH
.=
" -p$PASSWORD"
;
}
push
@args
,(
-preferred_groups
=>[
split
(/[,\s+]+/,
$GROUP_TAG
)])
if
defined
$GROUP_TAG
;
my
$db
= Bio::DB::GFF->new(
-adaptor
=>
'faux'
,
-dsn
=>
$DSN
,
@args
)
or
die
"Can't open database: "
,Bio::DB::GFF->error,
"\n"
;
$db
->gff3_name_munging(1)
if
$MUNGE
;
if
(
$CREATE
) {
$SUMMARY_STATS
++;
$MAX_BIN
?
$db
->initialize(
-erase
=>1,
-MAX_BIN
=>
$MAX_BIN
) :
$db
->initialize(1);
}
$MAX_BIN
||=
$db
->meta(
'max_bin'
) || 100_000_000;
if
(
$LONG_LIST
) {
-d
$LONG_LIST
or
die
"The --long_list argument must be a directory\n"
;
opendir
GFFDIR,
$LONG_LIST
or
die
"Could not open $LONG_LIST for reading: $!"
;
@ARGV
=
map
{
"$LONG_LIST\/$_"
}
readdir
GFFDIR;
closedir
GFFDIR;
if
(
defined
$FASTA
&& -d
$FASTA
) {
opendir
FASTA,
$FASTA
or
die
"Could not open $FASTA for reading: $!"
;
push
@ARGV
,
map
{
"$FASTA\/$_"
}
readdir
FASTA;
closedir
FASTA;
}
}
foreach
(
@ARGV
) {
$_
=
"gunzip -c $_ |"
if
/\.gz$/;
$_
=
"uncompress -c $_ |"
if
/\.Z$/;
$_
=
"bunzip2 -c $_ |"
if
/\.bz2$/;
}
my
(
@fasta
,
@gff
);
foreach
(
@ARGV
) {
if
(/\.(fa|fasta|dna|seq|fast)(?:\.|$)/i) {
push
@fasta
,
$_
;
}
else
{
push
@gff
,
$_
;
}
}
@ARGV
=
@gff
;
push
@fasta
,
$FASTA
if
defined
$FASTA
;
my
$FID
= 1;
my
$GID
= 1;
my
$FTYPEID
= 1;
my
$ATTRIBUTEID
= 1;
my
%GROUPID
= ();
my
%FTYPEID
= ();
my
%ATTRIBUTEID
= ();
my
%DONE
= ();
my
$FEATURES
= 0;
load_tables(
$db
->dbh)
unless
$CREATE
;
my
(
$major
,
$minor
,
$sub
) =
split
/\./,
$db
->dbh->get_info(18);
my
$can_disable_indexes
= (
$major
>= 4 and
$minor
>= 0);
my
(
%FH
,
%COMMAND
);
my
$MYSQL
= MYSQL;
my
$tmpdir
=
$TMPDIR
||
$ENV
{TMPDIR} ||
$ENV
{TMP} || File::Spec->tmpdir();
-d
$tmpdir
or
die
<<END;
I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
Please select a directory and indicate its location by setting the TMP environment variable, or
by using the --Temporary switch.
END
my
@fasta_files_to_be_unlinked
;
my
@files
= (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
foreach
(
@files
) {
my
$file
=
"$tmpdir/$_.$$"
;
print
STDERR
"creating load file $file..."
;
$DO_FAST
&&= (
system
(
"mkfifo $file"
) == 0);
print
STDERR
"ok\n"
;
my
$delete
=
$CREATE
?
"delete from $_"
:
''
;
my
$local
=
$LOCAL
?
'local'
:
''
;
my
$analyze
=
"analyze table $_"
;
my
$command
=
<<END;
$MYSQL $AUTH
-N
-s
-e "lock tables $_ write; $delete; load data $local infile '$file' replace into table $_; unlock tables; $analyze"
$DSN
END
;
$command
=~ s/\n/ /g;
$COMMAND
{
$_
} =
$command
;
if
(
$DO_FAST
) {
if
(
my
$pid
=
fork
) {
$PID
{
$pid
} =
$_
;
print
STDERR
"pausing for 0.5 sec..."
if
$DO_FAST
;
select
(
undef
,
undef
,
undef
,0.50);
print
STDERR
"ok\n"
;
}
else
{
die
"Couldn't fork: $!"
unless
defined
$pid
;
exec
$command
||
die
"Couldn't exec: $!"
;
exit
0;
}
}
print
STDERR
"opening load file for writing..."
;
$FH
{
$_
} = IO::File->new(
$file
,
'>'
) or
die
$_
,
": $!"
;
print
STDERR
"ok\n"
;
$FH
{
$_
}->autoflush;
}
print
STDERR
"Fast loading enabled\n"
if
$DO_FAST
;
my
(
$count
,
$gff3
,
$last
,
$start
,
$beginning
,
$current_file
);
$last
= Time::HiRes::
time
()
if
$timer
;
$beginning
=
$start
=
$last
;
if
(!
@ARGV
) {
$FH
{NULL} = IO::File->new(
">$tmpdir/null"
);
push
@ARGV
,
"$tmpdir/null"
;
}
while
(<>) {
$current_file
||=
$ARGV
;
unless
(
$current_file
eq
$ARGV
) {
undef
$gff3
;
$current_file
=
$ARGV
;
}
chomp
;
my
(
$ref
,
$source
,
$method
,
$start
,
$stop
,
$score
,
$strand
,
$phase
,
$group
);
if
( /^\
$FH
{FASTA}->
close
;
delete
$FH
{FASTA};
}
if
(
defined
$FH
{FASTA} ) {
$FH
{FASTA}->
print
(
"$_\n"
);
next
;
}
elsif
(/^>(\S+)/) {
$FH
{FASTA} = IO::File->new(
">$tmpdir/$1\.fa"
) or
die
"FASTA: $!\n"
;
$FH
{FASTA}->
print
(
"$_\n"
);
push
@fasta
,
"$tmpdir/$1\.fa"
;
push
@fasta_files_to_be_unlinked
,
"$tmpdir/$1\.fa"
;
print
STDERR
"Processing embedded sequence $1\n"
;
next
;
}
elsif
(/^\
$db
->preferred_groups(
split
(/\s+/,$1));
next
;
}
elsif
(/^\
$gff3
= ($1 >= 3);
$db
->print_gff3_warning()
if
$gff3
;
next
;
}
elsif
(/^\
(
$ref
,
$source
,
$method
,
$start
,
$stop
,
$score
,
$strand
,
$phase
,
$group
) =
($1,
'reference'
,
'Component'
,$2,$3,
'.'
,
'.'
,
'.'
,
$gff3
?
"ID=Sequence:$1"
:
qq(Sequence "$1")
);
}
elsif
(/^\
next
;
}
else
{
(
$ref
,
$source
,
$method
,
$start
,
$stop
,
$score
,
$strand
,
$phase
,
$group
) =
split
"\t"
;
}
next
unless
defined
$ref
;
$FEATURES
++;
warn
"Feature $group is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n"
if
$stop
-
$start
+1 >
$MAX_BIN
;
$source
=
'\N'
unless
defined
$source
;
$score
=
'\N'
if
$score
eq
'.'
;
$strand
=
'\N'
if
$strand
eq
'.'
;
$phase
=
'\N'
if
$phase
eq
'.'
;
my
(
$gclass
,
$gname
,
$target_start
,
$target_stop
,
$attributes
) =
$db
->split_group(
$group
,
$gff3
);
$gclass
= [
$gclass
]
unless
ref
$gclass
;
$gname
= [
$gname
]
unless
ref
$gname
;
for
(
my
$i
=0;
$i
<
@$gname
;
$i
++) {
my
$group_class
=
$gclass
->[
$i
];
my
$group_name
=
$gname
->[
$i
];
$group_class
||=
'\N'
;
$group_name
||=
'\N'
;
$target_start
||=
'\N'
;
$target_stop
||=
'\N'
;
$method
||=
'\N'
;
$source
||=
'\N'
;
my
$fid
=
$FID
++;
my
$gid
=
$GROUPID
{
lc
join
($;,
$group_class
,
$group_name
)} ||=
$GID
++;
my
$ftypeid
=
$FTYPEID
{
lc
join
($;,
$source
,
$method
)} ||=
$FTYPEID
++;
my
$bin
= bin(
$start
,
$stop
,
$db
->min_bin);
$FH
{ FDATA() }->
print
(
join
(
"\t"
,
$fid
,
$ref
,
$start
,
$stop
,
$bin
,
$ftypeid
,
$score
,
$strand
,
$phase
,
$gid
,
$target_start
,
$target_stop
),
"\n"
);
$FH
{ FGROUP() }->
print
(
join
(
"\t"
,
$gid
,
$group_class
,
$group_name
),
"\n"
)
unless
$DONE
{
"fgroup$;$gid"
}++;
$FH
{ FTYPE() }->
print
(
join
(
"\t"
,
$ftypeid
,
$method
,
$source
),
"\n"
)
unless
$DONE
{
"ftype$;$ftypeid"
}++;
foreach
(
@$attributes
) {
my
(
$key
,
$value
) =
@$_
;
my
$attributeid
=
$ATTRIBUTEID
{
lc
$key
} ||=
$ATTRIBUTEID
++;
$FH
{ FATTRIBUTE() }->
print
(
join
(
"\t"
,
$attributeid
,
$key
),
"\n"
)
unless
$DONE
{
"fattribute$;$attributeid"
}++;
$FH
{ FATTRIBUTE_TO_FEATURE() }->
print
(
join
(
"\t"
,
$fid
,
$attributeid
,
$value
),
"\n"
);
}
if
(
$FEATURES
% 1000 == 0) {
my
$now
= Time::HiRes::
time
()
if
$timer
;
my
$elapsed
=
$timer
?
sprintf
(
" in %5.2fs"
,
$now
-
$last
) :
''
;
$last
=
$now
;
print
STDERR
"$fid features parsed$elapsed..."
;
print
STDERR -t STDOUT && !
$ENV
{EMACS} ?
"\r"
:
"\n"
;
}
}
}
$FH
{FASTA}->
close
if
exists
$FH
{FASTA};
printf
STDERR
"Feature load time %5.2fs\n"
,(Time::HiRes::
time
() -
$start
)
if
$timer
;
$start
=
time
();
for
my
$fasta
(
@fasta
) {
warn
"Loading fasta "
,(-d
$fasta
?
"directory"
:
"file"
),
" $fasta\n"
;
my
$old
=
select
(
$FH
{FDNA()});
my
$loaded
=
$db
->load_fasta(
$fasta
);
warn
"$fasta: $loaded records loaded\n"
;
select
$old
;
}
printf
STDERR
"Fasta load time %5.2fs\n"
,(Time::HiRes::
time
() -
$start
)
if
$timer
;
$start
=
time
();
my
$success
= 1;
if
(
$DO_FAST
) {
warn
"Indexing and analyzing tables. This may take some time (you may see database messages during the process)...\n"
;
}
$_
->
close
foreach
values
%FH
;
if
(!
$DO_FAST
) {
warn
"Loading feature data and analyzing tables. You may see database messages here...\n"
;
$success
&&=
system
(
$COMMAND
{
$_
}) == 0
foreach
@files
;
}
while
(
%PID
) {
sleep
;
}
$success
&&= !
$FAILED
;
cleanup();
printf
STDERR
"Total parse & load time %5.2fs\n"
,(Time::HiRes::
time
() -
$beginning
)
if
$timer
;
if
(
$success
) {
print
"SUCCESS: $FEATURES features successfully loaded\n"
;
exit
0;
}
else
{
print
"FAILURE: Please see standard error for details\n"
;
exit
-1;
}
if
(
$SUMMARY_STATS
) {
warn
"Building summary statistics for coverage histograms...\n"
;
$db
->build_summary_statistics;
}
exit
0;
sub
cleanup {
foreach
(
@files
,
@fasta_files_to_be_unlinked
) {
unlink
"$tmpdir/$_.$$"
;
}
}
sub
load_tables {
my
$dbh
=
shift
;
print
STDERR
"loading normalized group, type and attribute information..."
;
$FID
= 1 + get_max_id(
$dbh
,
'fdata'
,
'fid'
);
$GID
= 1 + get_max_id(
$dbh
,
'fgroup'
,
'gid'
);
$FTYPEID
= 1 + get_max_id(
$dbh
,
'ftype'
,
'ftypeid'
);
$ATTRIBUTEID
= 1 + get_max_id(
$dbh
,
'fattribute'
,
'fattribute_id'
);
get_ids(
$dbh
,\
%DONE
,\
%GROUPID
,
'fgroup'
,
'gid'
,
'gclass'
,
'gname'
);
get_ids(
$dbh
,\
%DONE
,\
%FTYPEID
,
'ftype'
,
'ftypeid'
,
'fsource'
,
'fmethod'
);
get_ids(
$dbh
,\
%DONE
,\
%ATTRIBUTEID
,
'fattribute'
,
'fattribute_id'
,
'fattribute_name'
);
print
STDERR
"ok\n"
;
}
sub
get_max_id {
my
$dbh
=
shift
;
my
(
$table
,
$id
) =
@_
;
my
$sql
=
"select max($id) from $table"
;
my
$result
=
$dbh
->selectcol_arrayref(
$sql
) or
die
$dbh
->errstr;
$result
->[0];
}
sub
get_ids {
my
$dbh
=
shift
;
my
(
$done
,
$idhash
,
$table
,
$id
,
@columns
) =
@_
;
my
$columns
=
join
','
,
$id
,
@columns
;
my
$sql
=
"select $columns from $table"
;
my
$sth
=
$dbh
->prepare(
$sql
) or
die
$dbh
->errstr;
$sth
->execute or
die
$dbh
->errstr;
while
(
my
(
$id
,
@cols
) =
$sth
->fetchrow_array) {
my
$key
=
lc
join
$;,
@cols
;
$idhash
->{
$key
} =
$id
;
$done
->{
$table
,
$id
}++;
}
}