#!/usr/local/bin/perl -- # -*-Perl-*-
$VERSION
=
'3.4'
;
$DATE
=
'28 Jan 2002'
;
no
strict
"refs"
;
use
vars
qw( $VERSION $DATE %DBS %STYLES $RWGETZ $RGETZ %IDMATCH %IDLIST $XEMBL $FH )
;
BEGIN {
$RWGETZ
=
'/ebi/srs/srs/bin/osf_5/wgetz -e'
;
$RGETZ
=
'/ebi/srs/srs/bin/osf_5/getz -e'
;
$XEMBL
=
"cd /ebi/www/pages/cgi-bin/xembl/; ./XEMBL.pl"
;
%IDMATCH
= (
embl
=>
'ID (\w+)'
,
fasta
=>
'>\w+.(\w+)'
,
medlinefull
=>
'[\n><]MedlineID. ?(\w+)'
,
swissprot
=>
'ID (\w+)'
,
pdb
=>
'.{62}(\w+)'
,
bsml
=>
'DUMMY'
,
agave
=>
'DUMMY'
,
refseq
=>
'LOCUS ([\w_]+)'
);
%DBS
= (
embl
=> {
fields
=> [
'id'
,
'acc'
],
version
=>
'sv'
,
format
=> {
default
=>
'embl'
,
embl
=> 1,
fasta
=>
'FastaSeqs'
,
bsml
=> 1,
agave
=> 1
}
},
medline
=> {
fields
=> [
'id'
],
format
=> {
default
=>
'medlinefull'
,
medlinefull
=>
'MedlineFull'
}
},
ensembl
=> {
fields
=> [
'id'
],
format
=> {
default
=>
'embl'
,
embl
=> 1,
fasta
=>
'FastaSeqs'
}
},
swall
=> {
fields
=> [
'id'
,
'acc'
],
format
=> {
default
=>
'swissprot'
,
swissprot
=> 1,
fasta
=>
'FastaSeqs'
}
},
pdb
=> {
fields
=> [
'id'
],
format
=> {
default
=>
'pdb'
,
pdb
=>
'1'
}
},
refseq
=> {
fields
=> [
'id'
,
'acc'
],
format
=> {
default
=>
'refseq'
,
refseq
=> 1,
fasta
=>
'FastaSeqs'
}
}
);
%STYLES
= (
html
=> 1,
raw
=> 1
);
%IDLIST
= ();
}
my
$q
= new CGI;
my
$debug
= protect(
$q
->param(
'debug'
))
if
$q
->param(
'debug'
);
&debugging
if
not
$q
->user_agent and
$debug
;
if
(
$q
->param(
'id'
) or
$q
->param(
'keywords'
) ) {
my
$value
;
$value
= protect(
$q
->param(
'id'
))
if
$q
->param(
'id'
);
$value
= protect(
$q
->param(
'keywords'
))
if
$q
->param(
'keywords'
);
my
$db
=
lc
protect(
$q
->param(
'db'
));
my
$format
=
lc
protect(
$q
->param(
'format'
));
my
$style
=
lc
protect(
$q
->param(
'style'
));
$style
||=
'html'
;
input_error(
$q
,
$style
,
"2 Unknown style [$style]."
)
unless
$STYLES
{
$style
};
$db
||=
'embl'
;
input_error(
$q
,
$style
,
"1 Unknown database [$db]."
)
unless
$DBS
{
$db
};
$format
||=
$DBS
{
$db
}{
format
}{
default
};
input_error(
$q
,
$style
,
"3 Format [$format] not known for database [$db]"
)
unless
$DBS
{
$db
}{
format
}{
$format
};
$format
=
$DBS
{
$db
}{
format
}{
default
}
if
$format
eq
'default'
;
input_error(
$q
,
$style
,
"1 Unknown database [$db]."
)
if
(
$format
eq
'bsml'
or
$format
eq
'agave'
) and
$db
ne
'embl'
;
$style
= (
$format
=~ /(bsml|agave)/i) ?
'xml'
:
$style
;
if
(
$style
eq
'html'
) {
print
$q
->header(
-type
=>
'text/html'
,
-charset
=>
'UTF-8'
);
}
elsif
(
$style
eq
'raw'
) {
print
"Content-Type: text/plain; charset=UTF-8\n\n"
;
}
$FH
= tempfile(
'dbfetchXXXXXX'
,
DIR
=> TMPDIR,
UNLINK
=> 1 );
my
@ids
=
split
(/ /,
$value
);
input_error(
$q
,
$style
,
"6 Too many IDs ["
.
scalar
@ids
.
"]. Max ["
. MAXIDS.
"] allowed."
)
if
scalar
@ids
> MAXIDS;
if
(
$style
eq
'xml'
) {
&xml
(
$format
,
@ids
);
}
else
{
my
$counter
;
foreach
my
$id
(
@ids
) {
&$style
(
$db
,
$id
,
$format
);
}
no_entries(
$q
,
$value
)
if
$style
eq
'html'
and
tell
(
$FH
) == 0;
}
seek
$FH
, 0, 0;
print
'<pre>'
if
$style
eq
'html'
;
print
$_
while
<
$FH
>;
}
else
{
print_prompt(
$q
);
}
sub
print_prompt {
print
$q
->header(),
$q
->start_html(
-title
=>
'DB Entry Retrieval'
,
-bgcolor
=>
'white'
,
-author
=>
'heikki-at-bioperl-dot-org'
),
'<IMG align=middle SRC="/icons/ebibanner.gif">'
,
$q
->h1(
'Generic DB Entry Retrieval'
),
$q
->p(
"This page allows you to retrieve up to "
. MAXIDS .
" entries at the time from various up-to-date biological databases."
),
$q
->p("For EMBL, enter an accession number (e.g. J00231) or entry name (e.g.
BUM) or a sequence version (e.g. J00231.1), or any combination of them
separated by a non-word character into your browser's search dialog.
SWALL examples are: fos_human, p53_human.
For short Ensembl entries,
try
: AL122059, AL031002, AL031030 .
'Random'
Medline entry examples are: 20063307, 98276153.
PDB entry examples are: 100D, 1FOS. Try NM_006732
for
RefSeq.
Only one copy of the latest version of the entry is returned."),
$q
->hr,
$q
->startform,
$q
->popup_menu(
-name
=>
'db'
,
-values
=> [
'EMBL'
,
'SWALL'
,
'PDB'
,
'Medline'
,
'Ensembl'
,
'RefSeq'
]),
$q
->textfield(
-name
=>
'id'
,
-size
=> 40,
-maxlength
=> 1000),
$q
->popup_menu(
-name
=>
'format'
,
-values
=> [
'default'
,
'Fasta'
,
'bsml'
,
'agave'
]),
$q
->popup_menu(
-name
=>
'style'
,
-values
=> [
'html'
,
'raw'
]),
$q
->submit(
'Retrieve'
),
$q
->endform,
$q
->hr,
$q
->h2(
'Direct access'
),
$q
->p(
'For backward compatibility, the script defaults to EMBL:'
),
$q
->p(
'but the preferred way of calling it is:'
),
$q
->p('which can be extended to retrieve entries in alternative sequence formats
and other databases:'),
$q
->p('Set style to <code>raw</code> to retrieve plain text entries
for
computational purposes
and saving to disk:'),
$q
->p('There is now the possibility to retrieve EMBL sequences formatterd into two XML standards:
Bsml (Bioinformatic Sequence Markup Language - from
Labbook, Inc.) or as AGAVE (Architecture
for
Genomic Annotation,
Visualisation, and Exchange - from Labbook, Inc.). To
do
this,
use
the
formats \
'bsml\' or \'agave\', as follows:'
),
$q
->p(
"Version numbers are not supported with the XML retrieval."
),
$q
->hr,
$q
->address(
"Version $VERSION, $DATE, <a href=\"mailto:support\@ebi.ac.uk\">support\@ebi.ac.uk</a>"
),
$q
->end_html,
"\n"
;
}
sub
protect {
my
(
$s
) =
@_
;
$s
=~ s![^\w\.\_]+! !g;
$s
=~ s|^\W+||;
$s
=~ s|\W+$||;
return
$s
;
}
sub
input_error {
my
(
$q
,
$style
,
$s
) =
@_
;
if
(
$style
eq
'html'
) {
print
$q
->header,
$q
->start_html(
-title
=>
'DB Entry Retrieval: Input error'
,
-bgcolor
=>
'white'
),
"<h2>ERROR in input:<h2>$s\n"
,
$q
->end_html,
"\n"
;
}
else
{
print
"Content-type: text/plain\n\n"
,
"ERROR $s\n"
;
}
exit
0;
}
sub
no_entries {
my
(
$q
,
$value
) =
@_
;
print
$q
->start_html(
-title
=>
'DB Entry Retrieval: Input warning'
,
-bgcolor
=>
'white'
),
"<h2>Sorry, your query retrieved no entries.</h2>"
,
"Entries with [$value] where not found."
,
"Please go back or press <a href=\"dbfetch\"><b>here</b></a> to try again"
,
$q
->end_html,
"\n"
;
exit
0;
}
sub
raw {
my
(
$db
,
$value
,
$format
) =
@_
;
my
(
$srsq
,
$qdb
,
$entry
,
$id
);
my
(
$seqformat
) =
''
;
$seqformat
=
'-view '
.
$DBS
{
$db
}{
format
}{
$format
}
if
$format
ne
$DBS
{
$db
}{
format
}{
default
};
my
$version
=
''
;
$value
=~ /(.+)\.(.+)/;
$version
= $2
if
$2;
$value
= $1
if
$1;
$qdb
=
$db
;
$srsq
=
''
;
foreach
my
$field
(@{
$DBS
{
$db
}{fields}}) {
$srsq
.=
" [$qdb-$field:$value] |"
;
}
chop
$srsq
;
if
(
$version
) {
my
$vfname
=
$DBS
{
$db
}{version};
$srsq
=
"[$qdb-$vfname:$version] & ("
.
$srsq
.
")"
}
$entry
= `rsh srs
"$RGETZ $seqformat '$srsq'"
`;
$entry
=~ s|EMBL[^\n]+\n||;
$entry
=~ s|^\s+||g;
$entry
=~ s|\s+$|\n|g;
my
$idmatch
=
$IDMATCH
{
$format
};
(
$id
) =
$entry
=~ /
$idmatch
/;
input_error(
' '
,
'raw'
,
"5 ID [$value] not found in database [$db]."
)
unless
$id
;
print
$FH
$entry
unless
$IDLIST
{
$id
};
$IDLIST
{
$id
} = 1;
}
sub
html {
my
(
$db
,
$value
,
$format
) =
@_
;
my
(
$srsq
,
$qdb
,
$entry
,
$id
,
$idmatch
);
my
(
$seqformat
) =
''
;
$seqformat
=
'-view '
.
$DBS
{
$db
}{
format
}{
$format
}
if
$format
ne
$DBS
{
$db
}{
format
}{
default
};
my
$version
=
''
;
$value
=~ /(.+)\.(.+)/;
$version
= $2
if
$2;
$value
= $1
if
$1;
$seqformat
.=
' -vn 2 '
if
$db
eq
'swall'
or
$db
eq
'refseq'
;
$qdb
=
$db
;
$srsq
=
''
;
foreach
my
$field
(@{
$DBS
{
$db
}{fields}}) {
$srsq
.=
" [$qdb-$field:$value] |"
;
}
chop
$srsq
;
if
(
$version
) {
my
$vfname
=
$DBS
{
$db
}{version};
$srsq
=
"[$qdb-$vfname:$version] & ("
.
$srsq
.
")"
}
$entry
= `rsh srs
"$RWGETZ $seqformat '$srsq'"
`;
return
if
$entry
=~ /SRS error/;
$entry
=~ s|^Content-type:[^\n]+\n||;
$entry
=~ s|\n<A HREF[^\n]+\n||;
$entry
=~ s/\+-e\
"/\+-e/g; #"
$entry
=~ s|<BR>||g;
$entry
=~ s|</?pre>||g;
$entry
=~ s|\n+|\n|g;
$entry
=~ s|^\n+||g;
$idmatch
=
$IDMATCH
{
$format
};
(
$id
) =
$entry
=~ /
$idmatch
/;
print
$FH
$entry
unless
$IDLIST
{
$id
};
$IDLIST
{
$id
} = 1;
}
sub
xml {
my
(
$format
,
@ids
) =
@_
;
my
(
$entry
,
$id
,
$content
,
$counter
,
$reg
);
$content
= (
$ENV
{
'HTTP_USER_AGENT'
} =~ /MSIE/) ?
"Content-type: text/xml\n\n"
:
"Content-type: text/plain\n\n"
;
$entry
=
"--format "
.((
$format
eq
"bsml"
) ?
"Bsml"
:
"sciobj"
) .
" "
.
join
(
" "
,
@ids
);
$entry
= `rsh mercury
"$XEMBL $entry"
`;
$reg
= ((
$format
eq
"bsml"
) ?
'<Sequence id='
:
'<contig length-'
);
$counter
++
while
$entry
=~ /(
$reg
)/g;
foreach
my
$idl
(
@ids
) {
input_error(
$q
,
" "
,
"5 ID [$idl] not found in database [embl]."
)
if
(
$format
eq
"bsml"
&&
$entry
=~
"NOT EXIST: $idl"
) ||
(
$format
eq
"agave"
&&
$entry
=~
"NOT FOUND: $idl"
)
}
print
$FH
(
$content
.
$entry
);
}
sub
debugging {
foreach
my
$db
(
keys
%DBS
) {
my
$status
= 1;
print
"ERROR: [$db]: no SRS fields defined."
.
" Give an array of field names?\n"
and
$status
= 0
unless
$DBS
{
$db
}{fields};
print
"ERROR: [$db]: SRS fields are not defined as an array.\n"
and
$status
= 0
unless
ref
$DBS
{
$db
}{fields} eq
'ARRAY'
;
print
"ERROR: [$db]: no formats defined.\n"
and
$status
= 0
unless
$DBS
{
$db
}{
format
};
print
"ERROR: [$db]: no default format defined.\n"
and
$status
= 0
unless
$DBS
{
$db
}{
format
}{
default
};
my
$format
=
$DBS
{
$db
}{
format
}{
default
};
print
"ERROR: [$db]: no format [$format] defined."
.
" You declared it as a default and only.\n"
and
$status
= 0
unless
$DBS
{
$db
}{
format
}{
$format
};
foreach
my
$dbformat
(
keys
%{
$DBS
{
$db
}{
format
}}) {
print
"ERROR: [$db]: format [$format] not defined in %IDMATCH.\n"
and
$status
= 0
unless
$IDMATCH
{
$dbformat
} or
$dbformat
eq
'default'
;
}
printf
"%-12s%s"
,
"[$db]"
,
"OK\n"
if
$status
;
}
exit
;
}