—#!/usr/bin/env perl
use
warnings;
use
v5.20;
no
warnings
qw(experimental::signatures)
;
use
Data::Dumper;
use
Getopt::Long;
use
Mojo::UserAgent;
use
Pod::Usage;
my
(
$help
,
$man
,
$dry
,
$verbose
);
my
(
$sortBy
,
$sortOrder
) =
qw(submittedDate descending)
;
my
(
$id_list
,
$start
,
$max_results
) = (
""
,0,200);
my
%type
;
my
$label
;
GetOptions (
\
%type
,
"h|help"
=> \
$help
,
"man"
=> \
$man
,
"n|dry"
=> \
$dry
,
"v|verbose"
=> \
$verbose
,
"amsrefs"
,
"raw"
,
"l|label:s"
=> \
$label
,
"sortBy:s"
=> \
$sortBy
,
"sortOrder:s"
=> \
$sortOrder
,
"id_list:s"
=> \
$id_list
,
"start:i"
=> \
$start
,
"max_results:i"
=> \
$max_results
,
) or pod2usage(
-verbose
=> 0);
(
$help
) &&
do
{
pod2usage(
-verbose
=> 1);
exit
;
};
(
$man
) &&
do
{
pod2usage(
-verbose
=> 2);
exit
;
};
sub
entry2hsh(
$entry
) {
my
%h
;
for
(
qw(id updated published title summary)
) {
$h
{
$_
} =
$entry
->at(
$_
)->text;
}
$h
{
'authors'
} =
$entry
->find(
'author name'
)->
map
(
'text'
);
$h
{
'category'
} =
$entry
->find(
'category'
)->
map
(
sub
{
$_
->{
'term'
}});
$h
{
'abs'
} =
$entry
->at(
'link[type="text/html"]'
)->{
'href'
};
$h
{
'pdf'
} =
$entry
->at(
'link[type="application/pdf"]'
)->{
'href'
};
return
\
%h
;
}
sub
hsh2strings(
$h
,
$label
) {
my
$authors
=
$h
->{
'authors'
}->
join
(
', '
);
my
$title
=
$h
->{
'title'
};
my
$year
= (
$h
->{
'published'
} =~ /^(\d*)/) && $1;
my
$id
=
$h
->{
'id'
};
$label
//= (
$id
=~ /.*\/(.*)/) && $1;
return
[
$label
,
$authors
,
$title
,
$year
,
$id
];
}
sub
hsh2bbtx(
$h
,
$label
) {
my
(
$l
,
$authors
,
$title
,
$year
,
$id
) = @{hsh2strings(
$h
,
$label
)};
my
$msg
= <<
"EOF"
;
\
@misc
{
$l
,
Author = {
$authors
},
Title = {
$title
},
Year = {
$year
},
note = {
$id
},
}
EOF
return
$msg
;
}
sub
hsh2ams(
$h
,
$label
) {
my
(
$l
,
$authors
,
$title
,
$year
,
$id
) = @{hsh2strings(
$h
,
$label
)};
my
$msg
= <<
"EOF"
;
\\bib{
$l
}{misc}{
author={
$authors
},
title={
$title
},
date={
$year
},
note={
$id
},
}
EOF
return
$msg
;
}
sub
renderEntry(
$entry
,
$label
,
$type
) {
(
exists
$type
->{raw}) &&
return
Dumper(entry2hsh(
$entry
));
(
exists
$type
->{amsrefs}) &&
return
hsh2ams(entry2hsh(
$entry
),
$label
);
return
hsh2bbtx(entry2hsh(
$entry
),
$label
);
}
my
$ua
= Mojo::UserAgent->new;
$ua
=
$ua
->max_redirects(3);
my
%form
=(
search_query
=>
join
(
" "
,
@ARGV
),
id_list
=>
$id_list
,
start
=>
$start
,
max_results
=>
$max_results
,
sortBy
=>
$sortBy
,
sortOrder
=>
$sortOrder
,
);
my
$tx
=
$ua
->build_tx(
GET
=>
$baseURL
=>
form
=> \
%form
);
(
$dry
) &&
do
{
say
Dumper(
$tx
->req);
exit
;
};
my
$res
=
$ua
->start(
$tx
)->res;
(
$verbose
) &&
do
{
say
Dumper(
$res
);
exit
;
};
my
$dom
=
$res
->dom;
$dom
->find(
'entry'
)->
each
(
sub
{
say
renderEntry(
$_
,
$label
,\
%type
) });
__END__
=head1 NAME
App::arxiv2bib - Extract bibliographic data from the arXiv API
=head1 SYNOPSIS
arxiv2bib [options] [args]
arxiv2bib au:author1 AND au:author2 AND ti:title
arxiv2bib --raw "(au:author1 AND ti:title) OR au:author2"
arxiv2bib --amsrefs au:author1 OR ti:title
arxiv2bib --sortBy=relevance --sortOrder=ascending --max_results=20 au:author1 AND au:author2
=head1 DESCRIPTION
The C<script/arxiv2bib> executable provided by the distribution extracts bibliographic information using the L<arXiv API|https://arxiv.org/help/api/user-manual>.
It defaults to the L<BibTeX|http://www.bibtex.org/> format for entries, but can optionally return L<AMSRefs|http://www.ams.org/arc/resources/amsrefs-about.html> entries or just dump raw info (a C<Perl> hash).
=head1 INSTALLATION
Using L<cpanm|https://metacpan.org/dist/App-cpanminus/view/bin/cpanm>: just plain
$ cpanm App::arxiv2bib
should work once it's been indexed by L<CPAN|https://www.cpan.org/>. For more up-to-date versions clone this repo, C<cd> into it, and then:
$ cpanm .
Manual install:
$ perl Makefile.PL
$ make
$ make install
=head1 OPTIONS
-h | --help usage examples and a breakdown of options/arguments
--man full documentation
-n | --dry only dump the the Mojo request object your options have formed
-v | --verbose dump the entire Mojo response object you've received
--amsrefs return entries in AMSRefs format instead of the BibTeX default
--raw dump a hash containing bibliographic info (authors, etc.), unformatted
-l | --label a string that will be used as the label of the BibTeX entry in place of the default (which is the arXiv identifier)
only really useful if you're interested in one of the entries being returned, since it labels all entries identically
The rest of the options go hand-in-hand with identically-named query parameters in L<the API|https://arxiv.org/help/api/user-manual>, so that will be essential documentation.
--sortBy "submittedDate" (default), "relevance" or "lastUpdatedDate"
--sortOrder "descending" (default) or "ascending"
--id_list comma-separated list of arXiv identifiers, e.g. 2106.16211,2106.16119,2106.15900; defaults to ""
--start index of the first displayed entry in the list returned by the search; defaults to 0
--max_results maximal number of displayed returned entries; defaults to 200
=head1 ARGUMENTS
The rest of the arguments constitute the C<search_query>, built as described in the L<API docs|https://arxiv.org/help/api/user-manual#query_details>.
The individual lexemes are of the form C<prefix:string>, where the prefix is one of the following (with the second column indicating what the prefix stands for):
ti Title
au Author
abs Abstract
co Comment
jr Journal Reference
cat Subject Category
rn Report Number
id Id (use id_list instead)
all All of the above
The lexemes can be connected by the logical operators C<AND>, C<OR> and C<ANDNOT>. So a script call might look like this:
arxiv2bib au:author1 AND au:author2 ANDNOT au:author3
That'll search for papers coauthored by C<author1> and C<author2> but I<not> C<author3>.
You can also group your search terms parenthetically for more sophisticated logical constructs:
arxiv2bib "(au:author1 AND ti:title1) OR (au:author2 AND ti:title2)"
I had to enclose that in quotes though, because otherwise the shell gets confused.