The Perl and Raku Conference 2025: Greenville, South Carolina - June 27-29 Learn more

# ABSTRACT: Bible Gateway content conversion to Open Bible Markup Language
use 5.020;
use exact;
use Mojo::Util 'html_unescape';
our $VERSION = '2.09'; # VERSION
has translation => 'NIV';
has reference => sub { Bible::Reference->new( bible => 'Protestant' ) };
has url => sub { Mojo::URL->new('https://www.biblegateway.com/passage/') };
has ua => sub {
my $ua = Mojo::UserAgent->new( max_redirects => 3 );
$ua->transactor->name( __PACKAGE__ . '/' . ( __PACKAGE__->VERSION // '2.0' ) );
return $ua;
};
sub translations ($self) {
my $translations;
$self->ua->get( $self->url )->result->dom->find('select.search-dropdown option')->each( sub {
my $class = $_->attr('class') || '';
if ( $class eq 'lang' ) {
my @language = $_->text =~ /\-{3}(.+)\s\(([^\)]+)\)\-{3}/;
push( @$translations, {
language => $language[0],
acronym => $language[1],
} );
}
elsif ( not $class ) {
my @translation = $_->text =~ /\s*(.+)\s\(([^\)]+)\)/;
push( @{ $translations->[-1]{translations} }, {
translation => $translation[0],
acronym => $translation[1],
} );
}
} );
return $translations;
}
sub structure ( $self, $translation = $self->translation ) {
return $self->ua->get(
$self->url->clone->path( $self->url->path . 'bcv/' )->query( { version => $translation } )
)->result->json->{data}[0];
}
sub _retag ( $tag, $retag ) {
$tag->tag($retag);
delete $tag->attr->{$_} for ( keys %{ $tag->attr } );
}
sub fetch ( $self, $reference, $translation = $self->translation ) {
my $runs = $self->reference
->acronyms(0)->require_chapter_match(0)->require_book_ucfirst(0)
->clear->in($reference)->as_runs;
$reference = $runs->[0] unless ( @$runs != 1 or $runs->[0] !~ /\w\s*\d/ );
my $result = $self->ua->get(
$self->url->query( {
version => $translation,
search => $reference,
} )
)->result;
croak( $translation . ' "' . ( $reference // '(undef)' ) . '" did not match a chapter or run of verses' )
if ( $result->dom->at('div.content-section') );
return Mojo::ByteStream->new( $result->body )->decode->to_string;
}
sub parse ( $self, $html ) {
return unless ($html);
my $dom = Mojo::DOM->new($html);
my $ref_display = $dom->at('div.bcv div.dropdown-display-text');
croak('source appears to be invalid; check your inputs') unless ( $ref_display and $ref_display->text );
my $reference = $ref_display->text;
my $translation = $dom->at('div.passage-col')->attr('data-translation');
croak('EXB (Extended Bible) translation not supported') if ( $translation eq 'EXB' );
my $block = $dom->at('div.passage-text div.passage-content div:first-child');
$block->find('*[data-link]')->each( sub { delete $_->attr->{'data-link'} } );
$html = $block->to_string;
$html =~ s`<sup>(\d+)</sup>.<sub>(\d+)</sub>`$1/$2`g;
$html =~ s`(?:&lt;){2,}(.*?)(?:\x{2019}&gt;|(?:&gt;){2,})`\x{201c}$1\x{201d}`g;
$html =~ s`(?:&lt;)(.*?)(?:&gt;|\x{2019})`\x{2018}$1\x{2019}`g;
$html =~ s`\\\w+``g;
$html =~ s/(?:\.\s*){2,}\./\x{2026}/;
$html =~ s/\x{200a}//g;
$block = Mojo::DOM->new($html)->at('div');
$_->parent->strip if ( $_ = $block->find('div.poetry > h2')->first );
$block->descendant_nodes->grep( sub { $_->type eq 'comment' } )->each('remove');
$block->find(
'.il-text, hidden, hr, .translation-note, span.inline-note, a.full-chap-link, b.inline-h3, top1'
)->each('remove');
$block->find('.std-text, hgroup, b, em, versenum, char')->each('strip');
$block
->find('i, .italic, .trans-change, .idiom, .catch-word, selah, span.selah')
->each( sub { _retag( $_, 'i' ) } );
$block->find('.woj, u.jesus-speech')->each( sub { _retag( $_, 'woj' ) } );
$block->find('.divine-name, .small-caps')->each( sub { _retag( $_, 'small_caps' ) } );
$block->find('sup')->grep( sub { length $_->text == 1 } )->each( sub {
$_->content( '-' . $_->content );
$_->strip;
} );
$self->reference->acronyms(1)->require_chapter_match(1)->require_book_ucfirst(1);
my $footnotes = $block->at('div.footnotes');
if ($footnotes) {
$footnotes->find('a.bibleref')->each( sub {
( my $ref = $_->attr('data-bibleref') // '' ) =~ s/\.(\d+)\.(\d+)/ $1:$2/g;
$_->replace($ref);
} );
$footnotes->remove;
$footnotes = {
map {
'#' . $_->attr('id') => $self->reference->clear->in(
$_->at('span')->all_text
)->as_text
} $footnotes->find('ol li')->each
};
}
my $crossrefs = $block->at('div.crossrefs');
if ($crossrefs) {
$crossrefs->find('a.bibleref')->each( sub {
( my $ref = $_->attr('data-bibleref') // '' ) =~ s/\.(\d+)\.(\d+)/ $1:$2/g;
$_->replace($ref);
} );
$crossrefs->remove;
$crossrefs = {
map {
'#' . $_->attr('id') => $self->reference->clear->in(
$_->at('a:last-child')->attr('data-bibleref')
)->refs
} $crossrefs->find('ol li')->each
};
}
$block
->find('span.text > a.bibleref')
->map('parent')
->grep( sub { $_->content =~ /^\[<a/ } )
->each( sub {
$_->find('a')->each( sub {
( my $ref = $_->attr('data-bibleref') // '' ) =~ s/\.(\d+)\.(\d+)/ $1:$2/g;
$_->replace($ref);
} );
my $content = $_->content;
$content =~ s|\s+\[([^\]]+)\]|
'<footnote>' . $self->reference->clear->in($1)->as_text . '</footnote>'
|ge;
$_->content($content);
} );
$block
->find('i > a.bibleref, crossref > a.bibleref')
->map('parent')
->grep( sub { $_->children->size == 1 } )
->each( sub {
my $a = $_->at('a:last-child');
( my $ref = $_->attr('data-bibleref') // '' ) =~ s/\.(\d+)\.(\d+)/ $1:$2/g;
$_->tag('sup');
$_->attr({
'class' => 'crossreference',
'data-cr' => $a->attr('data-bibleref'),
});
$crossrefs = {
$a->attr('data-bibleref') => $self->reference->clear->in($ref)->refs
};
} );
$block->find('a.bibleref')->each('strip');
$block->find('sup.crossreference, sup.footnote')->each( sub {
if ( $_->attr('class') eq 'footnote' ) {
$_->replace(
( $footnotes->{ $_->attr('data-fn') } )
? '<footnote>' . $footnotes->{ $_->attr('data-fn') } . '</footnote>'
: ''
);
}
elsif ( $_->attr('class') eq 'crossreference' ) {
$_->replace(
( $crossrefs->{ $_->attr('data-cr') } )
? '<crossref>' . $crossrefs->{ $_->attr('data-cr') } . '</crossref>'
: ''
);
}
} );
$block->find('footnote, crossref')->each( sub {
_retag( $_, $_->tag );
# patch for if there's an error in the source HTML where some footnotes
# or crossrefs should have a space but don't
my $previous = $_;
$previous = $previous->previous_node while (
$previous and $previous->tag and
( $previous->tag eq 'crossref' or $previous->tag eq 'footnote' )
);
my $previous_char = substr( ( ($previous) ? $previous->all_text || $previous->content : '' ), -1 );
my $next = $_;
$next = $next->next_node while (
$next and $next->tag and
( $next->tag eq 'crossref' or $next->tag eq 'footnote' )
);
my $next_char = substr( ( ($next) ? $next->all_text || $next->content : '' ), 0, 1 );
$_->append(' ') if (
length $previous_char and
length $next_char and
$previous_char =~ /[:;,\w\!\.\?]/ and
$next_char =~ /\w/
);
} );
_retag( $block, 'obml' );
$block->child_nodes->first->prepend( $block->new_tag( 'reference', $reference ) );
$block->find('h3.chapter')->each('remove');
$block->find('h2 + h3')->each( sub { $_->tag('h4') } );
$block->find('h2, h3')->each( sub { _retag( $_, 'header' ) } );
$block->find('h4')->each( sub { _retag( $_, 'sub_header' ) } );
$block->find('.versenum')->grep( sub { $_->text =~ /^\s*\(/ } )->each('remove');
$block->find('.chapternum + .versenum')->each( sub { $_->previous->remove } );
$block->find('.chapternum + i > .versenum')->each( sub { $_->parent->previous->remove } );
$block->find('.chapternum')->each( sub {
_retag( $_, 'verse_number' );
$_->content(1);
} );
$block->find('.versenum')->each( sub {
_retag( $_, 'verse_number' );
my $verse_number = $_->content;
$verse_number =~ s/^.*://g;
($verse_number) = $verse_number =~ /(\d+)/;
$_->content($verse_number);
} );
$block->find('span.text')->each( sub { _retag( $_, 'text' ) } );
$block->find('table')->each( sub {
$_->find('tr')->each( sub {
$_->find('th')->each('remove');
unless ( $_->child_nodes->size ) {
$_->strip;
}
else {
$_->replace( join( '',
'<text>',
$_->find('td text')->map('content')->join(', '),
(
( $_->find('td text')->map('text')->last =~ /\W$/ ) ? '' :
( $_->following_nodes->size ) ? '; ' : '.'
),
( ( $_->following_nodes->size ) ? '</text> ' : '</text>' ),
) );
}
} );
$_->tag('div');
$_->content( '<p>' . $_->content . '</p>' );
} );
$block->find('ul, ol')->each( sub {
$_->find('li')->each( sub {
$_->tag('text');
$_->find('text > text')->each('strip');
$_->append_content('<br>') if ( $_->next and $_->next->tag eq 'li' );
} );
$_->tag('div');
$_->attr( class => 'left-1' );
$_->content( '<p>' . $_->content . '</p>' );
} );
$block->find( join( ', ', map { 'div.left-' . $_ } 1 .. 9 ) )->each( sub {
my ($left) = $_->attr('class') =~ /\bleft\-(\d+)/;
$_->find('text')->each( sub { $_->attr( indent => $left ) } );
$_->strip;
} );
$block->find('div.poetry')->each( sub { $_->attr( class => 'indent-1' ) } );
$block->find( join( ', ', map { '.indent-' . $_ } 1 .. 9 ) )->each( sub {
my ($indent) = $_->attr('class') =~ /\bindent\-(\d+)/;
$_->find('text')->each( sub {
$_->attr( indent => $indent + ( $_->attr('indent') || 0 ) );
} );
$_->strip;
} );
$block->find( join( ', ', map { '.indent-' . $_ . '-breaks' } 1 .. 5 ) )->each('remove');
$block->find('text[indent]')->each( sub {
my $level = $_->attr('indent');
_retag( $_, 'indent' );
$_->attr( level => $level );
} );
$block->find('text')->each('strip');
$block->find('indent + indent')->each( sub {
if ( $_->previous->attr('level') eq $_->attr('level') ) {
$_->previous->append_content( ' ' . $_->content );
$_->remove;
}
} );
$block->find('p')->each( sub { _retag( $_, 'p' ) } );
$block->at('p')->prepend_content('<verse_number>1</verse_number>')
if ( $block->at('p') and not $block->at('p')->at('verse_number') );
$block->find('div, span, u, sup, bk, verse, start-chapter')->each('strip');
$_->each('strip') while ( $_ = $block->find('i > i') and $_->size );
$html = html_unescape( $block->to_string );
$html =~ s/<p>[ ]+/<p>/g; # remove spaces immediately after a "<p>"
$html =~ s/,([A-z]|<i>)/, $1/g; # fix missing spaces after commas (error in source HTML)
$html =~ s/([A-z])(<i>)/$1 $2/g; # fix missing spaces before <i> (error in source HTML)
$html =~ s/([a-z])([A-Z])/$1 $2/g; # fix missing spaces from collapsed words (error in source HTML)
# hack: add back in any missing spaces after some punctuation
$html =~ s/([a-z])([:;,!?])([A-Za-z])/$1$2 $3/g;
return $html;
}
sub get ( $self, $reference, $translation = $self->translation ) {
return Bible::OBML->new->html( $self->parse( $self->fetch( $reference, $translation ) ) );
}
1;
__END__
=pod
=encoding UTF-8
=head1 NAME
Bible::OBML::Gateway - Bible Gateway content conversion to Open Bible Markup Language
=head1 VERSION
version 2.09
=head1 SYNOPSIS
use Bible::OBML::Gateway;
my $bg = Bible::OBML::Gateway->new;
$bg->translation('NIV');
my $obml_obj = $bg->get( 'Romans 12' );
print $bg->get( 'Romans 12', 'NASB' )->obml, "\n";
my $translations = $bg->translations;
my $structure = $bg->structure('NASB');
=head1 DESCRIPTION
This module consumes Bible Gateway content and returns useful data-bearing
objects or data structures. In the common case, it will accept a Bible reference
and return a L<Bible::OBML> object loaded with parsed content.
=head1 METHODS
The following methods are supported.
=head2 new
Instantiates a new gateway object. You can optionally pass a translation
acronym to be used on subsequent requests.
my $bg = Bible::OBML::Gateway->new( translation => 'NIV' );
=head2 get
This method requires a text input containing a Bible reference that can be
understood as a single chapter or single, unbroken run of verses. For example,
"Romans 12" or "Ro 12:13-17" are acceptable, but "Romans 12:13-17, 19" is not.
You can optionally also provide an overriding translation. If not specified,
the object's translation (set via the C<translation> attribute) will be used.
The method will get the raw HTML content from Bible Gateway, parse it, and
return a L<Bible::OBML> object loaded with the data.
my $obml_obj = $bg->get( 'Romans 12' );
print $bg->get( 'Romans 12', 'NASB' )->obml, "\n";
Internally, all this method does is call C<fetch>, pass that output to C<parse>,
and then load output that into a new L<Bible::OBML> object.
=head2 fetch
If all you want to do is fetch the HTML from Bible Gateway, you can use this
method. It uses the same signature as C<get> and returns the returned raw HTML.
=head2 parse
This method requires source HTML like what you might get from a C<fetch> call,
which it will then parse and return a special sort of HTML that can be loaded
directly into a L<Bible::OBML> object via it's C<html> method. (See
L<Bible::OBML> for more information.)
=head2 translations
This method will return a data structure consisting of data describing available
translations on Bible Gateway per spoken language. It returns an arrayref
containing a hashref per language. Each hashref contains an arrayref of
translations, each represented by a hashref.
my $translations = $bg->translations;
This a simplified example of the data structure:
[
{
acronym => 'EN',
language => 'English',
translations => [
{
acronym => 'NIV',
translation => 'New International Version',
},
],
},
]
=head2 structure
This method will return a data structure consisting of data describing the
structure of a given translation of the Bible from Bible Gateway. It can
optionally be provided an overriding translation. If not specified, the object's
translation (set via the C<translation> attribute) will be used. The data
structure returned is an arrayref of hashrefs, each representing a book.
my $structure = $bg->structure('NASB');
This a simplified example of the data structure:
[
{
testament => 'NT',
display => '2 John',
osis => '2John',
intro => 0,
num_chapters => 1,
chapters => [
{
chapter => 1,
type => 'heading',
content => [
"Walk According to His Commandments",
],
},
],
}
]
=head1 ATTRIBUTES
Attributes can be set in a call to C<new> or explicitly as a get/set method.
my $bg = Bible::OBML::Gateway->new( translation => 'NIV' );
$bg->translation('NIV');
say $bg->translation;
=head2 translation
Get or set the current translation acronym. The default if not explicitly set
will be "NIV".
say $bg->translation;
$bg->translation('NIV');
=head2 url
This provides access to the base URL, contained within a L<Mojo::URL> object.
$bg->url( Mojo::URL->new('https://www.biblegateway.com/passage/') );
=head2 ua
This provides access to the L<Mojo::UserAgent> user agent.
$bg->ua->transactor->name("Your Application's Name");
=head2 reference
This provides access to the L<Bible::Reference> object used to parse and
canonicalize Bible references.
$bg->reference->bible('Catholic');
Depending on which translation you C<get> from Bible Gateway, you may need to
alter the C<bible> setting of C<reference>, as in the example immediately above.
By default, C<bible> is set to "Protestant".
=head1 SEE ALSO
L<Bible::OBML>, L<Bible::Reference>, L<Mojo::URL>, L<Mojo::UserAgent>.
You can also look for additional information at:
=over 4
=item *
=item *
=item *
=item *
=item *
=item *
=back
=head1 AUTHOR
Gryphon Shafer <gryphon@cpan.org>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2017-2050 by Gryphon Shafer.
This is free software, licensed under:
The Artistic License 2.0 (GPL Compatible)
=cut