#!/usr/bin/perl

use warnings;
use strict;

=head1 specs

Turns http://www.loc.gov/marc/bibliographic/ecbdist.html into the format
used by MARC::Lint.pm

Takes ecbdist.html as input.  Skips fixed fields and data marked
"[OBSOLETE]"  Also, the HTML file doesn't include the 841-88X tags,
so those are hardcoded here.

=head1 AUTHOR

Originally written by Colin Campbell at Sirsi, and taken over and modified
by Andy Lester.

=cut

open( my $fh, '<', "../lib/MARC/Lint.pm" ) or die "Can't open module";
while ( <$fh> ) {
    print;
    last if /^__DATA__/;
}
close $fh;

local $/ = undef;
my $text = <>;
$text =~ s/(<BR>|\r|\n)+/\n/ig;
my @lines = split( /\n/, $text );


my $in_tag = undef;
my $i1;
my $i2;
my $curr_indicator;
my $ntags;
my $desc1;
my $desc2;

my $started = 0;
for ( @lines ) {
    unless ($started) {
	$started=1 if /Number and Code Fields/;
	next;
    }
    s/^\s+//;
    s/\s+$//;
    next if $_ eq "";

    if ( /^(\d\d\d)/ ) {
	my $tag = $1;
	if (/OBSOLETE/) { 
	    $in_tag = 0;
	    next; 
	}

	/$tag - (.+) \((N?R)\)/ or die "Tag $tag is invalid format";
	my $desc = $1;
	my $nr = $2;
	++$ntags;
	$in_tag = 1;
	print "\n" if $ntags > 1;
	print "$tag\t$nr\t$desc\n";
	$i1 = $i2 = "";
	next;
    }

    next unless $in_tag;
    next if /OBSOLETE/;
    
    if (/^First - (.+)/) {
	$curr_indicator = 1;
	$desc1 = $1;
    } elsif (/^Second - (.+)/) {
	print_indicator( 1, $i1, $desc1 );
	undef $desc1;
	$curr_indicator = 2;
	$desc2 = $1;
    } elsif (/^Subfield/) {
	print_indicator( 2, $i2, $desc2 );
	undef $desc2;
	$curr_indicator = 0;
    } else {
	if ($curr_indicator) {
	    my $data = '';
	    if (/^(\d-\d)/) {
		$data = $1;
	    } elsif (/^([#0123456789])/) {
		$data = $1;
	    }
	    $data = "b" if $data eq "#";
	    if ($curr_indicator == 1) {
		$i1 .= $data;
	    } elsif ($curr_indicator == 2) {
		$i2 .= $data;
	    }

	} else {
	    if ( /^\$(.) - (.+)\s*\((N?R)\)/ ) {
		my ($sub,$desc,$nr) = ($1,$2,$3);
		print "$sub\t$nr\t$desc\n";
	    } 
	}
    }
} # main while

sub print_indicator {
    my $n = shift;
    my $val = shift;
    my $desc = shift;

    $val = "blank" if $val eq "b";

    print "ind$n\t$val\t$desc\n";
}