=head1 NAME

File::OSS::Scan::Ruleset - initialize the scan rules

=head1 VERSION

version 0.02

=head1 SYNOPSIS

    use File::OSS::Scan::Ruleset;

    File::OSS::Scan::Ruleset->init($config_file);
    my $ruleset = File::OSS::Scan::Ruleset->get_ruleset();

=head1 DESCRIPTION

This is an internal module used by L<File::OSS::Scan> to initialise scan rules from
the configuration file, and should not be called directly.

=head1 SEE ALSO

=over 4

=item * L<File::OSS::Scan>

=back

=head1 AUTHOR

Harry Wang <harry.wang@outlook.com>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2014 by Harry Wang.

This is free software, licensed under:

    Artistic License 1.0

=cut

package File::OSS::Scan::Ruleset;

use strict;
use warnings FATAL => 'all';

use Fatal qw( open close );
use Carp;
use English qw( -no_match_vars );
use Data::Dumper; # for debug

use File::OSS::Scan::Constant qw(:all);

our $VERSION = '0.02';

my $cfg_default = $ENV{OSSSCAN_CONFIG} || ".ossscan.rc";
my @valid_sections = qw/GLOBAL FILE DIRECTORY LINE/;

# global var ...
our $ruleset = undef;

sub init {
    my $self = shift;
    my $config_file = shift || $cfg_default;

    local *CONFIG;

    if ( ! -f $config_file ) {
        carp "config file $config_file doesn't exist, using the embedded ruleset.";

        # read from __DATA__ section
        *CONFIG = *DATA;
    }
    else {
        croak "config file $config_file is not readable."
            if ( ! -r $config_file );

        open( CONFIG, $config_file ) ||
            croak "Can't open $config_file, $!.";
    }

    # clear previously set config $ruleset
    undef $ruleset;

    my ( $section, $rule ) = ( undef, undef );
    my $invalid_section_flag = UNI_FALSE;

    while(<CONFIG>) {
        chomp;      # remove newline
        s/#.*//;    # remove comments
        s/^\s+//;   # remove leading spaces
        s/\s+$//;   # remove trailing spaces

        # anything left ?
        next unless length;

        # parse sections
        if ( /^\[(\w+)\].*/ ) {
            $section = uc $1;

            # skip invalid sections
            if (  ! grep {/^$section$/} @valid_sections ) {
                carp "Invalid section name $section, skipping ...";
                $invalid_section_flag = UNI_TRUE;
                next;
            } else {
                $invalid_section_flag = UNI_FALSE;
            }

            $ruleset->{$section} = undef
                if ( not exists $ruleset->{$section} );
        }

        # parse settings
        # put them all under 'GLOBAL' section
        if ( /^(\w+)\s*\:\s*(.*)$/ ) {
            my ( $key, $val ) = ( $1, $2 );
            $ruleset->{'GLOBAL'}->{$key} = [ split /\s+/, $val ];
        }

        # parse rules
        # if ( /^([\w\%\-\s]+)$/ ) {
        if ( /^(\d+\%?.*)$/ ) {
            $rule = $1;

            # skip rules under invalid sections
            if ( $invalid_section_flag ) {
                carp "also skipping rules $rule under invalid section $section ...";
                next;
            }

            my $cur_section = $section || 'GLOBAL';

            my ( $certain, $func, @args )
                = split(' ', $rule);

            # valid level of certainty: 0 - 100
            if ( $certain =~ /^(\d+)\%?$/ ) {
                $certain = $1;

                if ( $certain > 100 or $certain < 0 ) {
                    carp "certainty $certain is not in the range of 0 to 100, skipping rules $rule ...";
                    next;
                }
            }
            else {
                carp "invalid level of certainty $certain, skipping rules $rule ...";
                next;
            }

            my $hash = {
                'cert'  => $certain,
                'func'  => $func,
                'args'  => \@args
            };

            push @{$ruleset->{$cur_section}}, $hash;
        }

    }

    close( CONFIG );

    # sort rulesets by certainty level
    sort_ruleset();

    return SUCCESS;
}

sub get_ruleset {
    my $var = $_[0] . "::ruleset";
    no strict 'refs';
    return $$var;
}

sub sort_ruleset {

    # sort every sections of ruleset according to
    # its level of certainty.
    foreach my $sec ( @valid_sections ) {

        # don't sort global setting
        next
            if ( $sec eq 'GLOBAL' );

        if ( exists $ruleset->{$sec} and
                defined $ruleset->{$sec} ) {

            my @sorted = ();
            my %raw_hash = ();
            my $rules = $ruleset->{$sec};

            foreach my $rule ( @$rules ) {
                my $cert = $rule->{'cert'};
                push @{$raw_hash{$cert}}, $rule;
            }

            foreach my $s_cert ( sort { $b <=> $a } keys %raw_hash ) {
                push @sorted, @{$raw_hash{$s_cert}};
            }

            $ruleset->{$sec} = \@sorted;
        }
    }

    return SUCCESS;
}


1;



__DATA__

exclude_extension: png jpg gif pdf doc docx html htm xml json xls

# section for directory check
[DIRECTORY]

# section for file check
[FILE]
    100% filename_match COPYING
    100% filename_match COPYING\.\w+
    100% filename_match LICEN[CS]E
    100% filename_match LICEN[CS]E\.\w+
    100% filename_match KEY[S]?
    100% filename_match KEY[S]?.\w+
    50%  filename_match AUTHOR[S]?

# section for line check
[LINE]
    100% content_match GPL 1
    100% content_match GPLv\d 1
    100% content_match LGPL 1
    100% content_match LGPLv\d 1
    100% content_match BSD.*Licen[cs]e
    100% content_match Public\W*Licen[cs]e
    100% content_match Public\W*WAttribution
    100% content_match Open\W*Licen[cs]e
    100% content_match Open\W*Source\W*Licen[cs]e
    100% content_match Software\W*Licen[cs]e
    100% content_match Library\W*Licen[cs]e
    100% content_match Free\W*Licen[cs]e
    100% content_match MIT\W*Licen[cs]e
    100% content_match Sleepycat\W*Licen[cs]e
    100% content_match Apache.*Licen[cs]e
    100% content_match GNU\W*General\W*Public
    100% content_match GNU\W*Affero
    100% content_match GNU\W*Lesser
    100% content_match GNU\W*Free.*Licen[cs]e
    100% content_match Netscape\W*Public
    100% content_match Netscape\W*Licen[cs]e
    100% content_match Academic\W*Free\W*Licen[cs]e
    100% content_match Apple\W*Public\W*Source\W*Licen[cs]e
    100% content_match Creative\W*Commons\W*Attribution
    100% content_match Artistic\W*Licen[cs]e
    100% content_match Common\W*Development.*Licen[cs]e
    100% content_match Educational\W*Community\W*Licen[cs]e
    100% content_match Free\W*Software\W*Foundation

    80%  content_match Open\W*Source
    # 80%  content_match licen[cs]e\W*

    50%  copyright_match ThomsonReuters Reuters Thomson TR EJV Bridge