package DBD::Google::parser;

# ----------------------------------------------------------------------
# This package needs to subclass SQL::Parser, in order that the
# functions defined can be used.  WIth SQL::Parser 1.005, the
# SELECT_CLAUSE method needs to be overridden.
#
# Jeff Zucker tells me that SQL::Parser 1.006 is coming out
# soon, and that it will support more functions and such.  There
# might need to be some logic in here to ensure that an incompatible
# version of SQL::Parser is not being used.
# ----------------------------------------------------------------------

use strict;
use base qw(SQL::Parser);
use vars qw($VERSION $FIELD_RE $FUNC_RE);

use Carp qw(carp);
use File::Spec::Functions qw(catfile);
use HTML::Entities qw(encode_entities);
use SQL::Parser;
use URI::Escape qw(uri_escape);

$VERSION = "2.00";

# Package-scoped variables
# These are not lexicals so that they can be used in tests
$FIELD_RE = '[a-zA-Z][a-zA-Z0-9_]';
$FUNC_RE = qr/$FIELD_RE*(?:::$FIELD_RE*)*(?:[-]>$FIELD_RE*)?/; # methods?
$FIELD_RE = qr/$FIELD_RE*/;

my @default_columns = sort qw( title url snippet summary
                               cachedSize directoryTitle
                               hostName directoryCategory
                             );
my %allowed_columns = map { $_ => 1 }
                      map { $_, lc $_, uc $_ }
                      @default_columns;
for my $dc (@default_columns) {
    $dc =~ s/([A-Z])/_\l$1/g;
    $allowed_columns{$dc} = 1;
}

# All functions are passed two items: the Net::Google::Search
# instanace and the text to be fiddled with.
my %functions = (
    'default'       => sub { $_[1]                  },
    'uri_escape'    => sub { uri_escape($_[1])      },
    'html_escape'   => sub { encode_entities($_[1]) },
    'count'         => \&count,
    'html_strip'    => \&striphtml,
);
$functions{''} = $functions{'default'};

# ----------------------------------------------------------------------
# new(@stuff)
# 
# Override SQL::Parser's new method, but only so that default values
# can be used.
# ----------------------------------------------------------------------
sub new { return shift->SUPER::new("Google", @_) }

# ----------------------------------------------------------------------
# SELECT_CLAUSE($sql)
#
# Parses the SELECT portion of $sql, which contains only the pieces 
# between SELECT and WHERE.  Understands the following syntax:
#
#   field
#
#   field AS alias
#
#   field AS "alias"
#
#   function(field)
#
#   function(field) AS alias
#
#   function(field) AS "alias"
#
#   package::function(field)
#
#   package::function(field) AS alias
#
#   package::function(field) AS "alias"
#
#   package->method(field)
#
#   package->method(field) AS alias
#
#   package->method(field) AS "alias"
#
# ----------------------------------------------------------------------
sub SELECT_CLAUSE {
    my ($self, $sql) = @_;
    #warn "Got: \$sql => '$sql'\n";
    my ($columns, $limit, @columns, @limit, $where, $parsed);

    # SQL::Parser::clean_sql does funny things to strings
    # that look like methods
    $sql = $self->unclean_cleaned_sql($sql);

    # Internal data structures, given shorter names
    my $column_names =  $self->{'struct'}->{'column_names'}     = [ ];
    my $ORG_NAME     =  $self->{'struct'}->{'ORG_NAME'}         = { };
    my $functions    =  $self->{'struct'}->{'column_functions'} = { };
    my $aliases      =  $self->{'struct'}->{'column_aliases'}   = { };
    my $errstr       = \$self->{'struct'}->{'errstr'};

    # columns
    while ($sql =~ /\G

                        # Field name, including possible function
                        (?:
                          ($FUNC_RE\s*\([^)]+\))   # $1 => function
                        |
                          ($FIELD_RE)               # $2 => field name
                        | (\*)                      # $3 => '*' 
                        )

                        # possible alias
                        (?:
                            \s+
                            [aA][sS]
                            \s+
                            (['"]?)                   # $4 => possibly quoted
                              \s*
                              ($FIELD_RE)             # $5 => alias (no spaces allowed!)
                              \s*
                            \4?
                        )?
                        \s*
                        ,?
                        \s*
                       /xsmg) {
        my $alias = $5 || "";
        my $function = $1 || "";

        #warn "\$function => '$function'\n\$alias => '$alias'\n";

        if (defined $3) {
            # SELECT * -> expanded to all column names
            my $df = $functions{'default'};
            for (@default_columns) {
                my $uc_ = uc $_;

                push @$column_names   => $_  ;
                $ORG_NAME->{  $uc_ }  =  $_  ;
                $functions->{ $uc_ }  =  $df ;
                $aliases->{   $uc_ }  =  $_  ;
            }
        }
        elsif ($function) {
            # SELECT foo(bar)
            my $original = $function;
            $original =~ /($FUNC_RE)\s*\((.*?)\)/;

            # XXX $n here might contains arguments; needs to be
            # passed to String::Shellquote to extract tokens
            my ($f, $n) = ($1, $2);
            $n =~ s/(^\s*|\s*$)//g;
            $f = "" unless defined $f;

            unless ($allowed_columns{$n}) {
                $$errstr = "Unknown column name '$n'";
                return;
            }

            # Possible cases include:
            #   1. No function defined
            #   2. Function defined that we know about
            #   3. Function defined we don't know about
            #       3a. Function/method to be loaded
            #       3b. Error
            if ($f) {
                if (defined $functions{$f}) {
                    # Common case:
                    #
                    #   SELECT html_strip(title) FROM google ...
                    #
                    # A pre-defined function.
                    $f = $functions{$f};
                }
                else {
                    # If a user specifies a function like:
                    #
                    #   SELECT Digest::MD5::md5_hex(title) FROM google ...
                    #
                    # or:
                    #
                    #   SELECT URI->new(URL) FROM google ...
                    #
                    if (my ($package, $type, $func) = $f =~ /(.*)(::|[-]>)(.*)/) {

                        eval "use $package;";
                        if ($@) {
                            $$errstr = $@;
                            return;
                        }
                        else {
                            if ($type eq '::') {
                                if (defined(my $g = \&{"$package\::$func"})) {
                                    $f = sub { shift; &$g(@_) };
                                } else {
                                    $$errstr = "Can't load $package\::$func";
                                }
                            }
                            elsif ($type eq '->') {
                                $f = sub { shift; $package->$func(@_) };
                            }
                            else {
                                $f = $functions{'default'};
                            }
                        }
                    }
                    else {
                        # Function that matches $FUNC_RE but doesn't contain
                        # :: or ->; might be a built-in, such as uc, lc, 
                        # gethostbyname, unlink, or even
                        # 'system("GET www.pr0n.com | mail ceo@my.company")'.
                        #
                        # This sucks, BTW.
                        $f = eval qq(sub { $f(\$_[1]) };);
                    }
                }
            }
            else {
                # No function:
                #
                #   SELECT title FROM google ...
                $f = $functions{'default'};
            }

            my $uc = uc $n;
            push @$column_names, $n;
            $ORG_NAME->{  $uc } = $n;
            $functions->{ $uc } = $f;
            $aliases->{   $uc } = $alias ? $alias : $n;
        }
        elsif (defined $2) {
            my $lc = lc $2;
            my $uc = uc $2;
            if ($allowed_columns{$lc}) {
                push @$column_names, $lc;
                $ORG_NAME->{  $uc } = $lc;
                $functions->{ $uc } = $functions{'default'};
                $aliases->{   $uc } = $alias ? $alias : $lc;
            } else {
                $$errstr = "Unknown column name '$2'";
                return;
            }
        }
    }

    1;
}

# ----------------------------------------------------------------------
# decompose()
#
# Returns a data structure, similar to the structure() method, that
# contains only what DBD::Google::db needs to pass to Net::Google.
# The data structure looks like:
#
# {
#   QUERY => "query string",
#   COLUMNS => [
#               {
#                 FIELD => "Net::Google methodname",
#                 FUNCTION => sub { },
#                 ALIAS => "alias",
#               },
#              ],
#   LIMIT => {
#               limit => X,
#               offset => Y,
#            },
# }
# ----------------------------------------------------------------------
sub decompose {
    my $self = shift;
    my $struct = $self->structure;
    my %data = ();

    # Limit (use defaults of 0, 10)
    $data{'LIMIT'} = $struct->{'limit_clause'} || { offset => 0, limit => 10 };

    # Where
    $data{'WHERE'} = $struct->{'where_clause'}->{'arg2'}->{'value'} || "";
    $data{'WHERE'} =~ tr/'"//d;

    # Columns
    $data{'COLUMNS'} = [
        map { { FIELD    => $struct->{'ORG_NAME'}->{$_},
                FUNCTION => $struct->{'column_functions'}->{$_},
                ALIAS    => $struct->{'column_aliases'}->{$_},
            } }          @{ $struct->{'column_names'} }
    ];

    return wantarray ? %data : \%data;
}

# ----------------------------------------------------------------------
# unclean_cleaned_sql($sql)
#
# Undo some of the damage that SQL::Parser::clean_sql does to functions
# that look like Perl methods, e.g., Foo::Bar->new(title) gets turned
# into Foo::Bar- > new (title), which is no good.
# ----------------------------------------------------------------------
sub unclean_cleaned_sql {
    my ($self, $sql) = @_;

    $sql =~ s/\s*([-<>])\s*/$1/g;

    return $sql;
}

# ----------------------------------------------------------------------
# striphtml($ng, $text)
#
# A function for stripping HTML.  Very naive; it it becomes an
# issue, I'll include TCHRIST's striphtml.
# ----------------------------------------------------------------------
sub striphtml {
    my $text = $_[1];
    $text =~ s#</?[^>]*>##smg;
    return $text;
}

# ----------------------------------------------------------------------
# count($ng)
#
# Returns the total number of results.
# ----------------------------------------------------------------------
sub count {
    my $ngs = shift; # Net::Google::Search instance
    my $res = $ngs->response;
    return $res->estimateTotalResultsNumber;
}

1;

__END__

NOTES

Tim Buunce suggested count(*) as a way to get the total number of search results.

Data structure of SQL::Parser instance after parsing looks like:

                 'struct' => {
                               'org_table_names' => [
                                                      'google'
                                                    ],
                               'column_names' => [
                                                   '*'
                                                 ],
                               'table_alias' => {},
                               'command' => 'SELECT',
                               'table_names' => [
                                                  'GOOGLE'
                                                ],
                               'org_col_names' => [
                                                    '*'
                                                  ]
                             },