package WWW::Google::PageRank; # -*- perl -*- use strict; use warnings; use vars qw($VERSION); use LWP::UserAgent; use URI::Escape; $VERSION = '0.16'; sub new { my $class = shift; my %par = @_; my $self; $self->{ua} = LWP::UserAgent->new(agent => $par{agent} || 'Mozilla/4.0 (compatible; GoogleToolbar 2.0.111-big; Windows XP 5.1)') or return; $self->{ua}->env_proxy if $par{env_proxy}; $self->{ua}->proxy('http', $par{proxy}) if $par{proxy}; $self->{ua}->timeout($par{timeout}) if $par{timeout}; $self->{host} = $par{host} || 'toolbarqueries.google.com'; bless($self, $class); } sub get { my ($self, $url) = @_; return unless defined $url and $url =~ m[^https?://]i; my $ch = '6' . _compute_ch_new('info:' . $url); my $query = 'http://' . $self->{host} . '/search?client=navclient-auto&ch=' . $ch . '&ie=UTF-8&oe=UTF-8&features=Rank&q=info:' . uri_escape($url); my $resp = $self->{ua}->get($query); if ($resp->is_success && $resp->content =~ /Rank_\d+:\d+:(\d+)/) { if (wantarray) { return ($1, $resp); } else { return $1; } } else { if (wantarray) { return (undef, $resp); } else { return; } } } sub _compute_ch_new { my $url = shift; my $ch = _compute_ch($url); $ch = (($ch % 0x0d) & 7) | (($ch / 7) << 2); return _compute_ch(pack("V20", map {my $t = $ch; _wsub($t, $_*9); $t} 0..19)); } sub _compute_ch { my $url = shift; my @url = unpack("C*", $url); my ($a, $b, $c, $k) = (0x9e3779b9, 0x9e3779b9, 0xe6359a60, 0); my $len = scalar @url; while ($len >= 12) { _wadd($a, $url[$k+0] | ($url[$k+1] << 8) | ($url[$k+2] << 16) | ($url[$k+3] << 24)); _wadd($b, $url[$k+4] | ($url[$k+5] << 8) | ($url[$k+6] << 16) | ($url[$k+7] << 24)); _wadd($c, $url[$k+8] | ($url[$k+9] << 8) | ($url[$k+10] << 16) | ($url[$k+11] << 24)); _mix($a, $b, $c); $k += 12; $len -= 12; } _wadd($c, scalar @url); _wadd($c, $url[$k+10] << 24) if $len > 10; _wadd($c, $url[$k+9] << 16) if $len > 9; _wadd($c, $url[$k+8] << 8) if $len > 8; _wadd($b, $url[$k+7] << 24) if $len > 7; _wadd($b, $url[$k+6] << 16) if $len > 6; _wadd($b, $url[$k+5] << 8) if $len > 5; _wadd($b, $url[$k+4]) if $len > 4; _wadd($a, $url[$k+3] << 24) if $len > 3; _wadd($a, $url[$k+2] << 16) if $len > 2; _wadd($a, $url[$k+1] << 8) if $len > 1; _wadd($a, $url[$k]) if $len > 0; _mix($a, $b, $c); return $c; # integer is positive always } sub _mix { my ($a, $b, $c) = @_; _wsub($a, $b); _wsub($a, $c); $a ^= $c >> 13; _wsub($b, $c); _wsub($b, $a); $b ^= ($a << 8) % 4294967296; _wsub($c, $a); _wsub($c, $b); $c ^= $b >>13; _wsub($a, $b); _wsub($a, $c); $a ^= $c >> 12; _wsub($b, $c); _wsub($b, $a); $b ^= ($a << 16) % 4294967296; _wsub($c, $a); _wsub($c, $b); $c ^= $b >> 5; _wsub($a, $b); _wsub($a, $c); $a ^= $c >> 3; _wsub($b, $c); _wsub($b, $a); $b ^= ($a << 10) % 4294967296; _wsub($c, $a); _wsub($c, $b); $c ^= $b >> 15; @_[0 .. $#_] = ($a, $b, $c); } sub _wadd { $_[0] = int(($_[0] + $_[1]) % 4294967296);} sub _wsub { $_[0] = int(($_[0] - $_[1]) % 4294967296);} 1; __END__ =head1 NAME WWW::Google::PageRank - Query google pagerank of page =head1 SYNOPSIS use WWW::Google::PageRank; my $pr = WWW::Google::PageRank->new; print scalar($pr->get('http://www.yahoo.com/')), "\n"; =head1 DESCRIPTION The C is a class implementing a interface for querying google pagerank. To use it, you should create C object and use its method get(), to query page rank of URL. It uses C for making request to Google. =head1 CONSTRUCTOR METHOD =over 4 =item $gpr = WWW::Google::PageRank->new(%options); This method constructs a new C object and returns it. Key/value pair arguments may be provided to set up the initial state. The following options correspond to attribute methods described below: KEY DEFAULT ----------- -------------------- agent "Mozilla/4.0 (compatible; GoogleToolbar 2.0.111-big; Windows XP 5.1)" proxy undef timeout undef env_proxy undef host "toolbarqueries.google.com" C specifies the header 'User-Agent' when querying Google. If the C option is passed in, requests will be made through specified poxy. C is the host which serve requests from Googlebar. If the C option is passed in with a TRUE value, then proxy settings are read from environment variables (see C) =back =head1 QUERY METHOD =over 4 =item $pr = $gpr->get('http://www.yahoo.com'); Queries Google for a specified pagerank URL and returns pagerank. If query successfull, integer value from 0 to 10 returned. If query fails for some reason (google unreachable, url does not begin from 'http://', undefined url passed) it return C. In list context this function returns list from two elements where first is the result as in scalar context and the second is the C object (returned by C). This can be usefull for debugging purposes and for querying failure details. =back =head1 BUGS If you find any, please report ;) =head1 AUTHOR Yuri Karaban Ftech@askold.netE>. Algorithm of computing checksum taken from mozilla module pagerankstatus F by Stephane Queraud Fsqueraud@toteme.comE>. Algorithm was modified (15-09-2004) according to new algorithm of computingchecksum in googlebar. =head1 COPYRIGHT Copyright 2004-2006, Yuri Karaban, All Rights Reserved. You may use, modify, and distribute this package under the same terms as Perl itself.