package Extractor;
use strict;
use MyUtils;
use HTML::TreeBuilder;
use Fetcher;
use Carp;
#Êý×Ö±¨×¨ÓÃ#
sub extract_nav_link{
    my ($class,$seed,$is_utf) = @_;
    my $content = Fetcher->fetch($seed) ;
    if($is_utf){
        $content =MyUtils->u8_to_gbk($content);
    }
    my $tree= HTML::TreeBuilder->new_from_content($content);
    my $addr = q/@0.0.3/;
    my $str =$tree->address($addr)->attr('content');
    $tree->delete;
    $str =~s/0;\sURL=//;
    my $seed_uri = URI->new($seed);
    my $index_uri = URI->new_abs( $str, $seed_uri );
    return $index_uri ; 
}


####extract current page article links####
sub extract_ak_lks{
    my $class = shift ;
    my $index_uri  = shift || Carp->croak("ÐèÒªindex_uri¡£¡£¡£¡£¡£");
    my $cp_ak_addr = shift || Carp->croak("ÐèÒªµØÖ·¡£¡£¡£¡£¡£"); 
    my $is_utf =shift;
    my $nav_page_content =Fetcher->fetch($index_uri) ;
    if($is_utf){
        $nav_page_content =MyUtils->u8_to_gbk($nav_page_content );
    }

    my $tree= HTML::TreeBuilder->new_from_content($nav_page_content);
#µ±Ç°°æÃæÖ÷ÒªÐÂÎÅÁ´½Ó currentpage_article_addr
    my $cp_aks_tree=$tree->address($cp_ak_addr);
    my @cp_aks;
    for (@{ $cp_aks_tree->extract_links("a") }) {
        my($link, $element, $attr, $tag) = @$_;
        my $abs_link = URI->new_abs( $link, $index_uri );
        push @cp_aks,$abs_link;

    }
    $tree->delete;

    return @cp_aks; 
}
##extract article content######
sub extract_ak{
    my $class = shift ;
    my $ak_uri  = shift || Carp->croak("ÐèÒªak_uri¡£¡£¡£¡£¡£");
    my $is_utf =shift;   
    my $content =Fetcher-> fetch($ak_uri);
    if($is_utf){
        $content=MyUtils->u8_to_gbk($content);
    }
    my $tree= HTML::TreeBuilder->new_from_content($content);
###²»Í¬ÍøÕ¾Ê¹Óò»Í¬µ«ÊÇÏà¶Ô¹Ì¶¨µÄÄ£°å####nav_page=>(article_page_link)=>article
    my $addr_title =q/@0.0.1/;
    my $addr_content=q/@0.1.4.0.0.4.1.0.0/;#q/@0.1.4.0.1.5.0.0.0.0.0.0/;
    my $title=$tree->address($addr_title)->as_text();
    my $news = $tree->address($addr_content)->as_text();
    $tree->delete;
    #print $news;
    my $time = localtime(time);
    my %ak = ("uri",$ak_uri,"title",$title,"content",$news,"ftime",$time);
    return %ak;
}


#######################################
#ͨÓ÷½·¨                              
#######################################
###³éÈ¡Á´½Ó,¸ùÏÂËùÓÐ<A>ÀàÐͳ¬Á´½Ó###
sub extract_links{
    my $class = shift ;
    my $index_uri  = shift || Carp->croak("ÐèÒªpageÒ³Á´½Ó¡£¡£¡£¡£¡£");
    my $lk_addr = shift; 

    print "\nץȡ---------".$index_uri;
    my $c =Fetcher->fetch($index_uri);

    my $nav_page_content =MyUtils->u8_to_gbk($c) if $c;
    my $tree= HTML::TreeBuilder->new_from_content($nav_page_content);
#µ±Ç°°æÃæÖ÷ÒªÐÂÎÅÁ´½Ó currentpage_article_addr
    my $cp_lks_tree=$tree->address($lk_addr);
#    print "\n treeeeeee----------",$cp_lks_tree->address();
    my @cp_lks;
    for (@{ $cp_lks_tree->extract_links("a") }) {
        my($link, $element, $attr, $tag) = @$_;
        my $abs_link = URI->new_abs( $link, $index_uri );
        push @cp_lks,$abs_link;

    }
    $tree->delete;

    return @cp_lks;
}
##³éÈ¡Îı¾ÄÚÈÝ,µ÷ÓÃ×ÓÊ÷as_text·½·¨###
sub extract_text{
    my $class = shift ;
    my $ak_uri  = shift || Carp->croak("ÐèÒªÐÂÎÅÁ´½Ó");
    my $content_addr = shift || Carp->croak("ÐèÒªÐÂÎÅÄÚÈÝλÖÃÐÅÏ¢");    
    my $is_utf = shift;
    my $content =Fetcher-> fetch($ak_uri);
    if($is_utf) {
        $content=MyUtils->u8_to_gbk( $content) if $content;
    } 
    $content =~s/&.*quo;//g;##²»È¥µô»áÂÒÂëµÄ£¬Õã½­Ò©¼àÍø
    my $tree= HTML::TreeBuilder->new_from_content($content);
    my $stree =$tree->address($content_addr);
    if($stree == undef) {
        return '';
    }
    my $news = $stree->as_text() ;
    $tree->delete;
    return $news || '';
}
##´ÓÄ£°åÖлñµÃµØÖ·######
##############
#Ä£°å²»ÄÜʹÓÃä¯ÀÀÆ÷ÓÅ»¯¹ýµÄ´úÂ룬µÃʹÓóÌÐòÏÂÔØµÄ°æ±¾£¬·ñÔò²»Æ¥ÅäµÄŶ¡£Ë¥~~~~
#######
sub extract_cont_addr{
    #  my ($class,$tmpl_file ) = @_;
    my $class = shift;
    my $tmpl_file =shift;
    my @crit =@_;
    @crit = ("id","my_content") unless @_;
    
    my $tree =HTML::TreeBuilder->new_from_file($tmpl_file);

    if(wantarray){
        my @strees =$tree->look_down("id","my_content");# || $tree;
        croak qq"
        Error:No desired content marked in the template\t$tmpl_file!
        ÇëÏȱê¼ÇÄ£°åÎļþ¡£
        \n
        " unless @strees;

        my @cont_addrs=();
        for my $stree ( @strees){
            push @cont_addrs,$stree->address();
        }
        $tree->delete;
        return @cont_addrs;
    }else{

        my $stree =$tree->look_down("id","my_content");# || $tree;
        croak qq"
        Error:No desired content marked in the template\t$tmpl_file!!
        ÇëÏȱê¼ÇÄ£°åÎļþ¡£
        \n
        " unless $stree;
        my $addr = $stree->address();
        $tree->delete;
        return $addr;
    }

}

###¸ù¾Ý¹ýÂËÌõ¼þ£¬·µ»ØÒ»¸ö»òÕß¶à¸öµØÖ·######
sub extract_addrs{
    my $class = shift;
    my $tmpl_file =shift;
    my @crit =@_;
    @crit = ("id","my_content") unless @_;
#    print "crit----------",@crit;

    my $tree =HTML::TreeBuilder->new_from_file($tmpl_file);

    my @strees =$tree->look_down(@crit);
    my @cont_addrs ;
    for my $stree ( @strees){
        my $addr =$stree->address();
        push @cont_addrs,$addr;
        print $stree->address(),"\n";
    }
    $tree->delete;
    if(wantarray){
        return @cont_addrs ;
    }else{
        return pop @cont_addrs;
    } 
}

1;