特点:

    就是自动爬取google咯,做坏事和查东西都能用,谁用谁知道。配置好配置文件之后,直接执行就好了。代码中每次查询间隔5秒,但一次查询100个url,所以速度不慢而且不被封。

Google Scanner 源码:

#!/usr/bin/perl -w
#By xti9er
require LWP::UserAgent;
use LWP::Simple;
use Color::Output;
Color::Output::Init;
$|=1;
#----------------------------
#从配置文件中提取字符串
#----------------------------
my $inrulfile=shift||"inurl.ini";
my $countryset="";
open(INURL,$inrulfile) or die $!;
while($inurl=<INURL>)
{
        chomp($inurl);
        next if $inurl=~/^#/;
        if($inurl=~/^country=(\w+)/i)
        {
           $countryset="country$1";
        }
        elsif($inurl=~/^country=$/i)
        {
        }
        else{
           push(@inurl,$inurl);
        }
}
close INURL;
for my $inurl(@inurl)
{
        $inurl=~s/\s/\+/g;
        $reginurl=$inurl;
        $reginurl=~s/\?/\\?/g;
        $reginurl=~s/\=/\\=/g;
        $reginurl=~s/\:/\\:/g;
        $reginurl=~s/\+/\\s/g;
        $sleeptime=5;
        #cprin("Sleeptime=[$sleeptime]\t Url=[$inurl]\t RegUrl=[$reginurl]\t country=[$countryset]\n",7);
        sleep(2);
        my $sdco=0;
        my $hostno=0;
        my $cksd=0;
        my $p75="-"x80;
        my $ua = LWP::UserAgent->new;
        $ua->timeout(20);
        $ua->env_proxy;
        $ua->agent("Mozilla/5.0");
        #print "Start ...\n";
        cprin("\t\t\t --=G o o g l e    \t S c a n n e r=-- \n",5);
        cprin("\t\t\t                     By xti9er \n",13);
        cprin("$p75\n",7);
        $stime=time;
        $lstime=localtime();
        cprin("\t\t\tStart at $lstime\n",13);
        my $searchweb="http://www.google.com.hk";
        my $startpage=0;
        print "[Google] page:$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=0&sa=N/\n";
        my $response = $ua->get("$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=0&sa=N/")
                 or (cprin("[Start] Get google start page faild:$!",5) and next);
        if ($response->is_success)
        {
            $getre=$response->content;
            @getlog=split(/href=/,$getre);
            $getco=0;
            for(@getlog)
            {
                $getco++;
                if($_=~/id=resultStats>(.*)?</)
                {
                    $ttpageno=$1;
                    $ttpageno=~s/\,//g;
                    $ttpageno=~s/\D//g;
                    $ttpageno=int($ttpageno/100);
                    cprin("\t\t\t$ttpageno Google Pages To Read!\n$p75\n",5);
                    sleep(2);
                    for($startpage=0;$startpage<=$ttpageno;$startpage=$startpage+100)
                    {
                        sleep($sleeptime);
                        cprin("Now Read The ".$startpage."th Page!\n--------------------------------------\n",13);
                        spider($startpage,$inurl);
                    }
                }
            }
        }
        else
        {
            cprin($response->status_line,5);next;
        }
}
sub spider
{
    $pageno=shift;
    my $inurl=shift;
    sleep($sleeptime);
    open(WLOG,"+>>$stime.log") or (cprin($!,5) and next);
    my $searchweb="http://www.google.com.hk";
    my $searchurl="$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=$pageno&sa=N/";
    my $nextpage=0;
    my $nextpageno=$pageno+100;
    cprin("Now URL: $searchurl\n",7);
        my $ua = LWP::UserAgent->new;
        $ua->timeout(20);
        $ua->env_proxy;
        $ua->agent("Mozilla/5.0");
    my $response = $ua->get($searchurl);
    if ($response->is_success)
    {
        $getre=$response->content; # or whatever
        @getlog=split(/href=/,$getre);
    }
    else
    {
        print "Get page count faild!\n";
        cprin($response->status_line,5);next;
    }
    my @sites;
    foreach $urlre(@getlog){
        if($urlre=~/^\"http\:\/\/(.*?)\"\starget=_blank\sclass=l/){
            push(@sites,"http://$1");
        }
        if($urlre=~/\/search\?.*?q=.*?start=$nextpageno\&amp\;sa=N/){
            $nextpage++;
        }
    }
    my %seen=();
    @sites = grep { !$seen{$_} ++ } @sites;
    $siteno=0;
    for my $nowsite (@sites)
    {
    $siteno++;
    print "URL:$nowsite\n";
    print WLOG $nowsite,"\n";
    }
    close WLOG;
    if($nextpage==0)
    {
    cprin("\t\t\tThe End\n",13);
    goto SPIDEREND;
    }
}
SPIDEREND:
$etime=time;
$ttime=$etime-$stime;
print stime($ttime);
sub cprin
{
   ($str,$i)=@_;
   cprint("\x03" . $i . "$str\n\x030");
}
sub stime
{
   my $stime=shift;
   my $hour=int($stime/(60*60));
   my $minute=int(($stime-($hour*60*60))/60);
   my $second=$stime-$hour*60*60-$minute*60;
   return ("$hour hours $minute mins $second secs");
}

配置文件inurl.ini如下格式:

#########################
#=任意区域
#countryTD=乍得
#countryGI=直布罗陀
#countryCL=智利
#countryCF=中非共和国
#countryCN=中国
#countryMO=中国澳门特别行政区
#countryHK=中国香港特别行政区
country=all
inurl:/index.htm
#########################