Google Scanner 源码:
#!/usr/bin/perl -w
#By xti9er
require LWP::UserAgent;
use LWP::Simple;
use Color::Output;
Color::Output::Init;
$|=1;
#----------------------------
#从配置文件中提取字符串
#----------------------------
my $inrulfile=shift||"inurl.ini";
my $countryset="";
open(INURL,$inrulfile) or die $!;
while($inurl=<INURL>)
{
chomp($inurl);
next if $inurl=~/^#/;
if($inurl=~/^country=(\w+)/i)
{
$countryset="country$1";
}
elsif($inurl=~/^country=$/i)
{
}
else{
push(@inurl,$inurl);
}
}
close INURL;
for my $inurl(@inurl)
{
$inurl=~s/\s/\+/g;
$reginurl=$inurl;
$reginurl=~s/\?/\\?/g;
$reginurl=~s/\=/\\=/g;
$reginurl=~s/\:/\\:/g;
$reginurl=~s/\+/\\s/g;
$sleeptime=5;
#cprin("Sleeptime=[$sleeptime]\t Url=[$inurl]\t RegUrl=[$reginurl]\t country=[$countryset]\n",7);
sleep(2);
my $sdco=0;
my $hostno=0;
my $cksd=0;
my $p75="-"x80;
my $ua = LWP::UserAgent->new;
$ua->timeout(20);
$ua->env_proxy;
$ua->agent("Mozilla/5.0");
#print "Start ...\n";
cprin("\t\t\t --=G o o g l e \t S c a n n e r=-- \n",5);
cprin("\t\t\t By xti9er \n",13);
cprin("$p75\n",7);
$stime=time;
$lstime=localtime();
cprin("\t\t\tStart at $lstime\n",13);
my $searchweb="http://www.google.com.hk";
my $startpage=0;
print "[Google] page:$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=0&sa=N/\n";
my $response = $ua->get("$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=0&sa=N/")
or (cprin("[Start] Get google start page faild:$!",5) and next);
if ($response->is_success)
{
$getre=$response->content;
@getlog=split(/href=/,$getre);
$getco=0;
for(@getlog)
{
$getco++;
if($_=~/id=resultStats>(.*)?</)
{
$ttpageno=$1;
$ttpageno=~s/\,//g;
$ttpageno=~s/\D//g;
$ttpageno=int($ttpageno/100);
cprin("\t\t\t$ttpageno Google Pages To Read!\n$p75\n",5);
sleep(2);
for($startpage=0;$startpage<=$ttpageno;$startpage=$startpage+100)
{
sleep($sleeptime);
cprin("Now Read The ".$startpage."th Page!\n--------------------------------------\n",13);
spider($startpage,$inurl);
}
}
}
}
else
{
cprin($response->status_line,5);next;
}
}
sub spider
{
$pageno=shift;
my $inurl=shift;
sleep($sleeptime);
open(WLOG,"+>>$stime.log") or (cprin($!,5) and next);
my $searchweb="http://www.google.com.hk";
my $searchurl="$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=$pageno&sa=N/";
my $nextpage=0;
my $nextpageno=$pageno+100;
cprin("Now URL: $searchurl\n",7);
my $ua = LWP::UserAgent->new;
$ua->timeout(20);
$ua->env_proxy;
$ua->agent("Mozilla/5.0");
my $response = $ua->get($searchurl);
if ($response->is_success)
{
$getre=$response->content; # or whatever
@getlog=split(/href=/,$getre);
}
else
{
print "Get page count faild!\n";
cprin($response->status_line,5);next;
}
my @sites;
foreach $urlre(@getlog){
if($urlre=~/^\"http\:\/\/(.*?)\"\starget=_blank\sclass=l/){
push(@sites,"http://$1");
}
if($urlre=~/\/search\?.*?q=.*?start=$nextpageno\&\;sa=N/){
$nextpage++;
}
}
my %seen=();
@sites = grep { !$seen{$_} ++ } @sites;
$siteno=0;
for my $nowsite (@sites)
{
$siteno++;
print "URL:$nowsite\n";
print WLOG $nowsite,"\n";
}
close WLOG;
if($nextpage==0)
{
cprin("\t\t\tThe End\n",13);
goto SPIDEREND;
}
}
SPIDEREND:
$etime=time;
$ttime=$etime-$stime;
print stime($ttime);
sub cprin
{
($str,$i)=@_;
cprint("\x03" . $i . "$str\n\x030");
}
sub stime
{
my $stime=shift;
my $hour=int($stime/(60*60));
my $minute=int(($stime-($hour*60*60))/60);
my $second=$stime-$hour*60*60-$minute*60;
return ("$hour hours $minute mins $second secs");
} |