#cpan https://cpan.metacpan.org/authors/id/S/SR/SRI/Mojolicious-7.31.tar.gz
#cpanm -n Mojolicious@7.31
use feature ':5.10';
use strict;
use warnings;
use utf8;
use Mojo;
use Encode qw(decode encode);
##########################################################################
$ENV{MOJO_REACTOR} = 'Mojo::Reactor::EV';
#使用 EV 具有更好的性能
my $ua = Mojo::UserAgent->new;
$ua->inactivity_timeout(60);
$ua->connect_timeout(60);
$ua->request_timeout(60);
#适当延长超时的时间,阻止过早的 http 请求失败,会有更好的性能
$ua->max_connections(1000);
#最大连接数 1000
$ua->max_redirects(0);
#阻止 http3xx 重定向
$ua->transactor->name('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0');
#使用正常浏览器的 user agent
$ua->cookie_jar->ignore( sub { 1 } );
#禁用 Mojo::UserAgent 自动处理 cookie
$ua->proxy->http('http://127.0.0.1:8080')->https('http://127.0.0.1:8080');
#使用代理服务器
##########################################################################
my @list = ();
#原始队列
my @urllist = ();
#下载队列
my $n = 0;
#下载数量
my $m = 0;
#出错数量
my $produce_num = 0;
#生产者数量
my $consumer_num = 0;
#消费者数量
my $cookie_num = 0;
#cookie 数量
my @cookielist = ();
#使用的 cookie 队列
my %cookieinvalid = ();
#失效的 cookie 散列
##########################################################################
open FILEIN, '<', "./url.txt" or die "$!";
while (<FILEIN>) {
my $content = $_;
chomp($content);
$content =~ s/\r//;
push( @list, $content );
}
close FILEIN;
#导入下载列表
##########################################################################
sub append_txt_to_file {
my $file_name = $_[0];
my $txt = $_[1];
local *FH;
open FH, '>>', $file_name;
print FH $txt;
close FH;
}
sub write_txt_to_file {
my $file_name = $_[0];
my $txt = $_[1];
local *FH;
open FH, '>', $file_name;
print FH $txt;
close FH;
}
my %safe_character = (
'<' => '<',
'>' => '>',
':' => ':',
'"' => '"',
'/' => '/',
'\\' => '\',
'|' => '|',
'?' => '?',
'*' => '*',
);
sub repace_safe {
my $per_char = $_[0];
my $one_txt = $_[1];
my $output_char;
if ( exists $safe_character{$per_char} ) {
$output_char = $safe_character{$per_char};
}
else {
$output_char = $per_char;
}
return $output_char;
}
sub find {
my $html_bin = $_[0];
my $id = $_[1];
if ( $html_bin =~ m/<\/html>/ ) {
return '####';
}
else {
return '@@@';
}
}
##########################################################################
sub get_multiplex {
my $id = $_[0];
my $delay = Mojo::IOLoop->delay( sub { get_multiplex($id) } );
#get_multiplex 递归迭代的开始标记
#$id 是每一个线程(端口的序号)
my $end = $delay->begin;
Mojo::IOLoop->timer( 0.1 => $delay->begin );
#每个 http 请求前暂停 0.1s
if ( scalar @urllist == 0 ) {
if ( $produce_num == $consumer_num ) {
Mojo::IOLoop->stop;
#异步循环结束
#当队列数量为 0 ,且所有的线程数据都处理完毕的时候,终止事件循环
#return 存在一个递归返回链,这里可以更快地结束
}
return;
#这里返回后异步任务数量为 0 时,系统会自动结束异步循环,不过速度较慢
#return 返回闭包函数的开始,并结束闭包函数,下面不开启递归自身
}
else {
my $object = shift @urllist;
$produce_num++;
my $url = $object;
my $filename = $object;
$filename =~ s/^http:\/\/www\.bing\.com\/w\///m;
$filename =~ s/(.)/repace_safe($1)/eg;
$filename = "./www.bing.com/" . $filename . ".html";
if ( -e $filename ) {
syswrite STDERR, encode( 'utf8', $n . "\t" . $object . "\t 跳过\n" ); #STDOUT 编码已改,输送到 STDOUT 会出现错误
$consumer_num++;
$end->();
}
else {
my $build_tx = $ua->build_tx( GET => $url );
$build_tx->req->headers->remove('Accept-Encoding');
#阻止网页压缩,保证更好的性能
$build_tx->req->headers->add( 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' );
$build_tx->req->headers->add( 'Accept-Language' => 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2' );
$ua->transactor->name( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:' . int( rand(900) ) . ') Gecko/' . int( rand(40000001) ) . ' Firefox/' . int( rand(900) ) . '.0' );
#使用 2 万个 cookie
$ua->start(
$build_tx => sub {
my ( $ua, $tx ) = @_;
if ( !$tx->is_finished ) {
push( @urllist, $object );
syswrite STDERR, "http 传输未完成" . "\n";
syswrite STDERR, encode( 'utf8', $url . "\t" . $tx->error->{message} . "\n" );
}
else {
my $code = '';
$code = $tx->res->code if defined $tx->res->code;
if ( $code =~ /\A2/ ) {
my $size = $tx->res->content->asset->size;
my $content_length = $tx->res->headers->to_hash->{'Content-Length'};
if ( ( $size == $content_length ) || !( defined $content_length ) ) {
my $outnum = find( $tx->res->body, $id );
if ( $outnum ne '@@@' ) {
append_txt_to_file( "url.txt", $object . "\t" . $outnum . "\n" );
write_txt_to_file( $filename, $tx->res->body );
$n++;
syswrite STDERR, encode( 'utf8', $n . "\t" . $object . "\n" );
}
else {
$n++;
syswrite STDERR, encode( 'utf8', $m . "\t" . $object . "\t 网页下载完整但未提取到数据\n" );
}
}
else {
$m++;
push( @urllist, $object );
syswrite STDERR, encode( 'utf8', $m . "\t" . $object . "\t 网页未下载完整\n" );
}
}
elsif ( $code =~ /\A4/ ) {
syswrite STDERR, encode( 'utf8', $m . "\t" . $object . "\thttp 4xx\n" );
push( @urllist, $object );
Mojo::IOLoop->timer( 0.5 => $delay->begin );
#http 404
}
else {
$m++;
push( @urllist, $object );
syswrite STDERR, encode( 'utf8', $m . "\t" . $object . "\t 未发现 http code, http 3xx, http 5xx\n" );
#标记失效的从 cookie http 3xx
Mojo::IOLoop->timer( 0.5 => $delay->begin );
#服务器返回 5xx ,暂停 0.5s
#未发现 http code, http 302, http 503
}
}
$consumer_num++;
$end->();
#get_multiplex 递归迭代的结束标记
#从这里跳转到下一个 get_multiplex
}
);
}
}
}
##########################################################################
$produce_num = 0;
$consumer_num = 0;
@urllist = @list;
#异步下载前的变量准备
foreach my $id ( 1 .. 50 ) { get_multiplex($id) }
#使用 50 个线程(端口)下载
#如果线程数是 100 ,限制最大 cookie 数无法生效,并且 EV 会出现错误
Mojo::IOLoop->start;
#异步循环启动
##########################################################################
这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.