@
webflier @
siw 搜索蜘蛛抓页面 无所谓啦 我user-agent都用bing的 抓取器终于支持gzip了 只考虑了chunked gzip 还有什么都没有的情况 暂时都能正常出人类可见文本
<code>
<?php
function dechunked($str){
$r = $t = '';
$t = $str;
while(trim($t)){
if(!preg_match('/^([\da-fA-F]+)[^\r\n]*\r\n/sm',$t,$m)) die('seem to be a chunked message');
$l = hexdec(trim($m[1]));
$c = strlen($m[0]);
$r.= substr($t,$c,$l);
$t = substr($t,$c+$l+2);
}
return $r;
}
function open($url){
$body = $head = $resp = '';
$temp = $temq = array();
$temp = parse_url($url);
$host = $temp['host'];
$file = $temp['path'];
if($temp['query']!='') $file = $file.'?'.$temp['query'];
$fp = fsockopen($host, 80, $errno, $errstr, 15);
if(!$fp) die('{gate_closed}');
else{
$head.= "GET $file HTTP/1.1\r\n";
$head.= "Host: $host\r\n";
$head.= "User-Agent: Mozilla/5.0 (compatible; bingbot/2.0; +
http://www.bing.com/bingbot.htm)\r\n";
$head.= "Referer: $url\r\n";
$head.= "Accept-Encoding: gzip,deflate\r\n";
$head.= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n";
$head.= "Connection: Close\r\n\r\n";
fwrite($fp, $head);
while(!feof($fp)){
$resp.= fgets($fp,4096);
}
fclose($fp);
}
if(empty($resp)) die('{gate_closed}');
$temq = explode("\r\n\r\n",$resp,2);
print_r($head);
print_r($temq[0]);
if(stristr($temq[0],'Transfer-Encoding: chunked')) $temq[1] = dechunked($temq[1]);
if(stristr($temq[0],'Content-Encoding: gzip')){
$body = gzinflate(substr($temq[1],10));
}
else $body = $temq[1];
return $body;
}
if(!empty($_GET['url'])) echo open($_GET['url']);
?>
</code>