php网页采集 想对高效版
想对前面写的版本,极大的减小了IO开销,减小了对主机的解析
<?php header("content-type: text/html; charset=utf-8"); class HttpWrap { public $timeout=10; public $status=""; public $host; public $port=80; private $ip; private $conn; private $path; private $url; private $scheme; public $http_method="GET"; public $http_version="HTTP/1.1"; public $agent="Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"; public $accept="image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*"; public $gzip="gzip"; public $referer; public $cookie; public $submit_type="application/x-www-form-urlencoded"; private $accept_language="zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; public $connection="keep-alive"; private $cmd_line; private $header; public $post_content; private $redirect; private $is_gzip; public $response_num; public $response_header; public $response_body_length=0; public $response_body; public $roll_link; public $roll_group; public $filename; public $encoding; public function init($url) { $this->url=$url; $url_pair = parse_url($url); $this->host = $url_pair["host"]; $this->path = $url_pair["path"]; $this->scheme = $url_pair["scheme"]; if(empty($this->ip)) { $this->ip = gethostbyname($this->host); } if(!empty($url_pair["port"])) { $this->port = $url_pair["port"]; } $this->connect(); // echo $this->status; exit(); $this->sendRequest(); //如果响应头部存在重定向,则对重定向发送请求 if($this->redirect) { if(preg_match("#^http://".preg_quote($this->host)."#i",$this->redirect)) { $this->referer=$this->host."/".parse_url($this->redirect)["path"]; $this->init($this->redirect); } } if($this->roll_link) { $next_url = substr($this->url,0,strripos($this->url, "/")+1).$this->roll_link; //如果下一页等于当前页 if(strtolower(trim(basename($this->url,".html"))) == strtolower(trim(basename($next_url,".html")))) { $next_group = $this->getNextGroup($this->response_body); echo "<font color="color">即将采集下一组</font><br />"; sleep(1); $this->init($next_group); } else { $this->init($next_url); } } else { die("没有下一页"); } } private function connect() { $this->conn = fsockopen($this->ip,$this->port,$errno,$errstr,$this->timeout); if($this->conn) { $this->status = "链接成功"; return true; } else { switch($errno) { case -3: $this->status="创建socket链接失败"; case -4: $this->status="dns查询失败"; case -5: $this->status="链接被拒绝或超时"; default: $this->status="创建连接失败"; } return false; } } private function sendRequest() { if(empty($this->path)) { $this->path="/"; } $this->cmd_line=$this->http_method." ".$this->path." ".$this->http_version." "; if(!empty($this->host)) { $this->header .= "Host: ".$this->host." "; } if(!empty($this->agent)) { $this->header .="User-Agent: ".$this->agent." "; } if(!empty($this->accept)) { $this->header .= "Accept: ". $this->accept ." "; } if(!empty($this->gzip)) { if ( function_exists("gzinflate") ) { $this->header .= "Accept-encoding: gzip "; } else { $this->status = "不支持压缩"; } } if(!empty($this->referer)) { $this->header .= "Referer: ".$this->referer." "; } if(!empty($this->accept_language)) { $this->header .= "Accept-Language: ".$this->accept_language." "; } if(!empty($this->cookie)) { if(!is_array($this->cookie)) { $this->header .="Cookie: ".$this->cookie; } else { if(count($this->cookie) >0) { $cookie = "Cookie: "; foreach($this->cookie as $key => $val) { $cookie.=$key."=".urlencode($val).";"; } $cookie = substr($cookie, 0, strlen($cookie)-1)." "; } $this->header .= $cookie; } } if(!empty($this->submit_type)) { $this->header .="Content-Type: ".$this->submit_type." "; } if(!empty($this->post_content)) { $this->header .= "Content-length: ".strlen($this->post_content)." "; } if(!empty($this->connection)) { $this->header .= "Connection: ".$this->connection." "; } $this->header .=" "; //上面是HTTP请求头部信息 //echo $this->cmd_line.$this->header.$this->post_content; exit(); //发送请求 $len = strlen($this->cmd_line.$this->header.$this->post_content); if($len != fwrite($this->conn, $this->cmd_line.$this->header.$this->post_content,$len)) { $this->status = "发送请求failed"; } //接受响应,每次读取一行内容,首先解析响应头 while($response_header = fgets($this->conn, 1024)) { if(preg_match("#^HTTP/#",$response_header)) { //匹配状态数字,200表示请求成功 if(preg_match("#^HTTP/[^s]*s(.*?)s#",$response_header, $status)) { $this->response_num= $status[1];//返回代表数字的状态 } } //echo $this->response_num; exit(); // 判断是否需要重定向 if(preg_match("#^(Location:|URI:)#i",$response_header)) { // 获取重定向地址 preg_match("#^(Location:|URI:)s+(.*)#",trim($response_header),$matches); //如果重定向字段不包含主机名,不是以以://开头的,则拼接王完整的请求地址,模式+主机+端口 if(!preg_match("#://#",$matches[2])) { // 补全主机名 $this->redirect = "http://".$this->host.":".$this->port; //添加路径 if(!preg_match("|^/|",$matches[2])) $this->redirect .= "/".$matches[2]; else $this->redirect .= $matches[2]; } else //包含完整的主机地址 $this->redirect = $matches[2]; } //判断返回的数据的压缩格式 if (preg_match("#^Content-Encoding: gzip#", $response_header) ) { $this->is_gzip = true; } if(preg_match("#^Content-Length:s*(d+)#i", $response_header, $len)) { $this->response_body_length = $len[1]; } //解析完响应头部 if(preg_match("/^ ? $/", $response_header) ) break; $this->response_header[]=$response_header; } //可以成功返回响应头部信息,响应状态码也为200 // var_dump($this->response_header); exit(); if($this->response_num==200) { //问题出在这里 //echo "ok"; exit(); $sub_dir; $dirname; $path; $filename; if(preg_match("#/(d+)/#", $this->url, $sub_dir)) { $dirname = "./download/".$sub_dir[1]; } else { $dirname = "./download/".date("Ymd"); } $len=0; while($items = fread($this->conn, $this->response_body_length)) { if(!is_dir($dirname)) { $path = mkdir($dirname,0777,true); } $filename = $dirname."/".basename($this->url); $len = $len+strlen($items); $this->response_body = $items; file_put_contents($filename, $items, FILE_APPEND); //这里必须判断读取的长度,不然会在这里阻塞 if($len >= $this->response_body_length) break; } if($this->is_gzip) { $this->response_body = gzinflate ($this->response_body); } echo str_repeat(" ", 2048); echo "对链接".$this->url."发起请求<br />"; $this->getRollLink($this->response_body); // sleep(1); } } private function getRollLink($filename) { $content=""; if(empty($this->encoding)) { $this->encoding=mb_detect_encoding(substr($filename,0,32), array("GB2312","GBK","UTF-8","BIG5","LATIN1")); if($this->encoding !="UTF-8") { $content = mb_convert_encoding($filename, "UTF-8", $this->encoding); } } else { $content = mb_convert_encoding($filename, "UTF-8", $this->encoding); } if(preg_match("#<uls+class="image"[^>]*?>.*?</ul>#is", $content, $match)) { if(preg_match("#<as+href="([^"]+?)">下一页</a>#ui", $match[0], $next)) { $this->roll_link = trim($next[1]); } } else { $this->roll_link = false; } } private function getNextGroup($filename) { if(empty($this->encoding)) { $this->encoding=mb_detect_encoding(substr($filename,0,32), array("GB2312","GBK","UTF-8","BIG5","LATIN1")); if($this->encoding !="UTF-8") { $content = mb_convert_encoding($filename, "UTF-8", $this->encoding); } } else { $content = mb_convert_encoding($filename, "UTF-8", $this->encoding); } if(preg_match("#<uls+class="page"[^>]*?>.*?</ul>#is", $content, $match)) { //echo $match[0]."<br />"; if(preg_match_all("#<as+href="([^"]*?)">.*?</a>#usi", $match[0], $next)) { //var_dump($next[1]); $choice; if(count($next[1])==2) { $first = basename($next[1][0], ".html"); $second = basename($next[1][1], ".html"); //往前翻页,进入下一组 if(intval($first) < intval($second)) { $choice = $first; } else { $choice = $second; } //h获取下一组 foreach($next[1] as $item) { if(strripos($item, $choice) !=false ) { if(substr($item, 0,2) =="..") { $link= substr($item, 2); $sub_path = explode("/", $this->path); $url = $this->scheme."://".$this->host."/".$sub_path[1].$link; return $url; } } } } //如果是最后一组,即没有下一组了 else if(count($next[1])==1) { if(substr($next[1][0],0,2)=="..") { $link = substr($next[1][0],2); $sub_path = explode("/", $this->path); $url = $this->scheme."://".$this->host."/".$sub_path[1].$link; return $url; } } } else { $this->status = "failed to match href"; } } else { $this->status = "failed to match class=page"; } } } ob_implicit_flush(true); set_time_limit(0); $url = $url = "http://www.mmkao.com/Beautyleg/201412/7066.html"; $http = new HttpWrap(); $http->cookie = "safedog-flow-item=41E2DBFEF121A8A2835ADB4476E5D3EC"; $http->referer = "www.mmkao.com"; $http->init($url); ?>
声明:该文观点仅代表作者本人,入门客AI创业平台信息发布平台仅提供信息存储空间服务,如有疑问请联系rumenke@qq.com。
- 上一篇: php多字节编码字符长度检测
- 下一篇: php网页采集 修改版