php网页采集 修改版
改进了检查,显示给出错误提示信息,匹配模式更通用
<?php header("content-type: text/html; charset=gb2312"); class HttpWrap { public $timeout=10; public $status=""; public $host; public $port=80; private $ip; private $conn; private $path; private $url; private $scheme; public $http_method="GET"; public $http_version="HTTP/1.1"; public $agent="Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"; public $accept="image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*"; public $gzip="gzip"; public $referer; public $cookie; public $submit_type="application/x-www-form-urlencoded"; private $accept_language="zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; public $connection="keep-alive"; private $cmd_line; private $header; public $post_content; private $redirect; private $is_gzip; public $response_num; public $response_header; public $response_body_length=0; public $response_body; public $roll_link; public $roll_group; public $filename; public $encoding; public $match_status=0; public function init($url) { $this->url=$url; $url_pair = parse_url($url); $this->host = $url_pair["host"]; $this->path = $url_pair["path"]; $this->scheme = $url_pair["scheme"]; if(empty($this->ip)) { $this->ip = gethostbyname($this->host); } if(!empty($url_pair["port"])) { $this->port = $url_pair["port"]; } if($this->connect()) { $this->sendRequest(); } else { die($this->status); } //如果响应头部存在重定向,则对重定向发送请求 if($this->redirect) { if(preg_match("#^http://".preg_quote($this->host)."#i",$this->redirect)) { $this->referer=$this->host."/".parse_url($this->redirect)["path"]; $this->init($this->redirect); } } if($this->roll_link) { $next_url = substr($this->url,0,strripos($this->url, "/")+1).$this->roll_link; //如果下一页等于当前页 if(strtolower(trim(basename($this->url,".html"))) == strtolower(trim(basename($next_url,".html")))) { $next_group = $this->getNextGroup($this->response_body); echo "<font color="color">即将采集下一组</font><br />"; sleep(1); $this->init($next_group); } else { $this->init($next_url); } } else { die("<font color="red">没有下一页,原因是</font>:".$this->match_status); } } private function connect() { $this->conn = fsockopen($this->ip,$this->port,$errno,$errstr,$this->timeout); if($this->conn) { $this->status = "链接成功"; return true; } else { switch($errno) { case -3: $this->status="创建socket链接失败"; case -4: $this->status="dns查询失败"; case -5: $this->status="链接被拒绝或超时"; default: $this->status="创建连接失败"; } return false; } } private function sendRequest() { if(empty($this->path)) { $this->path="/"; } $this->cmd_line=$this->http_method." ".$this->path." ".$this->http_version." "; if(!empty($this->host)) { $this->header .= "Host: ".$this->host." "; } if(!empty($this->agent)) { $this->header .="User-Agent: ".$this->agent." "; } if(!empty($this->accept)) { $this->header .= "Accept: ". $this->accept ." "; } if(!empty($this->gzip)) { if ( function_exists("gzinflate") ) { $this->header .= "Accept-encoding: gzip "; } else { $this->status = "不支持压缩"; } } if(empty($this->referer)) { $this->header .= "Referer: ".$this->url." "; } else { $this->header .= "Referer: ".$this->referer." "; } if(!empty($this->accept_language)) { $this->header .= "Accept-Language: ".$this->accept_language." "; } if(!empty($this->cookie)) { if(!is_array($this->cookie)) { $this->header .="Cookie: ".$this->cookie; } else { if(count($this->cookie) >0) { $cookie = "Cookie: "; foreach($this->cookie as $key => $val) { $cookie.=$key."=".urlencode($val).";"; } $cookie = substr($cookie, 0, strlen($cookie)-1)." "; } $this->header .= $cookie; } } if(!empty($this->submit_type)) { $this->header .="Content-Type: ".$this->submit_type." "; } if(!empty($this->post_content)) { $this->header .= "Content-length: ".strlen($this->post_content)." "; } if(!empty($this->connection)) { $this->header .= "Connection: ".$this->connection." "; } $this->header .=" "; //上面是HTTP请求头部信息 //echo $this->cmd_line.$this->header.$this->post_content; exit(); //发送请求 $len = strlen($this->cmd_line.$this->header.$this->post_content); if($len != fwrite($this->conn, $this->cmd_line.$this->header.$this->post_content,$len)) { $this->status = "发送请求failed"; die($this->status); } //接受响应,每次读取一行内容,首先解析响应头 while($response_header = fgets($this->conn, 1024)) { if(preg_match("#^HTTP/#",$response_header)) { //匹配状态数字,200表示请求成功 if(preg_match("#^HTTP/[^s]*s(.*?)s#",$response_header, $status)) { $this->response_num= $status[1];//返回代表数字的状态 } } //echo $this->response_num; exit(); // 判断是否需要重定向 if(preg_match("#^(Location:|URI:)#i",$response_header)) { // 获取重定向地址 preg_match("#^(Location:|URI:)s+(.*)#",trim($response_header),$matches); //如果重定向字段不包含主机名,不是以以://开头的,则拼接王完整的请求地址,模式+主机+端口 if(!preg_match("#://#",$matches[2])) { // 补全主机名 $this->redirect = "http://".$this->host.":".$this->port; //添加路径 if(!preg_match("|^/|",$matches[2])) $this->redirect .= "/".$matches[2]; else $this->redirect .= $matches[2]; } else //包含完整的主机地址 $this->redirect = $matches[2]; } //判断返回的数据的压缩格式 if (preg_match("#^Content-Encoding: gzip#", $response_header) ) { $this->is_gzip = true; } if(preg_match("#^Content-Length:s*(d+)#i", $response_header, $len)) { $this->response_body_length = $len[1]; } if(preg_match("#^Set-Cookie:#i", $response_header)) { $items = explode(":", $response_header); $this->cookie = explode(";", $items[1])[0]; } //解析完响应头部 if(preg_match("/^ ? $/", $response_header) ) break; $this->response_header[]=$response_header; } //可以成功返回响应头部信息,响应状态码也为200 //var_dump($this->response_header); exit(); if($this->response_num==200) { $sub_dir; $dirname; $path; $filename; if(preg_match("#/(d+)/#", $this->url, $sub_dir)) { $dirname = "./download/".$sub_dir[1]; } else { $dirname = "./download/".date("Ymd"); } $len=0; while($items = fread($this->conn, $this->response_body_length)) { if(!is_dir($dirname)) { $path = mkdir($dirname,0777,true); } $filename = $dirname."/".basename($this->url); $len = $len+strlen($items); $this->response_body = $items; file_put_contents($filename, $items, FILE_APPEND); //这里必须判断读取的长度,不然会在这里阻塞 if($len >= $this->response_body_length) break; } if($this->is_gzip) { $this->response_body = gzinflate ($this->response_body); } echo str_repeat(" ", 2048); echo "对链接".$this->url."发起请求<br />"; $this->getRollLink($this->response_body); } } //这个函数主要实现分析出下一页的链接 private function getRollLink($content, $flag="") { if(preg_match("#<uls+class="image"[^>]*?>.*?</ul>#is", $content, $match)) { if(preg_match_all("#<as+[^>]+?>.*?</a>#is", $match[0], $items)) { //var_dump($items[0]); $len = count($items[0]); $next_page = $items[0][$len-1]; if(preg_match("#<as+href="([^"]+)"s*>#i", $next_page, $link)) { $this->roll_link = $link[1]; } else { $this->roll_link = false; $this->match_status="匹配下一页失败"; } } else { $this->roll_link = false; $this->match_status="匹配<a>组失败"; } } else { $this->roll_link = false; $this->match_status="匹配image分页组失败"; } } //这个函数主要实现分析出下一组的链接 private function getNextGroup($content) { if(preg_match("#<uls+class="page"[^>]*?>.*?</ul>#is", $content, $match)) { //echo $match[0]."<br />"; if(preg_match_all("#<as+href="([^"]*?)">.*?</a>#si", $match[0], $next)) { //var_dump($next[1]); $choice; if(count($next[1])==2) { $first = basename($next[1][0], ".html"); $second = basename($next[1][1], ".html"); //往前翻页,进入下一组 if(intval($first) < intval($second)) { $choice = $first; } else { $choice = $second; } //h获取下一组 foreach($next[1] as $item) { if(strripos($item, $choice) !=false ) { if(substr($item, 0,2) =="..") { $link= substr($item, 2); $sub_path = explode("/", $this->path); $url = $this->scheme."://".$this->host."/".$sub_path[1].$link; return $url; } } } } //如果是最后一组,即没有下一组了 else if(count($next[1])==1) { if(substr($next[1][0],0,2)=="..") { $link = substr($next[1][0],2); $sub_path = explode("/", $this->path); $url = $this->scheme."://".$this->host."/".$sub_path[1].$link; return $url; } } } else { $this->status = "failed to match href"; } } else { $this->status = "failed to match class=page"; } } } ob_implicit_flush(true); set_time_limit(0); $url = $url = "http://www.mmkao.com/Beautyleg/201412/7066_2.html"; $http = new HttpWrap(); $http->init($url); ?>
声明:该文观点仅代表作者本人,入门客AI创业平台信息发布平台仅提供信息存储空间服务,如有疑问请联系rumenke@qq.com。
- 上一篇: php网页采集 想对高效版
- 下一篇:没有了