获取前端网页 php爬虫 get_html.php
<!DOCTYPE html>
<html>
<head>
<title>spider</title>
</head>
<body>
<form method="get" action="get_html.php">
crawl web html address:<input type="text" name="url" >
<input type="submit" value="crawl">
</form>
<?php
date_default_timezone_set("PRC");
function dump($var){
echo "<pre>";
var_dump($var);
echo "<pre>";
exit(date("Y-m-d H:i:s",time()));
}
//catetory html resource into local project file.
class spider{
public $url;
public $http;
public $host;
public $html;
public $path;
public $title;
function __construct($url,$imagesPath=""){
set_time_limit(60);
//dump($url);
//$url="http://www.hose.com";
preg_match("#(https?)s?:s?//([w.-]+)/?#", $url,$matches);
$this->http=$matches[1];
$this->host=$matches[2];
//dump($this->http.$this->host);
if ($url) {
$ch=curl_init($url);
//curl_setopt($ch,CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
curl_close($ch);
$this->html=$res;
$this->url=$url;
/*if(preg_match("#<title>(.*?)</title>#", $res,$matches)){
$this->title=substr($matches[1],0,9);
}*/
$this->title=$this->host;
if($res){
$this->path=dirname(__FILE__)."/".$this->title;
if(!file_exists($this->path)){
mkdir($this->path);
chmod($this->path,0777);
}
if(!file_exists($this->path."/style")){
mkdir($this->path."/style");
chmod($this->path."/style/",0777);
}
}else{
exit("could not load html webpage.");
}
}else{
exit("Please input url!");
}
}
function get_resource($url_array){
foreach ($url_array as $key => $url) {
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
file_put_contents($this->path."/".basename($url), $res);
chmod($this->path."/".basename($url), 0777);
curl_close($ch);
}
}
function get_image(){
$matches=array();
preg_match_all("/<img.*?src=[""](.*?/[w-]+.(gif|png|jpg)).*?[""]/i",$this->html, $matches);
foreach ($matches[1] as $key => $url) {
if(strpos($url, "/")===0){
$url=$this->http."://".$this->host.$url;
}elseif (strpos($url,"//")===false) {
$url=$this->http."://".$this->host."/".$url;
}
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
if(!file_exists($this->path."/style/img")){
mkdir($this->path."/style/img");
chmod($this->path."/style/img",0777);
}
file_put_contents($this->path."/style/img/".basename($url), $res);
chmod($this->path."/style/img/".basename($url), 0777);
curl_close($ch);
}
echo "<br />get image over.";
}
function get_css(){
$matches=array();
//var_dump($this->html);
preg_match_all("/<link.*?href=[""](.*?.css).*?>/i",$this->html, $matches);
//var_dump($matches);
foreach ($matches[1] as $key => $url) {
if(strpos($url, "/")===0){
$url=$this->http."://".$this->host.$url;
}elseif (strpos($url,"//")===false) {
$url=$this->http."://".$this->host."/".$url;
}
//dump($url);
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
if(!file_exists($this->path."/style/css")){
mkdir($this->path."/style/css");
chmod($this->path."/style/css", 0777);
}
file_put_contents($this->path."/style/css/".basename($url), $res);
chmod($this->path."/style/css/".basename($url),0777);
curl_close($ch);
}
echo "<br />get css over.";
}
function get_js(){
$matches=array();
//.js文件后面带参数一般是为了不要让浏览器读缓存,过旧的js版本
preg_match_all("/<script.*?src=[""](.*?.js).*?>/i",$this->html, $matches);
foreach ($matches[1] as $key => $url) {
if(strpos($url, "/")===0){
$url=$this->http."://".$this->host.$url;
}elseif (strpos($url,"//")===false) {
$url=$this->http."://".$this->host."/".$url;
}
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$res=curl_exec($ch);
if(!file_exists($this->path."/style/js")){
mkdir($this->path."/style/js");
chmod($this->path."/style/js",0777);
}
file_put_contents($this->path."/style/js/".basename($url), $res);
chmod($this->path."/style/js/".basename($url), 0777);
curl_close($ch);
}
echo "<br />get js over.";
}
function formate_html(){
$res=$this->html;
$url=$this->url;
//process the source link
$res=preg_replace("/<link.*?href=[""].*?/([w-]+.css).*?>/i", "<link href="./style/css/$1" rel="stylesheet" type="text/css" />", $res);
$res=preg_replace("/<script.*?src=[""].*?/([w-.]+.js).*?>/i", "<script type="text/javascript" src="./style/js/$1">", $res);
$res=preg_replace_callback("/<img.*?src=[""].*?/([w-]+.(gif|png|jpg)).*?>/i",
function ($res){
return preg_replace("/src=[""].*?/([w-]+.(gif|png|jpg)).*?[""]/i", "src="./style/img/".$res[1].""", $res[0]);
//dump($res);
}
, $res);
$file_name=preg_replace("/.w+$/", "", basename($url));
file_put_contents($this->path."/".$file_name.".html", $res);
chmod($this->path."/".$file_name.".html", 0777);
}
}
function crawl($url){
$spider=new spider($url);
$spider->get_css();
$spider->get_js();
$spider->get_image();
$spider->formate_html();
}
if (!empty($_GET["url"])) {
crawl($_GET["url"]);
}
?>
</body>
</html>声明:该文观点仅代表作者本人,入门客AI创业平台信息发布平台仅提供信息存储空间服务,如有疑问请联系rumenke@qq.com。
- 上一篇: php生成图片的一个小示例
- 下一篇: PHP无限级分类(嵌套集合模型)
