phpspider 简单使用
phpspider一款优秀的PHP开发蜘蛛爬虫
官方下载地址:https://github.com/owner888/phpspider
官方开发手册:https://doc.phpspider.org/
关于这个爬虫的使用 下载下来之后有写好的实例 我在这里只是说一下我遇到的一个坑
// GitHub下载方式
require_once __DIR__ . "/../autoloader.php";
use phpspidercorephpspider;
/* Do NOT delete this comment */
/* 不要删除这段注释 */
$configs = array(
"name" => "糗事百科",
"log_show" => true,
"tasknum" => 1,
//"save_running_state" => true,
"domains" => array(
"qiushibaike.com",
"www.qiushibaike.com"
),
"scan_urls" => array(
"http://www.qiushibaike.com/"
),
"list_url_regexes" => array(
"http://www.qiushibaike.com/8hr/page/d+?s=d+"
),
"content_url_regexes" => array(
"http://www.qiushibaike.com/article/d+",
),
"max_try" => 5,
//"proxies" => array(
//"http://H784U84R444YABQD:57A8B0B743F9B4D2@proxy.abuyun.com:9010"
//),
"export" => array(
"type" => "csv",
"file" => "../data/qiushibaike.csv",
),
//"export" => array(
//"type" => "sql",
//"file" => "../data/qiushibaike.sql",
//"table" => "content",
//),
// "export" => array(
// "type" => "db",
// "table" => "content",
// ),
"db_config" => array(
"host" => "127.0.0.1",
"port" => 3306,
"user" => "root",
"pass" => "123456",
"name" => "spider",
),
//"queue_config" => array(
//"host" => "127.0.0.1",
//"port" => 6379,
//"pass" => "",
//"db" => 5,
//"prefix" => "phpspider",
//"timeout" => 30,
//),
"fields" => array(
array(
"name" => "article_title",
"selector" => "//*[@id="single-next-link"]//div[contains(@class,"content")]/text()[1]",
"required" => true,
),
array(
"name" => "article_author",
"selector" => "//div[contains(@class,"author")]//h2",
"required" => true,
),
array(
"name" => "article_headimg",
"selector" => "//div[contains(@class,"author")]//a[1]",
"required" => true,
),
array(
"name" => "article_content",
"selector" => "//*[@id="single-next-link"]//div[contains(@class,"content")]",
"required" => true,
),
array(
"name" => "article_publish_time",
"selector" => "//div[contains(@class,"author")]//h2",
"required" => true,
),
array(
"name" => "url",
"selector" => "//div[contains(@class,"author")]//h2", // 这里随便设置,on_extract_field回调里面会替换
"required" => true,
),
),
);
$spider = new phpspider($configs);
$spider->start();
这是官网文档其中的一个实例 使用说明中描述只能通过命令行的模式运行爬虫文件
结果我运行了一下demo并没有成功
之后发现糗事百科已经换成https协议 但是代码中的路由还是http的
我抱着试试的想法改成 https
果然成功了 爬虫已经可以正常运行了
具体的一下参数详情还是得看官网的文档
声明:该文观点仅代表作者本人,入门客AI创业平台信息发布平台仅提供信息存储空间服务,如有疑问请联系rumenke@qq.com。
- 上一篇:没有了
- 下一篇:没有了
