php爬虫教程(四)抓取数据并进行处理
经过链接的分析,数据的分析,再加上规则的验证。
很容易的我们就get到了我们打算抓取到的数据,
so,我们就可以做我们想做的事情了。例如:
<?php header("Content-type:text/html;charset=utf8"); set_time_limit(0); require("client.php"); $client = new client(); $base_url = "http://t.pp.cc/"; for($i=0;$i<5;$i++) { echo "page:",$i," "; $client->setHeader("Cookie", "pt2gguin=o0056707892; RK=MBl/Y/W2em; ptcz=3c94d72206e5c146a03701b2cd5baa2dbf898ced78a80ca14afcb1c4347815d3; pgv_pvid=9725655970; g_ut=2; 3g_guest_id=-9042816631926882304; o_cookie=56707892; pgv_pvi=1429736448; eas_sid=K1S4H5o7F6b68265o2T8t240H5; luin=o0056707892; lskey=00010000d8b324c3df16b631120077e9d27f35b7d564ebc529087b9dcbc2f7556d9126fe81efd33c2d046cfd; pgv_si=s9506151424; pgv_info=pgvReferrer=&ssid=s6703251255; ptisp=ctc; ptui_loginuin=; uin=; skey=@5ZzsPWzRc; verifysession=h01a106acab1cddfbb02999f5bd471c902ebe5ab556be3b40de657fe21ffea2f01c24e692c37c2bd63c; IED_LOG_INFO=uin*|nick*%u7B11%u7740%u770B%u4F60%u54ED%20|time*1461910804; qzone_check=56707892_1461913345; rv2=802C9F7A654B37CD767C9691A7A5A7BF7F09CAB51D6341AA0B; property20=41424F4482BCD05C0A25B282DF8B360B38C86FEB7860B26C51C256022F9C1879FF87187E60572F65; qqmusic_uin=0056707892; qqmusic_key=@5ZzsPWzRc; qqmusic_fromtag=6"); $client->setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); $client->setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); $url = "https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=1312024342&inCharset=utf-8&outCharset=utf-8&hostUin=1312024342¬ice=0&sort=0&pos=40&num=20&cgi_host=http%3A%2F%2Ftaotao.qq.com%2Fcgi-bin%2Femotion_cgi_msglist_v6&code_version=1&format=jsonp&need_private_comment=1&g_tk=978158941"; $res = $client->get($url); $res = substr($res, 10); $res = substr($res, 0, -2); //echo $res;die; $res = json_decode($res, true); if(@$res["msglist"] == "") continue; foreach ($res["msglist"] as $k => $v) { if (!empty($v["commentlist"])) { foreach ($v["commentlist"] as $k2 => $v2) { if(isset($arr[$v2["name"]])){ $arr[$v2["name"]]["num"] = $arr[$v2["name"]]["num"]+1; }else{ $client->setHeader("Cookie", "randomSeed=824410; QZ_FE_WEBP_SUPPORT=0; cpu_performance_v8=31; pt2gguin=; RK=MBl/Y/W2em; ptcz=3c94d72206e5c146a03701b2cd5baa2dbf898ced78a80ca14afcb1c4347815d3; pgv_pvid=9725655970; g_ut=2; 3g_guest_id=-9042816631926882304; o_cookie=; pgv_pvi=1429736448; eas_sid=K1S4H5o7F6b68265o2T8t240H5; luin=; lskey=00010000d8b324c3df16b631120077e9d27f35b7d564ebc529087b9dcbc2f7556d9126fe81efd33c2d046cfd; pgv_si=s9506151424; pgv_info=pgvReferrer=&ssid=s6703251255; ptisp=ctc; ptui_loginuin=675365043; uin=; skey=@5ZzsPWzRc; verifysession=h01a106acab1cddfbb02999f5bd471c902ebe5ab556be3b40de657fe21ffea2f01c24e692c37c2bd63c; IED_LOG_INFO=uin*675365043|nick*%u7B11%u7740%u770B%u4F60%u54ED%20|time*1461910804; zzpaneluin=; zzpanelkey=; p_skey=bAQZCU78gH4Qy0BSWeZ5pOsOdoKEnmVDRCdEi2HTIUY_; pt4_token=MNU3KRdqZCn9wQhASxnjt2lE*Ikt29Yf-6r8jHUPFMw_; p_uin=; qzone_check=56707892_1461913345; rv2=802C9F7A654B37CD767C9691A7A5A7BF7F09CAB51D6341AA0B; property20=41424F4482BCD05C0A25B282DF8B360B38C86FEB7860B26C51C256022F9C1879FF87187E60572F65; qqmusic_uin=; qqmusic_key=@5ZzsPWzRc; qqmusic_fromtag=6; __Q_w_s_hat_seed=1"); // $url2="http://base.s21.qzone.qq.com/cgi-bin/user/cgi_userinfo_get_all?uin=". $v2["uin"]."&vuin=56707892&fupdate=1&rd=0.8304920770656681&g_tk=1551039607"; $url2="http://base.s11.qzone.qq.com/cgi-bin/user/cgi_userinfo_get_all?uin=". $v2["uin"]."&vuin=56707892&fupdate=1&rd=0.3045121533378856&g_tk=1845089435"; $res2 = $client->get($url2); $res2 = substr($res2, 10); $res2 = substr($res2, 0, -2); $res2 = json_decode($res2, true); $arr[$v2["name"]]["qq"] = $v2["uin"]; $arr[$v2["name"]]["num"] = 1; $arr[$v2["name"]]["sex"] = $res2["data"]["sex"]; $arr[$v2["name"]]["age"] = $res2["data"]["age"]; $arr[$v2["name"]]["birthday"] = $res2["data"]["birthday"]; } } } } sleep("1"); } if(empty($arr)) die; $ages = array(); foreach ($arr as $k=>$v) { $ages[] = $v["num"]; } $num=$num2=$num3=0; array_multisort($ages, SORT_DESC, $arr); foreach($arr as $k3=>$v3){ echo "昵称:",$k3,"账号:",$v3["qq"],"访问次数:",$v3["num"],"性别",$v3["sex"],"年龄",$v3["age"],"生日",$v3["birthday"]," "; $v3["sex"]==2? $num++:$num2++; $num3 = $v3["num"]>$num3?$v3["num"]:$num3; // echo $v3," "; } echo "共有妹子:$num 人,其他:$num2 人,最高访问次数:$num3";
这是我之前写过的一个抓取qq好友空间所有点过赞,评过论的用户,也就是他的QQ好友 :)
并且进行数据的整理和分析,找出
//共有妹子:$num 人,其他:$num2 人,最高访问次数:$num3一些好玩的数据
这个脚本是半自动的需要手动的写入cookie保持登陆的状态。
想写一个全自动的来着,实在是搞不懂tx的加密规则就放弃了(破涕为笑)
总结:至此恭喜会抓取数据了,但是人的创造力是无限的。
声明:该文观点仅代表作者本人,入门客AI创业平台信息发布平台仅提供信息存储空间服务,如有疑问请联系rumenke@qq.com。
- 上一篇: php比较两个二维数组是否相同,多维数组
- 下一篇: php爬虫教程(一) 简单的页面抓取