单页爬虫
1.项目简介
任务:爬取某基金网站数据, 以单页单条模式存储。
性能:抓取各式数据,以dict模式存入数据库,以json模式导出到文本。
最后实现90分钟内对全站约3500支基金数据更新, 约100条error。
2.版块说明
List_page.py
简单抓取列表页面基金代号,方便后面的url队列生成。
Crawl.py
分为Gen, Parse, Clean三个部分。
Gen为url队列生成, Parse做单页解析, Clean做简单的数据清洗。
-库调用:
requests + bs4 + pymongo 为主要的爬取函数
codecs + json 链接文本输出
threading + datetime 控制多线程运行,以及负责进度反馈
-各部分:
Gen:通过生成器函数实现,节省空间,结构也更清晰。
Parse:观察网页结构后,划出相应到版块解析,一方面调理更清晰,方便调试,另一方面则实现了并行。
Clean:在这个项目里并没有太多处理,只是在网页原始数据中把该拆分到部分拆分出来。
3.总结
通过本次项目实践,对于数据采集有了比较全面的认识,其间的各种细节技术都是在实践中学来的。
在项目中做了很多新尝试,比如报错系统和进度反馈,这些可有可无到部分既可以方便调试,也是为了让数据更加可靠。
为了使程序自由度高,没有使用爬虫框架,尽量用外层函数库编写。这样可以根据不同网页情况自己修改。
在多线程控制上仍然存在一些问题,为什么在每个单页解析里都要初始进程list呢,我试过把list放在外面,进程控制会报错。
-对于数据存储结构的心得:
跟据不同需要以及数据本身的特征,我找到了一些简单的存储方式。决定存储方式的当然还是dict以及json本身的格式。
1. dict: {‘attribute’ : …}
2. list: [a, b, c…]
对于一些表格,在不同到需要下有不同到存储方式:
1. dict: {date : {f(d), g(d)}, d : {}, d: {}…}
2. list: [{date, f(d), g(d)}, {}, {}…]
3. multi-list: {date : [d1, d2, d3…], f : [f1, f2, f3…], g : [g1, g2, g3…]}
dict方便对table单行查询,list能保证数据的完整性,multi-list便于对数据可视化。
又丑又长到代码我本是不想贴的, 只是还没找到合适的管理方式。
代码块中的地址为了保护网页权益擦掉了。
import requests
from bs4 import BeautifulSoup
import pymongo
import codecs
import json
import datetime
import threading
def Gen():
global dataframe
global home
tit = 0
file = codecs.open("fund.json", "r", "utf-8")
tmp = json.load(file)
file.close
datagram = tmp.get("fund")
for dataframe in datagram:
home = {}
code = dataframe.get("code")
tit += 1
if tit % 100 == 0:
now = datetime.datetime.now()
print(tit, " / ", len(datagram))
print(now.strftime("%Y-%m-%d %H:%M:%S"))
print(str(tit * 100 / len(datagram)) + "%")
#input()
url = ...
yield url
def Getsoup(url):
res = requests.get(url)
res.encoding = "utf-8"
html = res.text
soup = BeautifulSoup(html, "html.parser")
return soup
def Part01():
#01---infoOfItem---
global soup
global home
global eros
item = {}
ks = []
vs = []
try:
head = soup.find(attrs={"class": "fundInfoItem"})
except:
print("Error 01: at", dataframe.get("code"))
return
#01.1-infoOfFund
info = {}
try:
div = head.find(attrs={"class" : "infoOfFund"})
table = div.find_all("td")
for i in range(5):
td = table[i]
tmp = str(td.get_text())
p = tmp.find(":")
tmp = tmp[p + 1 : len(tmp)].replace("xa0", "")
if i == 0:
info["type"] = tmp
if i == 1:
info["scale"] = tmp
if i == 2:
info["manager"] = tmp
if i == 3:
info["est_date"] = tmp
if i == 4:
info["damin"] = tmp
ks.append("info")
vs.append(info)
except:
print("Error 01.1: at" + dataframe.get("code"))
flag = False
#01.2-dataOfFund
data = {}
try:
div = head.find(attrs={"class" : "dataOfFund"})
table = div.find_all("dd")
for i in range(9):
dd = table[i]
tmp = str(dd.get_text())
p = tmp.find(":")
tmp = tmp[p + 1: len(tmp)].replace("xa0", "")
if i == 0:
data["estimation"] = tmp
if i == 1:
data["1month"] = tmp
if i == 2:
data["1year"] = tmp
if i == 3:
data["unit_net"] = tmp
if i == 4:
data["3month"] = tmp
if i == 5:
data["3year"] = tmp
if i == 6:
data["accum_net"] = tmp
if i == 7:
data["6month"] = tmp
if i == 8:
data["since_est"] = tmp
ks.append("data")
vs.append(data)
flag = True
except:
pass
#01.3-dataOfFund_hb
if flag == False:
data = {}
try:
div = head
table = div.find_all("dd")
for i in range(10):
dd = table[i]
tmp = str(dd.get_text())
p = tmp.find(":")
tmp = tmp[p + 1: len(tmp)].replace("xa0", "")
if i == 0:
data["per_million"] = tmp
if i == 1:
data["7day"] = tmp
if i == 2:
data["14day"] = tmp
if i == 3:
data["28day"] = tmp
if i == 4:
data["1month"] = tmp
if i == 5:
data["1year"] = tmp
if i == 6:
data["3month"] = tmp
if i == 7:
data["3year"] = tmp
if i == 8:
data["6month"] = tmp
if i == 9:
data["since_est"] = tmp
ks.append("data")
vs.append(data)
flag = True
except:
pass
if flag == False:
eros += 1
print("Error 01.2/3: at" + dataframe.get("code"))
#01------
for i in range(len(ks)):
item[ks[i]] = vs[i]
home["item"] = item
def Part02():
#02---historyReturnRate---
global soup
global home
global eros
history = {}
flag = False
#02.1-Monetary
try:
head = soup.find(attrs={"id": "historyReturnRate"})
table = head.find_all("tr")
date = []
per_million = []
seven_day = []
for i in range(len(table)):
if i == 0:
continue
tr = table[i]
date.append(tr.find_all("td")[0].get_text())
per_million.append(tr.find_all("td")[1].get_text())
seven_day.append(tr.find_all("td")[2].get_text())
history["date"] = date
history["per_million"] = per_million
history["7day"] = seven_day
home["history"] = history
flag = True
except:
pass
if flag == True:
return
#02.2-stock
try:
head = soup.find(attrs={"id": "Div2"})
table = head.find("table")
table = table.find_all("tr")
date = []
unit_net = []
accum_net = []
rate = []
for i in range(len(table)):
if i == 0:
continue
tr = table[i]
date.append(tr.find_all("td")[0].get_text())
unit_net.append(tr.find_all("td")[1].get_text())
accum_net.append(tr.find_all("td")[2].get_text())
rate.append(tr.find_all("td")[3].span.get_text())
history["date"] = date
history["unit_net"] = unit_net
history["accum_net"] = accum_net
history["rate"] = rate
home["history"] = history
flag = True
except:
pass
if flag == False:
eros += 1
print("Error 02: at" + dataframe.get("code"))
#02------
def Part03():
#03---IncreaseAmount---
global soup
global home
global eros
increase = []
period = []
inc = []
avg = []
hs300 = []
rank = []
f = False
try:
head = soup.find(attrs={"class": "IncreaseAmount"})
table = head.find_all("tr")
except:
f = True
if f:
return
try:
for i in range(5):
tr = table[i]
if i == 0:
cols= tr.find_all("th")
for th in cols[1: len(cols)]:
period.append(th.get_text())
else:
cols = tr.find_all("td")
for td in cols[1 : len(cols)]:
if i == 1:
inc.append(td.get_text())
if i == 2:
avg.append(td.get_text())
if i == 3:
hs300.append(td.get_text())
if i == 4:
rank.append(td.get_text())
for i in range(len(period)):
tmp = {}
tmp["period"] = period[i]
tmp["inc"] = inc[i]
tmp["avg"] = avg[i]
tmp["hs300"] = hs300[i]
tmp["rank"] = rank[i]
increase.append(tmp)
home["increase"] = increase
except:
eros += 1
print("Error 03 at:" + dataframe.get("code"))
#03------
def Parse():
global home
global col
global partision
#00***fundInfoItem***
home["fund"] = {"name" : dataframe.get("name"), "code" : dataframe.get("code")}
for thread in partision:
thread.setDaemon(True)
thread.start()
for thread in partision:
thread.join()
def Clean():
tmp = str(home.get("item").get("info").get("type"))
p = tmp.find("|")
a = tmp[:p]
b = tmp[p+1:]
home["item"]["info"]["type"] = {"a" : a, "b" : b}
tmp = str(home.get("item").get("info").get("scale"))
p = tmp.find("(")
num = tmp[:p]
date = tmp[p+1:len(tmp) - 1]
home["item"]["info"]["scale"] = {"num" : num, "date" : date}
file_w = codecs.open("...", "w", "utf-8")
eros = 0
#col = pymongo.MongoClient("localhost", 27017).Easy2.Easy2
for url in Gen():
partision = []
p1 = threading.Thread(target=Part01)
partision.append(p1)
p2 = threading.Thread(target=Part02)
partision.append(p2)
p3 = threading.Thread(target=Part03)
partision.append(p3)
#!Caution : I tried to move it out of the loop, But it seems like not workable outside...
soup = Getsoup(url)
Parse()
Clean()
# col.insert(dict(home))
file_w.write(str(json.dumps(home, ensure_ascii=False, indent=4)) + "
")
print("Errors in tot: %d" %eros)
file_w.close()
- 上一篇:没有了
- 下一篇:没有了