import requests
from multiprocessing import Process,Queue
_WORKER_PROCESS_NUM = 5
_url = "http://axe-level-1.herokuapp.com/lv2/?page={}"
def get_page(page_num):
resp = requests.get(_url.format(page_num))
fileout = open ("{}.html".format(page_num),'w')
fileout.write(resp.text)
fileout.close()
def worker(queue):
while not queue.empty():
page_num = queue.get()
get_page(page_num)
def main():
# Multi-processing
queue = Queue()
for i in range(1,12+1):
queue.put(i)
process=[]
for i in range(_WORKER_PROCESS_NUM):
p = Process(target=worker,args=(queue,))
p.daemon=True
p.start()
process.append(p)
for i in process:
i.join()
# 開了multi-processing
import time
t_start = time.time()
main()
t_stop = time.time()
print (t_stop - t_start)
1.0364434719085693
# serial
import time
t_start = time.time()
for i in range(1,12+1):
get_page(i)
t_stop = time.time()
print (t_stop - t_start)
3.6738579273223877
重點:Chrome 的 「開發人員工具」觀察Network
=> 透過瀏覽器>檢視原始碼、Quick Javascript Switcher,
若確定是JS產生,觀察Network,找出能拿到資料的網址
主要以lxml,搭配XPath可以做很詳細的語法選擇。
遇到真的很棘手的,會搭配正規語法re,或直接把整區的純文字存下來,再以split去切。
能不要以位於第幾個位置就不要用
//*[@id="PaymentContainer"]/div[2]/ul/li[1]/span/span//div[@id="PaymentContainer"]//span[@class='price']/span[@class='value' and not(@id)]

