原理
通过Python的selenium模块,加上phantomjs这个服务端浏览器工具,模拟用户请求网页数据,等数据都加载完成后,解析html提取数据。
代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
|
from selenium import webdriver import sys,time,json
def loadHistoricPrices(): driver = webdriver.PhantomJS(executable_path='phantomjs') driver.viewportSize={'width':1440, 'height':900} driver.maximize_window() driver.get('https://www.nyse.com/quote/XXXX:IBM') time.sleep(10) js=""" var classes = ['Time', 'Open', 'High', 'Low', 'Close', 'Volume']; var data = []; $('.data-table-container').find('.flex_tr').each(function () { var cell = {}; for (var i = 0; i < classes.length; i ++) { cell[classes[i]] = $(this).find('.' + classes[i]).find('div').html(); } data.push(cell); }); return JSON.stringify(data); """ data = driver.execute_script(js) currentTime = time.strftime("%Y%m%d%H%M%S", time.localtime()) driver.save_screenshot("%s.png" % currentTime) driver.quit() return data
attemptTimes = 0 start=time.time() data = []
while (len(data) < 100) : attemptTimes +=1 print("The %s time to request data......" % attemptTimes) sys.stdout.flush() data = loadHistoricPrices()
end=time.time() cost=end-start print(data) print("len of data : %s" % len(data)) print("cost : % seconds, attempt times : %s" % (cost, attemptTimes))
|