Scrapy配合Selenium和PhantomJS爬取动态网页

Python世界中Scrapy一直是爬虫的一个较为成熟的解决方案,目前javascript在网页中应用越来越广泛,越来越多的网站选择使用javascript动态的生成网页的内容,使得很多纯html的爬虫解决方案失效。针对这种动态网站的爬取,目前也有很多解决方案。其中Selenium+PhantomJS是较为简单和稳定的一种。

Selenium是一个网页的自动化测试工具,其本身是用python编写的。PhantomJS可以认为是一个基于WebKit内核的Headless浏览器。我们通过Selenium的Webdriver引入PhantomJS支持,使用PhantomJS来解析动态的网页。

本文以爬取建行理财数据为例:

以下是建行理财页面的截图,其中红色框标出的理财数据是后续使用js动态加载的,并不是一开始就写死在html上的。
理财页面

另外我还发现,如果仅仅使用phantomjs加载该网页,没办法自动运行js脚本,获取理财列表。通过分析该网页,我发现只要先选择区域信息,然后再加载一次页面,即可下载对应的理财列表。
理财列表

废话不多说,先上完整代码:

爬虫的代码文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
import scrapy, urlparse, re
from selenium import webdriver
from scrapy.http import HtmlResponse, Request
from scrapy.loader.processors import MapCompose
from robot.items import FinanceItem
from w3lib.html import remove_tags
from datetime import datetime
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
from robot.db.modules import FinanceInfo
from robot.util import FinanceLoader



class CcbSpider(scrapy.Spider):
'''
中国建设银行爬虫
'''
name = "ccb"
allowed_domains = ["ccb.com"]
module = FinanceInfo
def __init__(self, *args, **kwargs):
try:
PHANTOMJS_PATH = kwargs['PHANTOMJS_PATH']
self.driver = webdriver.PhantomJS(executable_path=PHANTOMJS_PATH, service_args=["--ssl-protocol=any", "--ignore-ssl-errors=true", "--load-images=false", "--disk-cache=true"])
except Exception as e:
self.logger.error(e, exc_info=True)
exit(-2)
super(CcbSpider, self).__init__(*args, **kwargs)

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
kwargs['PHANTOMJS_PATH'] = crawler.settings['PHANTOMJS_PATH']
spider = cls(*args, **kwargs)
spider._set_crawler(crawler)
return spider

def start_requests(self):
url = 'http://finance.ccb.com/cn/finance/product.html'
self.driver.get(url)
# 点击网页中id为txt的元素
self.driver.find_element_by_id("txt").click()
wait = WebDriverWait(self.driver, 2)
# 等待class为select_hide的元素,变为可见
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'select_hide')))
# 点击id为500000的元素
self.driver.find_element_by_id("500000").click()
self.driver.get(url)
# 通过css选择器获取理财种类的tab
for element in self.driver.find_elements_by_css_selector(".life_tab>a"):
element.click()
sleep(1)
while True:
content = self.driver.page_source.encode('utf-8')
url = self.driver.current_url.encode('utf-8')
resp = HtmlResponse(url, encoding='utf-8', status=200, body=content)
div = resp.css(".insurance_tab_detail[style*='display: block']")
hrefs = div.css("td[class='list_title'] a::attr(href)").extract()
for href in hrefs:
req = Request(url=urlparse.urljoin(url, href), callback=self.parse)
req.meta['parse'] = True
yield req

if self.driver.find_element_by_id("pageDiv").is_displayed():
current, total = resp.css("#pageNum").xpath("./text()").extract()[0].split("/", 1)
if int(current) == int(total):
break
else:
self.driver.find_element_by_id("next").click()
else:
break

def parse(self, response):
self.logger.info("Start to parse the url %s \n", response.url)
self.logger.info("url: %s", response.url)
load = FinanceLoader(item=FinanceItem(), response=response)
load.add_value('updatetime', datetime.now())
load.add_css('name', "#name", MapCompose(remove_tags))
load.add_css('id', "#pdId", MapCompose(remove_tags))
load.add_value('type', u"理财")
expected_annual_return = response.css("#yieldRate2").xpath("./text()").extract()
if len(expected_annual_return) > 0:
expected_annual_return = expected_annual_return[0]
tmp = re.compile(u"\d+.\d+%").findall(expected_annual_return)
if len(tmp) == 0:
load.add_value("expected_annual_return", expected_annual_return)
else:
load.add_value("expected_annual_return", u",".join(tmp))
invest_duration = response.css("#investPeriod2").xpath("./text()").extract()
if len(invest_duration) > 0:
invest_duration = invest_duration[0]
tmp = re.compile(u"(\d+)天").findall(invest_duration)
if len(tmp) == 0:
load.add_value("invest_duration", invest_duration)
else:
load.add_value("invest_duration", u",".join(tmp))
load.add_css("currency", "#currencyType", MapCompose(remove_tags))
load.add_css("launch_area", "#saleCitys", MapCompose(remove_tags))
load.add_css("subtype", "#yieldSpec", MapCompose(remove_tags))
load.add_css("risk_level", "#riskLevel", MapCompose(remove_tags))
load.add_css("redeem", "#proMode", MapCompose(remove_tags))
detail = response.css("#instructionUrl a::attr(href)").extract()
if len(detail) > 0:
detail = detail[0]
if not detail.strip().startswith("http"):
detail = urlparse.urljoin("http://finance.ccb.com", detail)
load.add_value("detail", detail)
minimum_amount = response.css("#purFloorAmt2").xpath("./text()").extract()
if len(minimum_amount) > 0:
minimum_amount = minimum_amount[0]
try:
tmp = re.compile(u"(\d+)万").search(minimum_amount).group(1)
tmp = str(int(tmp)*10000)
except AttributeError as e:
tmp = '0'
load.add_value('minimum_amount', tmp)
start_date = response.css("#collBgnDate3").xpath("./text()").extract()
if len(start_date) > 0:
start_date = start_date[0].strip()
try:
start_date = datetime.strptime(start_date, "%Y.%m.%d %H:%M").date()
load.add_value("start_date", start_date)
except Exception as e:
pass
end_date = response.css("#collEndDate3").xpath("./text()").extract()
if len(end_date) > 0:
end_date = end_date[0].strip()
try:
end_date = datetime.strptime(end_date, "%Y.%m.%d %H:%M").date()
load.add_value("end_date", end_date)
except Exception as e:
pass
item = load.load_item()
self.logger.debug("ID: %s", load.get_value(response.css("#pdId").extract()[0], MapCompose(remove_tags))[0])
self.logger.debug("item: %s", str(item))
return item

def closed(self, reason):
self.driver.quit()

def __str__(self):
return "CcbSpider"

scrapy的配置文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# -*- coding: utf-8 -*-
BOT_NAME = 'robot'

SPIDER_MODULES = ['robot.spiders']
NEWSPIDER_MODULE = 'robot.spiders'

# Logging Setting
# LOG_FILE = os.path.normpath(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "log/spider.log"))
LOG_LEVEL = "INFO"
LOG_STDOUT = False
LOG_FORMAT = '%(asctime)s %(filename)s[line:%(lineno)d] [%(name)s] %(levelname)s: %(message)s'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY=1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16

# Disable cookies (enabled by default)
COOKIES_ENABLED=True


# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'robot.middlewares.PhantomJSMiddleware': 1000,
}


# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'robot.pipelines.DBPipeline': 1000,
}

PHANTOMJS_PATH = r'/root/phantomjs/bin/phantomjs'
DB_PATH = r'mysql+pymysql://robot:passw0rd@172.23.23.113:3306/robot'

代码解析

首先,在scrapy的setting.py中,加入PhantomJS的安装路径,即上述代码中的

1
2
3
4
5
6
7
8
9
10
接下来,我们来分析爬虫的代码文件,在爬虫类____init____过程中,我们需要启动selenium的Webdriver,并将其使用的浏览器设置为PhantomJS,```self.driver = webdriver.PhantomJS(executable_path=PHANTOMJS_PATH, service_args=["--ssl-protocol=any", "--ignore-ssl-errors=true", "--load-images=false", "--disk-cache=true"])```。 其中

- ```--ssl-protocol=any, --ignore-ssl-errors=true```用来设置ssl的
- ```--load-images=false```设置了让PhantomJS不加载图片,有利于提高PhantomJS的速度
- ```--disk-cache=true``` 启用本地磁盘缓存,同样有利于提高提高PhantomJS的速度

然后在start_request中

1. 先模拟点击id为txt元素,```self.driver.find_element_by_id("txt").click()```如图所示:![id为txt的元素](http://img.blog.csdn.net/20170817124919420?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvSnVsa290/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
2. 调用Webdriver的等待函数,等待弹框显示 ```wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'select_hide')))

  1. 在弹框中,模拟点击重庆市(id为500000),

    如图所示![重庆市](http://img.blog.csdn.net/20170817144201979?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvSnVsa290/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA
    1
    2
    3
    4
    4. 再重新获取一次页面, ```self.driver.get(url)```。
    > PS: *我猜测,上面的点击会把地理位置信息保存在PhantomJS的网页缓存中,再一次获取页面就不用再去设置地理位置信息,而通过这个信息,即可马上获取对应的理财列表了*。

    5. 遍历理财种类的tab页 ```for element in self.driver.find_elements_by_css_selector(".life_tab>a"): element.click()

    理财种类

  2. 遍历当前页面上理财列表,并且判断是否还有下一页,如果还有下一页,模拟点击下一页按钮
1
2
3
4
5
6
7
8
if self.driver.find_element_by_id("pageDiv").is_displayed():
current, total = resp.css("#pageNum").xpath("./text()").extract()[0].split("/", 1)
if int(current) == int(total):
break
else:
self.driver.find_element_by_id("next").click()
else:
break

以上就是selenium和PhantomJS配合使用的简单解释。

PS: 注意,在爬虫类中一定要加上def closed(self, reason):这个函数,并且在函数中显式退出PhantomJS,否则PhantomJS的进程会一直保留。当你使用scrapyd进行部署运行的时候,这个问题会导致只能运行几个爬虫,之后程序就卡住了。