解决网站需要cookies登录和内容需要动态加载问题
phantomjsMiddleware
1 class PhantomJSMiddleware(object): 2 @classmethod 3 def process_request(cls, request, spider): 4 from selenium import webdriver 5 from scrapy.http import HtmlResponse 6 7 driver = webdriver.PhantomJS(r'C:\InstallFile\Phantomjs\bin\phantomjs.exe') 8 driver.get(request.url) 9 driver.implicitly_wait(1) 10 11 saved_cookies = driver.get_cookies() 12 driver2 = webdriver.PhantomJS(r'C:\InstallFile\Phantomjs\bin\phantomjs.exe') 13 driver2.get(request.url) 14 driver2.implicitly_wait(1) 15 driver2.delete_all_cookies() 16 17 for cookie in saved_cookies: 18 for k in ('name','value','domain','path','expiry'): 19 if k not in list(cookie.keys()): 20 if k == 'expiry': 21 cookie[k] = 1475825481 22 23 driver2.add_cookie({k:cookie[k] for k in ('name', 'value', 'domain', 'path', 'expiry') if k in cookie}) 24 print(cookie) 25 driver2.get(request.url) 26 driver2.implicitly_wait(1) 27 28 content = driver.page_source.encode('utf-8') 29 driver.quit() 30 31 return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)