框架运行不了
import scrapy
from zhipin.req import SeleniumRequest
class BossSpider(scrapy.Spider):
name = 'boss'
allowed_domains = ['zhipin.com']
start_urls = ['https://mil.ifeng.com/shanklist/14-35083-']
def start_request(self):
yield SeleniumRequest(
url=self.start_urls[0],
methoad="get",
callback=self.parse
)
def parse(self,response,**kwargs):
li_list=response.xpath("/html/body/div/div[5]/div[1]/div/ul/li")
for li in li_list:
title=li.xpath("./div/h2/a/text()").extract_first()
print(title)
from scrapy import signals
from scrapy.http import HtmlResponse
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import time
from zhipin.req import SeleniumRequest
from itemadapter import is_item, ItemAdapter
class ZhipinDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s
def process_request(self, request, spider):
if isinstance(request, SeleniumRequest):
self.web.get(request.url)
time.sleep(2)
page_source=self.web.page_source
return HtmlResponse(
url=request.url,
status=200,
body=page_source,
request=request,
encoding="utf-8"
)
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
self.web = Chrome(executable_path="./chromedriver")
self.web.implicitly_wait(10)
def spider_closed(self, spider):
if self.web:
self.web.close()