scrapy——selenium,driver在下载中间件里创建,开了多线程,但是只建了一个driver并工作,为什么不是多个?多线程只对爬虫有效吗?
setting里面
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 3
中间器里面
class DangdangDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self):
self.chrome_options = ChromeOptions()
self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# self.chrome_options.add_argument('--headless')
# self.chrome_options.add_argument('--disable-gpu')
self.chrome_options.add_argument('log-level=3')
self.driver = webdriver.Chrome(options=self.chrome_options)
self.driver.implicitly_wait(10)
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def get_ip(self):
path = r'C:\Users\BigFish\PycharmProjects\pythonProject37264\辅助工具\爬虫辅助工具\IP_pool\ip_pool.txt'
with open(path, 'r', encoding='utf8') as f:
ip_list = [i.strip() for i in f.readlines()]
return ip_list
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
ip_list = self.get_ip()
request.meta['proxy'] = 'http://'+choice(ip_list)
self.driver.get(request.url)
sleep(2)
response=HtmlResponse(request.url,body=self.driver.page_source,request=request,encoding='utf8')
# print(self.driver.page_source)
# self.driver.quit()
return response
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
打开settings.py
文件
修改以下内容:DOWNLOADER_MIDDLEWARES
使刚才写的middlewares中间件中的类,ITEM_PIPELINES
是pipelines中的类
BOT_NAME = 'air_history'
SPIDER_MODULES = ['air_history.spiders']
NEWSPIDER_MODULE = 'air_history.spiders'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
DOWNLOADER_MIDDLEWARES = {
'air_history.middlewares.AreaSpiderMiddleware': 543,
}
ITEM_PIPELINES = {
'air_history.pipelines.AirHistoryPipeline': 300,
}
原则上你需要用的是多进程,而不是多线程