
Scrapy settings for xiaoshuopro project
For simplicity, this file contains only settings considered important or
commonly used. You can find more settings consulting the documentation:
BOT_NAME = 'xiaoshuopro'
SPIDER_MODULES = ['xiaoshuopro.spiders']
NEWSPIDER_MODULE = 'xiaoshuopro.spiders'
Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44'
Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
#CONCURRENT_REQUESTS = 32
See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
Disable cookies (enabled by default)
#COOKIES_ENABLED = False
Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
#DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
'Accept-Language': 'en',
#}
Enable or disable spider middlewares
#SPIDER_MIDDLEWARES = {
'xiaoshuopro.middlewares.XiaoshuoproSpiderMiddleware': 543,
#}
Enable or disable downloader middlewares
#DOWNLOADER_MIDDLEWARES = {
'xiaoshuopro.middlewares.XiaoshuoproDownloaderMiddleware': 543,
#}
Enable or disable extensions
#EXTENSIONS = {
'scrapy.extensions.telnet.TelnetConsole': None,
#}
ITEM_PIPELINES = {
'xiaoshuopro.pipelines.mongoPipeline:': 300,
'xiaoshuopro.pipelines.bdPipeline:': 301,
}
#AUTOTHROTTLE_ENABLED = True
The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
The average number of requests Scrapy should be sending in parallel to
each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'