scrapy分布式爬虫,采用Redisspider统一管理,win10中的爬虫一直等待,Ubuntu收到命令正常进行

Ubuntu和win终端都下达过命令win上的爬虫都未收到

img

img

img
爬虫.py


import json
from ..items import TencentjobItem
import scrapy
from time import time
from math import ceil
from scrapy_redis.spiders import RedisSpider
class TencentJobSpider(RedisSpider):
    name = 'tencent_job'
    allowed_domains = ['careers.tencent.com']
    keyword= input('请输入关键字:')
    # start_urls = [f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1630141770377&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={keyword}&pageIndex=1&pageSize=10&language=zh-cn&area=cn']
    redis_key = 'tencent_job:spider'
    # def start_requests(self):
    #     self.keyword = input('请输入关键字:')
    #     start_url = f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1630141770377&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={self.keyword}&pageIndex=1&pageSize=10&language=zh-cn&area=cn'
    #     yield scrapy.Request(url=start_url,callback=self.get_pages,dont_filter=True)
    def get_pages(self,response):
        print(response.status)
        pages=ceil(json.loads(response.text)['Data']['Count']/10)
        for i in range(1,pages+1):
            url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1630141770377&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={self.keyword}&pageIndex={i}&pageSize=10&language=zh-cn&area=cn'
            yield scrapy.Request(url=url,callback=self.parse)
    def parse(self, response):
        Posts=json.loads(response.text)['Data']['Posts']
        for post in Posts:
            item=TencentjobItem()
            item['PostId']=post['PostId']
            item['RecruitPostName']=post['RecruitPostName']
            item['CountryName']=post['CountryName']
            item['LocationName']=post['LocationName']
            item['BGName']=post['BGName']
            item['CategoryName']=post['CategoryName']
            item['LastUpdateTime']=post['LastUpdateTime']
            url=f"https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={int(time()*1000)}&postId={item['PostId']}&language=zh-cn"
            item['PostURL'] = url
            yield scrapy.Request(url=url,meta={'item':item},callback=self.get_post)
    def get_post(self,response):
        item=response.meta['item']
        msg=json.loads(response.text)['Data']
        item['Responsibility']=msg['Responsibility']
        item['Requirement']=msg['Requirement']
        yield item

settins.py


# Scrapy settings for TencentJob project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'TencentJob'

SPIDER_MODULES = ['TencentJob.spiders']
NEWSPIDER_MODULE = 'TencentJob.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'TencentJob (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
#1.指定使用scrapy-redis的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#2.指定使用scrapy-redis的去重机制
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#3.指定redis  IP地址和端口号
REDIS_HOST='192.168.233.129'
REDIS_PORT=6379
#4.(非必须)添加scrapy-redis管道 -把数据真正存入redis
#5.指纹持久化
# SCHEDULER_PERSIST = True


# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
import random
DOWNLOAD_DELAY = random.random()
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
    'cookie': '_ga=GA1.2.995650264.1628241994; _gcl_au=1.1.1095672804.1628241996; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22b3d217b8577c49e925712f893a5357bf%40devS%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22%24device_id%22%3A%2217b1ac9218a8fd-08233e32f50399-4343363-1327104-17b1ac9218b832%22%7D; Hm_lvt_eaa57ca47dacb4ad4f5a257001a3457c=1630141522; Hm_lpvt_eaa57ca47dacb4ad4f5a257001a3457c=1630144033'
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'TencentJob.middlewares.TencentjobSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'TencentJob.middlewares.TencentjobDownloaderMiddleware': 543,
    'TencentJob.middlewares.TencentJobUserAgentDownloaderMiddleware':300,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'TencentJob.pipelines.TencentjobPipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline':200,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# LOG_LEVEL='ERROR'