import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import SciencedirectItem
from scrapy.pipelines.images import ImagesPipeline
class SciiSpider(CrawlSpider):
name = 'sci'
allowed_domains = ['sciencedirect.com']
start_urls = ['https://www.sciencedirect.com/search?qs=composting']
rules = (
Rule(LinkExtractor(allow=r'/science/article/pii/S\d+'), callback='parse_item'),
Rule(LinkExtractor(allow=r'&show=25&offset=\d+'), follow=True)
)
def parse_item(self, response):
item =SciencedirectItem()
item['article'] = response.xpath('//span[@class="title-text"]/text()').extract()
item['abstract'] = response.xpath('//p[@id="abspara0010"]/text()').extract_first()
article_url = response.xpath('//div[@class="buttons text-s"]/a/@href').extract_first()
item['article_url']='https://www.sciencedirect.com/'+article_url
yield item
import scrapy
class SciencedirectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
article= scrapy.Field()
abstract = scrapy.Field()
article_url=scrapy.Field()
pass
BOT_NAME = 'sciencedirect'
SPIDER_MODULES = ['sciencedirect.spiders']
NEWSPIDER_MODULE = 'sciencedirect.spiders'
LOG_LEVEL='WARNING'
DOWNLOAD_DELAY = 3
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
ITEM_PIPELINES = {
'sciencedirect.pipelines.SciencedirectPipeline': 300
}
要爬取的网址是这个:https://www.sciencedirect.com/search?qs=composting&show=100,刚0基础学python两个月,用目前所学的知识写了这个,找不到原因在哪,望各位大神帮帮忙。
翻页信息:
要爬取的下载网址:
每篇文章详情页面要爬取的摘要信息:
你有没有查看你爬取的都是什么东西。
你可以用requests来测试你爬取的结果是什么。
我用浏览器打开 那个网址 ,呈现的是需要我登录邮箱,所以我估计你用 scrapy得到的也是 输入邮箱的页面
您好,我是有问必答小助手,你的问题已经有小伙伴为您解答了问题,您看下是否解决了您的问题,可以追评进行沟通哦~
如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~
ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>> https://vip.csdn.net/askvip?utm_source=1146287632