关于爬虫爬取页数的问题

翻页爬取,不单单只爬一页,但是这个只能爬一夜,无论怎么改Referer的page都不行

import requests
import re
import json
import time
for i in range(1,20,1):
 print("正在爬取" +str(i))
 headers = {
    "Accept": "application/json",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Connection": "keep-alive",
    "Content-Type": "application/json",
    "Origin": "https://read.douban.com",
    "Referer": "https://read.douban.com/ebook/57468782/comments?page=" +str(i),
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63",
    "X-CSRF-Token": "HrwZ",
    "X-Requested-With": "XMLHttpRequest",
    "sec-ch-ua": "\"Chromium\";v=\"104\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"104\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\""
 }
 cookies = {
     "bid": "CtjGQQts6qw",
     "_ga": "GA1.3.854662765.1661781900",
     "__gads": "ID=d5118ecfd9b7e0af-2274d7276ad60026:T=1661830434:RT=1661830434:S=ALNI_Mal2pDexGkPbLkfBDTGaasIQBwdVg",
     "ll": "\"108288\"",
     "__utma": "30149280.854662765.1661781900.1661864911.1661864911.1",
     "__utmz": "30149280.1661864911.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
     "dbcl2": "\"262046614:rlftVkO/IYI\"",
     "ck": "HrwZ",
     "_pk_ses.100001.a7dd": "*",
     "_gid": "GA1.3.1915073618.1662102188",
     "_gat": "1",
     "__gpi": "UID=0000093d4aa6ff27:T=1661830434:RT=1662102190:S=ALNI_MaJaflHC8_SKOY1tp75x72g4tBu-g",
     "_ga_RXNMP372GL": "GS1.1.1662102176.14.1.1662102202.34.0.0",
     "_pk_id.100001.a7dd": "684c82dba8991abc.1661781900.10.1662102203.1661928898."
 }
 url = "https://read.douban.com/j/graphql"
 data = {
     "query": "\n    query getWorksComment($worksId: ID!, $start: Int, $limit: Int, $sort: ReviewSortEnum, $onlyUserId: ID, $onlyCompetition: Boolean, $reviewType: String) {\n      works: works(worksId: $worksId) {\n        worksType\n        \n  ... on WorksBase {\n    mixedCommentCount\n  }\n\n\n  ... on WorksBase {\n    isCompetition\n  }\n\n\n    ... on WorksBase {\n      comments: reviews(\n        start: $start, limit: $limit, sort: $sort, onlyUserId: $onlyUserId, competitionOnly: $onlyCompetition, lengthType: $reviewType\n      ) {\n        list {\n          \n  ... on CommentBase {\n    id\n    isHidden\n    isDeleted\n    \n  ... on CommentBase {\n    id\n    works {\n      agent {\n        id\n      }\n      \n  title\n  url\n  isChapter\n\n    }\n    user {\n      id\n      avatar: picture(size: MEDIUM)\n      name\n      url\n      isVip\n      ... on Agent {\n        agentName\n        hasMedal\n        agentId\n      }\n    }\n    createTime\n    commentType\n    donation {\n      amount\n    }\n    ... on Review {\n      url\n      badge {\n        url\n        image\n        title\n        color\n      }\n    }\n    ... on Annotation {\n      url\n    }\n    ... on WorksRecommend {\n      score\n      isEditorChoice\n    }\n    hasPurchasedAllBadge\n  }\n\n    \n  ... on CommentBase {\n    id\n    content\n    commentType\n    user {\n      id\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Discussion {\n      refComment {\n        id\n        user {\n          id\n          name\n          url\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n        isDeleted\n        createTime\n        content\n      }\n    }\n    ... on Review {\n      title\n      badge {\n        label\n        color\n      }\n    }\n    ... on Annotation {\n      \n  ... on Annotation {\n    originContent {\n      rawTexts\n      startOffset\n      endOffset\n      image {\n        url\n        size { width height }\n      }\n    }\n  }\n\n    }\n    ... on WorksRecommend {\n      title\n    }\n    ... on ReviewComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on AnnotationComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on WorksRecommendComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    isHidden\n    isDeleted\n    content\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    works {\n      id\n      cover(useSmall: true)\n      \n  title\n  url\n  isChapter\n\n    }\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n      upvoted\n      upvoteCount\n      works {\n        title\n        url\n      }\n    }\n    ... on WorksRecommend {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on ReviewComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on AnnotationComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on WorksRecommendComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n  }\n\n    \n  ... on WorksRecommend {\n    id\n    works {\n      id\n      \n    title\n    cover(useSmall: true)\n    url\n    isBundle\n    coverLabel(preferVip: true)\n  \n      \n  url\n  title\n\n      \n  author {\n    name\n    url\n  }\n  origAuthor {\n    name\n    url\n  }\n  translator {\n    name\n    url\n  }\n\n      isColumn\n      isFinished\n      wordCount\n      wordCountUnit\n      isInLibrary\n      \n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n      title\n    }\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    isHidden\n    isDeleted\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      title\n      rating\n      reviewId\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on WorksRecommend {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n  \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n    }\n    user {\n      id\n      name\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Review {\n      reviewId\n    }\n  }\n\n  \n  ... on CommentBase {\n    id\n    commentType\n    works {\n      id\n    }\n    ... on Review {\n      reviewId\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n\n\n  }\n\n        }\n        total\n      }\n    }\n  \n      }\n    }\n  ",
     "variables": {
         "worksId": "57468782",
         "start": 20,
         "limit": 20,
         "sort": "SCORE_DESC",
         "onlyUserId": None,
         "onlyCompetition": False
     },
     "operationName": "getWorksComment"
 }
 try:
         data = json.dumps(data)
         response = requests.post(url, headers=headers,cookies=cookies ,data=data)
         time.sleep(5)
         conzhen = 'content":"(.*?)","'
         content = re.compile(conzhen).findall(response.text)
 except:
         print("本页爬取失败")
 fh = open('测试写入.txt', 'a')
 for i in range(1 , len(content)):
     print(content[i])
     fh.write(content[i] + '\n')
 fh.close()


img

import requests
import re
import json
import time

fh = open('测试写入.txt', 'a')
for i in range(1, 20, 1):
    print("正在爬取" + str(i))
    headers = {
        "Accept": "application/json",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Connection": "keep-alive",
        "Content-Type": "application/json",
        "Origin": "https://read.douban.com",
        "Referer": "https://read.douban.com/ebook/57468782/comments?page=" + str(i),
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63",
        "X-CSRF-Token": "HrwZ",
        "X-Requested-With": "XMLHttpRequest",
        "sec-ch-ua": "Chromium;v=104, Not A;Brand;v=99, Microsoft Edge;v=104",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "Windows"
    }
    cookies = {
        "bid": "CtjGQQts6qw",
        "_ga": "GA1.3.854662765.1661781900",
        "__gads": "ID=d5118ecfd9b7e0af-2274d7276ad60026:T=1661830434:RT=1661830434:S=ALNI_Mal2pDexGkPbLkfBDTGaasIQBwdVg",
        "ll": "108288",
        "__utma": "30149280.854662765.1661781900.1661864911.1661864911.1",
        "__utmz": "30149280.1661864911.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
        "dbcl2": "262046614:rlftVkO/IYI",
        "ck": "HrwZ",
        "_pk_ses.100001.a7dd": "*",
        "_gid": "GA1.3.1915073618.1662102188",
        "_gat": "1",
        "__gpi": "UID=0000093d4aa6ff27:T=1661830434:RT=1662102190:S=ALNI_MaJaflHC8_SKOY1tp75x72g4tBu-g",
        "_ga_RXNMP372GL": "GS1.1.1662102176.14.1.1662102202.34.0.0",
        "_pk_id.100001.a7dd": "684c82dba8991abc.1661781900.10.1662102203.1661928898."
    }
    url = "https://read.douban.com/j/graphql"
    data = {
        "query": "query getWorksComment($worksId: ID!, $start: Int, $limit: Int, $sort: ReviewSortEnum, $onlyUserId: ID, $onlyCompetition: Boolean, $reviewType: String) {\n      works: works(worksId: $worksId) {\n        worksType\n        \n  ... on WorksBase {\n    mixedCommentCount\n  }\n\n\n  ... on WorksBase {\n    isCompetition\n  }\n\n\n    ... on WorksBase {\n      comments: reviews(\n        start: $start, limit: $limit, sort: $sort, onlyUserId: $onlyUserId, competitionOnly: $onlyCompetition, lengthType: $reviewType\n      ) {\n        list {\n          \n  ... on CommentBase {\n    id\n    isHidden\n    isDeleted\n    \n  ... on CommentBase {\n    id\n    works {\n      agent {\n        id\n      }\n      \n  title\n  url\n  isChapter\n\n    }\n    user {\n      id\n      avatar: picture(size: MEDIUM)\n      name\n      url\n      isVip\n      ... on Agent {\n        agentName\n        hasMedal\n        agentId\n      }\n    }\n    createTime\n    commentType\n    donation {\n      amount\n    }\n    ... on Review {\n      url\n      badge {\n        url\n        image\n        title\n        color\n      }\n    }\n    ... on Annotation {\n      url\n    }\n    ... on WorksRecommend {\n      score\n      isEditorChoice\n    }\n    hasPurchasedAllBadge\n  }\n\n    \n  ... on CommentBase {\n    id\n    content\n    commentType\n    user {\n      id\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Discussion {\n      refComment {\n        id\n        user {\n          id\n          name\n          url\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n        isDeleted\n        createTime\n        content\n      }\n    }\n    ... on Review {\n      title\n      badge {\n        label\n        color\n      }\n    }\n    ... on Annotation {\n      \n  ... on Annotation {\n    originContent {\n      rawTexts\n      startOffset\n      endOffset\n      image {\n        url\n        size { width height }\n      }\n    }\n  }\n\n    }\n    ... on WorksRecommend {\n      title\n    }\n    ... on ReviewComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on AnnotationComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on WorksRecommendComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    isHidden\n    isDeleted\n    content\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    works {\n      id\n      cover(useSmall: true)\n      \n  title\n  url\n  isChapter\n\n    }\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n      upvoted\n      upvoteCount\n      works {\n        title\n        url\n      }\n    }\n    ... on WorksRecommend {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on ReviewComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on AnnotationComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on WorksRecommendComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n  }\n\n    \n  ... on WorksRecommend {\n    id\n    works {\n      id\n      \n    title\n    cover(useSmall: true)\n    url\n    isBundle\n    coverLabel(preferVip: true)\n  \n      \n  url\n  title\n\n      \n  author {\n    name\n    url\n  }\n  origAuthor {\n    name\n    url\n  }\n  translator {\n    name\n    url\n  }\n\n      isColumn\n      isFinished\n      wordCount\n      wordCountUnit\n      isInLibrary\n      \n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n      title\n    }\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    isHidden\n    isDeleted\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      title\n      rating\n      reviewId\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on WorksRecommend {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n  \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n    }\n    user {\n      id\n      name\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Review {\n      reviewId\n    }\n  }\n\n  \n  ... on CommentBase {\n    id\n    commentType\n    works {\n      id\n    }\n    ... on Review {\n      reviewId\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n\n\n  }\n\n        }\n        total\n      }\n    }\n  \n      }\n    }\n  ",
        "variables": {
            "worksId": "57468782",
            "start": 20*i,
            "limit": 20,
            "sort": "SCORE_DESC",
            "onlyUserId": None,
            "onlyCompetition": False
        },
        "operationName": "getWorksComment"
    }
    try:
        data = json.dumps(data)
        response = requests.post(url, headers=headers, cookies=cookies, data=data)
        time.sleep(5)
        conzhen = 'content":"(.*?)","'
        content = re.compile(conzhen).findall(response.text)
        print(len(content))
        for i in range(1, len(content)):
            print(content[i])
            fh.write(content[i] + '\n')
    except:
        print("本页爬取失败")
fh.close()

要改变参数,不然每次请求都是同一个请求,改变start参数:
把 "start": 20,改为 "start": 20*i,