爬虫一直循环运行,不结束进程


import requests
from lxml import etree
import re
from selenium import webdriver
import time
import csv
lst1=[]
lst2=[]
lst3=[]
url = 'https://www.mi.com/shop/category/list'
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost = requests.get(url,headers=headers).text
ele = etree.HTML(respost)
ulall = ele.xpath('//ul[@class="children-list clearix"]')
for i in ulall:
    url_all = i.xpath('./li/a/@href')  # 获取到全部商品url
    # 补全商品链接中的缺陷
    for i in url_all:
        if 'https:' in i :
            url1 = i
            headers1 = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
            respost1 = requests.get(url1, headers=headers1).text
            ele1 = etree.HTML(respost1)
            script1 = ele1.xpath('//script[@type="text/javascript"]/text()')
            for aq in script1:
                con1 = aq.split(',')
                for aw in con1:
                    ac = re.findall('"product_id":"(.*?)"', aw)
                    if ac:  #数据做保存
                        for xc in ac:
                            lst1.append(xc)
                            # print(xc)
        else:
            a = 'https:' + i
            url2 = a
            drive = webdriver.Chrome()
            drive.maximize_window()
            drive.get(f'{url2}')
            time.sleep(1)  # 加载1秒
            idall = drive.page_source  # 获取当前页面信息
            ida = re.findall('6.64.2.(.*?)&', idall)  # 获取当前页id
            for qe in ida:
                if qe.isdigit():  # 判断是否为纯数字
                    lst2.append(qe)
                    # print(lst2)
            drive.quit()
    lst3 = lst1 +lst2
    lst4= list(set(lst3)) #去重 保存所有ID
    # print(lst4)
    lst5=[]
    lst6=[]
    lst7=[]
    lst8=[]
    lst9=[]
    acx = 0
    for w2 in lst4:
        id = w2
        url3 = f'https://api2.service.order.mi.com/user_comment/get_summary?show_all_tag=1&goods_id={id}&v_pid=17972&support_start=0&support_len=10&add_start=0&add_len=10&profile_id=0&show_img=0&callback=__jp6'
        headers3 = {'referer': 'https://www.mi.com/',
                           'accept': 'application/json, text/plain, */*',
                           'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
                           'sec-ch-ua-mobile': '?0',
                           'sec-ch-ua-platform': "Windows",
                           'sec-fetch-dest': 'script',
                           'sec-fetch-mode': 'no-cors',
                           'sec-fetch-site': 'same-site',
                           'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
        respost3 = requests.get(url3, headers=headers3)
        data = respost3.text
        con1 = data.split(',')
        for i in con1:
            idp = re.findall('"product_id":"(.*?)"', i)   # ID
            if idp:
                lst6.append(idp)
            mani = re.findall('"comments_total":(.*)', i)  # 总评数
            if mani:
                lst7.append(mani)
            zop = re.findall('"comments_good":(.*)', i)    # 好评数
            if zop:
                lst8.append(zop)
            hop = re.findall('"satisfy_per":"(.*?)"', i)   # 满意度
            if hop:
                lst9.append(hop)
        url4 = f'https://www.mi.com/shop/comment/{id}.html'
        headers4 = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
        respost4 = requests.get(url4, headers=headers4).text
        name = re.findall('【(.*)怎么样,好不好】用户评价-小米商城', respost4)
        if name:
            lst5.append(name)
        data_list = []
        for a, b, c, d,e in zip(lst5, lst6, lst7, lst8,lst9):
            x = {}
            x['商品名称'] = a
            x['id'] = b
            x['总评数'] = c
            x['好评数'] = d
            x['满意度'] = e
            data_list.append(x)
        with open('小米商城.csv', 'w', encoding='gbk',newline='') as f:
            write = csv.DictWriter(f, fieldnames=['商品名称', 'id', '总评数', '好评数', '满意度'])
            write.writeheader()
            write.writerows(data_list)
import requests
from lxml import etree
import re
from selenium import webdriver
import time
import csv

lst1=[]
lst2=[]
lst3=[]

# 定义一个计数器
count = 0

# 设置循环条件
while count < 5:
    url = 'https://www.mi.com/shop/category/list'
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
    respost = requests.get(url,headers=headers).text
    ele = etree.HTML(respost)
    ulall = ele.xpath('//ul[@class="children-list clearix"]')
    for i in ulall:
        url_all = i.xpath('./li/a/@href')  # 获取到全部商品url
        # 补全商品链接中的缺陷
        for i in url_all:
            if 'https:' in i :
                url1 = i
                headers1 = {
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
                respost1 = requests.get(url1, headers=headers1).text
                ele1 = etree.HTML(respost1)
                script1 = ele1.xpath('//script[@type="text/javascript"]/text()')
                for aq in script1:
                    con1 = aq.split(',')
                    for aw in con1:
                        ac = re.findall('"product_id":"(.*?)"', aw)
                        if ac:  #数据做保存
                            for xc in ac:
                                lst1.append(xc)
                                # print(xc)
            else:
                a = 'https:' + i
                url2 = a
                drive = webdriver.Chrome()
                drive.maximize_window()
                drive.get(f'{url2}')
                time.sleep(1)  # 加载1秒
                idall = drive.page_source  # 获取当前页面信息
                ida = re.findall('6.64.2.(.*?)&', idall)  # 获取当前页id
                for qe in ida:
                    if qe.isdigit():  # 判断是否为纯数字
                        lst2.append(qe)
                        # print(lst2)
                drive.quit()
    l
    lst3 = lst1 +lst2
    lst4= list(set(lst3)) #去重 保存所有ID
    # print(lst4)
    lst5=[]
    lst6=[]
    lst7=[]
    lst8=[]
    lst9=[]
    acx = 0
    for w2 in lst4:
        id = w2
        url3 = f'https://api2.service.order.mi.com/user_comment/get_summary?show_all_tag=1&goods_id={id}&v_pid=17972&support_start=0&support_len=10&add_start=0&add_len=10&profile_id=0&show_img=0&callback=__jp6'
        headers3 = {'referer': 'https://www.mi.com/',
                               'accept': 'application/json, text/plain, */*',
                               'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="86", "Google Chrome";v="86"',
                               'sec-fetch-site': 'same-origin',
                               'sec-fetch-mode': 'cors',
                               'sec-fetch-dest': 'empty',
                               'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225
        respost3 = requests.get(url3, headers=headers3).text
        # print(respost3)
        ele3 = etree.HTML(respost3)
        script3 = ele3.xpath('//script[@type="text/javascript"]/text()')
        for x in script3:
            con3 = x.split(',')
            for z in con3:
                xc3 = re.findall('"content":"(.*?)"', z)
                if xc3:
                    for ax in xc3:
                        lst5.append(ax)
        good = re.findall('"good":"(.*?)"', respost3)
        for g in good:
            lst6.append(g)
        general = re.findall('"general":"(.*?)"', respost3)
        for g in general:
            lst7.append(g)
        poor = re.findall('"poor":"(.*?)"', respost3)
        for p in poor:
            lst8.append(p)
        ac = re.findall('"all":"(.*?)"', respost3)
        for a in ac:
            lst9.append(a)
        acx = acx + 1
        print(f'爬取第{acx}条数据')
    # 保存
    with open('mi.csv', 'a', newline='', encoding='utf-8') as f:
        write = csv.writer(f)
        for i in range(len(lst4)):
            data = [lst4[i], lst5[i], lst6[i], lst7[i], lst8[i], lst9[i]]
            write.writerow(data)
    # 清空列表,便于下次保存
    lst1.clear()
    lst2.clear()
    lst3.clear()
    lst4.clear()
    lst5.clear()
    lst6.clear()
    lst7.clear()
    lst8.clear()
    lst9.clear()
    acx = 0
    # 设置终止条件,爬取五次后退出循环
    if ulall.index(i) == 4:
        break



你要是仔细研究爬取这个网页的数据,有更简单的办法,既快又好。用不着这么复杂。
网页里面给你提供了完整的字典格式的数据