这个i的计数怎么弄,只能保存一页的数据。每次调用i总会重新赋值为1

import xlwt
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import json







class GmarketSpdier:

    def __init__(self):
        self.start_url = "http://browse.gmarket.co.kr/search?keyword=t-shirt&k=21&p={}"
        self.driver = webdriver.Chrome()






    def get_content_list(self):#提取数据

        html = self.driver.page_source
        soup = BeautifulSoup(html,'lxml')
        div_list = soup.find_all('div',class_='box__component box__component-itemcard box__component-itemcard--general')
        content_list = []
        
        for div in div_list:
            item = {}
            for h1 in div.find_all('span', class_='text__item'):
                item['상품명'] = h1.text
            for h2 in div.find_all('strong', class_='text text__value'):
                item['상품금액'] = h2.text
            for h3 in div.find_all('span', class_='text__tag'):
                item['배송'] = h3.text
            for h4 in div.find_all('span', class_='image__awards-points'):
                item['평점'] = h4.text
            for h5 in div.find_all('li', class_='list-item list-item__pay-count'):

                item['구매'] = h5.text
            for h6 in div.find_all('a', class_='link__item'):
                item['net'] = str(h6)

            content_list.append(item)



        return content_list #返回一个列表

    def next_url(self):  #다음 page 얻는다

        return [self.start_url.format(a) for a in range(2,1000)]

<u>    def save_content_list(self,i=1):
        worksheet = xlwt.Workbook(encoding='utf-8')
        excelTabel = worksheet .add_sheet('GmarketData')
        excelTabel.write(0, 0, '상품명')
        excelTabel.write(0, 1, '상품금액')
        excelTabel.write(0, 2, '배송')
        excelTabel.write(0, 3, '평점')
        excelTabel.write(0, 4, '구매')
        excelTabel.write(0, 5, 'net')
        col = excelTabel.col(1)
        col.width = 5000


        for shirt in self.get_content_list():

            excelTabel.write(i, 0, shirt['상품명'])
            excelTabel.write(i, 1, shirt['상품금액'])
            excelTabel.write(i, 2, shirt['배송'])
            excelTabel.write(i, 3, shirt['평점']if None else'')
            excelTabel.write(i, 4, shirt['구매']if None else'')
            excelTabel.write(i, 5, shirt['net'])
            i+=1


        worksheet.save('Gmarket.xls')</u>


    def run(self):#开始运行

        next_url_list = self.next_url()

        for url in next_url_list:

            html_str = self.driver.get(url)
            time.sleep(3)  # 等3秒加载完再提取

            self.get_content_list()
            self.save_content_list()



if __name__ == '__main__':

    gmarket = GmarketSpdier()
    gmarket.run()

用xlwt来写入数据,代码有点复杂。

如果用pandas中的DataFrame来保存数据,然后存为Excel或者csv文件,会比较方便。代码如下。

from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd

class GmarketSpdier:
    def __init__(self):
        self.start_url = "http://browse.gmarket.co.kr/search?keyword=t-shirt&k=21&p={}"
        self.driver = webdriver.Chrome()

    def get_content_list(self):  # 提取数据
        html = self.driver.page_source
        soup = BeautifulSoup(html, 'lxml')
        div_list = soup.find_all('div',class_='box__component box__component-itemcard box__component-itemcard--general')
        content_list = []
        for div in div_list:
            item = {}
            for h1 in div.find_all('span', class_='text__item'):
                item['상품명'] = h1.text
            for h2 in div.find_all('strong', class_='text text__value'):
                item['상품금액'] = h2.text
            for h3 in div.find_all('span', class_='text__tag'):
                item['배송'] = h3.text
            for h4 in div.find_all('span', class_='image__awards-points'):
                item['평점'] = h4.text
            for h5 in div.find_all('li', class_='list-item list-item__pay-count'):
                item['구매'] = h5.text
            for h6 in div.find_all('a', class_='link__item'):
                item['net'] = str(h6)
            content_list.append(item)

        df=pd.DataFrame(content_list)
        print(len(df))
        return df

    def next_url(self):  # 다음 page 얻는다
        return [self.start_url.format(a) for a in range(2, 4)]

    def run(self):  # 开始运行
        next_url_list = self.next_url()
        df_all=pd.DataFrame()
        for url in next_url_list:
            html_str = self.driver.get(url)
            time.sleep(3)  # 等3秒加载完再提取
            df=self.get_content_list()
            df_all=pd.concat([df_all,df],axis=0)

        df_all.to_excel('data_all.xlsx',encoding='utf-8')

if __name__ == '__main__':
    gmarket = GmarketSpdier()
    gmarket.run()

代码说明:run方法中,创建一个名为df_all的DataFrame,用于保存所有的数据,在url循环中,每次得到的df都追加到df_all中,最后将df_all导出为excel文件。

如果用xlwt,可以尝试这样修改代码。

import xlwt
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import json
i=1
class GmarketSpdier:
    def __init__(self):
        self.start_url = "http://browse.gmarket.co.kr/search?keyword=t-shirt&k=21&p={}"
        self.driver = webdriver.Chrome()

    def get_content_list(self):  # 提取数据
        html = self.driver.page_source
        soup = BeautifulSoup(html, 'lxml')
        div_list = soup.find_all('div',
                                 class_='box__component box__component-itemcard box__component-itemcard--general')
        content_list = []
        for div in div_list:
            item = {}
            for h1 in div.find_all('span', class_='text__item'):
                item['상품명'] = h1.text
            for h2 in div.find_all('strong', class_='text text__value'):
                item['상품금액'] = h2.text
            for h3 in div.find_all('span', class_='text__tag'):
                item['배송'] = h3.text
            for h4 in div.find_all('span', class_='image__awards-points'):
                item['평점'] = h4.text
            for h5 in div.find_all('li', class_='list-item list-item__pay-count'):
                item['구매'] = h5.text
            for h6 in div.find_all('a', class_='link__item'):
                item['net'] = str(h6)
            content_list.append(item)

        return content_list  # 返回一个列表

    def next_url(self):  # 다음 page 얻는다
        return [self.start_url.format(a) for a in range(2, 4)]

    def create_wb(self):
        worksheet = xlwt.Workbook(encoding='utf-8')
        excelTabel = worksheet.add_sheet('GmarketData')
        excelTabel.write(0, 0, '상품명')
        excelTabel.write(0, 1, '상품금액')
        excelTabel.write(0, 2, '배송')
        excelTabel.write(0, 3, '평점')
        excelTabel.write(0, 4, '구매')
        excelTabel.write(0, 5, 'net')
        col = excelTabel.col(1)
        col.width = 5000
        return worksheet,excelTabel

    def save_content_list(self, worksheet,excelTabel):
        global i
        for shirt in self.get_content_list():
            excelTabel.write(i, 0, shirt['상품명'])
            excelTabel.write(i, 1, shirt['상품금액'])
            excelTabel.write(i, 2, shirt['배송'])
            excelTabel.write(i, 3, shirt['평점'] if None else '')
            excelTabel.write(i, 4, shirt['구매'] if None else '')
            excelTabel.write(i, 5, shirt['net'])
            i += 1

        worksheet.save('Gmarket.xls')

    def run(self):  # 开始运行
        next_url_list = self.next_url()
        worksheet,excelTabel=self.create_wb()
        for url in next_url_list:
            html_str = self.driver.get(url)
            time.sleep(3)  # 等3秒加载完再提取
            self.get_content_list()
            self.save_content_list(worksheet,excelTabel)

if __name__ == '__main__':
    gmarket = GmarketSpdier()
    gmarket.run()

这个代码可以保存多页,不过中间的字段提取貌似有些问题。

定义成全局变量,然后使用i++就可以了

这个需要抓取所有页面之后一起写入 worksheet.save('Gmarket.xls')

 

import xlwt
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import json



class GmarketSpdier:
    def __init__(self):
        self.start_url = "http://browse.gmarket.co.kr/search?keyword=t-shirt&k=21&p={}"
        self.driver = webdriver.Chrome()



    def get_content_list(self):#提取数据
        html = self.driver.page_source
        soup = BeautifulSoup(html,'lxml')
        div_list = soup.find_all('div',class_='box__component box__component-itemcard box__component-itemcard--general')
        content_list = []
        for div in div_list:
            item = {}
            for h1 in div.find_all('span', class_='text__item'):
                item['상품명'] = h1.text
            for h2 in div.find_all('strong', class_='text text__value'):
                item['상품금액'] = h2.text
            for h3 in div.find_all('span', class_='text__tag'):
                item['배송'] = h3.text
            for h4 in div.find_all('span', class_='image__awards-points'):
                item['평점'] = h4.text
            for h5 in div.find_all('li', class_='list-item list-item__pay-count'):
                item['구매'] = h5.text
            for h6 in div.find_all('a', class_='link__item'):
                item['net'] = str(h6)
            content_list.append(item)

        return content_list #返回一个列表

    def next_url(self):  #다음 page 얻는다
        return [self.start_url.format(a) for a in range(2,7)] #为了快点测试只抓取5页面

    def save_content_list(self,i=1):
        worksheet = xlwt.Workbook(encoding='utf-8')
        excelTabel = worksheet .add_sheet('GmarketData')
        excelTabel.write(0, 0, '상품명')
        excelTabel.write(0, 1, '상품금액')
        excelTabel.write(0, 2, '배송')
        excelTabel.write(0, 3, '평점')
        excelTabel.write(0, 4, '구매')
        excelTabel.write(0, 5, 'net')
        col = excelTabel.col(1)
        col.width = 5000

        next_url_list = self.next_url()
        for url in next_url_list:
            self.driver.get(url)
            time.sleep(3)  # 等3秒加载完再提取
            print(url)

            for shirt in self.get_content_list():
                print(i,shirt['상품금액'],shirt['상품명'])
                excelTabel.write(i, 0, shirt['상품명'])
                excelTabel.write(i, 1, shirt['상품금액'])
                excelTabel.write(i, 2, shirt['배송'])
                excelTabel.write(i, 3, shirt['평점']if None else'')
                excelTabel.write(i, 4, shirt['구매']if None else'')
                excelTabel.write(i, 5, shirt['net'])
                i+=1

        worksheet.save('Gmarket.xls')

    def run(self):#开始运行
        self.save_content_list()

if __name__ == '__main__':
    gmarket = GmarketSpdier()
    gmarket.run()

 

您好,我是有问必答小助手,你的问题已经有小伙伴为您解答了问题,您看下是否解决了您的问题,可以追评进行沟通哦~

如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~

ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632

非常感谢您使用有问必答服务,为了后续更快速的帮您解决问题,现诚邀您参与有问必答体验反馈。您的建议将会运用到我们的产品优化中,希望能得到您的支持与协助!

速戳参与调研>>>https://t.csdnimg.cn/Kf0y