import xlwt
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import json
class GmarketSpdier:
def __init__(self):
self.start_url = "http://browse.gmarket.co.kr/search?keyword=t-shirt&k=21&p={}"
self.driver = webdriver.Chrome()
def get_content_list(self):#提取数据
html = self.driver.page_source
soup = BeautifulSoup(html,'lxml')
div_list = soup.find_all('div',class_='box__component box__component-itemcard box__component-itemcard--general')
content_list = []
for div in div_list:
item = {}
for h1 in div.find_all('span', class_='text__item'):
item['상품명'] = h1.text
for h2 in div.find_all('strong', class_='text text__value'):
item['상품금액'] = h2.text
for h3 in div.find_all('span', class_='text__tag'):
item['배송'] = h3.text
for h4 in div.find_all('span', class_='image__awards-points'):
item['평점'] = h4.text
for h5 in div.find_all('li', class_='list-item list-item__pay-count'):
item['구매'] = h5.text
for h6 in div.find_all('a', class_='link__item'):
item['net'] = str(h6)
content_list.append(item)
return content_list #返回一个列表
def next_url(self): #다음 page 얻는다
return [self.start_url.format(a) for a in range(2,1000)]
<u> def save_content_list(self,i=1):
worksheet = xlwt.Workbook(encoding='utf-8')
excelTabel = worksheet .add_sheet('GmarketData')
excelTabel.write(0, 0, '상품명')
excelTabel.write(0, 1, '상품금액')
excelTabel.write(0, 2, '배송')
excelTabel.write(0, 3, '평점')
excelTabel.write(0, 4, '구매')
excelTabel.write(0, 5, 'net')
col = excelTabel.col(1)
col.width = 5000
for shirt in self.get_content_list():
excelTabel.write(i, 0, shirt['상품명'])
excelTabel.write(i, 1, shirt['상품금액'])
excelTabel.write(i, 2, shirt['배송'])
excelTabel.write(i, 3, shirt['평점']if None else'')
excelTabel.write(i, 4, shirt['구매']if None else'')
excelTabel.write(i, 5, shirt['net'])
i+=1
worksheet.save('Gmarket.xls')</u>
def run(self):#开始运行
next_url_list = self.next_url()
for url in next_url_list:
html_str = self.driver.get(url)
time.sleep(3) # 等3秒加载完再提取
self.get_content_list()
self.save_content_list()
if __name__ == '__main__':
gmarket = GmarketSpdier()
gmarket.run()
用xlwt来写入数据,代码有点复杂。
如果用pandas中的DataFrame来保存数据,然后存为Excel或者csv文件,会比较方便。代码如下。
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
class GmarketSpdier:
def __init__(self):
self.start_url = "http://browse.gmarket.co.kr/search?keyword=t-shirt&k=21&p={}"
self.driver = webdriver.Chrome()
def get_content_list(self): # 提取数据
html = self.driver.page_source
soup = BeautifulSoup(html, 'lxml')
div_list = soup.find_all('div',class_='box__component box__component-itemcard box__component-itemcard--general')
content_list = []
for div in div_list:
item = {}
for h1 in div.find_all('span', class_='text__item'):
item['상품명'] = h1.text
for h2 in div.find_all('strong', class_='text text__value'):
item['상품금액'] = h2.text
for h3 in div.find_all('span', class_='text__tag'):
item['배송'] = h3.text
for h4 in div.find_all('span', class_='image__awards-points'):
item['평점'] = h4.text
for h5 in div.find_all('li', class_='list-item list-item__pay-count'):
item['구매'] = h5.text
for h6 in div.find_all('a', class_='link__item'):
item['net'] = str(h6)
content_list.append(item)
df=pd.DataFrame(content_list)
print(len(df))
return df
def next_url(self): # 다음 page 얻는다
return [self.start_url.format(a) for a in range(2, 4)]
def run(self): # 开始运行
next_url_list = self.next_url()
df_all=pd.DataFrame()
for url in next_url_list:
html_str = self.driver.get(url)
time.sleep(3) # 等3秒加载完再提取
df=self.get_content_list()
df_all=pd.concat([df_all,df],axis=0)
df_all.to_excel('data_all.xlsx',encoding='utf-8')
if __name__ == '__main__':
gmarket = GmarketSpdier()
gmarket.run()
代码说明:run方法中,创建一个名为df_all的DataFrame,用于保存所有的数据,在url循环中,每次得到的df都追加到df_all中,最后将df_all导出为excel文件。
如果用xlwt,可以尝试这样修改代码。
import xlwt
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import json
i=1
class GmarketSpdier:
def __init__(self):
self.start_url = "http://browse.gmarket.co.kr/search?keyword=t-shirt&k=21&p={}"
self.driver = webdriver.Chrome()
def get_content_list(self): # 提取数据
html = self.driver.page_source
soup = BeautifulSoup(html, 'lxml')
div_list = soup.find_all('div',
class_='box__component box__component-itemcard box__component-itemcard--general')
content_list = []
for div in div_list:
item = {}
for h1 in div.find_all('span', class_='text__item'):
item['상품명'] = h1.text
for h2 in div.find_all('strong', class_='text text__value'):
item['상품금액'] = h2.text
for h3 in div.find_all('span', class_='text__tag'):
item['배송'] = h3.text
for h4 in div.find_all('span', class_='image__awards-points'):
item['평점'] = h4.text
for h5 in div.find_all('li', class_='list-item list-item__pay-count'):
item['구매'] = h5.text
for h6 in div.find_all('a', class_='link__item'):
item['net'] = str(h6)
content_list.append(item)
return content_list # 返回一个列表
def next_url(self): # 다음 page 얻는다
return [self.start_url.format(a) for a in range(2, 4)]
def create_wb(self):
worksheet = xlwt.Workbook(encoding='utf-8')
excelTabel = worksheet.add_sheet('GmarketData')
excelTabel.write(0, 0, '상품명')
excelTabel.write(0, 1, '상품금액')
excelTabel.write(0, 2, '배송')
excelTabel.write(0, 3, '평점')
excelTabel.write(0, 4, '구매')
excelTabel.write(0, 5, 'net')
col = excelTabel.col(1)
col.width = 5000
return worksheet,excelTabel
def save_content_list(self, worksheet,excelTabel):
global i
for shirt in self.get_content_list():
excelTabel.write(i, 0, shirt['상품명'])
excelTabel.write(i, 1, shirt['상품금액'])
excelTabel.write(i, 2, shirt['배송'])
excelTabel.write(i, 3, shirt['평점'] if None else '')
excelTabel.write(i, 4, shirt['구매'] if None else '')
excelTabel.write(i, 5, shirt['net'])
i += 1
worksheet.save('Gmarket.xls')
def run(self): # 开始运行
next_url_list = self.next_url()
worksheet,excelTabel=self.create_wb()
for url in next_url_list:
html_str = self.driver.get(url)
time.sleep(3) # 等3秒加载完再提取
self.get_content_list()
self.save_content_list(worksheet,excelTabel)
if __name__ == '__main__':
gmarket = GmarketSpdier()
gmarket.run()
这个代码可以保存多页,不过中间的字段提取貌似有些问题。
定义成全局变量,然后使用i++就可以了
这个需要抓取所有页面之后一起写入 worksheet.save('Gmarket.xls')
import xlwt
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import json
class GmarketSpdier:
def __init__(self):
self.start_url = "http://browse.gmarket.co.kr/search?keyword=t-shirt&k=21&p={}"
self.driver = webdriver.Chrome()
def get_content_list(self):#提取数据
html = self.driver.page_source
soup = BeautifulSoup(html,'lxml')
div_list = soup.find_all('div',class_='box__component box__component-itemcard box__component-itemcard--general')
content_list = []
for div in div_list:
item = {}
for h1 in div.find_all('span', class_='text__item'):
item['상품명'] = h1.text
for h2 in div.find_all('strong', class_='text text__value'):
item['상품금액'] = h2.text
for h3 in div.find_all('span', class_='text__tag'):
item['배송'] = h3.text
for h4 in div.find_all('span', class_='image__awards-points'):
item['평점'] = h4.text
for h5 in div.find_all('li', class_='list-item list-item__pay-count'):
item['구매'] = h5.text
for h6 in div.find_all('a', class_='link__item'):
item['net'] = str(h6)
content_list.append(item)
return content_list #返回一个列表
def next_url(self): #다음 page 얻는다
return [self.start_url.format(a) for a in range(2,7)] #为了快点测试只抓取5页面
def save_content_list(self,i=1):
worksheet = xlwt.Workbook(encoding='utf-8')
excelTabel = worksheet .add_sheet('GmarketData')
excelTabel.write(0, 0, '상품명')
excelTabel.write(0, 1, '상품금액')
excelTabel.write(0, 2, '배송')
excelTabel.write(0, 3, '평점')
excelTabel.write(0, 4, '구매')
excelTabel.write(0, 5, 'net')
col = excelTabel.col(1)
col.width = 5000
next_url_list = self.next_url()
for url in next_url_list:
self.driver.get(url)
time.sleep(3) # 等3秒加载完再提取
print(url)
for shirt in self.get_content_list():
print(i,shirt['상품금액'],shirt['상품명'])
excelTabel.write(i, 0, shirt['상품명'])
excelTabel.write(i, 1, shirt['상품금액'])
excelTabel.write(i, 2, shirt['배송'])
excelTabel.write(i, 3, shirt['평점']if None else'')
excelTabel.write(i, 4, shirt['구매']if None else'')
excelTabel.write(i, 5, shirt['net'])
i+=1
worksheet.save('Gmarket.xls')
def run(self):#开始运行
self.save_content_list()
if __name__ == '__main__':
gmarket = GmarketSpdier()
gmarket.run()
您好,我是有问必答小助手,你的问题已经有小伙伴为您解答了问题,您看下是否解决了您的问题,可以追评进行沟通哦~
如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~
ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632
非常感谢您使用有问必答服务,为了后续更快速的帮您解决问题,现诚邀您参与有问必答体验反馈。您的建议将会运用到我们的产品优化中,希望能得到您的支持与协助!
速戳参与调研>>>https://t.csdnimg.cn/Kf0y