import csv
import requests
from lxml import etree
for i in range(1,111):
resp = requests.get(f"https://www.shifair.com/exhibition/0-0-0-0-0-0-add_time-desc-{i}/")
html = etree.HTML(resp.text)
divs = html.xpath("/html/body/div/div[5]/div[2]/div")
for div in divs:
zhanhui = div.xpath("./div/h1/a/text()")
tuijian = div.xpath("./div[2]/div[1]/span/text()")
buttons = div.xpath("./div[2]/div[3]/button/text()")
scales = div.xpath("./div[2]/div[4]/div/text()")
time = div.xpath("./div[2]/div[2]/div[1]/text()")
day = div.xpath("./div[2]/div[2]/div[2]/span/text()")
hot = div.xpath("./div[1]/div/span/text()")
pinglun = div.xpath("./div[2]/div[5]/div[1]/span/text()")
dingyue = div.xpath("./div[2]/div[5]/div[2]/button/text()")
print(zhanhui, tuijian, buttons, scales, time, day, hot, pinglun, dingyue)
存储数据
# data 数据 path 存储路径 data.csv
def write_to_csv(data, path):
with open(path, 'w', newline='') as f:
my_writer = csv.writer(f)
my_writer.writerows(data)
return path
import csv
import requests
from lxml import etree
path = 'SAVE.CSV'
with open(path, 'w', newline='') as f:
my_writer = csv.writer(f)
for i in range(1, 111):
resp = requests.get(f"https://www.shifair.com/exhibition/0-0-0-0-0-0-add_time-desc-{i}/")
html = etree.HTML(resp.text)
divs = html.xpath("/html/body/div/div[5]/div[2]/div")
for div in divs:
zhanhui = div.xpath("./div/h1/a/text()")
tuijian = div.xpath("./div[2]/div[1]/span/text()")
buttons = div.xpath("./div[2]/div[3]/button/text()")
scales = div.xpath("./div[2]/div[4]/div/text()")
time = div.xpath("./div[2]/div[2]/div[1]/text()")
day = div.xpath("./div[2]/div[2]/div[2]/span/text()")
hot = div.xpath("./div[1]/div/span/text()")
pinglun = div.xpath("./div[2]/div[5]/div[1]/span/text()")
dingyue = div.xpath("./div[2]/div[5]/div[2]/button/text()")
data = [zhanhui, tuijian, buttons, scales, time, day, hot, pinglun, dingyue]
my_writer.writerows(data)
保存成这样是不是就行了?
运行了一下,有一些小细节的地方需要注意,比如编码如果不注意,写入就会是乱码,还有列表如果不转为str,写入之后都是带[]的,很不友好。
优化完之后,就是我上面截图的效果。代码如下,可以直接运行即可:(生成的test.csv文件在当前目录,如果有需要自己修改即可)
import csv
import requests
from lxml import etree
for i in range(1,111):
resp = requests.get(f"https://www.shifair.com/exhibition/0-0-0-0-0-0-add_time-desc-{i}/")
html = etree.HTML(resp.text)
divs = html.xpath("/html/body/div/div[5]/div[2]/div")
with open("test.csv",'a',encoding='GBK',newline="")as f:
writer = csv.writer(f)
for div in divs:
zhanhui = ''.join(div.xpath("./div/h1/a/text()"))
tuijian = ''.join(div.xpath("./div[2]/div[1]/span/text()"))
buttons = ''.join(div.xpath("./div[2]/div[3]/button/text()"))
scales = ''.join(div.xpath("./div[2]/div[4]/div/text()"))
time = ''.join(div.xpath("./div[2]/div[2]/div[1]/text()"))
day = ''.join(div.xpath("./div[2]/div[2]/div[2]/span/text()"))
hot = ''.join(div.xpath("./div[1]/div/span/text()"))
pinglun = ''.join(div.xpath("./div[2]/div[5]/div[1]/span/text()"))
dingyue = ''.join(div.xpath("./div[2]/div[5]/div[2]/button/text()"))
writer.writerow([zhanhui, tuijian, buttons, scales, time, day, hot, pinglun, dingyue])
# -*- coding:utf-8 -*-
import pandas as pd
import requests,time
from lxml import etree
def getHtmlContent(url):
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
'Authority': 'www.shifair.com',
'Sec-Ch-Ua-Platform': "Windows",
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
}
html = requests.get(url=url,headers=headers)
if html.status_code == 200:
return html.text
def parseHtml(html):
data = []
html = etree.HTML(html)
#解析html的时候,不建议用div[5] div[2] 这种方式,太乱了,容易混淆,建议用class或者id的方式定位元素
divs = html.xpath('//div[@class="container"]//div[@class="info_list_block_item"]')
for div in divs:
zhanhui = div.xpath('./div[ @class ="info_list_block_item_info"]/h1/a/ text()')[0]
tuijian = div.xpath('./div[@class="info_list_block_item_info"]/div[@class="info_list_block_item_info_tab"]/span/text()')
time = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_time"]/text()')[0]
day = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_com"]/span/text()')[0]
buttons = div.xpath('.//div[@class="info_list_block_item_info_btn"]/button/text()')
guimo = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_gm_left"]/text()')[0]
guanzhong = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_gm_right"]/text()')[0]
pinglun = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_djs_list_dingyue info_list_block_item_info_com"]/span/text()')[0]
dingyue = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_djs_dingyue"]/button/text()')[0]
hot = div.xpath('.//div[@class="info_list_block_item_logo_hot"]/span/text()')[0]
data.append([zhanhui, str(tuijian) ,time,day,str(buttons),guimo,guanzhong,pinglun,dingyue,hot])
return data
def main():
result = []
start_url = 'https://www.shifair.com/exhibition/0-0-0-0-0-0-add_time-desc-{}'
for i in range(1,110):
html = getHtmlContent(start_url.format(str(i)))
time.sleep(1) #每次在爬取数据后停顿1s,也是防止IP被封
data = parseHtml(html)
result.extend(data)
# 使用pandas的目的是,pandas可以做大数据方面处理,且保证电脑也不会很卡
df = pd.DataFrame(result,columns=['展会名称','推荐','时间','距离天数','按钮标签','规模','观众人数','评论数量','订阅','热度'])
df.to_csv('result.csv',index=False,encoding='gbk') #如果是windows上,编码是gbk,如果是linux或者mac上,建议改成 utf-8
if __name__ == '__main__':
main()
我可以帮你解决这个问题。你可以按照以下步骤来运行代码并将生成的数据保存到CSV文件中:
import pandas as pd
import numpy as np
data = pd.DataFrame()
loc
逐行添加数据,或者使用append
方法从列表中添加数据:for .....
# 生成数据
item = {}
item['公司名称'] = html.xpath('//*[@id="wap_header_top"]/div[1]/div[1]/div[1]/text()')[0]
# 将生成的数据添加到数据框中
data = data.append(item, ignore_index=True)
to_csv
方法将数据保存为CSV文件:data.to_csv('data.csv', index=False)
整个过程示例代码如下:
import pandas as pd
import numpy as np
data = pd.DataFrame()
for ...
# 生成数据
item = {}
item['公司名称'] = html.xpath('//*[@id="wap_header_top"]/div[1]/div[1]/div[1]/text()')[0]
# 将生成的数据添加到数据框中
data = data.append(item, ignore_index=True)
data.to_csv('data.csv', index=False)
请注意,代码中的...
表示你需要根据实际情况添加代码来生成数据,并将其添加到数据框中。另外,你还需要根据实际情况修改生成数据的方式,这里只提供了一个示例。
希望这个解决方案能对你有所帮助!如果你还有其他问题,请随时提出。