怎么把这些代码运行出来的数据保存到csv里?


import csv
import requests

from lxml import etree
for i in range(1,111):
        resp = requests.get(f"https://www.shifair.com/exhibition/0-0-0-0-0-0-add_time-desc-{i}/")
        html = etree.HTML(resp.text)
        divs = html.xpath("/html/body/div/div[5]/div[2]/div")
        for div in divs:
                zhanhui = div.xpath("./div/h1/a/text()")
                tuijian = div.xpath("./div[2]/div[1]/span/text()")
                buttons = div.xpath("./div[2]/div[3]/button/text()")
                scales = div.xpath("./div[2]/div[4]/div/text()")
                time = div.xpath("./div[2]/div[2]/div[1]/text()")
                day = div.xpath("./div[2]/div[2]/div[2]/span/text()")
                hot = div.xpath("./div[1]/div/span/text()")
                pinglun = div.xpath("./div[2]/div[5]/div[1]/span/text()")
                dingyue = div.xpath("./div[2]/div[5]/div[2]/button/text()")
                
                print(zhanhui, tuijian, buttons, scales, time, day, hot, pinglun, dingyue)


存储数据

后续追加数据 'w' 改成 'a'

# data 数据    path  存储路径  data.csv
def write_to_csv(data, path):
    
    with open(path, 'w', newline='') as f:
        my_writer = csv.writer(f)
        my_writer.writerows(data)
    return path

    import csv
    import requests

    from lxml import etree
    path = 'SAVE.CSV'
    with open(path, 'w', newline='') as f:
        my_writer = csv.writer(f)
        for i in range(1, 111):
            resp = requests.get(f"https://www.shifair.com/exhibition/0-0-0-0-0-0-add_time-desc-{i}/")
            html = etree.HTML(resp.text)
            divs = html.xpath("/html/body/div/div[5]/div[2]/div")
            for div in divs:
                zhanhui = div.xpath("./div/h1/a/text()")
                tuijian = div.xpath("./div[2]/div[1]/span/text()")
                buttons = div.xpath("./div[2]/div[3]/button/text()")
                scales = div.xpath("./div[2]/div[4]/div/text()")
                time = div.xpath("./div[2]/div[2]/div[1]/text()")
                day = div.xpath("./div[2]/div[2]/div[2]/span/text()")
                hot = div.xpath("./div[1]/div/span/text()")
                pinglun = div.xpath("./div[2]/div[5]/div[1]/span/text()")
                dingyue = div.xpath("./div[2]/div[5]/div[2]/button/text()")
                data = [zhanhui, tuijian, buttons, scales, time, day, hot, pinglun, dingyue]
                my_writer.writerows(data)

保存成这样是不是就行了?

img

运行了一下,有一些小细节的地方需要注意,比如编码如果不注意,写入就会是乱码,还有列表如果不转为str,写入之后都是带[]的,很不友好。
优化完之后,就是我上面截图的效果。代码如下,可以直接运行即可:(生成的test.csv文件在当前目录,如果有需要自己修改即可)

 
import csv
import requests
 
from lxml import etree
for i in range(1,111):
        resp = requests.get(f"https://www.shifair.com/exhibition/0-0-0-0-0-0-add_time-desc-{i}/")
        html = etree.HTML(resp.text)
        divs = html.xpath("/html/body/div/div[5]/div[2]/div")
        with open("test.csv",'a',encoding='GBK',newline="")as f:
            writer = csv.writer(f)
            for div in divs:
                zhanhui = ''.join(div.xpath("./div/h1/a/text()"))
                tuijian = ''.join(div.xpath("./div[2]/div[1]/span/text()"))
                buttons = ''.join(div.xpath("./div[2]/div[3]/button/text()"))
                scales = ''.join(div.xpath("./div[2]/div[4]/div/text()"))
                time = ''.join(div.xpath("./div[2]/div[2]/div[1]/text()"))
                day = ''.join(div.xpath("./div[2]/div[2]/div[2]/span/text()"))
                hot = ''.join(div.xpath("./div[1]/div/span/text()"))
                pinglun = ''.join(div.xpath("./div[2]/div[5]/div[1]/span/text()"))
                dingyue = ''.join(div.xpath("./div[2]/div[5]/div[2]/button/text()"))

                writer.writerow([zhanhui, tuijian, buttons, scales, time, day, hot, pinglun, dingyue])


 
 

# -*- coding:utf-8 -*-

import pandas as pd
import requests,time
from lxml import etree


def getHtmlContent(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        'Authority': 'www.shifair.com',
        'Sec-Ch-Ua-Platform': "Windows",
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
    }
    html = requests.get(url=url,headers=headers)
    if html.status_code == 200:
        return html.text


def parseHtml(html):
    data = []
    html = etree.HTML(html)

    #解析html的时候,不建议用div[5]  div[2]  这种方式,太乱了,容易混淆,建议用class或者id的方式定位元素
    divs = html.xpath('//div[@class="container"]//div[@class="info_list_block_item"]')
    for div in divs:
        zhanhui = div.xpath('./div[ @class ="info_list_block_item_info"]/h1/a/ text()')[0]
        tuijian = div.xpath('./div[@class="info_list_block_item_info"]/div[@class="info_list_block_item_info_tab"]/span/text()')
        time = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_time"]/text()')[0]
        day = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_com"]/span/text()')[0]
        buttons = div.xpath('.//div[@class="info_list_block_item_info_btn"]/button/text()')
        guimo = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_gm_left"]/text()')[0]
        guanzhong = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_gm_right"]/text()')[0]
        pinglun = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_djs_list_dingyue info_list_block_item_info_com"]/span/text()')[0]
        dingyue = div.xpath('./div[@class="info_list_block_item_info"]//div[@class="info_list_block_item_info_djs_dingyue"]/button/text()')[0]
        hot = div.xpath('.//div[@class="info_list_block_item_logo_hot"]/span/text()')[0]

        data.append([zhanhui, str(tuijian) ,time,day,str(buttons),guimo,guanzhong,pinglun,dingyue,hot])
    return data

def main():
    result = []
    start_url = 'https://www.shifair.com/exhibition/0-0-0-0-0-0-add_time-desc-{}'
    for i in range(1,110):
        html = getHtmlContent(start_url.format(str(i)))
        time.sleep(1)    #每次在爬取数据后停顿1s,也是防止IP被封
        data = parseHtml(html)
        result.extend(data)
    # 使用pandas的目的是,pandas可以做大数据方面处理,且保证电脑也不会很卡
    df = pd.DataFrame(result,columns=['展会名称','推荐','时间','距离天数','按钮标签','规模','观众人数','评论数量','订阅','热度'])
    df.to_csv('result.csv',index=False,encoding='gbk')   #如果是windows上,编码是gbk,如果是linux或者mac上,建议改成 utf-8


if __name__ == '__main__':
    main()





  • 帮你找了个相似的问题, 你可以看下: https://ask.csdn.net/questions/7616157
  • 你也可以参考下这篇文章:怎样将浏览器中的数据保存成csv格式文件
  • 除此之外, 这篇博客: 机器学习数据读取总结中的 这是csv文件: 部分也许能够解决你的问题, 你可以仔细阅读以下内容或跳转源博客中阅读:
  • 在这里插入图片描述

  • 您还可以看一下 李宁老师的数据分析“薪”法修炼-面试篇课程中的 读写CSV文件小节, 巩固相关知识点
  • 以下回答由chatgpt基于相关博客总结生成:

    我可以帮你解决这个问题。你可以按照以下步骤来运行代码并将生成的数据保存到CSV文件中:

    1. 首先,导入所需的库(pandas和numpy):
    import pandas as pd
    import numpy as np
    
    1. 然后,你可以使用pandas的DataFrame函数创建一个空的数据框来存储你生成的数据:
    data = pd.DataFrame()
    
    1. 接下来,使用你的代码生成数据,并将每一条数据添加到数据框中。你可以使用loc逐行添加数据,或者使用append方法从列表中添加数据:
    for .....
        # 生成数据
        item = {}
        item['公司名称'] = html.xpath('//*[@id="wap_header_top"]/div[1]/div[1]/div[1]/text()')[0]
        # 将生成的数据添加到数据框中
        data = data.append(item, ignore_index=True)
    
    1. 最后,使用to_csv方法将数据保存为CSV文件:
    data.to_csv('data.csv', index=False)
    

    整个过程示例代码如下:

    import pandas as pd
    import numpy as np
    
    data = pd.DataFrame()
    
    for ...
        # 生成数据
        item = {}
        item['公司名称'] = html.xpath('//*[@id="wap_header_top"]/div[1]/div[1]/div[1]/text()')[0]
        # 将生成的数据添加到数据框中
        data = data.append(item, ignore_index=True)
    
    data.to_csv('data.csv', index=False)
    

    请注意,代码中的...表示你需要根据实际情况添加代码来生成数据,并将其添加到数据框中。另外,你还需要根据实际情况修改生成数据的方式,这里只提供了一个示例。

    希望这个解决方案能对你有所帮助!如果你还有其他问题,请随时提出。