爬取商品详细信息,如何将未知内容生成表中新的字段,可供挑选的套餐内容又是如何爬取呢

我将页面上每个信息单独爬取出来,但是感觉这样写浪费了很多的时间,而且部分内容不知道该如何存入数据库,比如套餐信息,不同套餐的价格,不仅如此写完就感觉代码怎么看怎么low,如何能够让代码简洁

import tarfile
import mysql
import driver as driver
from selenium import webdriver
from bs4 import BeautifulSoup
import pymysql


driver = webdriver.Chrome(executable_path="C:\Program Files\Google\Chrome\Application\chromedriver.exe")
#1.创建浏览器对象(在那个网站找东西)
driver.get('https://item.jd.com/100021007440.html')
# driver.maximize_window()
# sleep(5)
def search_content():

    base_price = driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div[4]/div/div[1]/div[2]/span[1]/span[2]').text
    sku_name = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[1]').text
    sku_number = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[2]').text
    sku_weight = driver.find_element_by_xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[3]/li[3]').text
    sku_from = driver.find_element_by_xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[3]/li[4]').text
    CPUModel = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[5]').text
    RunningMemory = driver.find_element_by_xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[3]/li[6]').text
    MemorySize = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[7]').text
    MemoryKar = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[8]').text
    NumberofCameras = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[9]').text
    TopPixel = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[11]').text
    BackPixel = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[10]').text
    ScreemSize = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[12]').text
    ResolutionRatio = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[13]').text
    ScreenRatio = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[14]').text
    combo = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[15]').text
    Charger = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[16]').text
    HotSpot = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[17]').text
    ScreenPercent = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[18]').text
    function= driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[19]').text
    OperatingSystem = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[20]').text
    url = 'https://item.jd.com/100021007440.html'
    url = "'" + url + "'"
    base_price = "'" + base_price + "'"
    sku_name = "'" + sku_name + "'"
    sku_number = "'" + sku_number + "'"
    sku_weight = "'" + sku_weight + "'"
    sku_from = "'" + sku_from + "'"
    CPUModel = "'" + CPUModel + "'"
    RunningMemory = "'" + RunningMemory + "'"
    MemorySize = "'" + MemorySize + "'"
    MemoryKar = "'" + MemoryKar + "'"
    NumberofCameras = "'" + NumberofCameras + "'"
    TopPixel = "'" + TopPixel + "'"
    BackPixel = "'" + BackPixel + "'"
    ScreemSize = "'" + ScreemSize + "'"
    ResolutionRatio = "'" + ResolutionRatio + "'"
    ScreenRatio = "'" + ScreenRatio + "'"
    combo = "'" + combo + "'"
    Charger = "'" + Charger + "'"
    HotSpot = "'" + HotSpot + "'"
    ScreenPercent  = "'" + ScreenPercent  + "'"
    function = "'" + function + "'"
    OperatingSystem = "'" + OperatingSystem + "'"
    sql = "insert into product_details()value(" + url + ',' + base_price + ',' + sku_name + ',' + sku_number + ',' + sku_weight + ',' + sku_from + ',' + CPUModel + ',' + RunningMemory + ',' + MemorySize + ',' + MemoryKar + ',' + NumberofCameras + ',' + TopPixel + ',' + BackPixel + ',' + ScreemSize + ',' + ResolutionRatio + ',' + ScreenRatio + ',' + combo + ',' + Charger + ',' + HotSpot + ',' + ScreenPercent  + ',' + function + ',' + OperatingSystem + ")";
    connect()  # 连接数据库,嵌套函数
    insert_or_update(sql)  # 插入数据
def connect():#定义数据库连接
    return pymysql.connect(
        host='127.0.0.1',#主机号
        user='root',#用户
        password='root',#数据库密码
        database='jd_db',#数据库名
        port=3306,#端口号
        charset='utf8'#字符集
    )
def insert_or_update(sql):  # 定义数据库插入
    con = connect()  # 开启连接
    cursor = con.cursor()  # 获取游标
    cursor.execute(sql)  # 执行sql语句
    con.commit()  # 确认操作
    cursor.close()  # 关闭游标
    con.close()  # 关闭连接

    # for introduction in text1_list:
    #     print(introduction.text)
    #     intro = introduction.text
if __name__ == '__main__':
    search_content()

 

首先,对于所有的未知内容,你可以把它们整合成一个json字符串:'{"name1":"value1", "name2":"value2"}',表中用一个字段存储

其次,关于套餐内容,如果你只是简单的存储的话,可以把所有的套餐数据存放到数组arr中,通过 ','.join(arr) 获取一个字符串,表中用一个字段存储该字符串;如果后续还有其他的例如展示产品工作,数据库存储方式更详细会比较好。

然后,关于代码简洁:你可以在最前面定义 url = 'https://item.jd.com/100021007440.html',line11 就可以写成 driver.get(url) ;

关于解析数据,可以先定位到一个包含所有信息的div: info_div = driver.find_element_by_xpath('/html/body/div[10]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]')[0]

在这个div中查找需要的信息:sku_number = info_div.find_element_by_xpath('./li[2]').text

最后,数据库的插入应该是可以稍微封装一下的,让代码简洁一些

个人的数据库封装示例:

# 自己封装的库 Mysql
from myLib.mysql import Mysql

# 调试模式输出sql
mysql = Mysql(debug=True)

print(mysql.insert('table_name', ['value1', 'value2', 'value3']))

# 打印结果:
# INSERT INTO `table_name` VALUES ('value1','value2','value3');

 

您好,我是有问必答小助手,您的问题已经有小伙伴解答了,您看下是否解决,可以追评进行沟通哦~

如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~

ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632