from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
import re
import pandas as pd
import lxml.html
import requests
from PIL import Image
from io import BytesIO
from selenium.webdriver.common.by import By
opt = Options()
opt.add_experimental_option('excludeSwitches', ['enable-automation'])
opt.add_argument('--disable-blink-features')
opt.add_argument('--disable-extensions')
opt.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(options=opt) # 创建driver对象
driver.get('https://www.douyu.com/') #打开页面
time.sleep(5)
a=driver.find_element(By.XPATH,'//h2[@class="layout-Module-title"]/a')
a.click()
time.sleep(2)
driver.switch_to.window(driver.window_handles[-1]) #读取新页面
html=driver.page_source
driver.close() #关闭页面
xp=lxml.html.fromstring(html) #lxml对象
units=xp.xpath('//li[@class="layout-Cover-item"]') #没有text()
df=pd.DataFrame()
df = None
volume=[]
title=[]
for u in units:
t={}
t['标题']=u.xpath('./div/a/div[2]/div/h3[@class="DyListCover-intro"]/text()')[0]
t['主播']=u.xpath('./div/a/div[2]/div[@class="DyListCover-info"]/h2[@class="DyListCover-user"]')[0].xpath('string()')
t['播放量']=u.xpath('./div/a/div[2]/div[@class="DyListCover-info"]/span[@class="DyListCover-hot"]/text()')[0]
if t['标题'] ==[]:
print('无')
df1=pd.DataFrame([t])
df = pd.concat([df,df1], ignore_index=True)
b= re.findall(r'\d+[.]\d+',t['播放量'])[0]
print(b)
volume.append(float(b))
title.append(t['标题'])
if len(df)>80:
break
print(df)
print(df.dtypes)
import pandas as pd
df = pd.DataFrame({'播放量':volume}, index=title)
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
pd.to_numeric(df['播放量']).plot(kind='bar')
plt.show()
因为正则式没有找到内容。
你的正则式里必须要匹配小数点,但是如果播放量没有小数点,就找不到了,自然也就报错了
b= re.findall(r'\d+[.]\d+',t['播放量'])[0]