python爬取网页的表格信息

img


这到底咋写呀,各位帮一帮!
1111111111111111111111111111111111111111111111111111111111111

参考以下代码:

from bs4 import BeautifulSoup as bs
from requests import get
import re
Agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
url = 'http://www.gaosan.com/gaokao/311315.html'
data = get(url,headers = {'User-Agent':Agent})
data.encoding='utf-8'
soup = bs(data.text,'html.parser')
table = soup.find('table')
colleges = table.find_all("td")
t = []
with open('college.csv','w', encoding='utf-8') as fn:
    for i,n in enumerate(colleges):
        t.append(re.findall(r'>(.+?)<', str(n))[0].strip())
        if i%4==3:
            print(','.join(t), file = fn)
            t = []
print('处理完成!\n')
try:  #如本地安装过pandas库,就读取测试文件
    import pandas as pd
    df = pd.read_csv('college.csv')
    print(df)
except:
    print('未安装pandas库,或读文件失败。')

瞅瞅这个
https://zhuanlan.zhihu.com/p/464198490

题主可以把网址发一下,可以帮你研究研究

不同的网站要分别分析的

不是说直接运行一个已有的代码就可以

只需要得到table标签下的源码吗?不需要解析数据内容吗?如果是的话就这样:

import requests
from bs4 import BeautifulSoup

url = "http://tjj.hunan.gov.cn/hntj/tjfx/tjgb/rkpc/202105/t20210519_19079329.html"
resp = requests.get(url)
resp.encoding = "utf-8"
page = BeautifulSoup(resp.text,"html.parser")
resp.close()
bg = page.find("table")
print(bg)