import requests
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"}
response =requests.get('https://movie.douban.com/chart', headers=headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# print(soup)
all_titles = soup.findAll("div", attrs={"class": "pl2"})
for title in all_titles:
all_name = title.find('a')
for name in all_name:
c = name.string
d = c.replace(' ', '')
e = d.replace('\n', '')
print(e)
疾速追杀4/
杀神JohnWick4(港)/捍卫任务4(台)
杀死福顺/
格杀福顺/KillBoksoon
俄罗斯方块/
俄罗斯方块:版权之争
鲸/
庞然大物/我的鲸鱼老爸(台)
网络谜踪2/
人肉搜索2:失踪搜救(台)/人肉搜寻2(港)
断网/
断网24小时/CyberHeist
金爆行动/
玩命特攻:武演行动(台)/伙星行动:扭计特攻(港)
三线轮洄/
三线轮回
下一个素熙/
NextSohee/陰影下的她(台)
阿凡达:水之道/
阿凡达2/阿凡达2:水之道
代码下部分是我爬出来的电影排行榜结果。但是我只需要电影的第一个名字,斜杠后面的外文名字不需要,而且不需要中间的空行。有没有好心的老哥教教我,怎么处理爬出来的数据!
试试这个代码:
import requests
from bs4 import BeautifulSoup
import re
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"}
response = requests.get('https://movie.douban.com/chart', headers=headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
all_titles = soup.findAll("div", attrs={"class": "pl2"})
for title in all_titles:
all_name = title.find('a')
for name in all_name:
c = name.string
d = c.replace(' ', '')
e = d.replace('\n', '')
first_name = re.split('/', e)[0] # 使用正则表达式将字符串按斜杠分割,并提取第一个名字
if first_name: # 跳过空字符串
print(first_name)
你的代码有问题,find修改为findAll ,其次使用contents获取节点,可解析a标签本身的文本,而不包括其子节点的文本。代码:
import requests
from bs4 import BeautifulSoup
import re
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"}
response = requests.get('https://movie.douban.com/chart', headers=headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
all_titles = soup.findAll("div", attrs={"class": "pl2"})
for title in all_titles:
all_name = title.findAll('a')
for name in all_name:
name = name.contents[0]
name = re.sub('/','',name)
name = name.strip()
if name: # 跳过空字符串
print(name)
运行结果: