最后的r'</ul>'去掉,要不内容并不是</p>结束后接着</ul>
示例代码如下
import requests as pc
import re
def get_html(url):
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
res=pc.get(url,headers=headers)
res.encoding='utf-8'
return res.text
def get_movie(html):
pattern=re.compile(r'<ul class="row" data-com="hrefTo.*?>'
r'.*?first-line">(.*?)</p>'
r'.*?second-line">(.*?)</p>'
,re.S)
r=re.findall(pattern,html)
print(r[0])
def main():
url="https://piaofang.maoyan.com/rankings/year"
html=get_html(url)
get_movie(html)
if __name__=='__main__':
main()