html = '''<dl class="bigtr cl">
<dt class="li01 ta_c"><b class="nob1">3</b></dt>
<dt class="li02"><samp class="holdPIC"></samp></dt>
<dt class="li03 oh"><a href="https://www.1905.com/vod/play/516604.shtml" target="_blank" title="杨贵妃" class=" pl28">杨贵妃</a></dt>
<dt class="li04 oh"><span><a href="https://www.1905.com/mdb/star/2996732/" target="_blank" title="周洁">周洁</a>/<a href="https://www.1905.com/mdb/star/1973493/" target="_blank" title="刘文治">刘文治</a>/<a href="https://www.1905.com/mdb/star/1403/" target="_blank" title="濮存昕">濮存昕</a>/<a href="https://www.1905.com/mdb/star/2998726/" target="_blank" title="程文宽">程文宽</a></span></dt>
<dt class="li05 ta_c"><span>39,189</span></dt>
</dl>
<dl class="cl">
<dt class="li01 ta_c"><b class="ptnob">5</b></dt>
<dt class="li02"><samp class="holdPIC"></samp></dt>
<dt class="li03 oh"><a href="https://www.1905.com/vod/play/85426.shtml" target="_blank" title="神话" class=" pl28">神话</a></dt>
<dt class="li04 oh"><span><a href="https://www.1905.com/mdb/star/242/" target="_blank" title="成龙">成龙</a>/<a href="https://www.1905.com/mdb/star/596/" target="_blank" title="金喜善">金喜善</a>/<a href="https://www.1905.com/mdb/star/1297/" target="_blank" title="梁家辉">梁家辉</a>/<a href="https://www.1905.com/mdb/star/1935/" target="_blank" title="于荣光">于荣光</a></span></dt>
<dt class="li05 ta_c"><span>34,348</span></dt>
</dl>
<dl class="cl">
<dt class="li01 ta_c"><b class="ptnob">6</b></dt>
<dt class="li02"><samp class="holdPIC"></samp></dt>
<dt class="li03 oh"><a href="https://www.1905.com/vod/play/85340.shtml" target="_blank" title="我和姐姐" class=" pl28">我和姐姐</a></dt>
<dt class="li04 oh"><span><a href="https://www.1905.com/mdb/star/3065837/" target="_blank" title="张梦露">张梦露</a>/<a href="https://www.1905.com/mdb/star/3406/" target="_blank" title="刘洋">刘洋</a>/<a href="https://www.1905.com/mdb/star/3065838/" target="_blank" title="易含">易含</a></span></dt>
<dt class="li05 ta_c"><span>30,709</span></dt>
</dl>
'''
如题, 如果都四位演员,我能正常抓, 如果少几个演员,或者多几个演员, 如何用正则抓取, 我写了一个长长的正则, 只能固定抓四位的
import requests, re
def one_page(url):
response = requests.get(url)
if response.status_code == 200:
# response.encoding = 'utf8'
return response.text
return None
def parse_one_page(html):
partter = re.compile(
'<dl.*?"li01 ta_c".*?".*?">(.*?)</b>.*?"li03.*?href="(.*?)".*?pl28">(.*?)</a>.*?"li04.*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>.*?"li05.*?<span>(.*?)</span></dt>.*?</dl>',
re.S)
items = re.findall(partter, html)
print(items)
def main():
url = 'https://www.1905.com/vod/rank/tao1.shtml'
html = one_page(url)
parse_one_page(html)
if __name__ == '__main__':
main()
结果 很多都漏抓了, 小白刚学习, 请高人指教
import requests, re
def one_page(url):
response = requests.get(url)
if response.status_code == 200:
# response.encoding = 'utf8'
return response.text
return None
def parse_one_page(html):
partter = re.compile(
'<dl.*?"li01 ta_c".*?".*?">(.*?)</b>.*?"li03.*?href="(.*?)".*?pl28">(.*?)</a>.*?"li04.*?<span>(.*?)</span>.*?"li05.*?<span>(.*?)</span></dt>.*?</dl>',
re.S)
items = re.findall(partter, html)
for i, item in enumerate(items):
r = list(item)
r[3] = re.findall(r'>(.*?)</a>', r[3])
items[i] = r
print(items)
def main():
url = 'https://www.1905.com/vod/rank/tao1.shtml'
html = one_page(url)
parse_one_page(html)
if __name__ == '__main__':
main()
唉,看了你给的表达式,发现我真的笨的可以, 我掉进了<a></a>标签黑洞, 用span就可以拿到结果 了. 当时我还用了判断 有或者没有 的 (<a></a>)?, 还是没获取到想要的信息
您好,我是问答小助手,你的问题已经有小伙伴为您解答了问题,您看下是否解决了您的问题,可以追评进行沟通哦~
如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~
ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>> https://vip.csdn.net/askvip?utm_source=1146287632