下面是代码,刚开始的时候能爬一点图片,过几秒就报错,不知道是怎么回事,求大佬指点!
import requests
import re
for i in range(0,200):
base_url = ("https://www.buxiuse.com/?page={}".format(i))
headers = {
'headers':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
}
response = requests.get(base_url,headers=headers)
page_urls = re.findall('<div class=".*?"> <span class=".*?"> <a href="(.*?)" class=".*?">',response.text)
for urls in page_urls:
response = requests.get(urls,headers=headers)
jpg_urls = re.findall('<img src="(.*?)" width=".*?" referrerpolicy=".*?">',response.text)
for jpg in jpg_urls:
jpg_name = jpg.split('/')[-1]
response = requests.get(jpg,headers=headers)
with open('IMG\\' + jpg_name,'wb') as f:
f.write(response.content)
print("下载完成=======>",jpg_name)
可以用下面代码试试,但是涉及到这样io操作的,建议还是用多线程
# -*- coding:utf-8 -*-
import requests
import re
import os
import time
from urllib.request import urlretrieve
for i in range(0,200):
base_url = ("https://www.buxiuse.com/?page={}".format(i))
headers = {
'headers':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
}
response = requests.get(base_url,headers=headers)
regx = re.compile(r'<img class="height_min" title=".*?" alt=".*?".*?src="(.*?)"')
image_urls = re.findall(regx,response.text)
if not os.path.exists('image'):
os.mkdir('image')
for item in image_urls:
name = item.rsplit('/',1)[1]
time.sleep(0.5)
urlretrieve(item,"image/%s" %name)
多线程代码(利用生产者和消费者模式实现):
# -*- coding:utf-8 -*-
import requests
import re
import os
import time
from urllib.request import urlretrieve
from queue import Queue
import threading
def Productor(q,urls):
for i in range(0, 200):
base_url = ("https://www.buxiuse.com/?page={}".format(i))
urls.append(base_url)
headers = {
'headers': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
}
response = requests.get(base_url, headers=headers)
regx = re.compile(r'<img class="height_min" title=".*?" alt=".*?".*?src="(.*?)"')
image_urls = re.findall(regx, response.text)
for item in image_urls:
q.put(item)
def Consumer(q,urls):
if not os.path.exists('image'):
os.mkdir('image')
if len(urls) == 200 and q.empty():
exit('爬取完成')
else:
while True:
image_url = q.get()
if image_url:
name = image_url.rsplit('/', 1)[1]
time.sleep(0.2)
urlretrieve(image_url, "image/%s" % name)
if __name__ == '__main__':
q = Queue(maxsize=1000)
urls = []
for i in range(4):
p = threading.Thread(target=Productor,args=(q,urls))
p.start()
for x in range(5):
c = threading.Thread(target=Consumer,args=(q,urls))
c.start()