完整代码如下
#多线程爬取搜图网
import requests
import threading
from bs4 import BeautifulSoup
import os
class WebSpider:
def __init__(self):
self.start_url = 'https://www.aisoutu.com/'
self.dict = {}
#获取响应对象
def get_text(self,url):
self.headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'}
return requests.get(url,headers = self.headers)
#解析初始页面
def pages(self):
self.text = self.get_text(self.start_url).text
self.soup = BeautifulSoup(self.text,features="html.parser")
#找到所有关于li的标签
self.all_suburl = self.soup.find_all('li')
for each in self.all_suburl:
self.dict[each.a.text] = 'https://www.aisoutu.com%s' % each.a['href']
return self.dict
def detail(self,title,url):
self.new_url = url
self.title = title
self.more_text = get_text(self.new_url).text
self.sub_soup = BeautifulSoup(self.more_text,features="html.parser")
targets = self.sub_soup.find_all('div',class_="product-img")
for each in targets:
self.img_url = each.a.img['src']
self.save_img(self.title,self.img_url)
def save_img(self,title,url):
self.title = title
self.img_url = url
os.mkdir(self.title)
os.chdir(self.title)
bytes = self.get_text(self.img_url).content
filename = self.url.split('/')[-1]
with open(filename,'wb') as f:
f.write(bytes)
os.chdir(os.pardir)
def start_work(self):
os.mkdir(r'D:\下载\python\学习\自我提升\爬虫\搜图网')
os.chdir(r'D:\下载\python\学习\自我提升\爬虫\搜图网')
self.dict = self.pages()
thread_list = []
for each in self.dict:
t = threading.Thread(target = self.detail,args = (each,self.dict.each))
t.start()
t.thread_list.append(t)
for i in thread_list:
t.join()
if __name__ == '__main__':
spider = WebSpider()
spider.start_work()
在爬取的时候总是报错
Traceback (most recent call last):
File "D:\下载\python\学习\自我提升\爬虫\爬取搜图网图片.py", line 79, in <module>
spider.start_work()
File "D:\下载\python\学习\自我提升\爬虫\爬取搜图网图片.py", line 66, in start_work
self.dict = self.pages()
File "D:\下载\python\学习\自我提升\爬虫\爬取搜图网图片.py", line 27, in pages
self.dict[each.a.text] = 'https://www.aisoutu.com%s' % each.a['href']
File "D:\下载\python\lib\site-packages\bs4\element.py", line 1406, in __getitem__
return self.attrs[key]
KeyError: 'href'
但是打开搜图网'检查‘功能时,可以看到
<li><a href="/l/8">宣传单</a></li>
这是为什么
25行,self.all_suburl = self.soup.find_all('li') 给查找li加个限制 ,比如父容器。现在 拿到的包括了页脚的li,里面是没有a链接的。
解析初始页面的def pages(self)这个函数里,self.all_suburl = self.soup.find_all('li')这个过滤条件设置的不太好,直接把网页上全部li都取出来了,其中有一部分li下的a标签并没有href属性,比如有一个
<li><a><i class="fab fa-facebook-f"></i></a></li>
执行each.a[ 'href' ]时,由于找不到href属性,所以报错。
打印出来看看:
def pages(self):
self.text = self.get_text(self.start_url).text
self.soup = BeautifulSoup(self.text,features="html.parser")
#找到所有关于li的标签
self.all_suburl = self.soup.find_all('li')
#print(self.all_suburl)
for each in self.all_suburl:
print(each) #在这里打印一下,方便查找错误原因
self.dict[each.a.text] = 'https://www.aisoutu.com%s' % each.a[ 'href' ]
return self.dict
修改方法:
#self.all_suburl = self.soup.find_all('li')改成下面这一行:
self.all_suburl = self.soup.find("div", class_="product-sidebar-tag").find_all('li')