多线程爬取搜图网出现问题

完整代码如下

#多线程爬取搜图网
import requests
import threading
from bs4 import BeautifulSoup
import os

class WebSpider:
    def __init__(self):
        self.start_url = 'https://www.aisoutu.com/'
        self.dict = {}
    
    #获取响应对象
    def get_text(self,url):
        self.headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'}
        return requests.get(url,headers = self.headers)
    
   

   #解析初始页面
    def pages(self):
        self.text = self.get_text(self.start_url).text
        self.soup = BeautifulSoup(self.text,features="html.parser")
        
        #找到所有关于li的标签
        self.all_suburl = self.soup.find_all('li')
        for each in self.all_suburl:
            self.dict[each.a.text] = 'https://www.aisoutu.com%s' % each.a['href']
        
        return self.dict
        
    
    
    def detail(self,title,url):
        self.new_url = url
        self.title = title
        
        self.more_text = get_text(self.new_url).text
        self.sub_soup = BeautifulSoup(self.more_text,features="html.parser")
        
        targets = self.sub_soup.find_all('div',class_="product-img")
        for each in targets:
            self.img_url = each.a.img['src']
            self.save_img(self.title,self.img_url)
            
    
    
    def save_img(self,title,url):
        self.title = title
        self.img_url = url
        
        os.mkdir(self.title)
        os.chdir(self.title)
        
        bytes = self.get_text(self.img_url).content
        filename = self.url.split('/')[-1]
        with open(filename,'wb') as f:
            f.write(bytes)
            
        os.chdir(os.pardir)
        
        
    def start_work(self):
        os.mkdir(r'D:\下载\python\学习\自我提升\爬虫\搜图网')
        os.chdir(r'D:\下载\python\学习\自我提升\爬虫\搜图网')
        
        self.dict = self.pages()
        thread_list = []
        
        for each in self.dict:
            t = threading.Thread(target = self.detail,args = (each,self.dict.each))
            t.start()
            t.thread_list.append(t)
            
        for i in thread_list:
            t.join()
            
if __name__ == '__main__':
    spider = WebSpider()
    spider.start_work()
        

在爬取的时候总是报错

Traceback (most recent call last):
  File "D:\下载\python\学习\自我提升\爬虫\爬取搜图网图片.py", line 79, in <module>
    spider.start_work()
  File "D:\下载\python\学习\自我提升\爬虫\爬取搜图网图片.py", line 66, in start_work
    self.dict = self.pages()
  File "D:\下载\python\学习\自我提升\爬虫\爬取搜图网图片.py", line 27, in pages
    self.dict[each.a.text] = 'https://www.aisoutu.com%s' % each.a['href']
  File "D:\下载\python\lib\site-packages\bs4\element.py", line 1406, in __getitem__
    return self.attrs[key]
KeyError: 'href'

但是打开搜图网'检查‘功能时,可以看到

<li><a href="/l/8">宣传单</a></li>

这是为什么

25行,self.all_suburl = self.soup.find_all('li') 给查找li加个限制 ,比如父容器。现在 拿到的包括了页脚的li,里面是没有a链接的。

解析初始页面的def pages(self)这个函数里,self.all_suburl = self.soup.find_all('li')这个过滤条件设置的不太好,直接把网页上全部li都取出来了,其中有一部分li下的a标签并没有href属性,比如有一个

<li><a><i class="fab fa-facebook-f"></i></a></li>

执行each.a[ 'href' ]时,由于找不到href属性,所以报错。
打印出来看看:


   def pages(self):
        self.text = self.get_text(self.start_url).text
        self.soup = BeautifulSoup(self.text,features="html.parser")
        #找到所有关于li的标签
        self.all_suburl = self.soup.find_all('li')
        #print(self.all_suburl)
        for each in self.all_suburl:
            print(each) #在这里打印一下,方便查找错误原因
            self.dict[each.a.text] = 'https://www.aisoutu.com%s' % each.a[ 'href' ]
        return self.dict

修改方法:

        #self.all_suburl = self.soup.find_all('li')改成下面这一行:
        self.all_suburl = self.soup.find("div", class_="product-sidebar-tag").find_all('li')