怎么用def函数类型来编写爬虫代码,需要思路,只会这种基础爬虫技巧

import requests
from bs4 import BeautifulSoup
from requests import status_codes


for page_number in range(23000,23957):
    url = 'http://www.netbian.com/desk/{}.htm'.format(page_number)
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36'}

    bian_res = requests.get(url,headers=headers)
    bian_res.encoding='gbk'
    soup = BeautifulSoup(bian_res.text,'html.parser')
    comment_page = soup.find_all('div',class_='pic')
    for comment in comment_page:
        comment_name = comment.find('a').find('img')['alt']
        comment_url = comment.find('a').find('img')['src']
        comment_res = requests.get(comment_url)
        pic = comment_res.content

    with open('D:\\图片\cars\\' + comment_name + '.jpg','wb') as f:
            f.write(pic)
            print('({})图片已保存'.format(comment_name))

import requests
from bs4 import BeautifulSoup
from requests import status_codes

def geturl(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36'}
    bian_res = requests.get(url,headers=headers)
    bian_res.encoding='gbk'
    soup = BeautifulSoup(bian_res.text,'html.parser')
    comment_page = soup.find_all('div',class_='pic')
    for comment in comment_page:
        comment_name = comment.find('a').find('img')['alt']
        comment_url = comment.find('a').find('img')['src']
        comment_res = requests.get(comment_url)
        pic = comment_res.content
    with open('D:\\图片\cars\\' + comment_name + '.jpg','wb') as f:
            f.write(pic)
            print('({})图片已保存'.format(comment_name))

for page_number in range(23000,23957):
    url = 'http://www.netbian.com/desk/{}.htm'.format(page_number)
    geturl(url)


使用函数目的就是能提高应用的模块性和代码的重复利用,将获取图片地址与获取并下载图片操作分别写入两个函数中,在主函数中进行调用即可。同时要注意请求页面内容时要设置一个休眠时间,不要过于频繁。修改代码如下:

import requests
from bs4 import BeautifulSoup
from requests import status_codes
import time
def geturl(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36'}
    bian_res = requests.get(url,headers=headers)
    bian_res.encoding='gbk'
    soup = BeautifulSoup(bian_res.text,'html.parser')
    comment_page = soup.find_all('div',class_='pic')
    imgs={}
    for comment in comment_page:
        comment_name = comment.find('a').find('img')['alt']
        comment_url = comment.find('a').find('img')['src']
        imgs[comment_name]=comment_url
    return imgs
def get_imgs(name,url):
    comment_res = requests.get(url)
    pic = comment_res.content
    with open('imgs/ab/' + name + '.jpg', 'wb') as f:
        f.write(pic)
        print('({})图片已保存'.format(name))
def main():

    for page_number in range(23000,23003):
        url = 'http://www.netbian.com/desk/{}.htm'.format(page_number)
        img_dict=geturl(url)
        time.sleep(1)
        for n,h in img_dict.items():
            get_imgs(n,h)
            time.sleep(1)
main()     

如有帮助,请点击我回答的采纳按钮给予采纳。