python爬取链家二手房数据

我使用这段代码爬取链家二手房数据,为什么只爬取到了链接,没有具体房价信息,还报错

img

######下面是我用到的代码

# -*- coding: UTF-8 -*-

# 导入需要的各种库
import re
import time
import requests
from random import uniform
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
import pymysql


# 设置一个请求头,不然无法通过链家的验证
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}


def get_parent_url(city):
    """
    获取选定城市的所有父母网址
    :param city:
    :return:
    """
    url = 'http://{city}.lianjia.com/zufang'.format(city=city)
    html = requests.get(url, headers=headers)                                           # 获取网址html
    Soup = BeautifulSoup(html.text, 'lxml')                                             # 解析html
    Selector = Soup.select('ul[data-target="area"] > li.filter__item--level2')          # 找出html中的区域文本
    Selector = Selector[1:]                                                             # 排除第一个区域“不限”
    url_parent_all = []                                                                 # 初始化最终的父母网址列表

    for i in Selector:                                                                  # 对每一个区域进行loop
        url_region = "https://sh.lianjia.com" + i.select('a')[0]['href']                # 找出区域网址
        html_region = requests.get(url_region, headers=headers)                         # 获取区域网址html
        Soup_region = BeautifulSoup(html_region.text, 'lxml')                           # 解析html
        number_data = int(Soup_region.select('span.content__title--hl')[0].text)        # 获取该区域信息条数
        if number_data <= 3000:                                                         # 信息条数少于3000直接开始爬取
            index = Soup_region.select('div.content__pg')[0]                            # 找出页数文本
            index = str(index)                                                          # 将bs4对象转换为str,否则无法进行正则提取
            re_set = re.compile(r'data-totalpage="(.*?)"')
            index = re.findall(re_set, index)[0]                                        # 正则表达式提取出页数
            for j in range(1, int(index)+1):                                            # 对每一页网址进行循环
                url_parent = url_region + "pg{}".format(j)
                url_parent_all.append(url_parent)                                       # 得到该区域每一页的网址,并添加至父母网址列表中
                print(url_parent)
            t = uniform(0, 1)
            time.sleep(t)                                                               # 每爬一个区域,稍作等待,避免被ban
        else:                                                                           # 信息条数大于3000按租金分层
            for i in range(1, 8):
                url_region_rp = url_region + "rp{}/".format(i)
                html_region_rp = requests.get(url_region_rp, headers=headers)
                Soup_region_rp = BeautifulSoup(html_region_rp.text, 'lxml')
                number_data = int(Soup_region_rp.select('span.content__title--hl')[0].text)
                if number_data>0:

                    index = Soup_region_rp.select('div.content__pg')[0]                     # 操作同上
                index = str(index)
                re_set = re.compile(r'data-totalpage="(.*?)"')
                index = re.findall(re_set, index)[0]
                for j in range(1, int(index) + 1):
                    url_parent = url_region + "rp{}/".format(i) + "pg{}".format(j)
                    url_parent_all.append(url_parent)
                    print(url_parent)
            t = uniform(0, 1)
            time.sleep(t)
    return url_parent_all


def get_detail_url(url_parent_all):
    """
    对每一个父母网址进行操作,获取最终详尽的子网址列表
    :param city:
    :return:
    """
    url_detail_all = []                                                                 # 创建最终的子网址列表

    for url in url_parent_all:                                                          # 对每一个父母网址进行for loop
        html = requests.get(url, headers=headers)
        Soup = BeautifulSoup(html.text, 'lxml')
        Selector = Soup.select('div a.content__list--item--aside')                      # 解析并找出子网址bs4对象
        for i in Selector:
            i = i['href']
            i = 'http://{city}.lianjia.com'.format(city=city) + i                       # 对每一个bs4子网址对象循环,构建最终的子网址
            url_detail_all.append(i)                                                    # 添加到子网址列表
            print(i)
        t = uniform(0, 0.01)
        time.sleep(t)                                                                   # 每处理一条父母网址暂停t秒
    return url_detail_all


def get_data(url_detail_all):
    """
    从子网址列表中网址获取数据
    :param url_detail_all:
    :return:
    """
    data = []                                                                           # 初始化一个爬虫数据列表
    num_error = 0                                                                       # 记录错误数

    for i in url_detail_all:                                                            # for loop对每一个网址进行爬取操作
        try:                                                                            # 使用try...except...方法防止循环中途出错退出
            info = {}
            url = i
            html = requests.get(url, headers=headers)
            Soup = BeautifulSoup(html.text, 'lxml')
            info['房源编号'] = Soup.select('i.house_code')[0].text
            info['链接'] = i
            info['标题'] = Soup.select('p.content__title')[0].text
            info['价格'] = Soup.select('p.content__aside--title')[0].text
            Selector1 = Soup.select('p.content__article__table')[0].text.split('\n')
            info['租赁方式'] = Selector1[1]
            info['户型'] = Selector1[2]
            info['面积'] = Selector1[3]
            info['朝向'] = Selector1[4]
            info['发布时间'] = Soup.select('li[class^="fl oneline"]')[1].text[3:]
            info['入住'] = Soup.select('li[class^="fl oneline"]')[2].text[3:]
            info['租期'] = Soup.select('li[class^="fl oneline"]')[4].text[3:]
            info['看房'] = Soup.select('li[class^="fl oneline"]')[5].text[3:]
            info['楼层'] = Soup.select('li[class^="fl oneline"]')[7].text[3:]
            info['电梯'] = Soup.select('li[class^="fl oneline"]')[8].text[3:]
            info['车位'] = Soup.select('li[class^="fl oneline"]')[10].text[3:]
            info['用水'] = Soup.select('li[class^="fl oneline"]')[11].text[3:]
            info['用电'] = Soup.select('li[class^="fl oneline"]')[13].text[3:]
            info['燃气'] = Soup.select('li[class^="fl oneline"]')[14].text[3:]
            info['采暖'] = Soup.select('li[class^="fl oneline"]')[16].text[3:]
            info['房源标签'] = Soup.select('p.content__aside--tags')[0].text[1:-1].replace('\n', ', ')
            Selector2 = Soup.select('li[class^="fl oneline"]')                          # 获取配套设施的bs4对象
            info['配套设施'] = []
            for i in Selector2:                                                         # 对每一个对象进行处理
                if len(i['class']) > 2:
                    if i['class'][2][-2:] != 'no':                                      # 仅保留有的配套设施
                        info['配套设施'].append(i['class'][2])
            info['配套设施'] = ",".join(info['配套设施'])                                # 列表转换为str
            if info['配套设施'] == '':                                                   # 配套设施为空的情况
                info['配套设施'] = None
            if not Soup.select('.threeline'):                                           # 房源描述为空的情况
                info['房源描述'] = None
            else:
                info['房源描述'] = Soup.select('.threeline')[0].text
            info['城区'] = Soup.select('p.bottom__list--item__des > span:nth-of-type(1) > a:nth-of-type(1)')[0].text
            info['小区地址'] = Soup.select('p.bottom__list--item__des > span:nth-of-type(1)')[0].text.replace(' ', '').replace('\n', '')
            data.append(info)                                                           # 将爬取的信息以字典形式添加到数据列表中
            print(info)
            t = uniform(0, 0.01)
            time.sleep(t)                                                               # 爬一条数据暂停t秒
        except:
            num_error += 1
            print("oops, some errors occured")
        continue
    print("出错数据行数: %d" % (num_error))
    df = pd.DataFrame(data)                                                             # 将数据转换为DataFrame
    return df


def to_mysql(df, table_name):
    """
    将爬取的数据保存到mysql中
    :param df:
    :param table_name:
    :return:
    """
    # 创建连接mysql的engine,以pymysql为driver
    engine = create_engine('mysql+pymysql://root:123456@localhost:3306/lianjia',
                       connect_args={'charset': 'utf8mb4'})

    # 将传入的df保存为mysql表格
    df.to_sql(name='{}'.format(table_name), con=engine, if_exists='replace', index=False)


def to_csv(df, table_name):
    """
    将爬取的数据保存到csv文件中
    :param df:
    :param table_name:
    :return:
    """
    df.to_csv('{}'.format(table_name), index=False)                                     # 不保留行索引


if __name__ == '__main__':
    city = input("输入要爬取的城市拼音缩写(小写): ")
    # city = 'sh'
    url_parent_all = get_parent_url(city)
    url_detail_all = get_detail_url(url_parent_all)
    all_url = pd.DataFrame(url_detail_all, columns=['url'])
    to_mysql(all_url, '{}_all_url'.format(city))
    to_csv(all_url, '{}_all_url'.format(city))
    home_df = get_data(url_detail_all)
    to_mysql(home_df, '{}_home_data'.format(city))
    to_csv(home_df, '{}_home_data'.format(city))