######下面是我用到的代码
# -*- coding: UTF-8 -*-
# 导入需要的各种库
import re
import time
import requests
from random import uniform
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
import pymysql
# 设置一个请求头,不然无法通过链家的验证
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
def get_parent_url(city):
"""
获取选定城市的所有父母网址
:param city:
:return:
"""
url = 'http://{city}.lianjia.com/zufang'.format(city=city)
html = requests.get(url, headers=headers) # 获取网址html
Soup = BeautifulSoup(html.text, 'lxml') # 解析html
Selector = Soup.select('ul[data-target="area"] > li.filter__item--level2') # 找出html中的区域文本
Selector = Selector[1:] # 排除第一个区域“不限”
url_parent_all = [] # 初始化最终的父母网址列表
for i in Selector: # 对每一个区域进行loop
url_region = "https://sh.lianjia.com" + i.select('a')[0]['href'] # 找出区域网址
html_region = requests.get(url_region, headers=headers) # 获取区域网址html
Soup_region = BeautifulSoup(html_region.text, 'lxml') # 解析html
number_data = int(Soup_region.select('span.content__title--hl')[0].text) # 获取该区域信息条数
if number_data <= 3000: # 信息条数少于3000直接开始爬取
index = Soup_region.select('div.content__pg')[0] # 找出页数文本
index = str(index) # 将bs4对象转换为str,否则无法进行正则提取
re_set = re.compile(r'data-totalpage="(.*?)"')
index = re.findall(re_set, index)[0] # 正则表达式提取出页数
for j in range(1, int(index)+1): # 对每一页网址进行循环
url_parent = url_region + "pg{}".format(j)
url_parent_all.append(url_parent) # 得到该区域每一页的网址,并添加至父母网址列表中
print(url_parent)
t = uniform(0, 1)
time.sleep(t) # 每爬一个区域,稍作等待,避免被ban
else: # 信息条数大于3000按租金分层
for i in range(1, 8):
url_region_rp = url_region + "rp{}/".format(i)
html_region_rp = requests.get(url_region_rp, headers=headers)
Soup_region_rp = BeautifulSoup(html_region_rp.text, 'lxml')
number_data = int(Soup_region_rp.select('span.content__title--hl')[0].text)
if number_data>0:
index = Soup_region_rp.select('div.content__pg')[0] # 操作同上
index = str(index)
re_set = re.compile(r'data-totalpage="(.*?)"')
index = re.findall(re_set, index)[0]
for j in range(1, int(index) + 1):
url_parent = url_region + "rp{}/".format(i) + "pg{}".format(j)
url_parent_all.append(url_parent)
print(url_parent)
t = uniform(0, 1)
time.sleep(t)
return url_parent_all
def get_detail_url(url_parent_all):
"""
对每一个父母网址进行操作,获取最终详尽的子网址列表
:param city:
:return:
"""
url_detail_all = [] # 创建最终的子网址列表
for url in url_parent_all: # 对每一个父母网址进行for loop
html = requests.get(url, headers=headers)
Soup = BeautifulSoup(html.text, 'lxml')
Selector = Soup.select('div a.content__list--item--aside') # 解析并找出子网址bs4对象
for i in Selector:
i = i['href']
i = 'http://{city}.lianjia.com'.format(city=city) + i # 对每一个bs4子网址对象循环,构建最终的子网址
url_detail_all.append(i) # 添加到子网址列表
print(i)
t = uniform(0, 0.01)
time.sleep(t) # 每处理一条父母网址暂停t秒
return url_detail_all
def get_data(url_detail_all):
"""
从子网址列表中网址获取数据
:param url_detail_all:
:return:
"""
data = [] # 初始化一个爬虫数据列表
num_error = 0 # 记录错误数
for i in url_detail_all: # for loop对每一个网址进行爬取操作
try: # 使用try...except...方法防止循环中途出错退出
info = {}
url = i
html = requests.get(url, headers=headers)
Soup = BeautifulSoup(html.text, 'lxml')
info['房源编号'] = Soup.select('i.house_code')[0].text
info['链接'] = i
info['标题'] = Soup.select('p.content__title')[0].text
info['价格'] = Soup.select('p.content__aside--title')[0].text
Selector1 = Soup.select('p.content__article__table')[0].text.split('\n')
info['租赁方式'] = Selector1[1]
info['户型'] = Selector1[2]
info['面积'] = Selector1[3]
info['朝向'] = Selector1[4]
info['发布时间'] = Soup.select('li[class^="fl oneline"]')[1].text[3:]
info['入住'] = Soup.select('li[class^="fl oneline"]')[2].text[3:]
info['租期'] = Soup.select('li[class^="fl oneline"]')[4].text[3:]
info['看房'] = Soup.select('li[class^="fl oneline"]')[5].text[3:]
info['楼层'] = Soup.select('li[class^="fl oneline"]')[7].text[3:]
info['电梯'] = Soup.select('li[class^="fl oneline"]')[8].text[3:]
info['车位'] = Soup.select('li[class^="fl oneline"]')[10].text[3:]
info['用水'] = Soup.select('li[class^="fl oneline"]')[11].text[3:]
info['用电'] = Soup.select('li[class^="fl oneline"]')[13].text[3:]
info['燃气'] = Soup.select('li[class^="fl oneline"]')[14].text[3:]
info['采暖'] = Soup.select('li[class^="fl oneline"]')[16].text[3:]
info['房源标签'] = Soup.select('p.content__aside--tags')[0].text[1:-1].replace('\n', ', ')
Selector2 = Soup.select('li[class^="fl oneline"]') # 获取配套设施的bs4对象
info['配套设施'] = []
for i in Selector2: # 对每一个对象进行处理
if len(i['class']) > 2:
if i['class'][2][-2:] != 'no': # 仅保留有的配套设施
info['配套设施'].append(i['class'][2])
info['配套设施'] = ",".join(info['配套设施']) # 列表转换为str
if info['配套设施'] == '': # 配套设施为空的情况
info['配套设施'] = None
if not Soup.select('.threeline'): # 房源描述为空的情况
info['房源描述'] = None
else:
info['房源描述'] = Soup.select('.threeline')[0].text
info['城区'] = Soup.select('p.bottom__list--item__des > span:nth-of-type(1) > a:nth-of-type(1)')[0].text
info['小区地址'] = Soup.select('p.bottom__list--item__des > span:nth-of-type(1)')[0].text.replace(' ', '').replace('\n', '')
data.append(info) # 将爬取的信息以字典形式添加到数据列表中
print(info)
t = uniform(0, 0.01)
time.sleep(t) # 爬一条数据暂停t秒
except:
num_error += 1
print("oops, some errors occured")
continue
print("出错数据行数: %d" % (num_error))
df = pd.DataFrame(data) # 将数据转换为DataFrame
return df
def to_mysql(df, table_name):
"""
将爬取的数据保存到mysql中
:param df:
:param table_name:
:return:
"""
# 创建连接mysql的engine,以pymysql为driver
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/lianjia',
connect_args={'charset': 'utf8mb4'})
# 将传入的df保存为mysql表格
df.to_sql(name='{}'.format(table_name), con=engine, if_exists='replace', index=False)
def to_csv(df, table_name):
"""
将爬取的数据保存到csv文件中
:param df:
:param table_name:
:return:
"""
df.to_csv('{}'.format(table_name), index=False) # 不保留行索引
if __name__ == '__main__':
city = input("输入要爬取的城市拼音缩写(小写): ")
# city = 'sh'
url_parent_all = get_parent_url(city)
url_detail_all = get_detail_url(url_parent_all)
all_url = pd.DataFrame(url_detail_all, columns=['url'])
to_mysql(all_url, '{}_all_url'.format(city))
to_csv(all_url, '{}_all_url'.format(city))
home_df = get_data(url_detail_all)
to_mysql(home_df, '{}_home_data'.format(city))
to_csv(home_df, '{}_home_data'.format(city))
页面:https://bd.ke.com/ershoufang/
获取如下信息: