python爬虫问题

在爬虫获取房屋信息是,反馈到的是同一个信息。且上面ul查询老报错。在后面的查找价格中也是同一个信息。急要

可能你没有加上请求头或cookies信息发送请求,可以的话将代码发一下



```python
#!/user/bin/env python3
# -*- coding: utf-8 -*-

import requests
import random
from lxml import html
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from selenium import webdriver

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {'User-Agent': ''}
user_agents = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
]
def get_detail_url(url):

    headers['User-Agent'] = random.choice(user_agents)
    try:
        r = requests.get(url, headers=headers, verify=False)
        html1 = html.etree.HTML(r.text)
        # 经过上述跳转,得到目标网页地址
        roal_url = html1.xpath('//a[@class="btn-redir"]/@href')[0]
        r = requests.get(roal_url, headers=headers, verify=False)
        html1 = html.etree.HTML(r.text)
        hrefs = html1.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@href')
        channels = html1.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@data_channel')
        next_urls = ['https://lz.esf.fang.com' + href +'?channel=' + channel for href,channel in zip(hrefs,channels)]
        house.extend(next_urls)
    except:
        process_captcha()
        get_detail_url(url)

def process_captcha():

    # 该处url是让出现验证码界面,没有具体的限制
    url = 'https://lz.esf.fang.com/chushou/3_416752691.htm?channel=2,2'
    driver = webdriver.Firefox()
    driver.get(url)
    # 人工输入验证码
    time.sleep(12)
    driver.find_element_by_name('submit').click()
    driver.close()

if __name__ == '__main__':

    '''
   这个过程中,貌似只能爬取100页,那么可以细化,比如分区域爬取,可以再细分。
    '''
    house = []
    for i in range(1,3):
        print('--------------------------------')
        print(f'开始爬取第{i}页')
        url = f'https://lz.esf.fang.com/house/i3{i}/'
        get_detail_url(url)
    print('爬取结束!')
    f = open('urls.txt', 'a+', encoding='utf8')
    for i in house:
        f.write(i + '\n')
    f.close()

```

img

这个是链家的代码

import requests
from parsel import Selector
import pandas as pd
import time
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'
}
# pages是不同页码的网址列表
# pages=['https://bj.lianjia.com/ershoufang/zizhuqiao/pg{}/'.format(x) for x in range(1,3)]
lj_shanghai= pd.DataFrame(columns=['hou_code','title','infotitle','alt','positionIcon_new',
                                   'positionInfo_new','position_xiaoqu','starIcon','price_total_new','unitPrice'])
count=0

def l_par_html(url):
    wr=requests.get(url,headers=headers,stream=True)
    sel=Selector(wr.text)
    # hou_code用来获取房源的编号
    hou_code=sel.xpath('//div[@class="title"]/a/@data-housecode').extract()
    #获取标题
    title=sel.xpath('//div[@class="title"]//text()').extract()
    infotitle=sel.xpath('//div[@class="title"]/a/text()').extract()
# =============================================================================
#     #图片
# #    src=sel.xpath('//img[@class="lj-lazy"]//@src').extract()
# #    print('src:%s'%src)
# =============================================================================
    #图片地址
    alt=sel.xpath('//img[@class="lj-lazy"]//@alt').extract()
    positionIcon_region=sel.xpath('//div[@class="houseInfo"]/a/text()').extract()
    #获取房屋信息
    positionIcon = sel.xpath('//div[@class="houseInfo"]//text()').extract()
    positionIcon_new=([x for x in positionIcon if x not in positionIcon_region ])
    positionInfo = sel.xpath('//div[@class="positionInfo"]//text()').extract()
    position_xiaoqu = sel.xpath('//div[@class="positionInfo"]/a/text()').extract()
    positionInfo_new = ([x for x in positionInfo if x not in position_xiaoqu])
    starIcon =sel.xpath('//div[@class="followInfo"]//text()').extract()
    price_total = sel.xpath('//div[@class="totalPrice"]//text()').extract()
    price_total_new =([x for x in price_total if x != '万' ])
    unitPrice =sel.xpath('//div[@class="unitPrice"]//text()').extract()
    wr=requests.get(url,headers=headers,stream=True)
    sel=Selector(wr.text)
    tag = sel.xpath('//div[@class="tag"]//text()').extract()
#    print("tag:%s"%tag)

    pages_info=pd.DataFrame(list(zip(hou_code,title,infotitle,alt,positionIcon_new,
                                   positionInfo_new,position_xiaoqu,starIcon,price_total_new,unitPrice)),
    columns=['hou_code','title','infotitle','alt','positionIcon_new',
                                   'positionInfo_new','position_xiaoqu','starIcon','price_total_new','unitPrice'])

 #由于抓取下来的信息是存储在列表中的,出现了一对多的情况,故将tag,title,infotitle单独取出分析![在这里插入图片描述](https://img-blog.csdnimg.cn/20190627170938390.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dpZmlfd3V4aWFu,size_16,color_FFFFFF,t_70)
    return pages_info,tag,title,infotitle
urls=[["https://bj.lianjia.com/ershoufang/zizhuqiao/",13],["https://bj.lianjia.com/ershoufang/dinghuisi/",11],["https://bj.lianjia.com/ershoufang/maliandao1/",12],["https://bj.lianjia.com/ershoufang/yongdingmen/",6],["https://bj.lianjia.com/ershoufang/dongzhimen/",9],["https://bj.lianjia.com/ershoufang/chongwenmen/",11]]
index=0
limits=["车道沟","定慧","红莲","西革","胡家园","兴隆"]
for ui in urls:
    pages = [ui[0]+'pg{}/'.format(x) for x in range(1, ui[1])]
    print(pages)

    for page in pages:
        a=l_par_html(page)[0]
        b=l_par_html(page)[1:]
        print('advantage:{}'.format(b))
        count=count+1
        print('the '+str(count)+' page is sucessful')
        #每隔20s翻页一次
        time.sleep(20)
        a=a[a["position_xiaoqu"].str.contains(limits[index])]
        # print(a)
        lj_shanghai=pd.concat([lj_shanghai,a],ignore_index=True)
        print(lj_shanghai)
    index+=1
 #将数据存储到excel表格中
lj_shanghai.to_excel(r'lianjia_ershou_shanghai.xlsx')