import pandas as pd
import re
import requests
from requests import RequestException
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
for i in range(2,5):
url = "https://bj.lianjia.com/xiaoqu/pg" + str(i) +"/?from=rec"
print(url)
html = getHTMLText(url)
pattern = re.compile('
我把你写的爬虫代码给简单改了下,用正则匹配你这个写的有问题,后期需要加强下正则学习,我用lxml解析的源代码,因为你写的是同步执行的,所以我也是同步执行的,没有改成异步的,这样爬虫用异步方式是最好的
#-*- coding:utf-8 -*-
import pandas as pd
import requests
from lxml import etree
def getHTMLText(url):
Headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
try:
r = requests.get(url, timeout=30,headers=Headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.content.decode('utf-8')
except:
return ""
if __name__ == '__main__':
names = []
prices = []
info = []
for i in range(2,5):
url = "https://bj.lianjia.com/xiaoqu/pg" + str(i) +"/?from=rec"
print(url)
html = getHTMLText(url)
if html:
datas = etree.HTML(html)
name = datas.xpath("//div[@class='info']/div[@class='title']/a/text()")
price = datas.xpath("//div[@class='totalPrice']/span/text()")
names.extend(name)
prices.extend(price)
info = list(zip(names,prices))
headers = ['小区', '价格']
filen_name = '1.csv'
data3 = pd.DataFrame(columns = headers,data = info)
data3.to_csv(filen_name, encoding='utf-8')