for i in range(33): # 每页20个小区,共648个小区
url = 'https://newhouse.fang.com/house/s/b81-b9' + str(i+1) + '/'
from lxml import etree
import requests
from fake_useragent import UserAgent
import pandas as pd
import random
import time
import csv
headers = {
'User-Agent' : UserAgent().random,
'cookie' : "global_cookie=kxyzkfz09n3hnn14le9z39b9g3ol3wgikwn; city=www; city.sig=OGYSb1kOr8YVFH0wBEXukpoi1DeOqwvdseB7aTrJ-zE; __utmz=147393320.1664372701.10.4.utmcsr=mp.csdn.net|utmccn=(referral)|utmcmd=referral|utmcct=/mp_blog/creation/editor; csrfToken=KUlWFFT_pcJiH1yo3qPmzIc_; g_sourcepage=xf_lp^lb_pc'; __utmc=147393320; unique_cookie=U_bystp5cfehunxkbjybklkryt62fl8mfox4z*3; __utma=147393320.97036532.1606372168.1664431058.1664433514.14; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmt_t3=1; __utmt_t4=1; __utmb=147393320.5.10.1664433514",
# 设置从何处跳转过来
'referer': 'https://newhouse.fang.com/house/s/b81-b91/'
}
url = 'https://newhouse.fang.com/house/s/b81-b91/'# 首页网址URL
page_text = requests.get(url=url, headers=headers).text# 请求发送
tree = etree.HTML(page_text)#数据解析
# 小区名称
name = [i.strip() for i in tree.xpath("//div[@class='nlcd_name']/a/text()")]
print(name)
print(len(name))
# 评论数
commentCounts = tree.xpath("//span[@class='value_num']/text()")
print(commentCounts)
print(len(commentCounts))
# 房屋面积
buildingarea = [i.strip() for i in tree.xpath("//div[@class='house_type clearfix']/text()")]
print(buildingarea)
print(len(buildingarea))
# 详细地址
detailAddress = tree.xpath("//div[@class='address']/a/@title")
print(detailAddress)
print(len(detailAddress))
# 所在区
district = [i.strip() for i in tree.xpath("//div[@class='address']//span[@class='sngrey']/text()")]
print(district)
print(len(district))
# 均价
num = tree.xpath(
"//div[@class='nlc_details']/div[@class='nhouse_price']/span/text() | //div[@class='nlc_details']/div[@class='nhouse_price']/i/text()")
unit = tree.xpath("//div[@class='nlc_details']/div[@class='nhouse_price']/em/text()")
price = [i + j for i, j in zip(num, unit)]
print(price)
print(len(price))
# 评论数处理
commentCounts = [int(i.split('(')[1].split('条')[0]) for i in commentCounts]
print(commentCounts)
# 详细地址处理
detailAddress = [i.split(']')[1] for i in detailAddress]
print(detailAddress)
# 所在区字段处理
district = [i.split('[')[1].split(']')[0] for i in district]
print(district)
# 房屋面积处理
t = []
for i in buildingarea:
if i != '/' and i != '':
t.append(i.split('—')[1].split('平米')[0])
print(t)
print(len(t))
df = pd.DataFrame(columns = ['小区名称', '详细地址', '所在区', '均价', '评论数'])
df
for k in range(10):
url = 'https://newhouse.fang.com/house/s/b81-b9' + str(k + 1) + '/'
page_text = requests.get(url=url, headers=headers).text # 请求发送
tree = etree.HTML(page_text) # 数据解析
# 小区名称
name = [i.strip() for i in tree.xpath("//div[@class='nlcd_name']/a/text()")]
# 评论数
commentCounts = tree.xpath("//span[@class='value_num']/text()")
# 详细地址
detailAddress = tree.xpath("//div[@class='address']/a/@title")
# 所在区
district = [i.strip() for i in tree.xpath("//div[@class='address']//text()")]
# 均价
num = tree.xpath(
"//div[@class='nlc_details']/div[@class='nhouse_price']/span/text() | //div[@class='nlc_details']/div[@class='nhouse_price']/i/text()")
unit = tree.xpath("//div[@class='nlc_details']/div[@class='nhouse_price']/em/text()")
price = [i + j for i, j in zip(num, unit)]
# 评论数处理
commentCounts = [int(i.split('(')[1].split('条')[0]) for i in commentCounts]
# 详细地址处理
tmp1 = []
for i in detailAddress:
if ']' in i:
tmp1.append(i.split(']')[1])
continue
tmp1.append(i)
detailAddress = tmp1
# 所在区处理
tmp2 = []
for i in district:
if ']' in i and '[' in i:
tmp2.append(i.split(']')[0].split('[')[1])
district = tmp2
dic = {'小区名称': name, '详细地址': detailAddress, '所在区': district, '均价': price, '评论数': commentCounts}
df2 = pd.DataFrame(dic)
df = pd.concat([df, df2], axis=0)
print('第{}页爬取成功, 共{}条数据'.format(k + 1, len(df2)))
print('全部数据爬取成功')
df.to_csv('北京小区数据信息.csv',index=None)
pip install 把缺少的库安装一下,你这个是缺少库啊
写个测试代码:
if __name__ == '__main__':
f = open(r"./share.jpeg", "rb")
text = input('Input text:')
res = share_weibo(text, f)
f.close()
运行:
[root@localhost share_weibo]# py sina_weibo.py
Input text:hello sky
成功发布一条图片微博:
hello sky
如果你想发个视频,可惜了,新浪没有提供对应的接口。我们只能通过在文本中包含已有视频的链接来达到类似的效果。比如要发这个视频:http://t.cn/Ail9eamb?m=4396440127989330&u=5296864682
运行:
[root@localhost share_weibo]# py sina_weibo.py
Input text:http://t.cn/Ail9eamb?m=4396440127989330&u=5296864682
成功发送一条视频的微博:
video
可以在我的微博看到刚刚发送的这条视频。
根据您提供的代码和错误提示,可以看出问题是找不到bs4
模块。一种可能的原因是安装的beautifulsoup4
包不在Python的搜索路径中。下面给出一种解决方案:
beautifulsoup4
包。您可以使用以下命令检查:pip show beautifulsoup4
如果结果中显示了包的信息,则表示已经安装。
beautifulsoup4
包。您可以使用以下命令重新安装:pip install beautifulsoup4
请确保在命令行窗口中运行该命令并等待安装完成。
bs4
模块,可能是因为beautifulsoup4
包安装在了错误的Python解释器路径下。您可以尝试以下解决方案:a. 使用命令pip show beautifulsoup4
查找beautifulsoup4
包的安装路径,并记下路径。
b. 将路径中的bs4
文件夹拷贝到Python安装目录下的Lib
文件夹中。这样bs4
模块就可以被Python解释器找到了。
lxml
解析器替代html.parser
解析器。您可以使用以下代码:soup = BeautifulSoup(response.text, 'lxml')
这样做的原因是lxml
解析器比html.parser
解析器性能更好。
请根据上述步骤逐个尝试解决问题,并在尝试完成后再次运行您的代码,看看问题是否得到解决。如果问题仍然存在,请回复我提供更多的信息和错误提示,以便我能够更好地帮助您解决问题。
问题点:当前运行环境缺少第三方库lxml,requests, fake_useragent,pandas(当前这四个都标红了)
安装指令如下,CMD黑色窗口下,逐个执行如下四个指令,安装好之后就可以正常运行代码了.(第十行代码删除,因为没用)
pip install lxml
pip install requests
pip install fake_useragent
pip install pandas