帮忙看看为啥代码可以运行,但最后导出的CSV文件却是空白的呢?只有标题栏。而且文件也会随着爬取的数据增多而变大
```python
# encoding=utf-8
# -*- coding:utf-8 -*-
import logging
logging.getLogger("bs4.dammit").setLevel(logging.ERROR)
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
import urllib2
import requests
import json
import re
import csv
import unicodecsv as ucsv
from itertools import islice
import gc
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
house_urls = []
region = '台山市'
with open('./基础信息/所有单元链接.csv')as f:
f_csv = csv.reader(f)
for row in islice(f_csv, 1, None):
if row[0] == region:
house_urls.append({'url':row[1]})
count_house = len(house_urls)
head = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en-GB;q=0.8,en-US;q=0.7,en;q=0.6',
'Connection': 'keep-alive',
'Host': 'jmzjj.jiangmen.cn:8085',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36}'
}
count_fail = 0
fail_house = []
def gethtml(url):
global count_house
global count_fail
global fail_house
fails = 0
while True:
try:
if fails >= 10:
count_fail += 1
fail_house.append({'url':url})
break
head['Referer'] = url['url']
req = urllib2.Request(url['url'],headers=head)
response = urllib2.urlopen(req,None,15)
html = response.read()
soup_house_detail(html)
response.close()
count_house -= 1
print('待执行:'+str(count_house)+',失败:'+str(count_fail))
except:
fails += 1
print('正在尝试再次请求: '+str(fails))
else:
break
House_Detail_dic = {}
House_Detail_list = []
csv_headers = ['登记号','开发商名称','开发商地址','开发商电话','项目地址','预售证号','房屋结构','销售情况','项目名称','栋号','房屋坐落','房屋号','套内面积','建筑面积','房屋用途','申报单价','申报总价','房屋朝向','签约日期','备注','物业管理公司','物业管理费']
with open('./结果/'+region+'.csv','wb')as f:
f_csv = csv.writer(f)
f_csv.writerow(csv_headers)
f.close()
def soup_house_detail(html):
House_Detail_list = []
soup = BeautifulSoup(html,"html.parser")
while True:
try:
for link in soup.find_all('div',style="max-width:1200px;width:100%; margin:0 auto;"):
PresellName = link.find_all('td',id="PresellName")
House_Detail_list=[
link.find('td',id="djhtd").get_text().strip(),
link.find('td',id["CorpName"]).get_text().strip(),
link.find('td',id["CorpAddress"]).get_text().strip(),
link.find('td',id["CorpPhone"]).get_text().strip(),
link.find('td',id["HouseRepose"]).get_text(),
link.find('td',id["PresellBookID"]).get_text(),
link.find('td',id["HouseFrame"]).get_text(),
link.find('td',id["statusfont1"]).get_text()+link.find('td',id["statusfont2"]).get_text()+link.find('td',id["statusfont3"]).get_text(),
PresellName[0].string,
link.find('td',id["DongNo"]).get_text(),
PresellName[1].string,
link.find('td',id["HouseNO"]).get_text(),
link.find('td',id["HouseArea"]).get_text().split('m')[0],
link.find('td',id["SumBuildArea1"]).get_text().split('m')[0],
link.find('td',id["HouseUse"]).get_text(),
link.find('td',id["sbdj"]).get_text().split('元')[0],
link.find('td',id["sbzj"]).get_text().split('元')[0],
link.find('td',id["CHX"]).get_text(),
link.find('td',id["VisaDate"]).get_text(),
link.find('td',id["BZGS"]).get_text(),
link.find('td',id["ManagerCom"]).get_text(),
link.find('td',id["ManagerCharge"]).get_text().split('元')[0]
]
print(House_Detail_list)
with open('./结果/'+region+'.csv','a+')as f:
f_csv = csv.writer(f)
f_csv.writerow(House_Detail_list)
f.close()
except Exception as e:
print '内容解析失败' ,e
else:
break
soup.decompose()
del soup
gc.collect()
pool = ThreadPool(100)
pool.map(gethtml, house_urls)
pool.close()
pool.join()
csv_headers = ['url']
with open('./fail/'+region+'.csv','wb')as f:
f_csv = ucsv.DictWriter(f,csv_headers)
f_csv.writeheader()
f_csv.writerows(fail_house)
f.close()
```
不知道你这个问题是否已经解决, 如果还没有解决的话:python版本为python2.7
下面直接上代码
#a.csv is three rows csv
#这个代码是用来统计某一csv文件的某一列数据,对该列数据进行分类并且进行计数
import pandas as pd
import re
import csv
import sys
path='C:\\Users\\hg\\Desktop\\a\\a.csv'#文件路径不能包含中文,否则会报错
df=pd.read_csv(path,'utf-8',engine='python')#编码格式utf-8
df.head()
print(df)
df.describe()
import numpy as np
#公共部分
#统计出行方式
address=pd.read_csv(path,usecols=[0]) #提取想要的数据列,0是列索引
address.to_csv("C:\\Users\\hg\\Desktop\\b\\d.csv") #文件输出
path2='C:\\Users\\hg\\Desktop\\b\\d.csv' #文件读取
df2=pd.read_csv(path2,'utf-8',engine='python')
df3=np.unique(address) #调用unique函数对该列数据分组,返回每一组的组名
print(df3)
ts = pd.Series(address['driving'].values, index=address['driving']) #分组后计数返回该组的组名和每一个名称的数量
ts.describe()
ts.value_counts()
wuqu=ts.value_counts() #格式转换加文件输出,series无法直接输出为csv
wuqu1=pd.DataFrame(ts.value_counts())
wuqu1.to_csv('C:\\Users\\hg\\Desktop\\b\\e.csv') #输出文件是包含组名以及个数的csv文件