这个问题没有具体的数据,只能用模拟数据给你一个思路。
任务1:
import pandas as pd
import os
file = 'data/tieba/'
fileList=os.listdir(file)
def readCSV(dataAll,fileName):
data = pd.read_csv(file + fileName)
dataAll=pd.concat([dataAll,data])
return dataAll
dataAll=pd.DataFrame()
for filename in fileList:
dataAll=readCSV(dataAll,filename)
df=dataAll.reset_index(drop=True)
df['Time'] = pd.to_datetime(df['Time'])
df.sort_values(by='Time',ascending=False).to_csv('data/posts.csv',encoding='gbk')
任务2:
df2=df.groupby('Username')['Username'].count()
df3=pd.DataFrame([df2.index,df2.values]).T
df3.columns=['Username','Count_Stats']
df3.sort_values(by='Count_Stats',ascending=False)
df3.to_csv('data/user.csv',encoding='gbk')
任务3:
dateDay=[]
for i in range(len(df)):
dateDay.append(df['Time'][i].date())
df['dateDay']=dateDay
df4=pd.concat([df.groupby('dateDay').size(),df[['dateDay','Username']].drop_duplicates().groupby('dateDay').count()],axis=1)
df5=df4.reset_index()
df5.columns=['Date','Count_Stats','Count_Users']
df5.to_csv('data/days.csv')
任务4:
#帖子数
df['PostID'].drop_duplicates().value_counts().sum()
#用户数
df['Username'].drop_duplicates().value_counts().sum()
用户数/帖子数就是答案
任务5:
待考虑,是根据小时段来统计发言数吗?