python数据分析,数据分析,爬虫

img

这个问题没有具体的数据,只能用模拟数据给你一个思路。
任务1:

import pandas as pd
import os

file = 'data/tieba/'
fileList=os.listdir(file)

def readCSV(dataAll,fileName):  
    data = pd.read_csv(file + fileName)
    dataAll=pd.concat([dataAll,data])
    return dataAll

dataAll=pd.DataFrame()

for filename in fileList:
    dataAll=readCSV(dataAll,filename)

df=dataAll.reset_index(drop=True)
df['Time'] = pd.to_datetime(df['Time'])

df.sort_values(by='Time',ascending=False).to_csv('data/posts.csv',encoding='gbk')

任务2:

df2=df.groupby('Username')['Username'].count()
df3=pd.DataFrame([df2.index,df2.values]).T
df3.columns=['Username','Count_Stats']
df3.sort_values(by='Count_Stats',ascending=False)
df3.to_csv('data/user.csv',encoding='gbk')

任务3:

dateDay=[]

for i in range(len(df)):
    dateDay.append(df['Time'][i].date())

df['dateDay']=dateDay
df4=pd.concat([df.groupby('dateDay').size(),df[['dateDay','Username']].drop_duplicates().groupby('dateDay').count()],axis=1)
df5=df4.reset_index()
df5.columns=['Date','Count_Stats','Count_Users']
df5.to_csv('data/days.csv')

任务4:

#帖子数
df['PostID'].drop_duplicates().value_counts().sum()
#用户数
df['Username'].drop_duplicates().value_counts().sum()

用户数/帖子数就是答案

任务5:
待考虑,是根据小时段来统计发言数吗?