import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the "Data/input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("data/input"))
# Any results you write to the current directory are saved as output.
train = pd.read_csv('data/input/train.csv')
test = pd.read_csv('data/input/test.csv')
train.head() # we can see first 5 samples with this function
test.head()
print(train.isnull().sum().sort_values(ascending = False)) #as you can see there is no null value in the columns
print("**"*50)
print(test.isnull().sum().sort_values(ascending = False))
print(train.info()) # we can see type of features with this info()
print('**'*50)
print(test.info())
train.datetime = pd.to_datetime(train.datetime)
test.datetime = pd.to_datetime(test.datetime)
print(train.info())
print('**'*50)
print(test.info())
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['dayofweek'] = train['datetime'].dt.weekday_name
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour
test['dayofweek'] = test['datetime'].dt.weekday_name
train.tail() # we can see last 5 samples with this function!
train.describe().T # we can see statistical results with this function
plt.figure(figsize=(16,8))
sns.heatmap(train.corr(), annot=True)
plt.show()
plt.figure(figsize=(16,8))
sns.pairplot(train)
plt.show()
plt.figure(figsize=(16,8))
sns.distplot(train['count'])
plt.show()
def scatter_plot():
for i in test.columns:
plt.scatter(train[i],train['count'])
plt.title(f"Scatter plot for {i}")
plt.show()
scatter_plot()
plt.figure(figsize=(16,8))
plt.plot(train.set_index('datetime')["count"][0:300])
plt.show()
plt.figure(figsize=(16,8))
sns.boxplot(x='dayofweek',y='count', data=train)
# we need to convert categorical data to numeric data.分类数据转换成数字数据
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['dayofweek'] = le.fit_transform(train['dayofweek'])
test['dayofweek'] = le.transform(test['dayofweek'])
plt.figure(figsize=(16,8))
sns.boxplot(x='season', y='count', data=train)
plt.figure(figsize=(16,8))
sns.boxplot(x='hour',y='count', data=train) # as we can see there is difference for each hour. We need to use it !
plt.figure(figsize=(16,8))
sns.boxplot(x='year',y='count', data=train) # bike were rented in 2012!
plt.figure(figsize=(16,8))
plt.hist(train['count'][train['year'] == 2011], alpha=0.5, label='2011')
plt.hist(train['count'][train['year'] == 2012], alpha=0.5, label='2012', color='red')
train.head()
train.set_index('datetime', inplace=True)
train['2011-01-19 23:00:00':]
Q1 = train.quantile(0.25)
Q3 = train.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
train_without_outliers =train[~((train < (Q1 - 1.5 * IQR)) |(train > (Q3 + 1.5 * IQR))).any(axis=1)]
train_without_outliers.dropna(inplace=True)
print(train.info())
print('*********************************************************************************')
print(train_without_outliers.info())
train_without_outliers.head(2)
plt.figure(figsize=(12, 7))
sns.boxplot(x='season',y='windspeed',data=train_without_outliers,palette='winter')
train_without_outliers['windspeed'] = train_without_outliers['windspeed'].replace(0,np.NaN)
test['windspeed'] = test['windspeed'].replace(0,np.NaN)
train_without_outliers['windspeed'].fillna(method='bfill',inplace=True)
train_without_outliers['windspeed'] = train_without_outliers['windspeed'].interpolate()
test['windspeed'] = test['windspeed'].interpolate()
train_without_outliers['windspeed'].isnull().sum()
test.head()
train_without_outliers.head(5)
train_without_outliers[['season','holiday','workingday','weather', 'year','month','day','hour','dayofweek']] = train_without_outliers[['season','holiday','workingday','weather', 'year','month','day','hour','dayofweek']].astype('category')
test[['season','holiday','workingday','weather', 'year','month','day','hour','dayofweek']] = test[['season','holiday','workingday','weather', 'year','month','day','hour','dayofweek']].astype('category')
train_without_outliers.info()
from sklearn.model_selection import train_test_split
X = train_without_outliers[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp','humidity', 'year', 'month', 'day', 'hour', 'dayofweek','windspeed']]
y = train_without_outliers['count']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)
y_train
from sklearn.preprocessing import MinMaxScaler
sc_X = MinMaxScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
rf_prediction = rf.predict(X_test)
from sklearn.metrics import mean_squared_error
from sklearn import metrics
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, rf_prediction)))
plt.scatter(y_test,rf_prediction)
plt.figure(figsize=(16,8))
plt.plot(rf_prediction[0:200],'r')
plt.plot(y_test[0:200].values)
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
dt_prediction = dt_reg.predict(X_test)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, dt_prediction)))
plt.scatter(y_test,dt_prediction)
test.head()
test[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp','humidity', 'year', 'month', 'day', 'hour', 'dayofweek','windspeed']] = sc_X.fit_transform(test[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp','humidity', 'year', 'month', 'day', 'hour', 'dayofweek','windspeed']])
test_pred= rf.predict(test[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp','humidity', 'year', 'month', 'day', 'hour', 'dayofweek','windspeed']])
test_pred
test_pred=test_pred.reshape(-1,1)
test_pred = pd.DataFrame(test_pred, columns=['count'])
df = pd.concat([test['datetime'], test_pred],axis=1)
df.head()
df['count'] = df['count'].astype('int')
df.to_csv('submission1.csv' , index=False)