#%%
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
#%%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
#%%
train=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
items=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
cat=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
train=train.sample(frac=0.1,random_state=42)
#%%
shop_id_map = {11: 10, 0: 57, 1: 58, 40: 39}
train.loc[train['shop_id'].isin(shop_id_map), 'shop_id'] = train.loc[train['shop_id'].isin(shop_id_map), 'shop_id'].map(shop_id_map)
train.loc[train['shop_id'].isin(shop_id_map), 'shop_id']
#%%
shops['shop_city'] = shops['shop_name'].map(lambda x:x.split(' ')[0].strip('!'))
shop_types = ['ТЦ', 'ТРК', 'ТРЦ', 'ТК', 'МТРЦ']
shops['shop_type'] = shops['shop_name'].map(lambda x:x.split(' ')[1] if x.split(' ')[1] in shop_types else 'Others')
shops.loc[shops['shop_id'].isin([12, 56]), ['shop_city', 'shop_type']] = 'Online' # 12和56号是网上商店
shops.head(13)
#%%
shop_city_map = dict([(v,k) for k, v in enumerate(shops['shop_city'].unique())])
shop_type_map = dict([(v,k) for k, v in enumerate(shops['shop_type'].unique())])
shops['shop_city_code'] = shops['shop_city'].map(shop_city_map)
shops['shop_type_code'] = shops['shop_type'].map(shop_type_map)
shops.head(7)
#%%
items['item_name'] = items['item_name'].map(lambda x: ''.join(x.split(' '))) # 删除空格
duplicated_item_name = items[items['item_name'].duplicated()]
duplicated_item_name_rec = items[items['item_name'].isin(duplicated_item_name['item_name'])]
#%%
old_id = duplicated_item_name_rec['item_id'].values[::2]
new_id = duplicated_item_name_rec['item_id'].values[1::2]
old_new_map = dict(zip(old_id, new_id))
train.loc[train['item_id'].isin(old_id), 'item_id'] = train.loc[train['item_id'].isin(old_id), 'item_id'].map(old_new_map)
train[train['item_id'].isin(old_id)]
#%%
cat['item_type'] = cat['item_category_name'].map(lambda x: 'Игры' if x.find('Игры ')>0 else x.split(' -')[0].strip('\"'))
cat.iloc[[32,-3, -2], -1] = ['Карты оплаты', 'Чистые носители', 'Чистые носители' ]
item_type_map = dict([(v,k) for k, v in enumerate(cat['item_type'].unique())])
cat['item_type_code'] = cat['item_type'].map(item_type_map)
cat.head()
#%%
cat['sub_type'] = cat['item_category_name'].map(lambda x: x.split('-',1)[-1])
sub_type_map = dict([(v,k) for k, v in enumerate(cat['sub_type'].unique())])
cat['sub_type_code'] = cat['sub_type'].map(sub_type_map)
#%%
items = items.merge(cat[['item_category_id', 'item_type_code', 'sub_type_code']], on='item_category_id', how='left')
items.head()
#%%
import gc
del cat
gc.collect()
#%%
sns.jointplot('item_cnt_day', 'item_price', train, kind='scatter')
#%%
train_filtered = train[(train['item_cnt_day'] < 800) & (train['item_price'] < 70000)].copy()
#%%
outer = train[(train['item_cnt_day'] > 400) | (train['item_price'] > 40000)]
#%%
outer_set = train_filtered[train_filtered['item_id'].isin(outer['item_id'].unique())].groupby('item_id')
fig, ax = plt.subplots(1,1,figsize=(10, 10))
colors = sns.color_palette() + sns.color_palette('bright') # 使用调色板。默认颜色只有10来种,会重复使用,不便于观察
i = 1
for name, group in outer_set:
ax.plot(group['item_cnt_day'], group['item_price'], marker='o', linestyle='', ms=12, label=name, c=colors[i])
i += 1
ax.legend()
plt.show()
#%%
filtered = train[(train['item_cnt_day'] < 400) & (train['item_price'] < 45000)].copy()
filtered.head()
#%%
filtered.drop(index=filtered[filtered['item_id'].isin([7238, 14173])].index, inplace=True)
#%%
del train, train_filtered
gc.collect()
#%%
filtered.loc[filtered['item_price'] <= 0, 'item_price'] = 1249.0 # 用了同一个月同一个商店该商品的均价
filtered[filtered['item_price'] <= 0]
#%%
filtered['turnover_day'] = filtered['item_price'] * filtered['item_cnt_day']
#%%
item_sales_monthly = filtered.pivot_table(columns='item_id',
index='date_block_num',
values='item_cnt_day',
fill_value=0,
aggfunc=sum)
fig, axes = plt.subplots(1,2, figsize=(20, 8))
item_sales_monthly.sum(1).plot(ax=axes[0], title='Total sales of each month', xticks=[i for i in range(0,34,2)]) # 每月总销量
item_sales_monthly.sum(0).plot(ax=axes[1], title='Total sales of each item') # 每个商品的总销量
plt.subplots_adjust(wspace=0.2)
#%%
top_sales = item_sales_monthly.sum().sort_values(ascending=False)
#%%
item_turnover_monthly = filtered.pivot_table(index= 'date_block_num',
columns= 'item_id',
values='turnover_day',
fill_value=0,
aggfunc=sum)
item_sales_monthly = item_sales_monthly.drop(columns=top_sales[top_sales<=0].index, axis=1) # 去掉销量为0和负值的商品
item_turnover_monthly = item_turnover_monthly.drop(columns=top_sales[top_sales<=0].index, axis=1)
total_turnover = item_turnover_monthly.sum().sum()
#%%
fig, axes = plt.subplots(1,2, figsize=(20, 8))
item_turnover_monthly.sum(1).plot(ax=axes[0], title='Total turnovers of each month', xticks=[i for i in range(0,34,2)]) # 每月总营收
item_turnover_monthly.sum(0).plot(ax=axes[1], title='Total turnovers of each item') # 每个商品的总营收
plt.subplots_adjust(wspace=0.2)
#%%
top_turnover = item_turnover_monthly.sum().sort_values(ascending=False)
#%%
turnover_monthly = item_turnover_monthly.sum(1)
sales_monthly = item_sales_monthly.sum(1)
fig, axe1 = plt.subplots(1, 1, figsize=(16, 6))
axe2 = axe1.twinx()
axe1.plot(turnover_monthly.index, turnover_monthly.values, c='r')
axe2.plot(sales_monthly.index, sales_monthly.values, c='b')
axe2.grid(c='c', alpha=0.3)
axe1.legend(['Monthly Turnover'],fontsize=13, bbox_to_anchor=(0.95, 1))
axe2.legend(['Monthly Sales'],fontsize=13, bbox_to_anchor=(0.93, 0.9))
axe1.set_ylabel('Monthly Turnover', c='r')
axe2.set_ylabel('Monthly Sales', c='b')
plt.show()
#%%
sales_growth = item_sales_monthly.loc[23].sum() - item_sales_monthly.loc[11].sum()
sales_growth_rate = sales_growth / item_sales_monthly.loc[11].sum() * 100
turnover_growth = item_turnover_monthly.loc[23].sum() - item_turnover_monthly.loc[11].sum()
turnover_growth_rate = turnover_growth / item_turnover_monthly.loc[11].sum() * 100
print(
' 销售同比增长量为: %.2f ,同比增长率为: %.2f%%;\n' % (sales_growth, sales_growth_rate),
'营收同比增长量为: %.2f ,同比增长率为: %.2f%%。' % (turnover_growth, turnover_growth_rate)
)
#%%
filtered.groupby('shop_id')['item_cnt_day'].sum().sort_values().plot(kind='bar', figsize=(12, 6))
#%%
filtered.groupby('shop_id')['turnover_day'].sum().sort_values().plot(kind='bar', figsize=(12, 6))
#%%
filtered = filtered.merge(items.iloc[:,1:], on='item_id', how='left')
filtered.head()
#%%
filtered.groupby('item_category_id')['turnover_day'].sum().sort_values().plot(kind='bar',figsize=(16,6), rot=0)
#%%
filtered = filtered.merge(shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')
filtered.head()
#%%
filtered.groupby('shop_city_code')['item_cnt_day'].sum().plot(kind='bar',figsize=(12,6))
#%%
filtered.groupby('shop_type_code')['item_cnt_day'].sum().plot(kind='bar',figsize=(12,6))
#%%
shop_sales_monthly = filtered.pivot_table(index='date_block_num',
columns='shop_id',
values='item_cnt_day',
fill_value=0,
aggfunc=sum)
shop_open_month_cnt = (shop_sales_monthly.iloc[-6:] > 0).sum() # 有销量的记录
shop_open_month_cnt.head() # 每个店铺最后半年里有几个月有销量
#%%
item_selling_month_cnt = (item_sales_monthly.iloc[-6:] > 0).sum()
open_shop = shop_sales_monthly[shop_open_month_cnt[shop_open_month_cnt == 6].index]
item_zero = item_sales_monthly[item_selling_month_cnt[item_selling_month_cnt == 0].index]
selling_item = item_sales_monthly[item_selling_month_cnt[item_selling_month_cnt > 0].index]
cl_set = filtered[filtered['shop_id'].isin(open_shop.columns) & filtered['item_id'].isin(selling_item.columns)]
#%%
from itertools import product
import time
ts = time.time()
martix = []
for i in range(34):
record = cl_set[cl_set['date_block_num'] == i]
group = product([i],record.shop_id.unique(),record.item_id.unique())
martix.append(np.array(list(group)))
cols = ['date_block_num', 'shop_id', 'item_id']
martix = pd.DataFrame(np.vstack(martix), columns=cols)
martix
#%%
from itertools import product
import time
ts = time.time()
martix = []
for i in range(34):
record = filtered[filtered['date_block_num'] == i]
group = product([i],record.shop_id.unique(),record.item_id.unique())
martix.append(np.array(list(group)))
cols = ['date_block_num', 'shop_id', 'item_id']
martix = pd.DataFrame(np.vstack(martix), columns=cols)
martix
#%%
del cl_set
del item_sales_monthly
del item_turnover_monthly
gc.collect()
#%%
group = filtered.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': np.sum})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)
del filtered
#%%
martix = pd.merge(martix, group, on=['date_block_num', 'shop_id', 'item_id'], how='left')
martix.head()
#%%
martix = martix.fillna(0)
#%%
martix = martix.merge(shops[['shop_id', 'shop_type_code', 'shop_city_code']], on='shop_id', how='left')
martix = martix.merge(items.drop(columns='item_name'), on='item_id', how='left')
martix
#%%
martix['year'] = martix['date_block_num'].map(lambda x: x // 12 + 2013)
martix['month'] = martix['date_block_num'].map(lambda x: x % 12)
martix.head()
#%%
group = martix.groupby(['date_block_num','item_id']).agg({'item_cnt_month':'mean'})
group.columns = ['item_cnt_month_avg']
group.reset_index(inplace=True)
martix = martix.merge(group, on=['date_block_num', 'item_id'], how='left')
group = martix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month':'mean'})
group.columns = ['shop_cnt_month_avg']
group.reset_index(inplace=True)
martix = martix.merge(group, on=['date_block_num', 'shop_id'], how='left')
group = martix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month':'mean'})
group.columns = ['cat_cnt_month_avg']
group.reset_index(inplace=True)
martix = martix.merge(group, on=['date_block_num', 'item_category_id'], how='left')
group = martix.groupby(['date_block_num','shop_id','item_category_id']).agg({'item_cnt_month':'mean'})
group.columns = ['shop_cat_cnt_month_avg']
group.reset_index(inplace=True)
martix = martix.merge(group, on=['date_block_num','shop_id','item_category_id'], how='left')
group = martix.groupby(['date_block_num', 'item_type_code']).agg({'item_cnt_month':'mean'})
group.columns = ['itemtype_cnt_month_avg']
group.reset_index(inplace=True)
martix = martix.merge(group, on=['date_block_num', 'item_type_code'], how='left')
group = martix.groupby(['date_block_num', 'sub_type_code']).agg({'item_cnt_month':'mean'})
group.columns = ['subtype_cnt_month_avg']
group.reset_index(inplace=True)
martix = martix.merge(group, on=['date_block_num','sub_type_code'], how='left')
group = martix.groupby(['date_block_num','shop_city_code','item_id']).agg({'item_cnt_month':'mean'})
group.columns = ['city_item_cnt_month_avg']
group.reset_index(inplace=True)
martix = martix.merge(group, on=['date_block_num','shop_city_code','item_id'], how='left')
group = martix.groupby(['date_block_num','shop_type_code','item_id']).agg({'item_cnt_month':'mean'})
group.columns = ['shoptype_item_cnt_month_avg']
group.reset_index(inplace=True)
martix = martix.merge(group, on=['date_block_num','shop_type_code','item_id'], how='left')
martix.head()
#%%
del group
gc.collect()
#%%
def lag_feature(df, lags, col):
tmp = df[['date_block_num','shop_id','item_id',col]]
for i in lags:
shifted = tmp.copy()
shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
shifted['date_block_num'] += i
df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
return df
martix = lag_feature(martix, [1,2,3,6,12], 'item_cnt_month')
martix = lag_feature(martix, [1,2,3,6,12], 'item_cnt_month_avg')
martix = lag_feature(martix, [1,2,3,6,12], 'shop_cnt_month_avg')
martix.drop(columns=[ 'item_cnt_month_avg', 'shop_cnt_month_avg'], inplace=True) # 只保留特征的历史信息
gc.collect()
martix = lag_feature(martix, [1,2,3,6,12], 'cat_cnt_month_avg')
martix = lag_feature(martix, [1,2,3,6,12], 'shop_cat_cnt_month_avg')
martix.drop(columns=['cat_cnt_month_avg', 'shop_cat_cnt_month_avg'], inplace=True)
martix = lag_feature(martix, [1,2,3,6,12], 'itemtype_cnt_month_avg')
martix = lag_feature(martix, [1,2,3,6,12], 'subtype_cnt_month_avg')
martix.drop(columns=['itemtype_cnt_month_avg', 'subtype_cnt_month_avg'], inplace=True)
martix = lag_feature(martix, [1,2,3,6,12], 'city_item_cnt_month_avg')
martix = lag_feature(martix, [1,2,3,6,12], 'shoptype_item_cnt_month_avg')
martix.drop(columns=[ 'city_item_cnt_month_avg','shoptype_item_cnt_month_avg'], inplace=True)
#%%
martix[martix.columns[:20]].isna().any()
#%%
train_set = martix[martix['date_block_num'] > 11].fillna(0)
del martix
gc.collect()
#%%
for col in train_set.columns:
if col.find('code') >= 0:
train_set[col] = train_set[col].astype(np.int8)
elif train_set[col].dtype == 'float64':
train_set[col] = train_set[col].astype(np.float32)
elif train_set[col].dtype == 'int64':
train_set[col] = train_set[col].astype(np.int16)
train_set['item_type_code'] = train_set['item_type_code'].astype('category')
train_set['sub_type_code'] = train_set['sub_type_code'].astype('category')
train_set.info()
#%%
import lightgbm as lgb
X_train = train_set[train_set['date_block_num'] < 32].drop(columns=['item_cnt_month']) # 训练集的样本特征
Y_train = train_set[train_set['date_block_num'] < 32]['item_cnt_month'] # 训练集的样本标签
X_validate = train_set[train_set['date_block_num'] == 32].drop(columns=['item_cnt_month']) # 校对集
Y_validate = train_set[train_set['date_block_num'] == 32]['item_cnt_month']
X_test = train_set[train_set['date_block_num'] == 33].drop(columns=['item_cnt_month']) # 测试集
#%%
Y_true=train_set[train_set['date_block_num'] == 33]['item_cnt_month']
#%%
X_test[0:50]
#%%
del train_set
gc.collect()
#%%
train_data = lgb.Dataset(data=X_train, label=Y_train)
validate_data = lgb.Dataset(data=X_validate, label=Y_validate)
#%%
import time
ts = time.time()
params = {"objective" : "regression", "metric" : "rmse", 'n_estimators':10000, 'early_stopping_rounds':50,
"num_leaves" : 200, "learning_rate" : 0.01, "bagging_fraction" : 0.9,
"feature_fraction" : 0.3, "bagging_seed" : 0}
print('Start....', ts)
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, validate_data], verbose_eval=1000)
print('End...', time.time() - ts)
#%%
lgb.plot_importance(lgb_model, max_num_features=40, figsize=(12, 8))
plt.title("Featurertances")
plt.show()
#%%
X_test.shape
#%%
Y_true.shape
#%%
Y_test = lgb_model.predict(X_test).clip(0, 20)
#%%
error = Y_test - Y_true
rmse = (error**2).mean()**0.5
rmse
#%%
X_test.head(50)
#%%
Y_test[0:50]
#%%
Y_true[0:50]
#%%
这是源代码
另外这段的
错误是因为什么
不知道你这个问题是否已经解决, 如果还没有解决的话: