"""
.. module:: run_model
:synopsis: Collection of Models
.. moduleauthor:: MA Raza
This modules consists of collection of various machine learning models. We start with Light GBM.
Depending on the time, we can add more
Todo:
* Add more machine learning models, such as GBM, RF and XGBoost
* Spark Compatible GBM and Light GBM Models
* Add Model Diagnostic plots using SHAP Library
* Feature Reduction
* Config file
"""
import sys
sys.path.append('.')
import os
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from retail_sales_prediction import logger
[docs]def run_model_lgbm(feature_prep, X_train, y_train, X_val, y_val, X_test, config, num_days=6):
"""
Training the Light GBM Model.
Args:
feature_prep:
X_train:
y_train:
X_val:
y_val:
X_test:
num_days:
Returns:
:param model_params:
"""
logger("Training and predicting models...")
# params = {
# 'num_leaves': 3,
# 'objective': 'regression',
# 'min_data_in_leaf': 200,
# 'learning_rate': 0.02,
# 'feature_fraction': 0.8,
# 'bagging_fraction': 0.7,
# 'bagging_freq': 1,
# 'metric': 'l2',
# 'num_threads': 20
# }
params = config['model_params']
# MAX_ROUNDS = 200
MAX_ROUNDS = config['MAX_ROUNDS']
output_dir = config['output_dir']
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logger.info('output directory : {}'.fromat(output_dir))
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
logger.info("=" * 50)
logger.info("Step %d" % (i+1))
logger.info("=" * 50)
dtrain = lgb.Dataset(
X_train, label=y_train[:, i],
categorical_feature=cate_vars,
weight=pd.concat([feature_prep.items["perishable"]] * num_days) * 0.25 + 1
)
dval = lgb.Dataset(
X_val, label=y_val[:, i], reference=dtrain,
weight=feature_prep.items["perishable"] * 0.25 + 1,
categorical_feature=cate_vars)
bst = lgb.train(
params, dtrain, num_boost_round=MAX_ROUNDS,
valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
)
logger.info("\n".join(("%s: %.2f" % x) for x in sorted(
zip(X_train.columns, bst.feature_importance("gain")),
key=lambda x: x[1], reverse=True
)))
val_pred.append(bst.predict(
X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
test_pred.append(bst.predict(
X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))
logger.info('**** Finished Training *****')
logger.info("Validation mse:", mean_squared_error(
y_val, np.array(val_pred).transpose()))
weight = feature_prep.items["perishable"] * 0.25 + 1
err = (y_val - np.array(val_pred).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
logger.info('nwrmsle = {}'.format(err))
y_val = np.array(val_pred).transpose()
df_preds = pd.DataFrame(
y_val, index=feature_prep.df_2017.index,
columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
df_preds["unit_sales"] = np.clip(np.expm1(df_preds["unit_sales"]), 0, 1000)
df_preds.reset_index().to_csv(output_dir + 'lgb_cv.csv', index=False)
logger.info("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
y_test, index=feature_prep.df_2017.index,
columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
submission = feature_prep.test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv(output_dir + 'lgb_sub.csv', float_format='%.4f', index=None)