Source code for retail_sales_prediction.utils.run_model

"""
.. module:: run_model
   :synopsis: Collection of Models
.. moduleauthor:: MA Raza

This modules consists of collection of various machine learning models. We start with Light GBM.

Depending on the time, we can add more

Todo:
    * Add more machine learning models, such as GBM, RF and XGBoost
    * Spark Compatible GBM and Light GBM Models
    * Add Model Diagnostic plots using SHAP Library
    * Feature Reduction
    * Config file
"""
import sys
sys.path.append('.')

import os

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from retail_sales_prediction import logger


[docs]def run_model_lgbm(feature_prep, X_train, y_train, X_val, y_val, X_test, config, num_days=6):
    """
    Training the Light GBM Model.
    Args:
        feature_prep:
        X_train:
        y_train:
        X_val:
        y_val:
        X_test:
        num_days:

    Returns:
    :param model_params:

    """

    logger("Training and predicting models...")
    # params = {
    #     'num_leaves': 3,
    #     'objective': 'regression',
    #     'min_data_in_leaf': 200,
    #     'learning_rate': 0.02,
    #     'feature_fraction': 0.8,
    #     'bagging_fraction': 0.7,
    #     'bagging_freq': 1,
    #     'metric': 'l2',
    #     'num_threads': 20
    # }
    params = config['model_params']
    # MAX_ROUNDS = 200
    MAX_ROUNDS = config['MAX_ROUNDS']
    output_dir = config['output_dir']
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    logger.info('output directory : {}'.fromat(output_dir))
    val_pred = []
    test_pred = []
    cate_vars = []
    for i in range(16):
        logger.info("=" * 50)
        logger.info("Step %d" % (i+1))
        logger.info("=" * 50)
        dtrain = lgb.Dataset(
            X_train, label=y_train[:, i],
            categorical_feature=cate_vars,
            weight=pd.concat([feature_prep.items["perishable"]] * num_days) * 0.25 + 1
        )
        dval = lgb.Dataset(
            X_val, label=y_val[:, i], reference=dtrain,
            weight=feature_prep.items["perishable"] * 0.25 + 1,
            categorical_feature=cate_vars)
        bst = lgb.train(
            params, dtrain, num_boost_round=MAX_ROUNDS,
            valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
        )
        logger.info("\n".join(("%s: %.2f" % x) for x in sorted(
            zip(X_train.columns, bst.feature_importance("gain")),
            key=lambda x: x[1], reverse=True
        )))
        val_pred.append(bst.predict(
            X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
        test_pred.append(bst.predict(
            X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

    logger.info('**** Finished Training *****')

    logger.info("Validation mse:", mean_squared_error(
        y_val, np.array(val_pred).transpose()))

    weight = feature_prep.items["perishable"] * 0.25 + 1
    err = (y_val - np.array(val_pred).transpose())**2
    err = err.sum(axis=1) * weight
    err = np.sqrt(err.sum() / weight.sum() / 16)
    logger.info('nwrmsle = {}'.format(err))

    y_val = np.array(val_pred).transpose()
    df_preds = pd.DataFrame(
        y_val, index=feature_prep.df_2017.index,
        columns=pd.date_range("2017-07-26", periods=16)
    ).stack().to_frame("unit_sales")
    df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
    df_preds["unit_sales"] = np.clip(np.expm1(df_preds["unit_sales"]), 0, 1000)
    df_preds.reset_index().to_csv(output_dir + 'lgb_cv.csv', index=False)

    logger.info("Making submission...")
    y_test = np.array(test_pred).transpose()
    df_preds = pd.DataFrame(
        y_test, index=feature_prep.df_2017.index,
        columns=pd.date_range("2017-08-16", periods=16)
    ).stack().to_frame("unit_sales")
    df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

    submission = feature_prep.test[["id"]].join(df_preds, how="left").fillna(0)
    submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
    submission.to_csv(output_dir + 'lgb_sub.csv', float_format='%.4f', index=None)