how to pass parameters over sklearn pipeline's stages?

Question

I'm working on a deep neural model for text classification using Keras. To fine tune some hyperparameters i'm using Keras Wrappers for the Scikit-Learn API. So I builded a Sklearn Pipeline for that:

def create_model(optimizer="adam", nbr_features=100):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(nbr_features,)))
    ...
    model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=["accuracy"])
    return model

estimator = Pipeline([("tfidf", TfidfVectorizer()),
                      ('norm', StandardScaler(with_mean=False)),
                      ("km", KerasClassifier(build_fn=create_model, verbose=1))])
grid_params = {
     'tfidf__max_df': (0.1, 0.25, 0.5, 0.75, 1.0),
     'tfidf__max_features': (100, 500, 1000, 5000,),
      ... }

gs = GridSearchCV(estimator,
                   param_grid,
                   ...)

I want to pass max_features parameters from tfidf stage to km stage as nbr_features. Any Hack/Workaround to do that ?

Amine Benatmane · Accepted Answer · 2019-09-15T09:12:41.853

I figured out how to do that by monkey patching ParameterGrid.__iter__ and GridSearchCV._run_search methods.

ParameterGrid.__iter__ iterates over all possible combinations of hyerparameters (dict of param_name: value). so i modified what it yields (one configuration of hyperparameters params) by adding "km__nbr_features" equal to 'tfidf__max_features':

params["km__nbr_features"] = params['tfidf__max_features']

Important: "km__nbr_features" must be missing from grid_params so the trick works.

Here is some code:

from sklearn.model_selection import GridSearchCV, ParameterGrid
import numpy as np
from itertools import product

def patch_params(params):
    # Updates a configuration of possible parameters
    params["km__nbr_features"] = params['tfidf__max_features']
    return out

def monkey_iter__(self):
    """Iterate over the points in the grid.

    Returns
    -------
    params : iterator over dict of string to any
        Yields dictionaries mapping each estimator parameter to one of its
        allowed values.
    """
    for p in self.param_grid:
        # Always sort the keys of a dictionary, for reproducibility
        items = sorted(p.items())
        if not items:
            yield {}
        else:
            keys, values = zip(*items)
            for v in product(*values):
                params = dict(zip(keys, v))
                yield patch_params(params)


# replacing address of "__getitem__" with "monkey_getitem__"
ParameterGrid.__iter__  = monkey_iter__

def monkey_run_search(self, evaluate_candidates):
    """Search all candidates in param_grid"""
    evaluate_candidates(ParameterGrid(self.param_grid))

# replacing address of "_run_search " with "monkey_run_search"
GridSearchCV._run_search = monkey_run_search

Then i preformed Grid Search normaly:

def create_model(optimizer="adam", nbr_features=100):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(nbr_features,)))
    ...
    model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=["accuracy"])
    return model

estimator = Pipeline([("tfidf", TfidfVectorizer()),
                      ('norm', StandardScaler(with_mean=False)),
                      ("km", KerasClassifier(build_fn=create_model, verbose=1))])
grid_params = {
     'tfidf__max_df': (0.1, 0.25, 0.5, 0.75, 1.0),
     'tfidf__max_features': (100, 500, 1000, 5000,),
      ... }

# Performing Grid Search
gs = GridSearchCV(estimator,
                   param_grid,
                   ...)

Update: In case you used RandomizedGridSearchCV you must monkey patch ParameterGrid.__getitem__ insted.

def monkey_getitem__(self, ind):
    """Get the parameters that would be ``ind``th in iteration
    Parameters
    ----------
    ind : int
        The iteration index
    Returns
    -------
    params : dict of string to any
        Equal to list(self)[ind]
    """
    # This is used to make discrete sampling without replacement memory
    # efficient.
    for sub_grid in self.param_grid:
        # XXX: could memoize information used here
        if not sub_grid:
            if ind == 0:
                return {}
            else:
                ind -= 1
                continue

        # Reverse so most frequent cycling parameter comes first
        keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
        sizes = [len(v_list) for v_list in values_lists]
        total = np.product(sizes)

        if ind >= total:
            # Try the next grid
            ind -= total
        else:
            out = {}
            for key, v_list, n in zip(keys, values_lists, sizes):
                ind, offset = divmod(ind, n)
                out[key] = v_list[offset]
            return patch_params(out)

    raise IndexError('ParameterGrid index out of range')

ParameterGrid.__getitem__ = monkey_getitem__

how to pass parameters over sklearn pipeline's stages?

1 Answers1