[scikit-learn] Automatic ThresholdClassifier based on cost-function - Classifier Interface?

Fri Nov 11 05:23:12 EST 2016

Hi!

I tried writing a ThresholdClassifier, that wraps any classifier with
predict_proba() and based on a cost function adjusts the threshold for
predict(). This helps for imbalanced data.
My current cost function assigns cost +cost for a true positive and -1 for
a false positive.
It seems to run, but I'm not sure if I got the API for a classifier right.

Can you tell me whether this is how the functions should be implemented to
play together with other parts of sklearn?

Especially parameter settings for base.clone both in klass.__init__ and
.set_params() seemed weird.

Here is the code. The class ThresholdClassifier wraps a clf. RandomForest
in this case.

Anton

from sklearn.base import BaseEstimator, ClassifierMixin
from functools import partial

def find_threshold_cost_factor(clf, X, y, cost_factor):
    y_pred = clf.predict_proba(X)

    top_score = 0
    top_threshold = None
    cur_score=0
    for y_pred_el, y_el in sorted(zip(y_pred[:, 1], y), reverse=True): #
FIXME: assumes 2 classes
        if y_el == 0:
            cur_score -= 1
        if y_el == 1:
            cur_score += cost_factor
        if cur_score > top_score:
            top_score = cur_score
            top_threshold = y_pred_el
    return top_threshold, top_score

class ThresholdClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, clf, find_threshold, **params):
        self.clf = clf
        self.find_threshold = find_threshold
        self.threshold = None
        self.set_params(**params)

    def score(self, X, y, sample_weight=None):
        _threshold, score = self.find_threshold(self.clf, X, y)
        return score

    def fit(self, X, y):
        self.clf.fit(X, y)
        self.threshold, _score=self.find_threshold(self.clf, X, y)
        self.classes_ = self.clf.classes_

    def predict(self, X):
        y_score=self.clf.predict_proba(X)
        return np.array(y_score[:,1]>=self.threshold) # FIXME assumes 2
classes

    def predict_proba(self, X):
        return self.clf.predict_proba(X)

    def set_params(self, **params):
        for param_name in ["clf", "find_threshold", "threshold"]:
            if param_name in params:
                setattr(self, param_name, params[param_name])
                del params[param_name]
        self.clf.set_params(**params)
        return self

    def get_params(self, deep=True):
        params={"clf":self.clf, "find_threshold": self.find_threshold,
"threshold":self.threshold}
        params.update(self.clf.get_params(deep))
        return params

if __name__ == '__main__':
    import numpy as np
    import random
    from sklearn.grid_search import RandomizedSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import make_classification
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import make_scorer, classification_report,
confusion_matrix

    np.random.seed(111)
    random.seed(111)

    X, y = make_classification(1000,
                               n_features=20,
                               n_informative=4,
                               n_redundant=0,
                               n_repeated=0,
                               n_clusters_per_class=4,
                               # class_sep=0.5,
                               weights=[0.90]
                               )

    X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, stratify=y)

    for cost in [10]:
        find_threshold=partial(find_threshold_cost_factor, cost_factor=10)

        def scorer(clf, X, y):
            return find_threshold(clf, X, y)[1]

        clfs = [RandomizedSearchCV(
                    ThresholdClassifier(RandomForestClassifier(),
find_threshold),
                       {"n_estimators": [100, 200],
                        "criterion": ["entropy"],
                        "min_samples_leaf": [1, 5],
                        "class_weight": ["balanced", None],
                        },
                       cv=3,
                       scoring=scorer, # Get rid of this, by letting
classifier tell it's cost-bsed score?
                       n_iter=8,
                       n_jobs=4),
                ]

        for clf in clfs:
            clf.fit(X_train, y_train)
            clf_best = clf.best_estimator_
            print(clf_best, cost, clf_best.score(X_test, y_test))
            print(confusion_matrix(y_test, clf_best.predict(X_test)))
            #print(find_threshold(clf_best, X_train, y_train))
            #print(clf_best.threshold,
sorted(zip(clf_best.predict_proba(X_train)[:,1], y_train),
reverse=True)[:20])
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/scikit-learn/attachments/20161111/8bce70ab/attachment.html>