[scikit-learn] Automatic ThresholdClassifier based on cost-function - Classifier Interface?
Anton Suchaneck
a.suchaneck at gmail.com
Fri Nov 11 05:23:12 EST 2016
Hi!
I tried writing a ThresholdClassifier, that wraps any classifier with
predict_proba() and based on a cost function adjusts the threshold for
predict(). This helps for imbalanced data.
My current cost function assigns cost +cost for a true positive and -1 for
a false positive.
It seems to run, but I'm not sure if I got the API for a classifier right.
Can you tell me whether this is how the functions should be implemented to
play together with other parts of sklearn?
Especially parameter settings for base.clone both in klass.__init__ and
.set_params() seemed weird.
Here is the code. The class ThresholdClassifier wraps a clf. RandomForest
in this case.
Anton
from sklearn.base import BaseEstimator, ClassifierMixin
from functools import partial
def find_threshold_cost_factor(clf, X, y, cost_factor):
y_pred = clf.predict_proba(X)
top_score = 0
top_threshold = None
cur_score=0
for y_pred_el, y_el in sorted(zip(y_pred[:, 1], y), reverse=True): #
FIXME: assumes 2 classes
if y_el == 0:
cur_score -= 1
if y_el == 1:
cur_score += cost_factor
if cur_score > top_score:
top_score = cur_score
top_threshold = y_pred_el
return top_threshold, top_score
class ThresholdClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, clf, find_threshold, **params):
self.clf = clf
self.find_threshold = find_threshold
self.threshold = None
self.set_params(**params)
def score(self, X, y, sample_weight=None):
_threshold, score = self.find_threshold(self.clf, X, y)
return score
def fit(self, X, y):
self.clf.fit(X, y)
self.threshold, _score=self.find_threshold(self.clf, X, y)
self.classes_ = self.clf.classes_
def predict(self, X):
y_score=self.clf.predict_proba(X)
return np.array(y_score[:,1]>=self.threshold) # FIXME assumes 2
classes
def predict_proba(self, X):
return self.clf.predict_proba(X)
def set_params(self, **params):
for param_name in ["clf", "find_threshold", "threshold"]:
if param_name in params:
setattr(self, param_name, params[param_name])
del params[param_name]
self.clf.set_params(**params)
return self
def get_params(self, deep=True):
params={"clf":self.clf, "find_threshold": self.find_threshold,
"threshold":self.threshold}
params.update(self.clf.get_params(deep))
return params
if __name__ == '__main__':
import numpy as np
import random
from sklearn.grid_search import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
from sklearn.metrics import make_scorer, classification_report,
confusion_matrix
np.random.seed(111)
random.seed(111)
X, y = make_classification(1000,
n_features=20,
n_informative=4,
n_redundant=0,
n_repeated=0,
n_clusters_per_class=4,
# class_sep=0.5,
weights=[0.90]
)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, stratify=y)
for cost in [10]:
find_threshold=partial(find_threshold_cost_factor, cost_factor=10)
def scorer(clf, X, y):
return find_threshold(clf, X, y)[1]
clfs = [RandomizedSearchCV(
ThresholdClassifier(RandomForestClassifier(),
find_threshold),
{"n_estimators": [100, 200],
"criterion": ["entropy"],
"min_samples_leaf": [1, 5],
"class_weight": ["balanced", None],
},
cv=3,
scoring=scorer, # Get rid of this, by letting
classifier tell it's cost-bsed score?
n_iter=8,
n_jobs=4),
]
for clf in clfs:
clf.fit(X_train, y_train)
clf_best = clf.best_estimator_
print(clf_best, cost, clf_best.score(X_test, y_test))
print(confusion_matrix(y_test, clf_best.predict(X_test)))
#print(find_threshold(clf_best, X_train, y_train))
#print(clf_best.threshold,
sorted(zip(clf_best.predict_proba(X_train)[:,1], y_train),
reverse=True)[:20])
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/scikit-learn/attachments/20161111/8bce70ab/attachment.html>
More information about the scikit-learn
mailing list