[SciPy-User] Logistic regression using SciPy

David Warde-Farley d.warde.farley at gmail.com
Sun Dec 9 23:22:35 EST 2012


First, the way you've written the log likelihood is numerically
unstable. Consider simplifying the expression (using logarithm laws
and breaking apart logistic function) and using the log1p function
where appropriate.

Second, the optimization problem is going to be extremely ill
conditioned given the very different scales of your different
predictors. You should probably mean-center and divide by the standard
deviation.

Third, there's a check_grad function in scipy.optimize that can be
used to troubleshoot gradient issues.

Fourth, there's a pre-rolled of this in scikit-learn that will
probably be a good deal faster (it wraps a C library) and certainly
better tested than home-rolling it.

David

On Sun, Dec 9, 2012 at 11:01 PM, Fg Nu <fgnu32 at yahoo.com> wrote:
>
>
> I am trying to code up logistic regression in Python using the SciPy "fmin_bfgs" function, but am running into some issues. I wrote functions for the logistic (sigmoid) transformation  function, and the cost function, and those work fine (I have used the optimized values of the parameter vector found via canned software to test the functions, and those match up). I am not that sure of my implementation of the gradient function, but it looks reasonable.
>
> Here is the code:
>
> #==================================================
>
>     # purpose: logistic regression
>     import numpy as np
>     import scipy as sp
>     import scipy.optimize
>
>     import matplotlib as mpl
>     import os
>
>     # prepare the data
>     data = np.loadtxt('data.csv', delimiter=',', skiprows=1)
>     vY = data[:, 0]
>     mX = data[:, 1:]
>     intercept = np.ones(mX.shape[0]).reshape(mX.shape[0], 1)
>     mX = np.concatenate((intercept, mX), axis = 1)
>     iK = mX.shape[1]
>     iN = mX.shape[0]
>
>     # logistic transformation
>     def logit(mX, vBeta):
>         return((1/(1.0 + np.exp(-np.dot(mX, vBeta)))))
>
>     # test function call
>     vBeta0 = np.array([-.10296645, -.0332327, -.01209484, .44626211, .92554137, .53973828,
>         1.7993371, .7148045  ])
>     logit(mX, vBeta0)
>
>     # cost function
>     def logLikelihoodLogit(vBeta, mX, vY):
>         return(-(np.sum(vY*np.log(logit(mX, vBeta)) + (1-vY)*(np.log(1-logit(mX, vBeta))))))
>     logLikelihoodLogit(vBeta0, mX, vY) # test function call
>
>     # gradient function
>     def likelihoodScore(vBeta, mX, vY):
>         return(np.dot(mX.T,
>                       ((np.dot(mX, vBeta) - vY)/
>                        np.dot(mX, vBeta)).reshape(iN, 1)).reshape(iK, 1))
>
>     likelihoodScore(vBeta0, mX, vY).shape # test function call
>
>     # optimize the function (without gradient)
>     optimLogit = scipy.optimize.fmin_bfgs(logLikelihoodLogit,
>                                       x0 = np.array([-.1, -.03, -.01, .44, .92, .53,
>                                                 1.8, .71]),
>                                       args = (mX, vY), gtol = 1e-3)
>
>     # optimize the function (with gradient)
>     optimLogit = scipy.optimize.fmin_bfgs(logLikelihoodLogit,
>                                       x0 = np.array([-.1, -.03, -.01, .44, .92, .53,
>                                                 1.8, .71]), fprime = likelihoodScore,
>                                       args = (mX, vY), gtol = 1e-3)
> #=====================================================
>
> * The first optimization (without gradient)  ends with a whole lot of stuff about division by zero.
>
> * The second optimization (with gradient) ends with a matrices not aligned error, which probably means I have got the way the gradient is to be returned wrong.
>
> Any help with this is appreciated. If anyone wants to try this, the data is included below.
>
>     low,age,lwt,race,smoke,ptl,ht,ui
>     0,19,182,2,0,0,0,1
>     0,33,155,3,0,0,0,0
>     0,20,105,1,1,0,0,0
>     0,21,108,1,1,0,0,1
>     0,18,107,1,1,0,0,1
>     0,21,124,3,0,0,0,0
>     0,22,118,1,0,0,0,0
>     0,17,103,3,0,0,0,0
>     0,29,123,1,1,0,0,0
>     0,26,113,1,1,0,0,0
>     0,19,95,3,0,0,0,0
>     0,19,150,3,0,0,0,0
>     0,22,95,3,0,0,1,0
>     0,30,107,3,0,1,0,1
>     0,18,100,1,1,0,0,0
>     0,18,100,1,1,0,0,0
>     0,15,98,2,0,0,0,0
>     0,25,118,1,1,0,0,0
>     0,20,120,3,0,0,0,1
>     0,28,120,1,1,0,0,0
>     0,32,121,3,0,0,0,0
>     0,31,100,1,0,0,0,1
>     0,36,202,1,0,0,0,0
>     0,28,120,3,0,0,0,0
>     0,25,120,3,0,0,0,1
>     0,28,167,1,0,0,0,0
>     0,17,122,1,1,0,0,0
>     0,29,150,1,0,0,0,0
>     0,26,168,2,1,0,0,0
>     0,17,113,2,0,0,0,0
>     0,17,113,2,0,0,0,0
>     0,24,90,1,1,1,0,0
>     0,35,121,2,1,1,0,0
>     0,25,155,1,0,0,0,0
>     0,25,125,2,0,0,0,0
>     0,29,140,1,1,0,0,0
>     0,19,138,1,1,0,0,0
>     0,27,124,1,1,0,0,0
>     0,31,215,1,1,0,0,0
>     0,33,109,1,1,0,0,0
>     0,21,185,2,1,0,0,0
>     0,19,189,1,0,0,0,0
>     0,23,130,2,0,0,0,0
>     0,21,160,1,0,0,0,0
>     0,18,90,1,1,0,0,1
>     0,18,90,1,1,0,0,1
>     0,32,132,1,0,0,0,0
>     0,19,132,3,0,0,0,0
>     0,24,115,1,0,0,0,0
>     0,22,85,3,1,0,0,0
>     0,22,120,1,0,0,1,0
>     0,23,128,3,0,0,0,0
>     0,22,130,1,1,0,0,0
>     0,30,95,1,1,0,0,0
>     0,19,115,3,0,0,0,0
>     0,16,110,3,0,0,0,0
>     0,21,110,3,1,0,0,1
>     0,30,153,3,0,0,0,0
>     0,20,103,3,0,0,0,0
>     0,17,119,3,0,0,0,0
>     0,17,119,3,0,0,0,0
>     0,23,119,3,0,0,0,0
>     0,24,110,3,0,0,0,0
>     0,28,140,1,0,0,0,0
>     0,26,133,3,1,2,0,0
>     0,20,169,3,0,1,0,1
>     0,24,115,3,0,0,0,0
>     0,28,250,3,1,0,0,0
>     0,20,141,1,0,2,0,1
>     0,22,158,2,0,1,0,0
>     0,22,112,1,1,2,0,0
>     0,31,150,3,1,0,0,0
>     0,23,115,3,1,0,0,0
>     0,16,112,2,0,0,0,0
>     0,16,135,1,1,0,0,0
>     0,18,229,2,0,0,0,0
>     0,25,140,1,0,0,0,0
>     0,32,134,1,1,1,0,0
>     0,20,121,2,1,0,0,0
>     0,23,190,1,0,0,0,0
>     0,22,131,1,0,0,0,0
>     0,32,170,1,0,0,0,0
>     0,30,110,3,0,0,0,0
>     0,20,127,3,0,0,0,0
>     0,23,123,3,0,0,0,0
>     0,17,120,3,1,0,0,0
>     0,19,105,3,0,0,0,0
>     0,23,130,1,0,0,0,0
>     0,36,175,1,0,0,0,0
>     0,22,125,1,0,0,0,0
>     0,24,133,1,0,0,0,0
>     0,21,134,3,0,0,0,0
>     0,19,235,1,1,0,1,0
>     0,25,95,1,1,3,0,1
>     0,16,135,1,1,0,0,0
>     0,29,135,1,0,0,0,0
>     0,29,154,1,0,0,0,0
>     0,19,147,1,1,0,0,0
>     0,19,147,1,1,0,0,0
>     0,30,137,1,0,0,0,0
>     0,24,110,1,0,0,0,0
>     0,19,184,1,1,0,1,0
>     0,24,110,3,0,1,0,0
>     0,23,110,1,0,0,0,0
>     0,20,120,3,0,0,0,0
>     0,25,241,2,0,0,1,0
>     0,30,112,1,0,0,0,0
>     0,22,169,1,0,0,0,0
>     0,18,120,1,1,0,0,0
>     0,16,170,2,0,0,0,0
>     0,32,186,1,0,0,0,0
>     0,18,120,3,0,0,0,0
>     0,29,130,1,1,0,0,0
>     0,33,117,1,0,0,0,1
>     0,20,170,1,1,0,0,0
>     0,28,134,3,0,0,0,0
>     0,14,135,1,0,0,0,0
>     0,28,130,3,0,0,0,0
>     0,25,120,1,0,0,0,0
>     0,16,95,3,0,0,0,0
>     0,20,158,1,0,0,0,0
>     0,26,160,3,0,0,0,0
>     0,21,115,1,0,0,0,0
>     0,22,129,1,0,0,0,0
>     0,25,130,1,0,0,0,0
>     0,31,120,1,0,0,0,0
>     0,35,170,1,0,1,0,0
>     0,19,120,1,1,0,0,0
>     0,24,116,1,0,0,0,0
>     0,45,123,1,0,0,0,0
>     1,28,120,3,1,1,0,1
>     1,29,130,1,0,0,0,1
>     1,34,187,2,1,0,1,0
>     1,25,105,3,0,1,1,0
>     1,25,85,3,0,0,0,1
>     1,27,150,3,0,0,0,0
>     1,23,97,3,0,0,0,1
>     1,24,128,2,0,1,0,0
>     1,24,132,3,0,0,1,0
>     1,21,165,1,1,0,1,0
>     1,32,105,1,1,0,0,0
>     1,19,91,1,1,2,0,1
>     1,25,115,3,0,0,0,0
>     1,16,130,3,0,0,0,0
>     1,25,92,1,1,0,0,0
>     1,20,150,1,1,0,0,0
>     1,21,200,2,0,0,0,1
>     1,24,155,1,1,1,0,0
>     1,21,103,3,0,0,0,0
>     1,20,125,3,0,0,0,1
>     1,25,89,3,0,2,0,0
>     1,19,102,1,0,0,0,0
>     1,19,112,1,1,0,0,1
>     1,26,117,1,1,1,0,0
>     1,24,138,1,0,0,0,0
>     1,17,130,3,1,1,0,1
>     1,20,120,2,1,0,0,0
>     1,22,130,1,1,1,0,1
>     1,27,130,2,0,0,0,1
>     1,20,80,3,1,0,0,1
>     1,17,110,1,1,0,0,0
>     1,25,105,3,0,1,0,0
>     1,20,109,3,0,0,0,0
>     1,18,148,3,0,0,0,0
>     1,18,110,2,1,1,0,0
>     1,20,121,1,1,1,0,1
>     1,21,100,3,0,1,0,0
>     1,26,96,3,0,0,0,0
>     1,31,102,1,1,1,0,0
>     1,15,110,1,0,0,0,0
>     1,23,187,2,1,0,0,0
>     1,20,122,2,1,0,0,0
>     1,24,105,2,1,0,0,0
>     1,15,115,3,0,0,0,1
>     1,23,120,3,0,0,0,0
>     1,30,142,1,1,1,0,0
>     1,22,130,1,1,0,0,0
>     1,17,120,1,1,0,0,0
>     1,23,110,1,1,1,0,0
>     1,17,120,2,0,0,0,0
>     1,26,154,3,0,1,1,0
>     1,20,106,3,0,0,0,0
>     1,26,190,1,1,0,0,0
>     1,14,101,3,1,1,0,0
>     1,28,95,1,1,0,0,0
>     1,14,100,3,0,0,0,0
>     1,23,94,3,1,0,0,0
>     1,17,142,2,0,0,1,0
>     1,21,130,1,1,0,1,0
>
> Thanks.
>
> _______________________________________________
> SciPy-User mailing list
> SciPy-User at scipy.org
> http://mail.scipy.org/mailman/listinfo/scipy-user



More information about the SciPy-User mailing list