OneAtATime

Lawrence D'Oliveiro ldo at geek-central.gen.new_zealand
Sat Apr 15 03:24:42 EDT 2006


#!/usr/bin/python
#+
# This script ensures there is only one instance of a particular command
# running at a time. It may be useful, for example, for potentially
# lengthy cron tasks, to ensure that a new task is not started if the
# previous one has not finished yet.
#
# Invoke this script from the command line as follows:
#
#     OneAtATime [opts] cmd
#
# where cmd is the Shell command line to execute. Valid options are:
#
#     --id=s
#         use the string s as the unique identifier for this command
#         (should not contain "/" or nul characters). Any other execution
#         of OneAtATime specifying the same id will not proceed as long 
as
#         this instance is still executing.
#         If omitted, the id is computed as a hash on the cmd string.
#
#     --timeout=n
#         If an existing instance has been executing for at least n
#         seconds, assume it's taking too long and try to kill it
#         before proceeding. If omitted, the timeout is infinity.
#
#     --verbose
#         increases the verbosity level by 1 each time it is specified.
#         The initial default verbosity level is 0.
#
#     --wait
#         If specified, then this invocation should sleep until any
#         previous instance is finished (or until the timeout elapses,
#         if specified). If not specified, then this invocation will
#         simply exit if an existing instance is already running.
#         Note that, if more than one invocation is waiting on a
#         previous instance to complete, there is no guarantee in
#         what order they'll execute.
#
# Start of development 2006 March 23 by Lawrence D'Oliveiro
#     <ldo at geek-central.gen.nz>.
# First working version 2006 March 24.
# Fix hang if previous process dies without freeing instance lock and
#     --timeout was not specified 2006 March 24.
# Ensure lock-breaking recovery is still invoked even without --wait,
#     include effective user ID in instance lock name for uniqueness
#     2006 March 25.
# Last modified 2006 March 27.
#-

import sys
import os
import time
import errno
import signal
import getopt
import sha

SignalName = {}
for Name in dir(signal) :
  # collect symbolic names for all signals
    Value = getattr(signal, Name)
    if Name.startswith("SIG") and type(Value) == int :
        SignalName[Value] = Name
    #end if
#end for

#+
# Mainline
#-

(Opts, Args) = getopt.getopt \
  (
    sys.argv[1:],
    "",
    ["id=", "timeout=", "verbose", "wait"]
  )
if len(Args) != 1 :
    raise getopt.GetoptError("need exactly one arg, the cmd to execute")
#end if
Cmd = Args[0]
ID = None
Timeout = None
Wait = False
Verbosity = 0
for Keyword, Value in Opts :
    if Keyword == "--id" :
        ID = Value
    elif Keyword == "--timeout" :
        Timeout = int(Value)
    elif Keyword == "--verbose" :
        Verbosity += 1
    elif Keyword == "--wait" :
        Wait = True
    #end if
#end for
if ID == None :
    ID = sha.new(Cmd).hexdigest()[:16]
      # less than 40-char file name should be OK
#end if

def Debug(Level, Msg) :
    if Verbosity >= Level :
        print Msg
    #end if
#end Debug

PidFileName = "/tmp/OneAtATime-pid-%d" % os.getpid()
PidFile = file(PidFileName, "w")
PidFile.write("%s\n" % os.getpid())
PidFile.close()
InstanceLockName = "/tmp/OneAtATime-lock-%d-%s" % (os.geteuid(), ID)
  # presence of this file is used to guard against more than one
  # invocation of a task with the same id at once
InstanceLock2Name = "/tmp/OneAtaTime-lock2-%s" % ID
  # presence of this file is used to serialize checking for
  # presence of InstanceLockName and of the process whose PID is
  # contained in that file
GotLock = False
GotLock2 = False
LastLastMod = None
OtherPid = None
while True :
    # try to get instance lock
    if not GotLock2 :
        while True :
            # grab the secondary lock for checking the instance lock
            try :
                os.symlink(PidFileName, InstanceLock2Name)
                  # guaranteed atomic operation that will fail if
                  # destination name already exists. Note use of
                  # symlink rather than link because my pid file
                  # might be older than secondary lock timeout
                  # if I've been waiting a while.
                GotLock2 = True
                break
            except OSError, (ErrNo, Msg) :
                if ErrNo != errno.EEXIST :
                    # Not bothering to recover from privilege failures
                    raise OSError(ErrNo, Msg)
                #end if
            #end try
          # Somebody else has the lock, check it looks reasonable
            try :
                LastMod = os.lstat(InstanceLock2Name).st_mtime
                  # note use of lstat to find out how long symlink has
                  # been present, not age of pid file
            except OSError, (ErrNo, Msg) :
                if ErrNo == errno.ENOENT :
                    LastMod = None
                else :
                    # Not bothering to recover from privilege failures
                    raise OSError(ErrNo, Msg)
                #end if
            #end try
            if LastMod != None :
                if LastMod + 10 < time.time() :
                    Debug(0, "Breaking secondary lock")
                    try :
                        os.unlink(InstanceLock2Name)
                    except OSError, (ErrNo, Msg) :
                        if ErrNo != errno.ENOENT :
                            # Not bothering to recover from privilege 
failures
                            raise OSError(ErrNo, Msg)
                        #end if
                    #end try
                else :
                    # lock looks valid, wait awhile and try again
                    time.sleep(1)
                #end if
            else :
                pass # disappeared while I was looking at it
            #end if
        #end while
    #end if not GotLock2
    # At this point, I have the secondary lock, so nobody else will be
    # trying the following operations at the same time.
    try :
        LastMod = os.stat(InstanceLockName).st_mtime
    except OSError, (ErrNo, Msg) :
        if ErrNo == errno.ENOENT :
            os.link(PidFileName, InstanceLockName)
            GotLock = True
        else :
            # Not bothering to recover from privilege failures
            raise OSError(ErrNo, Msg)
        #end if
    #end try
    if GotLock :
        Debug(2, "Got the lock")
        break
    #end if
    if LastMod != LastLastMod :
        # instance lock file seems to have changed, (re)fetch OtherPid.
        Debug(2, "Getting existing process ID")
        try :
            PidFile = file(InstanceLockName, "r")
        except OSError, (ErrNo, Msg) :
            if ErrNo == errno.ENOENT :
                PidFile = None
            else :
                # Not bothering to recover from privilege failures
                raise OSError(ErrNo, Msg)
            #end if
        #end try
        if PidFile != None :
            OtherPid = PidFile.readline()
            OtherPid = int(OtherPid.replace("\n", "").replace(" ", ""))
              # not bothering to guard against bad pidfile contents
            PidFile.close()
            Debug(1, "Existing process ID is %d" % OtherPid)
        else :
            OtherPid = None
        #end if
        LastLastMod = LastMod
    #end if
    if OtherPid != None :
      # check the process still exists
        try :
            os.kill(OtherPid, 0)
        except OSError, (ErrNo, Msg) :
            if ErrNo == errno.ESRCH :
                OtherPid = None # nope, it's gone
            else :
                # Not bothering to recover from privilege failures
                raise OSError(ErrNo, Msg)
            #end if
        #end try
    #end if
    BreakLock = \
            OtherPid == None \
        or \
            Timeout != None and time.time() > LastMod + Timeout
    if BreakLock :
        Debug(0, "Breaking lock and killing existing lock owner")
        if OtherPid != None :
            ExtremePrejudice = False
            while True :
                if ExtremePrejudice :
                    Signal = signal.SIGKILL
                else :
                    Signal = signal.SIGTERM # give it a chance to clean 
up
                #end if
                try :
                    os.kill(OtherPid, Signal)
                except OSError, (ErrNo, Msg) :
                    if ErrNo == errno.ESRCH :
                        break # OK, it's gone
                    else :
                        # Not bothering to recover from privilege 
failures
                        raise OSError(ErrNo, Msg)
                    #end if
                #end try
                if ExtremePrejudice :
                    break # it should be gone
                StartWait = time.time()
                while True :
                    # wait up to a certain amount of time for process to
                    # terminate by itself
                    try :
                        os.kill(OtherPid, 0)
                        StillThere = True
                    except OSError, (ErrNo, Msg) :
                        if ErrNo == errno.ESRCH :
                            StillThere = False
                        else :
                            # Not bothering to recover from privilege 
failures
                            raise OSError(ErrNo, Msg)
                        #end if
                    #end try
                    if not StillThere or time.time() - StartWait > 5.0 :
                      # Note this timeout must be less than secondary 
lock timeout,
                      # otherwise another invocation could take 
secondary lock
                      # away from me
                        break
                    time.sleep(1)
                #end while
                if not StillThere :
                    break # OK, it's gone
                ExtremePrejudice = True # out of patience
            #end while
            OtherPid = None
        #end if
        # Lock owner killed, now clean up lock file (if owner
        # didn't already clean it up) so I can grab it for myself
        try :
            os.unlink(InstanceLockName)
        except OSError, (ErrNo, Msg) :
            if ErrNo != errno.ENOENT :
                # Not bothering to recover from privilege failures
                raise OSError(ErrNo, Msg)
            #end if
        #end try
    elif OtherPid != None :
        if GotLock2 :
          # mustn't hang on to secondary lock for too long
            os.unlink(InstanceLock2Name)
            GotLock2 = False
        #end if
        if not Wait :
            Debug(1, "Can't get lock, exiting")
            break
        #end if
        time.sleep(1)
    #end if
#end while
if GotLock2 :
    os.unlink(InstanceLock2Name)
#end if
os.unlink(PidFileName)
if GotLock :
    Debug(2, "Spawning \"%s\"" % Cmd)
    ChildPid = os.spawnl(os.P_NOWAIT, "/bin/bash", "/bin/bash", "-c", 
Cmd)
    if Timeout != None :
        StartWait = time.time()
        while True :
            time.sleep(5)
            (ChildPidAgain, ChildStatus) = os.waitpid(ChildPid, 
os.WNOHANG)
            if ChildPidAgain != 0 :
                break
            if time.time() > StartWait + Timeout :
                Debug(0, "Command taking too long, killing")
                os.kill(ChildPid, signal.SIGKILL)
                ChildStatus = os.waitpid(ChildPid, 0)[1]
                break
            #end if
        #end while
    else :
        ChildStatus = os.waitpid(ChildPid, 0)[1]
    #end if
    if os.WIFEXITED(ChildStatus) :
        if os.WEXITSTATUS(ChildStatus) != 0 :
            Debug(1, "Child exited with status %d" % 
os.WEXITSTATUS(ChildStatus))
        #end if
    elif os.WIFSIGNALED(ChildStatus) :
        SigValue = os.WTERMSIG(ChildStatus)
        if SignalName.has_key(SigValue) :
            SigName = SignalName[SigValue]
        else :
            SigName = "?"
        #end if
        Debug(0, "Child died from signal %d (%s)" % (SigValue, SigName))
    else :
        Debug(0, "Child ?terminated? status %d" % ChildStatus)
    #end if
    os.unlink(InstanceLockName)
#end if



More information about the Python-list mailing list