[Tutor] need help parsing multiple log files to create a timeline. what am I doing wrong??

Michael Cole contactmikecole at gmail.com
Tue Feb 18 15:42:29 EST 2020


I am working on parsing a bunch of log files to construct a timeline to
represent changes on a network so that I can view events on different
machines in parallel with matching timestamps. However, my program is
getting bogged down and doesn't seem to be giving the right output.

expected behavior:

each row contains the name of the file it came from, and each cell in the
row either is blank or contains a log with a timestamp in the same column
as the matching timestamp from the timestamp row.

observed behavior:

readlines() reads the lines correctly. The timelines and headers are all
built correctly.

the data in each row is not populated fully, with many entries still left
blank. some entries are filled, but only a tiny fraction.

I am seeing that python is taking 99% of the CPU and that a lot of data is
missing from the csv file generated.

import globimport osimport csvimport sysimport datetime
def createCSV(headers, rows, timestamps, csvFileName):
    with open (csvFileName, 'w+') as csvfile:

        csvwriter = csv.writer(csvfile)
        timestamps.insert(0, "timestamps")
        csvwriter.writerow(timestamps)

        for row in rows:
            row.insert(0, headers[rows.index(row)])
            csvwriter.writerow(row)
# checks to see if beginning of line matches format DD-MM
HH:MM:SS.MSXdef getDateTime(line):
    if(line.strip()):
        if(line != '\n'):
            time = line.split(' ')[1]
            if(len(time.split(":")) == 4):
                hour = time.split(":")[0]
                minute = time.split(":")[1]
                second = time.split(":")[2].split(".")[0]
                microsecond = time.split(":")[2].split(".")[1]
                if(datetime.time(int(hour), int(minute), int(second),
int(microsecond))):
                    stamp = line.split(' ')[0] + " " + line.split(' ')[1]
                    return stamp
                else:
                    return 0
            else:
                return 0
    return 0
def listToString(s):
    str1 = ""
    return (str1.join(s))
def parseLogs(logFilePaths, csvFileName):
    rows = []
    headers = []
    timestamps = []

    # parse files to get headers and timestamps
    for logFilePath in logFilePaths:
        logFile = open(logFilePath, 'r')
        fileName = logFilePath.split('/')[-3:]
        fileNameStr = listToString(fileName).split("-")[2].split("
")[0].split("20")[1]
        headers.append(fileNameStr)
        lines = logFile.readlines()
        for line in lines:
            stamp = getDateTime(line)
            # append all valid timestamps to the array
            if(stamp != 0):
                timestamps.append(stamp)
        logFile.close()

    # remove duplicate timestamps and sort
    timestamps = list(dict.fromkeys(timestamps))
    timestamps.sort()

    # parse files again to get data and put it in the right slot in the row
    for logFilePath in logFilePaths:
        logFile = open(logFilePath, "r")
        lines = logFile.readlines()
        row = []

        # zero fill row, ensuring row array is same size as timestamps array
        for timestamp in timestamps:
            row.append("")

        # find the index of the corresponding timestamp for each line
in the file
        linecount = 0
        for line in lines:
            while linecount < len(timestamps):
                if line.strip() and getDateTime(line) != 0:
                    try:
                        index = timestamps.index(getDateTime(line))
                        row[index] = line
                        linecount += 1
                        break
                    except ValueError:
                        row[linecount] = "!!XX!!"
                        linecount += 1
                        pass

        rows.append(row)
        logFile.close()

    createCSV(headers, rows, timestamps, csvFileName)
def main ():

    # default logfile base path and csv file name
    logFileBasePath = ""
    csvFileName = "Log.csv"

    # parsing command line args
    numArgs = len (sys.argv) - 1
    if(numArgs == 0):
        print ("ERROR: You must specify the path to a directory of logfiles")
        return
    elif(numArgs > 1):
        print ("ERROR: Too many arguments")
        return
    else:
        # appending slash to path if not already there
        arg = sys.argv[1]
        if(arg[len (arg) - 1] != "/"):
            arg = arg + "/"
        logFileBasePath = arg

        # csv file will be placed adjacent to the logs, and will be
named the name of its containing folder
        csvFileName = logFileBasePath + os.path.splitext
(os.path.basename (os.path.dirname (logFileBasePath)))[0] + ".csv"

    logFilePaths = glob.glob (logFileBasePath + "*.log")
    if(len (logFilePaths) == 0):
        print ("ERROR: No logfiles found at: ", logFileBasePath)
        return
    else:
        parseLogs(logFilePaths, csvFileName )

main ()

Example log format:

---------- 2020-02-13 18:06:45 -0600: Logging Started ----------
02-13 18:18:24.370: 00:12:42: INFO [XMOS] Media clock unlocked!
reason: unlocked on zeroing
02-13 18:18:24.421: XMOS clock update. state:0 source:ff, rate:ff
02-13 18:18:24.656: 00:12:43: INFO [XMOS] out of sequence error. this
seq: 16 last seq: 41 timestamp: fceb397f
02-13 18:18:24.709: 00:12:43: INFO [XMOS] out of sequence error. this
seq: 57 last seq: 80 timestamp: fd3a1012
02-13 18:18:31.830: XMOS clock update. state:1 source:ff, rate:ff
02-13 18:46:41.844: 00:41:00: INFO [XMOS] Media clock unlocked!
reason: unlocked on zeroing
02-13 18:46:41.896: XMOS clock update. state:0 source:ff, rate:ff
02-13 18:46:42.131: 00:41:00: INFO [XMOS] out of sequence error. this
seq: 86 last seq: 111 timestamp: 38052b81
02-13 18:46:42.183: 00:41:00: INFO [XMOS] out of sequence error. this
seq: 126 last s


More information about the Tutor mailing list