[SciPy-user] What can be improved ?

Wed May 16 11:54:09 EDT 2007

On Wed, May 16, 2007 at 11:38:53AM +0200, Stef Mientki wrote:
> hello,
> 
> I've just written a function,
> (with a lot of trial and error,
> converting strings to float, reshaping arrays etc)
> to read a tab delimited file, exported from Excel,
> and I'm glad it's working ok now.
> 
> But I've the unpleasant feeling, that this function is written in a very 
> clumsy way,
> so may I ask some guru for some comment about improvements.

First thing I would do would be to look at the csv module included with
Python :) That'll handle most of the parsing of the file, and you just
then have to worry about converting the columns from strings to arrays.

Also,
- from ... import * in functions is deprecated. You'll get a warning
- instead of appending to arrays using r_, build a list and convert it to an
  array at the end.

import csv
import numpy as N

def readSenseWearTabFile(filename, print_info=False):
    fo = open(filename, 'rb')
    reader = csv.reader(fo, dialect='excel_tab')
    column_names = reader.next()
    if print_info:
        for items in column_names:
            print items
    N = len(column_names)
    start_time = reader.next()[0]
    start = datetime(*strptime(start_time[0:16], "%Y-%m-%d %H:%M")[0:6])
    prev_tyd = 0     # time of the previous sample
    data = []
    for vals in reader:
        # calculate number of minutes from start
        tyd = datetime(*strptime(vals[0][0:16], "%Y-%m-%d %H:%M")[0:6])
        s = tyd - start
        tyd = s.seconds/60 + s.days*24*60

        # if there are sample-sets missing, fill them empty sample-sets
        # (beware of sample reduction)
        if tyd - prev_tyd > 1:
            d = (tyd - prev_tyd) // SR
            zero_vals = ([0] * N) * d
            data.extend(zero_values)

        prev_tyd = tyd    # remember the time of this sample-set
        vals[0] = tyd     # replace the datetime with number of minutes

        # be sure all lines are of equal length
        # (sometimes Excel omits the last columns if they are empty)
        while len(vals) < N:
            vals.append(0)

        def sfloat(s):
            if s == '':
                return 0.0
            else:
                return float(s)
        data.append([sfloat(v) for v in vals])

    data = N.array(data, len(data)dtype=float).T
    return data

> 
> thanks,
> Stef Mientki
> 
> 
> # 
> ******************************************************************************
> # 
> ******************************************************************************
> def Read_SenseWear_Tab_File (filename, Print_Info = False):
>   from scipy import *
>   from time import strptime
> 
>   # open the data file and read the column names (and print if desired)
>   Datafile = open(filename,'r')
>   line = Datafile.readline()
>   column_names = line.rstrip('\n').split('\t')
>   if Print_Info:
>     for items in column_names: print items
> 
>   # initialize Number of columns and an empty sample-set
>   N = len(column_names)
>   zero_vals = N * [0]
>   SR = 5
> 
>   # read the first dataline, to determine the start time
>   # (we forget this first sampleset)
>   line = Datafile.readline()
>   vals = line.rstrip('\n').split('\t')
>   start = datetime(*strptime(vals[0][0:16], "%Y-%m-%d %H:%M")[0:6])
>   prev_tyd = 0     # time of the previous sample
> 
>   # create an empty array
>   data = asarray([])
>   sample_reduction = asarray([])
>  
>   # read and interpretate all lines in file
>   for line in Datafile:
>     # remove EOL, split the line on tabs
>     vals = line.rstrip('\n').split('\t')
> 
>     # calculate number of minutes from start
>     tyd = datetime(*strptime(vals[0][0:16], "%Y-%m-%d %H:%M")[0:6])
>     s = tyd - start
>     tyd = s.seconds/60 + s.days*24*60
>     
>     # if there are sample-sets missing, fill them empty sample-sets
>     # (beware of sample reduction)
>     if tyd - prev_tyd > 1:
>       zero_vals = (( tyd - prev_tyd )/SR) * N * [0]
>       data = r_[data, zero_vals]
> 
>     prev_tyd = tyd    # remember the time of this sample-set
>     vals[0] = tyd     # replace the datetime with number of minutes
> 
>     # be sure all lines are of equal length
>     # (sometimes Excel omits the last columns if they are empty)
>     if len(vals) < N:
>       vals = vals + ( N- len(vals) )*[0]
>       
>     # replace empty strings, otherwise float conversion raises an error
>     for i in range(len(vals)):
>         if vals[i] == '' : vals[i] = '0'
> 
>     # convert the string vector to a float vector
>     # VERY STRANGE: the next 2 operation may not be done at once
>     vals = asarray(vals)
>     vals = vals.astype(float)
> 
>     # append new sampleset, with a sample reduction of 5
>     sample_reduction = r_ [ sample_reduction, vals ]
>     if len(sample_reduction) == SR * N:
>       
>       # reshape sample array, for easy ensemble average
>       sample_reduction = sample_reduction.reshape(SR, N)
>       sample_reduction = sample_reduction.mean(0)
> 
>       # add mean value of SAMPLE_REDUCTION sample-sets to the total array
>       # and clear the averaging sample-set
>       data = r_[data, sample_reduction]
>       sample_reduction = asarray([])
> 
>   # reshape into N signal vectors
>   data = data.reshape(size(data)/N,N)
>   data = transpose(data)
> 
>   return data
> # 
> ******************************************************************************
> 
> 
> Kamer van Koophandel - handelsregister 41055629  / Netherlands Chamber of Commerce - trade register 41055629
> 
> 
> _______________________________________________
> SciPy-user mailing list
> SciPy-user at scipy.org
> http://projects.scipy.org/mailman/listinfo/scipy-user
> 

-- 
|>|\/|<
/--------------------------------------------------------------------------\
|David M. Cooke                      http://arbutus.physics.mcmaster.ca/dmc/
|cookedm at physics.mcmaster.ca