Comments on my first script?

Fri Jun 13 11:24:52 EDT 2008

On Jun 13, 3:19 pm, Bruno Desthuilliers <bruno.
42.desthuilli... at websiteburo.invalid> wrote:
> Phillip B Oldham a écrit :
>
> > I'm keen on learning python, with a heavy lean on doing things the
> > "pythonic" way, so threw the following script together in a few hours
> > as a first-attempt in programming python.
>
> > I'd like the community's thoughts/comments on what I've done;
> > improvements I can make, "don'ts" I should be avoiding, etc. I'm not
> > so much bothered about the resulting data - for the moment it meets my
> > needs. But any comment is welcome!
>
> Ok, since you asked for it, let's go:
>
> > #!/usr/bin/env python
> > ## Open a file containing a list of domains (1 per line),
> > ## request and parse it's whois record and push to a csv
> > ## file.
>
> > import subprocess
> > import re
>
> > src = open('./domains.txt')
>
> > dest = open('./whois.csv', 'w');
>
> Might be better to allow the user to pass source and destination as
> arguments, defaulting to stdin and stdout.
>
> Also, you may want to have a look at the csv module in the stdlib.
>
>
>
> > sep = "|"
> > headers = ["Domain","Registrant","Registrant's
> > Address","Registrar","Registrant Type","Date Registered","Renewal
> > Date","Last Updated","Name Servers"]
> > dest.write(sep.join(headers)+"\n")
> > def trim( txt ):
> >    x = []
> >    for line in txt.split("\n"):
> >            if line.strip() == "":
> >                    continue
> >            if line.strip().startswith('WHOIS'):
> >                    continue
> >            if line.strip().startswith('>>>'):
> >                    continue
> >            if line.strip().startswith('%'):
> >                    continue
> >            if line.startswith("--"):
> >                    return ''.join(x)
> >            x.append(" "+line)
> >    return "\n".join(x)
>
> You're doing way to may calls to line.strip(). Call it once and store
> the result.
>
> def trim_test(line):
>      line = line.strip()
>      if not line:
>          return False
>      for test in ("WHOIS", ">>>", "%",):
>          if line.startswith(test):
>              return False
>      return True
>
> def trim(txt):
>      lines = []
>      for line in txt.split.splitlines():
>          if trim_test(line):
>              if line.starstwith("--"):
>                  return "".join(lines)
>              lines.append(" " + line)
>      return "\n".join(lines)
>
> > def clean( txt ):
> >    x = []
> >    isok = re.compile("^\s?([^:]+): ").match
>
> Would be better to extract the regex compilation out of the function.
>
> >    for line in txt.split("\n"):
> >            match = isok(line)
> >            if not match:
> >                    continue
> >            x.append(line)
>
> If you don't use the match object itself, don't ever bother to bind it:
>
>      for line in txt.split("\n"):
>          if not isok(line):
>              continue
>          x.append(line)
>
> Then, you may find the intent and flow most obvious if you get rid of
> the double negation (the not and the continue):
>
>      for line in txt.splitlines():
>          if isok(line):
>              x.append(line)
>
> which is easy to rewrite as a either a list comprehension:
>
>      x = [line for line in txt.splitlines() if isok(line)]
>
> or in a more lispish/functional style:
>
>     x = filter(isok, txt.splitlines())
>
> In both way, you now can get rid of the binding to 'x' (a very bad name
> for a list of lines BTW - what about something more explicit, like
> 'lines' ?)
>
> >    return "\n".join(x);
>
> isok = re.compile("^\s?([^:]+): ").match
>
> def clean(txt):
>      return "\n".join(filter(isok, txt.splitlines()))
>
> > def clean_co_uk( rec ):
> >    rec = rec.replace('Company number:', 'Company number -')
> >    rec = rec.replace("\n\n", "\n")
>
> Given the following, this above statement is useless.
>
> >    rec = rec.replace("\n", "")
> >    rec = rec.replace(": ", ":\n")
> >    rec = re.sub("([^(][a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec)
> >    rec = rec.replace(":\n", ": ")
> >    rec = re.sub("^[ ]+\n", "", rec)
>
> All this could probably be simplified.
>
> >    return rec
>
> > def clean_net( rec ):
> >    rec = rec.replace("\n\n", "\n")
> >    rec = rec.replace("\n", "")
> >    rec = rec.replace(": ", ":\n")
> >    rec = re.sub("([a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec)
> >    rec = rec.replace(":\n", ": ")
> >    return rec
>
> Idem.
>
> > def clean_info( rec ):
> >    x = []
> >    for line in rec.split("\n"):
> >            x.append(re.sub("^([^:]+):", "\g<0> ", line))
> >    return "\n".join(x)
>
> > def record(domain, record):
> >    details = ['','','','','','','','','']
>
>      details = [''] * 9
>
>
>
> >    for k, v in record.items():
> >            try:
> >                    details[0] = domain.lower()
> >                    result = {
> >                            "registrant": lambda: 1,
> >                            "registrant name": lambda: 1,
> >                            "registrant type": lambda: 4,
> >                            "registrant's address": lambda: 2,
> >                            "registrant address1": lambda: 2,
> >                            "registrar": lambda: 3,
> >                            "sponsoring registrar": lambda: 3,
> >                            "registered on": lambda: 5,
> >                            "registered": lambda: 5,
> >                            "domain registeration date": lambda: 5,
> >                            "renewal date": lambda: 6,
> >                            "last updated": lambda: 7,
> >                            "domain last updated date": lambda: 7,
> >                            "name servers": lambda: 8,
> >                            "name server": lambda: 8,
> >                            "nameservers": lambda: 8,
> >                            "updated date": lambda: 7,
> >                            "creation date": lambda: 5,
> >                            "expiration date": lambda: 6,
> >                            "domain expiration date": lambda: 6,
> >                            "administrative contact": lambda: 2
> >                    }[k.lower()]()
>
> Ok, let's summarize. On each iteration, you define a dict with the very
> same 21 key:value pairs. Isn't it a bit wasteful ? What about defining
> the dict only once, outside the function ?
>
> Also, the values in the dict are constant functions. Why not just use
> the constant results of the functions then ? I mean : what's wrong with
> just :
>
> {
>    "registrant": 1,
>    "registrant name": 1,
>    "registrant type": 4,
>    (etc...)
>
> }
> >                    if v != '':
> >                            details[result] = v
>
> As an icing on the cake, you build this whole dict, look up a function
> in it, an call the function *before* you even decide if you need that
> result.
>
> >            except:
>
>  >                   continue
>
> Friendly advice : *never* use a bare except clause that discards the
> exception. Never ever do that.
>
> Your except clause here should specifically catch KeyError. But anyway
> you don't ever need to worry about exceptions here, you just have to use
> dict.get(key, default) instead.
>
> FIELDS_POSITIONS = {
>    "registrant": 1,
>    "registrant name": 1,
>    "registrant type": 4,
>    "registrant's address": 2,
>    (etc...)
>
> }
>
> def record(domain, rec):
>      details = [domain.lower()] + [''] * 8
>      for k, v in record.items():
>          if v:
>              pos = FIELDS_POSITIONS.get(k.lower(), None)
>              if pos is not None:
>                  details[pos] = v
>
>      # I'm leaving this here, but I'd personnaly split the
>      # two unrelated concerns of formatting the record and
>      # writing it somewhere.
>
>      dest.write(sep.join(details)+"\n")
>
>
>
> > ## Loop through domains
> > for domain in src:
>
> >    domain = domain.strip()
>
> >    if domain == '':
> >            continue
>
> >    rec = subprocess.Popen(["whois",domain],
> > stdout=subprocess.PIPE).communicate()[0]
>
> >    if rec.startswith("No whois server") == True:
> >            continue
>
> >    if rec.startswith("This TLD has no whois server") == True:
> >            continue
>
> >    rec = trim(rec)
>
> >    if domain.endswith(".net"):
> >            rec = clean_net(rec)
>
> >    if domain.endswith(".com"):
> >            rec = clean_net(rec)
>
> >    if domain.endswith(".tv"):
> >            rec = clean_net(rec)
>
> >    if domain.endswith(".co.uk"):
> >            rec = clean_co_uk(rec)
>
> >    if domain.endswith(".info"):
> >            rec = clean_info(rec)
>
> Since the domain is very unlikely to match more than one test, at least
> use if/elif/.../else to avoid redundant useless tests.
>
> Now *this* would have been a good use of a dict of functions:
>
> REC_CLEANERS = {
>     '.net' : clean_net,
>     '.com' : clean_com,
>     '.tv'  : clean_net,
>     '.uk'  : clean_co_uk,
>     (etc...)
>
> }
>
> for domain in rec:
>     # code here
>     ext = domain.rsplit('.', 1)[1]
>     cleaner = REC_CLEANERS.get(ext, None)
>     if cleaner:
>         rec = cleaner(rec)
>
> >    rec = clean(rec)
>
> >    details = {}
>
> >    try:
> >            for line in rec.split("\n"):
> >                    bits = line.split(': ')
> >                    a = bits.pop(0)
> >                    b = bits.pop(0)
>
> if you expect only one ': ', then:
>                          a, b = line.split(': ')
>
> if you can have many but don't care about the others:
>                         bits = line.split(': ')
>                         a, b = bits[0], bits[1]

or:
a, b = line.split(': ')[:1]

> >                    details[a.strip()] = b.strip().replace("\t", ", ")
> >    except:
>
> cf above. Please, *don't* do that.
>
> >            continue
>
> >    record(domain, details)
>
> > ## Cleanup
> > src.close()
> > dest.close()
>
> There are other possible improvements of course. Like:
>
> - putting the main loop in it's own function taking source and dest (two
> opened (resp in 'r' and 'w' mode) filelike objects)
> - conditionnally call it from the top-level *if* the script has been
> called as a script (vs imported as a module) so you can reuse this code
> from another script.
>
> The test is:
>
> if __name__ == '__main__':
>     # has been called as a script
> else:
>     # has been imported
>
> HTH