Comments on my first script?

Thu Jun 12 11:29:28 EDT 2008

On Jun 12, 4:27 pm, Phillip B Oldham <phillip.old... at gmail.com> wrote:
> I'm keen on learning python, with a heavy lean on doing things the
> "pythonic" way, so threw the following script together in a few hours
> as a first-attempt in programming python.
>
> I'd like the community's thoughts/comments on what I've done;
> improvements I can make, "don'ts" I should be avoiding, etc. I'm not
> so much bothered about the resulting data - for the moment it meets my
> needs. But any comment is welcome!
>
> #!/usr/bin/env python
> ## Open a file containing a list of domains (1 per line),
> ## request and parse it's whois record and push to a csv
> ## file.
>
> import subprocess
> import re
>
> src = open('./domains.txt')
>
> dest = open('./whois.csv', 'w');
>
> sep = "|"
> headers = ["Domain","Registrant","Registrant's
> Address","Registrar","Registrant Type","Date Registered","Renewal
> Date","Last Updated","Name Servers"]
>
> dest.write(sep.join(headers)+"\n")
>
> def trim( txt ):
>         x = []
>         for line in txt.split("\n"):
>                 if line.strip() == "":
>                         continue
>                 if line.strip().startswith('WHOIS'):
>                         continue
>                 if line.strip().startswith('>>>'):
>                         continue
>                 if line.strip().startswith('%'):
>                         continue
>                 if line.startswith("--"):
>                         return ''.join(x)
>                 x.append(" "+line)
>         return "\n".join(x)
>
> def clean( txt ):
>         x = []
>         isok = re.compile("^\s?([^:]+): ").match
>         for line in txt.split("\n"):
>                 match = isok(line)
>                 if not match:
>                         continue
>                 x.append(line)
>         return "\n".join(x);
>
> def clean_co_uk( rec ):
>         rec = rec.replace('Company number:', 'Company number -')
>         rec = rec.replace("\n\n", "\n")
>         rec = rec.replace("\n", "")
>         rec = rec.replace(": ", ":\n")
>         rec = re.sub("([^(][a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec)
>         rec = rec.replace(":\n", ": ")
>         rec = re.sub("^[ ]+\n", "", rec)
>         return rec
>
> def clean_net( rec ):
>         rec = rec.replace("\n\n", "\n")
>         rec = rec.replace("\n", "")
>         rec = rec.replace(": ", ":\n")
>         rec = re.sub("([a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec)
>         rec = rec.replace(":\n", ": ")
>         return rec
>
> def clean_info( rec ):
>         x = []
>         for line in rec.split("\n"):
>                 x.append(re.sub("^([^:]+):", "\g<0> ", line))
>         return "\n".join(x)
>
> def record(domain, record):
>         details = ['','','','','','','','','']
>         for k, v in record.items():
>                 try:
>                         details[0] = domain.lower()
>                         result = {
>                                 "registrant": lambda: 1,
>                                 "registrant name": lambda: 1,
>                                 "registrant type": lambda: 4,
>                                 "registrant's address": lambda: 2,
>                                 "registrant address1": lambda: 2,
>                                 "registrar": lambda: 3,
>                                 "sponsoring registrar": lambda: 3,
>                                 "registered on": lambda: 5,
>                                 "registered": lambda: 5,
>                                 "domain registeration date": lambda: 5,
>                                 "renewal date": lambda: 6,
>                                 "last updated": lambda: 7,
>                                 "domain last updated date": lambda: 7,
>                                 "name servers": lambda: 8,
>                                 "name server": lambda: 8,
>                                 "nameservers": lambda: 8,
>                                 "updated date": lambda: 7,
>                                 "creation date": lambda: 5,
>                                 "expiration date": lambda: 6,
>                                 "domain expiration date": lambda: 6,
>                                 "administrative contact": lambda: 2
>                         }[k.lower()]()
>                         if v != '':
>                                 details[result] = v
>                 except:
>                         continue
>
>         dest.write(sep.join(details)+"\n")
>
> ## Loop through domains
> for domain in src:
>
>         domain = domain.strip()
>
>         if domain == '':
>                 continue
>
>         rec = subprocess.Popen(["whois",domain],
> stdout=subprocess.PIPE).communicate()[0]
>
>         if rec.startswith("No whois server") == True:
>                 continue
>
>         if rec.startswith("This TLD has no whois server") == True:
>                 continue
>
>         rec = trim(rec)
>
>         if domain.endswith(".net"):
>                 rec = clean_net(rec)
>
>         if domain.endswith(".com"):
>                 rec = clean_net(rec)
>
>         if domain.endswith(".tv"):
>                 rec = clean_net(rec)
>
>         if domain.endswith(".co.uk"):
>                 rec = clean_co_uk(rec)
>
>         if domain.endswith(".info"):
>                 rec = clean_info(rec)
>
>         rec = clean(rec)
>
>         details = {}
>
>         try:
>                 for line in rec.split("\n"):
>                         bits = line.split(': ')
>                         a = bits.pop(0)
>                         b = bits.pop(0)
>                         details[a.strip()] = b.strip().replace("\t", ", ")
>         except:
>                 continue
>
>         record(domain, details)
>
> ## Cleanup
> src.close()
> dest.close()

Just a few quick things before I leave work.

#!/usr/bin/env python
"""Open a file containing a list of domains (1 per line),
   request and parse it's whois record and push to a csv
   file.
"""  # Rather use docstrings than multiline commenting like that.

def trim(txt):
    x = []
    for line in txt.splitlines():  # Strings have a built in function
        if not line.strip() or line.startswith('WHOIS') \
            or line.startswith('>>>') or line.startswith('%'):
            continue  # you can do them in one if statement
        if line.startswith('--'): return ''.join(x)
        x.append(' '+line)
    return '\n'.join(x)

for domain in src:
    if not domain.strip(): continue  # A line with nothing is False

    rec = subprocess.Popen(["whois",domain.strip()],
               stdout=subprocess.PIPE).communicate()[0]
    if rec.startswith('No whois server') \
        or rec.startswith('This TLD has no whois server'):
        continue   # Startswith will return True/False so it is enough

    rec = trim(rec)
    if domain.endswith('.net'):
        rec = clean_net(rec)
    elif domain.endswith('.com'):
        # Rather use if/elif statements unless somehow you think you
will match more than one.
    ....

    for line in rec.splitlines():
        try:
            a, b = line.split(': ')[:2]
            details[a.strip()] = b.strip().replace('\t', ', ')
        except IndexError: # No matches
            continue

Hope that's a start.