Partition Recursive

Mon Dec 27 19:55:03 EST 2010

# parse_url11.py

# DevPlayer at gmail.com
# 2010-12 (Dec)-27
# A brute force ugly hack from a novice programmer.

# You're welcome to use the code, clean it up, make positive
suggestions
# for improvement.

"""
Parse a url string into a list using a generator.
"""

#special_itemMeaning = ";?:@=&#."
#"//",
#"/",
special_item = [";", "?", ":", "@", "=", "&", "#", ".", "/", "//"]

# drop urls with obviously bad formatting - NOTIMPLEMENTED
drop_item = ["|", "localhost", "..", "///"]
ignore_urls_containing = ["php", "cgi"]

def url_parser_generator(url):
    len_text = len(url)
    index = 0
    start1 = 0    # required here if url contains ONLY specials
    start2 = 0    # required here if url contains ONLY non specials
    while index < len_text:

        # LOOP1 == Get and item in the special_item list; can be any
length
        if url[index] in special_item:
            start1 = index
            inloop1 = True
            while inloop1:
                if inloop1:
                    if url[start1:index+1] in special_item:
                        #print "[",start1, ":", index+1, "] = ",
url[start1:index+1]
                        inloop1 = True
                    else:    # not in ANYMORE, but was in special_item
                        #print "[",start1, ":", index, "] = ",
url[start1:index]
                        yield url[start1:index]
                        start1 = index
                        inloop1 = False

                if inloop1:
                    if index < len_text-1:
                        index = index + 1
                    else:
                        #yield url[start1:index]  # NEW
                        inloop1 = False

        elif url[index] in drop_item:
            # not properly implemeted at all
            raise NotImplemented(
                "Processing items in the drop_item list is not "\
                "implemented.", url[index])

        elif url[index] in ignore_urls_containing:
            # not properly implemeted at all
            raise NotImplemented(
                "Processing items in the ignore_urls_containing list
"\
                "is not implemented.", url[index])

        # LOOP2 == Get any item not in the special_item list; can be
any length
        elif not url[index] in special_item:
            start2 = index
            inloop2 = True
            while inloop2:
                if inloop2:
                    #if not url[start2:index+1] in special_item:  #<-
doesn"t work
                    if not url[index] in special_item:
                        #print "[",start2, ":", index+1, "] = ",
url[start2:index+1]
                        inloop2 = True
                    else:    # not in ANYMORE, but item was not in
special_item before
                        #print "[",start2, ":", index, "] = ",
url[start2:index]
                        yield url[start2:index]
                        start2 = index
                        inloop2 = False

                if inloop2:
                    if index < len_text-1:
                        index = index + 1
                    else:
                        #yield url[start2:index]  # NEW
                        inloop2 = False

        else:
            print url[index], "Not Implemented" # should not get here
            index = index + 1

        if index >= len_text-1:
            break

    # Process any remaining part of URL and yield it to caller.
    # Don't know if last item in url is a special or non special.
    # Used start1 and start2 instead of start and
    #     used inloop1 and inloop2 instead of inloop
    # to help debug, as using just "start" and "inloop" can get be
    # harder to track in a generator.
    if start1 >= start2:
        start = start1
    else:
        start = start2
    yield url[start: index+1]

def parse(url):
    mylist = []
    words = url_parser_generator(url)
    for word in words:
        mylist.append(word)
        #print word
    return mylist

def test():
    urls = {
        0: (True,"http://docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition"),

        1: (True,"/http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition"),
        2: (True,"//http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition"),
        3: (True,"///http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition"),

        4: (True,"/http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition/"),
        5: (True,"//http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition//"),
        6: (True,"///http:///docs.python.org/dev/library/stdtypes.html?
highlight=partition#str.partition///"),

        7: (True,"/#/http:///#docs.python..org/dev//////library/
stdtypes./html??highlight=p=partition#str.partition///"),

        8:
(True,"httpdocspythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition"),
        9:
(True,"httpdocs.pythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition"),
        10:
(True,":httpdocspythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition"),
        11:
(True,"httpdocspythonorgdevlibrarystdtypeshtmlhighlightpartitionstrpartition/"),

        12: (True,"///:;#.???"),    # only special_items
        13: (True,"///a:;#.???"),    # only 1 non special_item
        14: (True,"///:;#.???a"),    # only 1 non special_item
        15: (True,"a///:;#.???"),    # only 1 non special_item
        16: (True,"http://docs.python.php"),
        17: (True,"http://php.python.org"),
        18: (True,"http://www.localhost.com"),
        }

    # test various combinations of special_item characters in possible
in urls
    for url_num in range(len(urls)):
        value = urls[url_num]
        test, url = value
        if test: # allow for single tesing
            mylist = parse(url)
            print
            print
            print "url:", url_num, " ", url
            print
            print mylist
            print
    return mylist

test()