[Python-checkins] r56524 - tracker/importer/config.py tracker/importer/xmlexport2handlers.py tracker/importer/xmlexport2toroundup.py

erik.forsberg python-checkins at python.org
Tue Jul 24 16:40:11 CEST 2007


Author: erik.forsberg
Date: Tue Jul 24 16:40:10 2007
New Revision: 56524

Added:
   tracker/importer/config.py
   tracker/importer/xmlexport2handlers.py
   tracker/importer/xmlexport2toroundup.py
      - copied, changed from r56508, tracker/importer/sfxml2roundup.py
Log:
Importer for the "new" format produced by xml_export2.php

Added: tracker/importer/config.py
==============================================================================
--- (empty file)
+++ tracker/importer/config.py	Tue Jul 24 16:40:10 2007
@@ -0,0 +1,26 @@
+mappings = {'category':
+            {"Demos and tools":"Demos and Tools",
+             "Distutils and setup.py":"Distutils",
+             "Python Interpreter Core":"Interpreter Core",
+             "Core (C code)":"Interpreter Core",
+             "Python Library":"Library (Lib)",
+             "Modules":"Extension Modules",
+             "Parser/Compiler":"Interpreter Core",
+             "Performance":"Interpreter Core",
+             "Threads":"Interpreter Core",
+             "Type/class unification":"Interpreter Core"},
+
+            'priority':
+            {'1':'low',
+              '2':'low',
+              '3':'low',
+              '4':'low',
+              '5':'normal',
+              '6':'high',
+              '7':'high',
+              '8':'immediate',
+              '9':'urgent'
+             },
+            }
+             
+

Added: tracker/importer/xmlexport2handlers.py
==============================================================================
--- (empty file)
+++ tracker/importer/xmlexport2handlers.py	Tue Jul 24 16:40:10 2007
@@ -0,0 +1,429 @@
+import time, os,  urllib, socket, mimetools, stat, re
+
+from config import mappings
+import time
+
+import BeautifulSoup as BS
+
+# slightly silly
+try:
+    # import xml.etree.cElementTree as ET # may crash in 2.5b2 !?
+    import xml.etree.ElementTree as ET
+except ImportError:
+    try:
+        import cElementTree as ET
+    except ImportError:
+        import elementtree.ElementTree as ET
+
+import htmlentitydefs
+
+from roundup.support import ensureParentsExist
+from roundup.date import Date
+
+class XMLExport2Handler:
+    def __init__(self, db, source, target):
+        self.db = db
+        self.source = source
+        self.target = target
+
+    def handle(self, item, roundupdata):
+        raise NotImplementedError
+
+
+class TextValueHandler(XMLExport2Handler):
+    def handle(self, item, roundupdata):
+        roundupdata[self.target] = item.find(self.source).text.encode('utf-8')
+
+class StatusHandler(XMLExport2Handler):
+    def __init__(self, db, source, target, statuses):
+        XMLExport2Handler.__init__(self, db, source, target)
+        self.statuses = statuses
+    
+    def handle(self, item, roundupdata):
+        status = self.statuses[item.find(self.source).text].lower()
+        
+        if "deleted" == status:
+            status = "closed"
+
+        roundupdata[self.target] = self.db.status.lookup(status)        
+
+class ComponentHandler(XMLExport2Handler):
+    def __init__(self, db, source, target, categories):
+        XMLExport2Handler.__init__(self, db, source, target)
+        self.categories = categories
+
+    def handle(self, item, roundupdata):
+        category = self.categories[item.find(self.source).text]
+        category = mappings['category'].get(category, category)
+
+        try:
+            component_id = self.db.component.lookup(category)
+            roundupdata[self.target] =  [component_id]
+        except KeyError:
+            roundupdata[self.target] = \
+                    [self.db.component.create(name=category)]
+
+class GroupHandler(XMLExport2Handler):
+    def __init__(self, db, source, target, groups):
+        XMLExport2Handler.__init__(self, db, source, target)
+        self.groups = groups
+    
+    def handle(self, item, roundupdata):
+        roundupdata[self.target] = []
+        group = self.groups[item.find(self.source).text]
+
+        if group in ["None", "Irreproducible", "AST", "Not a Bug"]:
+            return
+        elif "Feature Request" == group:
+            roundupdata['type'] = self.db.issue_type.lookup("rfe")
+            return
+        elif "Python 3000" == group:
+            roundupdata['keywords'].append(self.db.keyword.lookup('py3k'))
+        try:
+            # Merge as specified in http://psf.upfronthosting.co.za/roundup/meta/issue101
+            if group.startswith("Python 2.1"):
+                group = "Python 2.1"
+            elif group.startswith("Python 2.2"):
+                group = "Python 2.2"
+            version = self.db.version.lookup(group)
+            roundupdata[self.target] = version
+            return
+        except KeyError:
+            pass        
+
+class ResolutionHandler(XMLExport2Handler):
+    def __init__(self, db, source, target, resolutions):
+        XMLExport2Handler.__init__(self, db, source, target)
+        self.resolutions = resolutions
+
+    def handle(self, item, roundupdata):
+        resolution = self.resolutions[item.find(self.source).text].lower()
+        if "none" == resolution:
+            roundupdata[self.target] = None
+        else:
+            roundupdata[self.target] = self.db.resolution.lookup(resolution)
+
+
+class UserlinkHandler(XMLExport2Handler):
+    def __init__(self, db, source, target, pmembers):
+        XMLExport2Handler.__init__(self, db, source, target)
+        self.pmembers = pmembers
+
+    def handle(self, item, roundupdata):
+        username = item.find(self.source).text
+
+        if "nobody" == username and \
+               "assignee" == self.target :
+            roundupdata[self.target] = None
+            return
+
+        if "nobody" == username:
+            username = "anonymous"
+
+        roundupdata[self.target] = self.getauthor(username)
+
+        # Add user to nosy 
+        if roundupdata[self.target] not in roundupdata['nosy'] and \
+               roundupdata[self.target] != self.getauthor("anonymous"):
+            roundupdata['nosy'].append(roundupdata[self.target])
+
+    def unescape(self, string):
+        # work around oddities in BeautifulSoup's entity handling
+        def unescape_entity(m, defs=htmlentitydefs.entitydefs):
+            try:
+                return defs[m.group(1)]
+            except KeyError:
+                return m.group(0) # use as is
+        pattern = re.compile("&(\w+);")
+        return pattern.sub(unescape_entity, string)    
+            
+
+    def loadauthorfile(self, file):
+        def emit(soup):
+            if isinstance(soup, BS.NavigableString):
+                bob.data(self.unescape(soup))
+            else:
+                bob.start(soup.name, dict((k, self.unescape(v)) for k, v in soup.attrs))
+                for s in soup:
+                    emit(s)
+                bob.end(soup.name)
+        # determine encoding (the document charset is not reliable)
+        text = open(file).read()
+        try:
+            encoding = "utf-8"
+            unicode(text, encoding)
+        except UnicodeError:
+            encoding = "iso-8859-1"
+        soup = BS.BeautifulSoup(
+            text, convertEntities="html", fromEncoding=encoding
+            )
+        # build the tree
+        bob = ET.TreeBuilder()
+        for s in soup:
+            emit(s)
+        return bob.close()            
+
+    def getnonprojectmember(self, username):
+        address = "%s at users.sourceforge.net" % username
+
+        authorfile = os.path.join("authordata", username)
+        if not os.path.exists(authorfile) or 0 == os.stat(authorfile)[stat.ST_SIZE]:
+            print "Fetching user information for %s" % username
+            u = urllib.urlopen("http://sourceforge.net/users/" + username)
+            open(authorfile, 'w').write(u.fp.read())
+
+        realname = None
+        authordata = open(authorfile).read()
+        if -1 != authordata.find("That user does not exist or is not yet active."):
+            return ("anonymous", None, None)
+
+        elif -1 != authordata.find("This user account has been deleted"):
+            realname = "Deleted User %s" % username
+            return (username, realname, address)
+
+        tree = self.loadauthorfile(authorfile)
+        try:
+            table = tree.getiterator('table')[0]
+        except TypeError:
+            table = tree.getiterator('table').next()
+
+        alltds = table.findall('.//td')
+        for i in range(len(alltds)):
+            header = alltds[i].text or ""
+            if -1 != header.find("Publicly Displayed Name:"):
+                realname = alltds[i+1].text
+                break
+
+        return (username, realname, address)
+        
+        
+
+    def getauthor(self, username):
+        try:
+            return self.db.user.lookup(username)
+        except KeyError:
+            print "Creating new user", username
+            roles = ["User"]
+            if not self.pmembers.has_key(username):
+                (username, realname, address) = self.getnonprojectmember(username)
+                if "anonymous" == username:
+                    return self.db.user.lookup(username)
+                realname = realname.encode('utf-8')
+            else:
+                realname = self.pmembers[username]['public_name'].encode('utf-8')
+                address = self.pmembers[username]['email']
+                roles.append("Developer")
+                if self.pmembers[username]['admin']:
+                    roles.append('Coordinator')
+            return self.db.user.create(username=username,
+                                       realname=realname,
+                                       address=address,
+                                       roles=",".join(roles))
+
+class AssigneeHandler(UserlinkHandler):
+    def handle(self, item, roundupdata):
+        UserlinkHandler.handle(self, item, roundupdata)
+        if None == roundupdata[self.target]:
+            return
+        user = self.db.user.getnode(roundupdata[self.target])
+        roles = user['roles'].split(',')
+        if not "Developer" in roles:
+            roles.append('Developer')
+            user['roles'] = ",".join(roles)
+
+    
+class DateHandler(XMLExport2Handler):
+    def handle(self, item, roundupdata):
+        roundupdata[self.target] = time.gmtime(int(item.find(self.source).text))
+class PriorityHandler(XMLExport2Handler):
+    def handle(self, item, roundupdata):
+        priority = item.find(self.source).text
+        roundupdata[self.target] =  self.db.priority.lookup(mappings['priority'][priority])
+
+
+class TextstringHandler(XMLExport2Handler):
+    def handle(self, item, roundupdata):
+        roundupdata[self.target] = item.find(self.source).text.encode('utf-8')
+    
+class MessagesHandler(UserlinkHandler):
+    def createmessage(self, roundupdata, author, date, content, recipients):
+        messageprops = ['author', 'date', 'files', 'content', 'recipients']
+        messagevals = [repr(self.getauthor(author)),
+                       repr(time.gmtime(int(date))),
+                       repr([]),
+                       repr(content.encode('utf-8')),
+                       repr([])]
+
+        if not roundupdata.has_key('activity') or \
+           int(date) > time.mktime(roundupdata['activity']):
+            roundupdata['activity'] = time.gmtime(int(date))
+            roundupdata['actor'] = self.getauthor(author)        
+
+        msg_nodeid = int(self.db.msg.import_list(messageprops, messagevals))
+
+        msg_filename = self.db.filename(self.db.msg.classname,
+                                        msg_nodeid, create=1)
+        ensureParentsExist(msg_filename)
+
+        mo = re.search('^Logged In: (YES |NO )\nuser_id=[0-9]+\nOriginator: (YES|NO)\n', content, re.MULTILINE)
+        if mo:
+            content = content[mo.end():]
+        
+        open(msg_filename, 'w').write(content.encode('utf-8'))
+        
+        return msg_nodeid
+    
+    def handle(self, item, roundupdata):
+        # Handle 'details'
+        roundupdata[self.target] = [self.createmessage(roundupdata,
+                                                      item.find('submitter').text,
+                                                      item.find('submit_date').text,
+                                                      item.find(self.source).text,
+                                                      [])]
+
+
+        followups = item.find("followups")
+        for fu in followups.findall("followup"):
+            author = fu.find("submitter").text
+            date = fu.find("date").text
+            content = fu.find("details").text
+            roundupdata[self.target].append(self.createmessage(roundupdata, author, date, content, []))
+
+            authorid = self.getauthor(author)
+            if authorid not in roundupdata['nosy'] and \
+                   authorid != self.getauthor('anonymous'):
+                roundupdata['nosy'].append(authorid)
+
+
+class AttachmentHandler(UserlinkHandler):
+    def __init__(self, db, source, target, pmembers,
+                 project_group_id, tracker):
+        UserlinkHandler.__init__(self, db, source, target, pmembers)
+        self.project_group_id = project_group_id
+        self.tracker = tracker
+    
+    def downloadfile(self, url, cachefilename):
+
+        delay = 0
+        backoff = 30
+        while True:
+            print url, "->", cachefilename
+            try:
+                f = urllib.urlopen(url)
+                data = f.read()
+                if data.find("send-email-to-ipblocked-at-sourceforge-dot-net") >= 0:
+                    delay+=backoff
+                    print "Blocked by Sourceforge. Sleeping %d seconds before trying again" % delay
+
+                out = open(cachefilename + ".tmp", 'w')
+                out.write(str(f.headers))
+                out.write("\n")
+                out.write(data)
+                out.close()
+                try:
+                    os.remove(cachefilename)
+                except:
+                    pass
+                os.rename(cachefilename + ".tmp", cachefilename)
+                break
+
+            except socket.error, e:
+                print "Error fetching file, retrying", e
+                continue
+            except AttributeError, e:
+                print e, "Probably SF weirdness. Trying again after delay.."
+                delay+=backoff
+            except IOError, e:
+                print e, "Probably SF weirdness. Trying again after delay.."
+                delay+=backoff                
+
+            time.sleep(delay)
+
+    
+    def handle(self, item, roundupdata):
+
+        tracker_id = self.tracker.find("tracker_id").text
+        aid = roundupdata["id"]
+        files = []
+        issuefiles = []
+
+        attachments = item.find(self.source)
+        for a in attachments.findall("attachment"):
+            url = a.find("url").text + aid
+            date = a.find("date").text
+            author = a.find("submitter").text
+            filetype = a.find("filetype").text
+            file_id = a.find("id").text
+            filename = a.find("filename").text
+
+            files.append((date, url, author, filetype, file_id, filename))
+
+        files.sort(lambda x, y: cmp(x[0], y[0]))
+
+        backoff = 30
+        for timestamp, url, author, filetype, file_id, filename in files:
+            cachefilename = os.path.join("files", "%s-%s-%s-%s.dat" % (tracker_id,
+                                                                       aid,
+                                                                       file_id,
+                                                                       timestamp))
+            if not os.path.exists(cachefilename):
+                self.downloadfile(url, cachefilename)
+
+            datafile = open(cachefilename, 'rb')
+            message = mimetools.Message(datafile)
+
+            fileprops = ['creator', 'creation', 'activity',
+                         'name', 'type']
+
+            filevals = [repr(self.getauthor(author)),
+                        repr(time.gmtime(int(timestamp))),
+                        repr(time.gmtime(int(timestamp))),
+                        repr(filename),
+                        repr(filetype)
+                        ]
+
+            file_nodeid = int(self.db.file.import_list(fileprops, filevals))
+            file_filename = self.db.filename(self.db.file.classname, file_nodeid,
+                                        create=1)
+            ensureParentsExist(file_filename)
+            open(file_filename, 'w').write(datafile.read())
+
+            issuefiles.append(file_nodeid)
+
+            if not roundupdata.has_key('activity') or \
+                   int(timestamp) > time.mktime(roundupdata['activity']):
+                roundupdata['activity'] = time.gmtime(int(timestamp))
+                roundupdata['actor'] = self.getauthor(author)        
+
+        roundupdata[self.target] = issuefiles
+
+
+class SeverityHandler(XMLExport2Handler):
+    def handle(self, item, roundupdata):
+        roundupdata[self.target] = self.db.severity.lookup('normal')                
+
+class TypeHandler(XMLExport2Handler):
+    def __init__(self, db, source, target, tracker):    
+        XMLExport2Handler.__init__(self, db, source, target)
+        self.tracker = tracker
+        
+    def handle(self, item, roundupdata):
+        if "Feature Requests" == self.tracker.find("name").text:
+            roundupdata[self.target] = self.db.issue_type.lookup("rfe")
+        elif "Patches" == self.tracker.find("name").text:
+            roundupdata["keywords"].append(self.db.keyword.lookup("patch"))
+                                           
+        
+def handle_journal(db, item, roundupdata, nodeid):
+    journal = []
+    journal.append((nodeid, Date(roundupdata['creation']),
+                    roundupdata['creator'],
+                    'create', {}))
+    db.setjournal("issue", nodeid, journal)
+        
+        
+            
+        
+        
+        
+        

Copied: tracker/importer/xmlexport2toroundup.py (from r56508, tracker/importer/sfxml2roundup.py)
==============================================================================
--- tracker/importer/sfxml2roundup.py	(original)
+++ tracker/importer/xmlexport2toroundup.py	Tue Jul 24 16:40:10 2007
@@ -19,62 +19,101 @@
 origin_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
 sys.path = [os.path.join(origin_dir, "sourceforge")] + sys.path
 
-
 import htmlentitydefs, re
 import getopt
-import sfxmlhandlers
+import xmlexport2handlers as x2h
 
 from roundup import instance
 
-def handle_artifact(db, artifact):
+def handle_idmapping(tracker, name, itemname):
+    print "Reading in '%s'" % name
+    mapping = tracker.find(name)
+    ret = {}
+    for g in mapping.findall(itemname):
+        ret[g.find('id').text] = g.find('%s_name' % itemname).text 
+
+    return ret
+
+def handle_namemapping(tracker, name, itemname):
+    resolutions = tracker.find(name)
+    ret = {}
+    for r in resolutions.findall(itemname):
+        ret[r.find('id').text] = r.find('name').text
+    return ret
+        
+
+def handle_tracker(db, project_group_id, tracker, pmembers):
+    print "Handling tracker \"%s\"" % tracker.find('name').text
 
-    handlers = [sfxmlhandlers.IDHandler(db, "artifact_id", 'id'),
-                sfxmlhandlers.CreationHandler(db, 'open_date', 'creation'),
-                # activity and actor is set by CreationHandler, FileHandler and MessagesHandler
-                sfxmlhandlers.UserlinkHandler(db, 'submitted_by', 'creator'),
-                sfxmlhandlers.TitleHandler(db, 'summary', 'title'),
-                sfxmlhandlers.MessagesHandler(db, None, 'messages'),
-                sfxmlhandlers.FilesHandler(db, None, 'files'),
-                sfxmlhandlers.NosyHandler(db, None, 'nosy'),
-                # No handler for superseder
-                sfxmlhandlers.ComponentHandler(db, 'category', 'components'),
-                sfxmlhandlers.VersionsHandler(db, None, 'versions'),
-                sfxmlhandlers.SeverityHandler(db, None, 'severity'),
-                sfxmlhandlers.PriorityHandler(db, 'priority', 'priority'),
-                sfxmlhandlers.DependencyHandler(db, None, 'dependencies'),
-                sfxmlhandlers.AssigneeHandler(db, 'assigned_to', 'assignee'),
-                sfxmlhandlers.StatusHandler(db, 'status', 'status'),
-                sfxmlhandlers.ResolutionHandler(db, 'resolution', 'resolution'),
-                sfxmlhandlers.TypeHandler(db, "artifact_type", "type"),
-                sfxmlhandlers.GroupHandler(db, "artifact_group_id", "versions"),
-                ]
-
-    roundupdata = {'files':[], 'keywords':[]}
-    fields = {}
-                 
-    for field in artifact.findall("field"):
-        name = field.attrib.get('name')
-        if None == name:
-            print "field has no name", field.attrib
-            continue
-        fields[name] = field
-
-    aid = int(fields['artifact_id'].text)
-
-    for handler in handlers:
-        handler.handle(fields, roundupdata)
-
-    props = []
-    values = []
-
-    for key, value in roundupdata.items():
-        props.append(key)
-        values.append(repr(value))
-
-    nodeid = db.issue.import_list(props, values)
-    sfxmlhandlers.handle_journal(db, fields, roundupdata, nodeid)
-    return nodeid
+    groups = handle_idmapping(tracker, "groups", "group")
+    categories = handle_idmapping(tracker, "categories", "category")
 
+    categories['100100'] = 'None'
+    categories['100'] = 'None'    
+    groups['100100'] = 'None'
+    groups['100'] = 'None'    
+
+    print groups
+    
+    resolutions = handle_namemapping(tracker, 'resolutions', 'resolution')
+    statuses = handle_namemapping(tracker, 'statuses', 'status')
+
+    handlers = [x2h.TextValueHandler(db, "id", "id"),
+                x2h.StatusHandler(db, "status_id", "status", statuses),
+                x2h.ComponentHandler(db, "category_id", "components",
+                                     categories),
+                x2h.GroupHandler(db, "group_id", "versions",
+                                     groups),                
+                x2h.ResolutionHandler(db, "resolution_id", "resolution",
+                                      resolutions),
+                x2h.UserlinkHandler(db, 'submitter', 'creator',
+                                    pmembers),
+                x2h.AssigneeHandler(db, 'assignee', 'assignee', pmembers),
+                # FIXME: Activity
+                x2h.DateHandler(db, 'submit_date', 'creation'),
+                x2h.PriorityHandler(db, 'priority', 'priority'),
+                x2h.TextstringHandler(db, 'summary', 'title'),
+                x2h.MessagesHandler(db, 'details', 'messages', pmembers),
+                x2h.AttachmentHandler(db, 'attachments', 'files',
+                                      pmembers, project_group_id, tracker),
+                x2h.SeverityHandler(db, None, "severity"),
+                x2h.TypeHandler(db, None, "type", tracker),
+                ]    
+
+    for item in tracker.find('tracker_items').findall('tracker_item'):
+        print "Handling \"%s\" item with id %s" % (tracker.find('name').text,
+                                                   item.find('id').text)
+        roundupdata = {'keywords':[], 'files':[],
+                       'messages':[], 'dependencies':[], 'nosy':[]}
+
+        for handler in handlers:
+            handler.handle(item, roundupdata)
+
+        props = []
+        values = []        
+
+        for key, value in roundupdata.items():
+            props.append(key)
+            values.append(repr(value))
+
+        nodeid = db.issue.import_list(props, values)
+        x2h.handle_journal(db, item, roundupdata, nodeid)
+
+        db.commit()
+        
+
+def handle_projectmembers(tree):
+    ps = tree.find('projectsummary').find("projectmembers")
+    ret = {'nobody':{'public_name':'Nobody/Anonymous', 'admin':False,
+                     'email':''}}
+    for pm in ps.findall("projectmember"):
+        user_name = pm.find('user_name').text
+        ret[user_name] = {'public_name':pm.find('public_name').text,
+                          'email':pm.find('email').text,
+                          'admin':False}
+        if 'Yes' == pm.find('project_admin').text:
+            ret[user_name]['admin'] = True
+    return ret
 
         
 if "__main__" == __name__:
@@ -92,26 +131,22 @@
             trackerhome = optarg
         elif "--startat" == opt:
             startat = int(optarg)
+
+    rounduptracker = instance.open(trackerhome)
+    db = rounduptracker.open("admin")            
     
     tree = ET.parse(xmlfile)
-    artifacts = tree.find('artifacts')
 
-    rounduptracker = instance.open(trackerhome)
-    db = rounduptracker.open("admin")
+    project_group_id = tree.find("export_details").find("project_group_id").text
 
-    max_id = 0
-    allartifacts = artifacts.findall('artifact')
-    i=startat
-    totalartifacts = len(allartifacts)
-    for artifact in allartifacts[startat:]:
-        i+=1
-        sys.stdout.write("[%5d/%d]   " % (i, totalartifacts))
-        aid = handle_artifact(db, artifact)
-        if max_id < int(aid):
-            max_id = int(aid)
-        db.commit()
+    pmembers = handle_projectmembers(tree)
+    
+    trackers = tree.find("trackers")
+    for tracker in trackers.findall("tracker"):
+        handle_tracker(db, project_group_id, tracker, pmembers)
 
     db.setid('issue', str(999))
-    db.commit()
+    db.commit()        
+
               
                 


More information about the Python-checkins mailing list