[Python-checkins] CVS: python/dist/src/Tools/unicode makeunicodedata.py,1.2,1.3

Mon, 25 Sep 2000 01:07:09 -0700

Update of /cvsroot/python/python/dist/src/Tools/unicode
In directory slayer.i.sourceforge.net:/tmp/cvs-serv23556/Tools/unicode

Modified Files:
	makeunicodedata.py 
Log Message:

unicode database compression, step 2:

- fixed attributions
- moved decomposition data to a separate table, in preparation
  for step 3 (which won't happen before 2.0 final, promise!)
- use relative paths in the generator script

I have a lot more stuff in the works for 2.1, but let's leave
that for another day...

Index: makeunicodedata.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Tools/unicode/makeunicodedata.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** makeunicodedata.py	2000/09/25 07:13:41	1.2
--- makeunicodedata.py	2000/09/25 08:07:06	1.3
***************
*** 1,13 ****
  #
! # makeunidb.py -- generate a compact version of the unicode property
! # database (unicodedatabase.h)
  #

  import sys

  SCRIPT = sys.argv[0]
! VERSION = "1.0"

! UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"

  CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
--- 1,18 ----
  #
! # generate a compact version of the unicode property database
  #
+ # history:
+ # 2000-09-24 fl   created (based on bits and pieces from unidb)
+ # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
+ #
+ # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
+ #

  import sys

  SCRIPT = sys.argv[0]
! VERSION = "1.1"

! UNICODE_DATA = "../UnicodeData-Latest.txt"

  CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
***************
*** 25,35 ****

      # extract unicode properties
!     dummy = (0, 0, 0, 0, "NULL")
      table = [dummy]
      cache = {0: dummy}
      index = [0] * len(unicode.chars)
- 
-     DECOMPOSITION = [""]

      for char in unicode.chars:
          record = unicode.table[char]
--- 30,39 ----

      # extract unicode properties
!     dummy = (0, 0, 0, 0)
      table = [dummy]
      cache = {0: dummy}
      index = [0] * len(unicode.chars)

+     # 1) database properties
      for char in unicode.chars:
          record = unicode.table[char]
***************
*** 40,49 ****
              bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
              mirrored = record[9] == "Y"
-             if record[5]:
-                 decomposition = '"%s"' % record[5]
-             else:
-                 decomposition = "NULL"
              item = (
!                 category, combining, bidirectional, mirrored, decomposition
                  )
              # add entry to index and item tables
--- 44,49 ----
              bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
              mirrored = record[9] == "Y"
              item = (
!                 category, combining, bidirectional, mirrored
                  )
              # add entry to index and item tables
***************
*** 54,59 ****
              index[char] = i

!     # FIXME: we really should compress the decomposition stuff
!     # (see the unidb utilities for one way to do this)

      FILE = "unicodedata_db.h"
--- 54,77 ----
              index[char] = i

!     # 2) decomposition data
! 
!     # FIXME: <fl> using the encoding stuff from unidb would save
!     # another 50k or so, but I'll leave that for 2.1...
! 
!     decomp_data = [""]
!     decomp_index = [0] * len(unicode.chars)
! 
!     for char in unicode.chars:
!         record = unicode.table[char]
!         if record:
!             if record[5]:
!                 try:
!                     i = decomp_data.index(record[5])
!                 except ValueError:
!                     i = len(decomp_data)
!                     decomp_data.append(record[5])
!             else:
!                 i = 0
!             decomp_index[char] = i

      FILE = "unicodedata_db.h"
***************
*** 66,70 ****
      print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
      for item in table:
!         print "    {%d, %d, %d, %d, %s}," % item
      print "};"
      print
--- 84,88 ----
      print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
      for item in table:
!         print "    {%d, %d, %d, %d}," % item
      print "};"
      print
***************
*** 83,86 ****
--- 101,110 ----
      print "};"

+     print "static const char *decomp_data[] = {"
+     for name in decomp_data:
+         print "    \"%s\"," % name
+     print "    NULL"
+     print "};"
+ 
      # split index table
      index1, index2, shift = splitbins(index)
***************
*** 90,93 ****
--- 114,125 ----
      Array("index1", index1).dump(sys.stdout)
      Array("index2", index2).dump(sys.stdout)
+ 
+     # split index table
+     index1, index2, shift = splitbins(decomp_index)
+ 
+     print "/* same, for the decomposition data */"
+     print "#define DECOMP_SHIFT", shift
+     Array("decomp_index1", index1).dump(sys.stdout)
+     Array("decomp_index2", index2).dump(sys.stdout)

      sys.stdout = sys.__stdout__