[Python-checkins] python/dist/src/Modules unicodedata.c,2.20,2.21

loewis@users.sourceforge.net loewis@users.sourceforge.net
Sat, 23 Nov 2002 04:22:35 -0800


Update of /cvsroot/python/python/dist/src/Modules
In directory sc8-pr-cvs1:/tmp/cvs-serv13615/Modules

Modified Files:
	unicodedata.c 
Log Message:
Patch #626548: Support Hangul syllable names.


Index: unicodedata.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/unicodedata.c,v
retrieving revision 2.20
retrieving revision 2.21
diff -C2 -d -r2.20 -r2.21
*** unicodedata.c	18 Oct 2002 16:11:51 -0000	2.20
--- unicodedata.c	23 Nov 2002 12:22:32 -0000	2.21
***************
*** 1,10 ****
  /* ------------------------------------------------------------------------
  
!    unicodedata -- Provides access to the Unicode 3.0 data base.
  
!    Data was extracted from the Unicode 3.0 UnicodeData.txt file.
  
     Written by Marc-Andre Lemburg (mal@lemburg.com).
     Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
  
     Copyright (c) Corporation for National Research Initiatives.
--- 1,11 ----
  /* ------------------------------------------------------------------------
  
!    unicodedata -- Provides access to the Unicode 3.2 data base.
  
!    Data was extracted from the Unicode 3.2 UnicodeData.txt file.
  
     Written by Marc-Andre Lemburg (mal@lemburg.com).
     Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
+    Modified by Martin v. Löwis (martin@v.loewis.de)
  
     Copyright (c) Corporation for National Research Initiatives.
***************
*** 277,280 ****
--- 278,322 ----
  }
  
+ #define SBase   0xAC00
+ #define LBase   0x1100
+ #define VBase   0x1161
+ #define TBase   0x11A7
+ #define LCount  19
+ #define VCount  21
+ #define TCount  28
+ #define NCount  (VCount*TCount)
+ #define SCount  (LCount*NCount)
+ 
+ static char *hangul_syllables[][3] = {
+     { "G",  "A",   ""   },
+     { "GG", "AE",  "G"  },
+     { "N",  "YA",  "GG" },
+     { "D",  "YAE", "GS" },
+     { "DD", "EO",  "N", },
+     { "R",  "E",   "NJ" },
+     { "M",  "YEO", "NH" },
+     { "B",  "YE",  "D"  },
+     { "BB", "O",   "L"  },
+     { "S",  "WA",  "LG" },
+     { "SS", "WAE", "LM" },
+     { "",   "OE",  "LB" },
+     { "J",  "YO",  "LS" },
+     { "JJ", "U",   "LT" },
+     { "C",  "WEO", "LP" },
+     { "K",  "WE",  "LH" },
+     { "T",  "WI",  "M"  },
+     { "P",  "YU",  "B"  },
+     { "H",  "EU",  "BS" },
+     { 0,    "YI",  "S"  },
+     { 0,    "I",   "SS" },
+     { 0,    0,     "NG" },
+     { 0,    0,     "J"  },
+     { 0,    0,     "C"  },
+     { 0,    0,     "K"  },
+     { 0,    0,     "T"  },
+     { 0,    0,     "P"  },
+     { 0,    0,     "H"  }
+ };
+ 
  static int
  _getucname(Py_UCS4 code, char* buffer, int buflen)
***************
*** 285,288 ****
--- 327,352 ----
      unsigned char* w;
  
+     if (SBase <= code && code <= SBase+SCount) {
+ 	/* Hangul syllable. */
+ 	int SIndex = code - SBase;
+ 	int L = SIndex / NCount;
+ 	int V = (SIndex % NCount) / TCount;
+ 	int T = SIndex % TCount;
+ 
+ 	if (buflen < 27)
+ 	    /* Worst case: HANGUL SYLLABLE <10chars>. */
+ 	    return 0;
+ 	strcpy(buffer, "HANGUL SYLLABLE ");
+ 	buffer += 16;
+ 	strcpy(buffer, hangul_syllables[L][0]);
+ 	buffer += strlen(hangul_syllables[L][0]);
+ 	strcpy(buffer, hangul_syllables[V][1]);
+ 	buffer += strlen(hangul_syllables[V][1]);
+ 	strcpy(buffer, hangul_syllables[T][2]);
+ 	buffer += strlen(hangul_syllables[T][2]);
+ 	*buffer = '\0';
+ 	return 1;
+     }
+ 
      if (code >= 0x110000)
          return 0;
***************
*** 344,347 ****
--- 408,432 ----
  }
  
+ static void 
+ find_syllable(const char *str, int *len, int *pos, int count, int column)
+ {
+     int i, len1;
+     *len = -1;
+     for (i = 0; i < count; i++) {
+ 	char *s = hangul_syllables[i][column];
+ 	len1 = strlen(s);
+ 	if (len1 <= *len)
+ 	    continue;
+ 	if (strncmp(str, s, len1) == 0) {
+ 	    *len = len1;
+ 	    *pos = i;
+ 	}
+     }
+     if (*len == -1) {
+ 	*len = 0;
+ 	*pos = -1;
+     }
+ }
+ 
  static int
  _getcode(const char* name, int namelen, Py_UCS4* code)
***************
*** 351,354 ****
--- 436,455 ----
      unsigned int i, incr;
  
+     /* Check for hangul syllables. */
+     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
+ 	int L, V, T, len;
+ 	const char *pos = name + 16;
+ 	find_syllable(pos, &len, &L, LCount, 0);
+ 	pos += len;
+ 	find_syllable(pos, &len, &V, VCount, 1);
+ 	pos += len;
+ 	find_syllable(pos, &len, &T, TCount, 2);
+ 	pos += len;
+ 	if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
+ 	    *code = SBase + (L*VCount+V)*TCount + T;
+ 	    return 1;
+ 	}
+     }
+ 
      /* the following is the same as python's dictionary lookup, with
         only minor changes.  see the makeunicodedata script for more
***************
*** 476,477 ****
--- 577,584 ----
          PyModule_AddObject(m, "ucnhash_CAPI", v);
  }
+ 
+ /* 
+ Local variables:
+ c-basic-offset: 4
+ End:
+ */