split string with hieroglyphs

Belize uss.japan at gmail.com
Sun Dec 24 05:20:15 EST 2006


Steven, thanks! Very nice algorithm.
Here is code:


#!/usr/bin/env python
# -*- coding: utf_8 -*-

# Thanks Steven D'Aprano for hints

import unicodedata
import MySQLdb

#MySQL variables
mysql_host = "localhost"
mysql_user = "dict"
mysql_password = "passwd"
mysql_db = "dictionary"

try:
	mysql_conn = MySQLdb.connect(mysql_host, mysql_user, mysql_password,
mysql_db)
	cur = mysql_conn.cursor()
	cur.execute("""SET NAMES UTF8""")
except:
	print "unable insert to MySQL, check connection"

jap_text = "BZツーリTVツキDVD?"
jap_text = unicode(jap_text, 'utf-8')			          # fight with
full-width, half-width katakana madness :-)
jap_text = unicodedata.normalize('NFKC', jap_text)	#
jap_text = jap_text.encode('utf-8')			          #

def translate_hieroglyph(jap_text):
	eng_text = ""
	mysql_translate_query = "SELECT Eng FROM dictionary where Jis='%s'
collate utf8_unicode_ci LIMIT 1" % jap_text
	cur.execute(mysql_translate_query)
	mysql_trans_data = cur.fetchall()
	for line in mysql_trans_data:
		eng_text = line[0]
	if not eng_text:
		eng_text = jap_text
	return eng_text

def islatin(s):
    try:
        unicode(s, 'ascii')
    except UnicodeError:
        pass
    else:
        return True

def split_fragments(s):
    fragments = []
    latin = []
    nonlatin = []
    for c in s:
        if islatin(c):
            if nonlatin:
                fragments.append(''.join(nonlatin))
                nonlatin = []
            latin.append(c)
        else:
            if latin:
                fragments.append(''.join(latin))
                latin = []
            nonlatin.append(c)
    if latin:                                              # without
this we lose last fragment
        fragments.append(''.join(latin))         #
    else:                                                 #
        fragments.append(''.join(nonlatin))     #
    return fragments

fragments = split_fragments(jap_text)

def join_fragments(fragments):
    accumulator = []
    for fragment in fragments:
        if islatin(fragment):
            accumulator.append(fragment)
        else:
            accumulator.append(translate_hieroglyph(fragment))
    return ' '.join(accumulator)

print join_fragments(fragments)


home at my ~/Src/Code $ python translate.py
BZ navigation TV display DVD?

Work as needed :-) Thanks again!




More information about the Python-list mailing list