How to print first(national) char from unicode string encoded inutf-8?
sniipe at gmail.com
sniipe at gmail.com
Tue Sep 2 04:17:21 EDT 2008
On 2 Wrz, 06:05, "Mark Tolonen" <M8R-yft... at mailinator.com> wrote:
> "Marco Bizzarri" <marco.bizza... at gmail.com> wrote in message
>
> news:mailman.331.1220276398.3487.python-list at python.org...
>
>
>
> > On Mon, Sep 1, 2008 at 3:25 PM, <sni... at gmail.com> wrote:
>
> >> When I do ${urllib.unquote(c.user.firstName)} without encoding to
> >> latin-1 I got different chars than I will get: no Ùukasz but Å ukasz
> >> --
> >>http://mail.python.org/mailman/listinfo/python-list
>
> > That's crazy. "string".encode('latin1') gives you a latin1 encoded
> > string; latin1 is a single byte encoding, therefore taking the first
> > byte should be no problem.
>
> > Have you tried:
>
> > urlib.unquote(c.user.firstName)[0].encode('latin1') or
>
> > urlib.unquote(c.user.firstName)[0].encode('utf8')
>
> > I'm assuming here that the urlib.unquote(c.user.firstName) returns an
> > encodable string (which I'm absolutely not sure), but if it does, this
> > should take the first 'character'.
>
> The OP stated that the original string was "encoded in UTF-8 and
> urllib.quote()", so after urllib.unquote the string is in UTF-8 format.
> This must be decoded into a Unicode string before removing the first
> character:
>
> urllib.unquote(c.user.firstName).decode('utf-8')[0]
>
> The next problem is that the character in the OP's example string 'Ù' is not
> present in the latin-1 encoding, but using utf-8 encoding demonstrates that
> the full two-byte UTF-8 encoded character is collected:
>
> >>> import urllib
> >>> name = urllib.quote(u'Ùukasz'.encode('utf-8'))
> >>> name
> '%C5%81ukasz'
> >>> urllib.unquote(name).decode('utf-8')[0].encode('utf-8')
> '\xc5\x81'
>
> -Mark
@Mark, when I tried urllib.unquote(c.user.firstName).decode('utf-8')
[0].encode('utf-8'), I received this message:
>> return render('/reports/create_report_step2.mako')
Module pylons.templating:344 in render
<< **cache_args)
return pylons.buffet.render(template_name=template,
fragment=fragment,
format=format, namespace=kargs,
**cache_args)
>> format=format, namespace=kargs, **cache_args)
Module pylons.templating:229 in render
<< log.debug("Rendering template %s with engine %s",
full_path, engine_name)
return engine_config['engine'].render(namespace,
template=full_path,
**options)>> **options)
Module mako.ext.turbogears:49 in render
<< info.update(self.extra_vars_func())
return template.render(**info)
>> return template.render(**info)
Module mako.template:114 in render
<< declared by this template's internal rendering method are
also pulled from the given *args, **data
members. members."""
return runtime._render(self, self.callable_, args, data)
def render_unicode(self, *args, **data):>> return
runtime._render(self, self.callable_, args, data)
Module mako.runtime:287 in _render
<< context = Context(buf, **data)
context._with_template = template
_render_context(template, callable_, context, *args,
**_kwargs_for_callable(callable_, data))
return context.pop_buffer().getvalue()>>
_render_context(template, callable_, context, *args,
**_kwargs_for_callable(callable_, data))
Module mako.runtime:304 in _render_context
<< # if main render method, call from the base of the
inheritance stack
(inherit, lclcontext) = _populate_self_namespace(context,
tmpl)
_exec_template(inherit, lclcontext, args=args,
kwargs=kwargs)
else:
# otherwise, call the actual rendering method specified>>
_exec_template(inherit, lclcontext, args=args, kwargs=kwargs)
Module mako.runtime:337 in _exec_template
<< error_template.render_context(context,
error=error)
else:
callable_(context, *args, **kwargs)>> callable_(context,
*args, **kwargs)
Module _reports_create_report_step2_mako:57 in render_body
<<
context.write(filters.decode.utf8(urllib.unquote(str(c.period.end))))
context.write(u' + ')
context.write(filters.decode.utf8(urllib.unquote(c.user.firstName).decode('utf-8')
[0].encode('utf-8')))
context.write(filters.decode.utf8(urllib.unquote(str(c.user.secondName)
[0:1])))
context.write(u'</h3>\r\n <input type="hidden"
name="works[]" value="')>>
context.write(filters.decode.utf8(urllib.unquote(c.user.firstName).decode('utf-8')
[0].encode('utf-8')))
Module encodings.utf_8:16 in decode
<<
def decode(input, errors='strict'):
return codecs.utf_8_decode(input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):>> return
codecs.utf_8_decode(input, errors, True)
<type 'exceptions.UnicodeEncodeError'>: 'ascii' codec can't encode
characters in position 0-1: ordinal not in range(128)
More information about the Python-list
mailing list