Source code for n6sdk.encoding_helpers

# -*- coding: utf-8 -*-

# Copyright (c) 2013-2016 NASK. All rights reserved.
#
# For some parts of the source code of the provide_surrogateescape() function:
# Copyright (c) 2011-2013 Victor Stinner. All rights reserved.
# (For more information -- see the provide_surrogateescape()'s docstring.)


[docs]class AsciiMixIn(object): r""" A mix-in class that provides the :meth:`__str__`, :meth:`__unicode__` and :meth:`__format__` special methods based on :func:`ascii_str`. >>> class SomeBase(object): ... def __str__(self): ... return 'Cośtam-cośtam' ... def __format__(self, fmt): ... return 'Nó i ' + fmt ... >>> class MyClass(AsciiMixIn, SomeBase): ... pass ... >>> obj = MyClass() >>> str(obj) 'Co\\u015btam-co\\u015btam' >>> unicode(obj) u'Co\\u015btam-co\\u015btam' >>> format(obj) 'N\\xf3 i ' >>> 'Oto {0:ś}'.format(obj) 'Oto N\\xf3 i \\u015b' >>> u'Oto {0:\\u015b}'.format(obj) # unicode format string u'Oto N\\xf3 i \\u015b' >>> 'Oto {0!s}'.format(obj) 'Oto Co\\u015btam-co\\u015btam' >>> 'Oto %s' % obj 'Oto Co\\u015btam-co\\u015btam' >>> u'Oto %s' % obj # unicode format string u'Oto Co\\u015btam-co\\u015btam' """ def __str__(self): return ascii_str(super(AsciiMixIn, self).__str__()) def __unicode__(self): try: super_meth = super(AsciiMixIn, self).__unicode__ except AttributeError: super_meth = super(AsciiMixIn, self).__str__ return ascii_str(super_meth()).decode('ascii') def __format__(self, fmt): return ascii_str(super(AsciiMixIn, self).__format__(ascii_str(fmt)))
[docs]def ascii_str(obj): r""" Safely convert the given object to an ASCII-only :class:`str`. This function does its best to obtain a pure-ASCII string representation (possibly :class:`str`/:func:`unicode`-like, though :func:`repr` can also be used as the last-resort fallback) -- *not raising* any encoding/decoding exceptions. The result is an ASCII str, with non-ASCII characters escaped using Python literal notation (``\x...``, ``\u...``, ``\U...``). >>> ascii_str('') '' >>> ascii_str(u'') '' >>> ascii_str('Ala ma kota\nA kot?\n2=2 ') # pure ASCII str => unchanged 'Ala ma kota\nA kot?\n2=2 ' >>> ascii_str(u'Ala ma kota\nA kot?\n2=2 ') 'Ala ma kota\nA kot?\n2=2 ' >>> ascii_str(ValueError('Ech, ale błąd!')) # UTF-8 str => decoded 'Ech, ale b\\u0142\\u0105d!' >>> ascii_str(ValueError(u'Ech, ale b\u0142\u0105d!')) 'Ech, ale b\\u0142\\u0105d!' >>> ascii_str('\xee\xdd \t jaźń') # non-UTF-8 str => using surrogateescape '\\udcee\\udcdd \t ja\\u017a\\u0144' >>> ascii_str(u'\udcee\udcdd \t ja\u017a\u0144') '\\udcee\\udcdd \t ja\\u017a\\u0144' >>> class Nasty(object): ... def __str__(self): raise UnicodeError ... def __unicode__(self): raise UnicodeError ... def __repr__(self): return 'really nas\xc5\xa7y! \xaa' ... >>> ascii_str(Nasty()) 'really nas\\u0167y! \\udcaa' """ if not isinstance(obj, unicode): try: s = str(obj) except ValueError: try: obj = unicode(obj) except ValueError: obj = repr(obj).decode('utf-8', 'surrogateescape') else: obj = s.decode('utf-8', 'surrogateescape') return obj.encode('ascii', 'backslashreplace')
[docs]def as_unicode(obj): r""" Convert the given object to a :class:`unicode` string. Unlike :func:`ascii_str`, this function is not decoding-error-proof and does not apply any escaping. The function requires that the given object is one of the following: * a :class:`unicode` string, * a UTF-8-decodable :class:`str` string, * an object that produces one of the above kinds of strings when converted using :class:`unicode` or :class:`str`, or :func:`repr` (the conversions are tried in this order); if not -- :exc:`~exceptions.UnicodeDecodeError` is raised. >>> as_unicode(u'') u'' >>> as_unicode('') u'' >>> as_unicode(u'O\u0142\xf3wek') == u'O\u0142\xf3wek' True >>> as_unicode('O\xc5\x82\xc3\xb3wek') == u'O\u0142\xf3wek' True >>> as_unicode(ValueError(u'O\u0142\xf3wek')) == u'O\u0142\xf3wek' True >>> as_unicode(ValueError('O\xc5\x82\xc3\xb3wek')) == u'O\u0142\xf3wek' True >>> class Hard(object): ... def __str__(self): raise UnicodeError ... def __unicode__(self): raise UnicodeError ... def __repr__(self): return 'foo' ... >>> as_unicode(Hard()) u'foo' >>> as_unicode('\xdd') # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... UnicodeDecodeError: ... """ if isinstance(obj, str): u = obj.decode('utf-8') else: try: u = unicode(obj) except ValueError: try: u = str(obj).decode('utf-8') except ValueError: u = repr(obj).decode('utf-8') return u
[docs]def provide_surrogateescape(): r""" Provide the ``surrogateescape`` error handler for bytes-to-unicode decoding. The source code of the function has been copied from https://bitbucket.org/haypo/misc/src/d76f4ff5d27c746c883d40160c8b4fb0891e79f2/python/surrogateescape.py?at=default and then adjusted, optimized and commented. Original code was created by Victor Stinner and released by him under the Python license and the BSD 2-clause license. The ``surrogateescape`` error handler is provided out-of-the-box in Python 3 but not in Python 2. It can be used to convert arbitrary binary data to Unicode in a practically non-destructive way. .. seealso:: https://www.python.org/dev/peps/pep-0383. This implementation (for Python 2) covers only the decoding part of the handler, i.e. the :class:`str`-to-:class:`unicode` conversion. The encoding (:class:`unicode`-to-:class:`str`) part is not implemented. Note, however, that once we transformed a binary data into a *surrogate-escaped* Unicode data we can (in Python 2) freely encode/decode it (:class:`unicode`-to/from-:class:`str`), not using ``surrogateescape`` anymore, e.g.: >>> # We assume that the function has already been called -- >>> # as it is imported and called in N6SDK/n6sdk/__init__.py >>> b = 'ołówek \xee\xdd' # utf-8 text + some non-utf-8 mess >>> b 'o\xc5\x82\xc3\xb3wek \xee\xdd' >>> u = b.decode('utf-8', 'surrogateescape') >>> u u'o\u0142\xf3wek \udcee\udcdd' >>> b2 = u.encode('utf-8') >>> b2 # now all stuff is utf-8 encoded 'o\xc5\x82\xc3\xb3wek \xed\xb3\xae\xed\xb3\x9d' >>> u2 = b2.decode('utf-8') >>> u2 == u True >>> u.encode('latin2', # doctest: +IGNORE_EXCEPTION_DETAIL ... 'surrogateescape') # does not work for *encoding* Traceback (most recent call last): ... TypeError: don't know how to handle UnicodeEncodeError in error callback This function is idempotent (i.e., it can be called safely multiple times -- because if the handler is already registered the function does not try to register it again) though it is not thread-safe (typically it does not matter as the function is supposed to be called somewhere at the beginning of program execution). .. note:: This function is called automatically on first import of :mod:`n6sdk` module or any of its submodules. .. warning:: In Python 3 (if you were using a Python-3-based application or script to handle data produced with Python 2), the ``utf-8`` codec (as well as other ``utf-...`` codecs) does not decode *surrogate-escaped* data encoded to bytes with the Python 2's ``utf-8`` codec unless the ``surrogatepass`` error handler is used for decoding (on the Python 3 side). """ def surrogateescape(exc, # to avoid namespace dict lookups: isinstance=isinstance, UnicodeDecodeError=UnicodeDecodeError, ord=ord, unichr=unichr, unicode_join=u''.join): if isinstance(exc, UnicodeDecodeError): decoded = [] append_to_decoded = decoded.append for ch in exc.object[exc.start:exc.end]: code = ord(ch) if 0x80 <= code <= 0xFF: append_to_decoded(unichr(0xDC00 + code)) elif code <= 0x7F: append_to_decoded(unichr(code)) else: raise exc decoded = unicode_join(decoded) return (decoded, exc.end) else: raise TypeError("don't know how to handle {} in error callback" .format(type(exc).__name__)) import codecs try: codecs.lookup_error('surrogateescape') except LookupError: codecs.register_error('surrogateescape', surrogateescape)