diff options
| author | Flaper Fesp <flaper87@gmail.com> | 2013-01-24 13:33:45 +0100 |
|---|---|---|
| committer | Flaper Fesp <flaper87@gmail.com> | 2013-02-25 17:52:07 +0100 |
| commit | bd5dad97585208ea5e86d636f3dc3b669e361a41 (patch) | |
| tree | fb58c80f702dea421f3b712221875a6924826cd2 | |
| parent | 15377750465b6eb261d2354988b9c90f1f3c1d29 (diff) | |
| download | oslo-bd5dad97585208ea5e86d636f3dc3b669e361a41.tar.gz oslo-bd5dad97585208ea5e86d636f3dc3b669e361a41.tar.xz oslo-bd5dad97585208ea5e86d636f3dc3b669e361a41.zip | |
Decode / Encode string utils for openstack
Currently some clients lack of non-ASCII characters support. This patch
introduces 2 functions (strutils.py) that will help clients and servers
to "safely" encode and decode strings.
About the ensure_(str|unicode) functions:
They both try to use first the encoding used in stdin (or python's
default encoding if that's None) and fallback to utf-8 if those
encodings fail to decode a given text.
Neither of them will try to encode / decode non-basestring objects
and will raise a TypeError if one is passed.
Use case:
This is currently being used in glanceclient. I5c3ea93a716edfe284d19f6291d4e36028f91eb2
Needed For:
* Bug 1061156
* Bug 1130572
Change-Id: I78960dfdb6159fd600a6f5e5551ab5d5a3366ab5
| -rw-r--r-- | openstack/common/strutils.py | 74 | ||||
| -rw-r--r-- | tests/unit/test_strutils.py | 34 |
2 files changed, 107 insertions, 1 deletions
diff --git a/openstack/common/strutils.py b/openstack/common/strutils.py index 05f0e9f..7813b64 100644 --- a/openstack/common/strutils.py +++ b/openstack/common/strutils.py @@ -20,6 +20,7 @@ System-level utilities and helper functions. """ import logging +import sys LOG = logging.getLogger(__name__) @@ -57,3 +58,76 @@ def bool_from_string(subject): if subject.strip().lower() in ('true', 'on', 'yes', '1'): return True return False + + +def safe_decode(text, incoming=None, errors='strict'): + """ + Decodes incoming str using `incoming` if they're + not already unicode. + + :param incoming: Text's current encoding + :param errors: Errors handling policy. See here for valid + values http://docs.python.org/2/library/codecs.html + :returns: text or a unicode `incoming` encoded + representation of it. + :raises TypeError: If text is not an isntance of basestring + """ + if not isinstance(text, basestring): + raise TypeError("%s can't be decoded" % type(text)) + + if isinstance(text, unicode): + return text + + if not incoming: + incoming = (sys.stdin.encoding or + sys.getdefaultencoding()) + + try: + return text.decode(incoming, errors) + except UnicodeDecodeError: + # Note(flaper87) If we get here, it means that + # sys.stdin.encoding / sys.getdefaultencoding + # didn't return a suitable encoding to decode + # text. This happens mostly when global LANG + # var is not set correctly and there's no + # default encoding. In this case, most likely + # python will use ASCII or ANSI encoders as + # default encodings but they won't be capable + # of decoding non-ASCII characters. + # + # Also, UTF-8 is being used since it's an ASCII + # extension. + return text.decode('utf-8', errors) + + +def safe_encode(text, incoming=None, + encoding='utf-8', errors='strict'): + """ + Encodes incoming str/unicode using `encoding`. If + incoming is not specified, text is expected to + be encoded with current python's default encoding. + (`sys.getdefaultencoding`) + + :param incoming: Text's current encoding + :param encoding: Expected encoding for text (Default UTF-8) + :param errors: Errors handling policy. See here for valid + values http://docs.python.org/2/library/codecs.html + :returns: text or a bytestring `encoding` encoded + representation of it. + :raises TypeError: If text is not an isntance of basestring + """ + if not isinstance(text, basestring): + raise TypeError("%s can't be encoded" % type(text)) + + if not incoming: + incoming = (sys.stdin.encoding or + sys.getdefaultencoding()) + + if isinstance(text, unicode): + return text.encode(encoding, errors) + elif text and encoding != incoming: + # Decode text before encoding it with `encoding` + text = safe_decode(text, incoming, errors) + return text.encode(encoding, errors) + + return text diff --git a/tests/unit/test_strutils.py b/tests/unit/test_strutils.py index 6995427..891a045 100644 --- a/tests/unit/test_strutils.py +++ b/tests/unit/test_strutils.py @@ -17,7 +17,6 @@ import mock -from openstack.common import exception from openstack.common import strutils from tests import utils @@ -73,3 +72,36 @@ class StrUtilsTest(utils.BaseTestCase): def test_int_from_bool_as_string(self): self.assertEqual(1, strutils.int_from_bool_as_string(True)) self.assertEqual(0, strutils.int_from_bool_as_string(False)) + + def test_safe_decode(self): + safe_decode = strutils.safe_decode + self.assertRaises(TypeError, safe_decode, True) + self.assertEqual(u'ni\xf1o', safe_decode("ni\xc3\xb1o", + incoming="utf-8")) + self.assertEqual(u"test", safe_decode("dGVzdA==", + incoming='base64')) + + self.assertEqual(u"strange", safe_decode('\x80strange', + errors='ignore')) + + self.assertEqual(u'\xc0', safe_decode('\xc0', + incoming='iso-8859-1')) + + # Forcing incoming to ascii so it falls back to utf-8 + self.assertEqual(u'ni\xf1o', safe_decode('ni\xc3\xb1o', + incoming='ascii')) + + def test_safe_encode(self): + safe_encode = strutils.safe_encode + self.assertRaises(TypeError, safe_encode, True) + self.assertEqual("ni\xc3\xb1o", safe_encode(u'ni\xf1o', + encoding="utf-8")) + self.assertEqual("dGVzdA==\n", safe_encode("test", + encoding='base64')) + self.assertEqual('ni\xf1o', safe_encode("ni\xc3\xb1o", + encoding="iso-8859-1", + incoming="utf-8")) + + # Forcing incoming to ascii so it falls back to utf-8 + self.assertEqual('ni\xc3\xb1o', safe_encode('ni\xc3\xb1o', + incoming='ascii')) |
