Decode / Encode string utils for openstack

Currently some clients lack of non-ASCII characters support. This patch introduces 2 functions (strutils.py) that will help clients and servers to "safely" encode and decode strings. About the ensure_(str|unicode) functions: They both try to use first the encoding used in stdin (or python's default encoding if that's None) and fallback to utf-8 if those encodings fail to decode a given text. Neither of them will try to encode / decode non-basestring objects and will raise a TypeError if one is passed. Use case: This is currently being used in glanceclient. I5c3ea93a716edfe284d19f6291d4e36028f91eb2 Needed For: * Bug 1061156 * Bug 1130572 Change-Id: I78960dfdb6159fd600a6f5e5551ab5d5a3366ab5
author: Flaper Fesp <flaper87@gmail.com> 2013-01-24 13:33:45 +0100
committer: Flaper Fesp <flaper87@gmail.com> 2013-02-25 17:52:07 +0100
commit: bd5dad97585208ea5e86d636f3dc3b669e361a41 (patch)
tree: fb58c80f702dea421f3b712221875a6924826cd2
parent: 15377750465b6eb261d2354988b9c90f1f3c1d29 (diff)
download: oslo-bd5dad97585208ea5e86d636f3dc3b669e361a41.tar.gz
oslo-bd5dad97585208ea5e86d636f3dc3b669e361a41.tar.xz
oslo-bd5dad97585208ea5e86d636f3dc3b669e361a41.zip
2 files changed, 107 insertions, 1 deletions
diff --git a/openstack/common/strutils.py b/openstack/common/strutils.py
index 05f0e9f..7813b64 100644
--- a/openstack/common/strutils.py
+++ b/openstack/common/strutils.py
@@ -20,6 +20,7 @@ System-level utilities and helper functions.
 """
 
 import logging
+import sys
 
 LOG = logging.getLogger(__name__)
 
@@ -57,3 +58,76 @@ def bool_from_string(subject):
         if subject.strip().lower() in ('true', 'on', 'yes', '1'):
             return True
     return False
+
+
+def safe_decode(text, incoming=None, errors='strict'):
+    """
+    Decodes incoming str using `incoming` if they're
+    not already unicode.
+
+    :param incoming: Text's current encoding
+    :param errors: Errors handling policy. See here for valid
+        values http://docs.python.org/2/library/codecs.html
+    :returns: text or a unicode `incoming` encoded
+                representation of it.
+    :raises TypeError: If text is not an isntance of basestring
+    """
+    if not isinstance(text, basestring):
+        raise TypeError("%s can't be decoded" % type(text))
+
+    if isinstance(text, unicode):
+        return text
+
+    if not incoming:
+        incoming = (sys.stdin.encoding or
+                    sys.getdefaultencoding())
+
+    try:
+        return text.decode(incoming, errors)
+    except UnicodeDecodeError:
+        # Note(flaper87) If we get here, it means that
+        # sys.stdin.encoding / sys.getdefaultencoding
+        # didn't return a suitable encoding to decode
+        # text. This happens mostly when global LANG
+        # var is not set correctly and there's no
+        # default encoding. In this case, most likely
+        # python will use ASCII or ANSI encoders as
+        # default encodings but they won't be capable
+        # of decoding non-ASCII characters.
+        #
+        # Also, UTF-8 is being used since it's an ASCII
+        # extension.
+        return text.decode('utf-8', errors)
+
+
+def safe_encode(text, incoming=None,
+                encoding='utf-8', errors='strict'):
+    """
+    Encodes incoming str/unicode using `encoding`. If
+    incoming is not specified, text is expected to
+    be encoded with current python's default encoding.
+    (`sys.getdefaultencoding`)
+
+    :param incoming: Text's current encoding
+    :param encoding: Expected encoding for text (Default UTF-8)
+    :param errors: Errors handling policy. See here for valid
+        values http://docs.python.org/2/library/codecs.html
+    :returns: text or a bytestring `encoding` encoded
+                representation of it.
+    :raises TypeError: If text is not an isntance of basestring
+    """
+    if not isinstance(text, basestring):
+        raise TypeError("%s can't be encoded" % type(text))
+
+    if not incoming:
+        incoming = (sys.stdin.encoding or
+                    sys.getdefaultencoding())
+
+    if isinstance(text, unicode):
+        return text.encode(encoding, errors)
+    elif text and encoding != incoming:
+        # Decode text before encoding it with `encoding`
+        text = safe_decode(text, incoming, errors)
+        return text.encode(encoding, errors)
+
+    return text
diff --git a/tests/unit/test_strutils.py b/tests/unit/test_strutils.py
index 6995427..891a045 100644
--- a/tests/unit/test_strutils.py
+++ b/tests/unit/test_strutils.py
@@ -17,7 +17,6 @@
 
 import mock
 
-from openstack.common import exception
 from openstack.common import strutils
 from tests import utils
 
@@ -73,3 +72,36 @@ class StrUtilsTest(utils.BaseTestCase):
     def test_int_from_bool_as_string(self):
         self.assertEqual(1, strutils.int_from_bool_as_string(True))
         self.assertEqual(0, strutils.int_from_bool_as_string(False))
+
+    def test_safe_decode(self):
+        safe_decode = strutils.safe_decode
+        self.assertRaises(TypeError, safe_decode, True)
+        self.assertEqual(u'ni\xf1o', safe_decode("ni\xc3\xb1o",
+                                                 incoming="utf-8"))
+        self.assertEqual(u"test", safe_decode("dGVzdA==",
+                                              incoming='base64'))
+
+        self.assertEqual(u"strange", safe_decode('\x80strange',
+                                                 errors='ignore'))
+
+        self.assertEqual(u'\xc0', safe_decode('\xc0',
+                                              incoming='iso-8859-1'))
+
+        # Forcing incoming to ascii so it falls back to utf-8
+        self.assertEqual(u'ni\xf1o', safe_decode('ni\xc3\xb1o',
+                                                 incoming='ascii'))
+
+    def test_safe_encode(self):
+        safe_encode = strutils.safe_encode
+        self.assertRaises(TypeError, safe_encode, True)
+        self.assertEqual("ni\xc3\xb1o", safe_encode(u'ni\xf1o',
+                                                    encoding="utf-8"))
+        self.assertEqual("dGVzdA==\n", safe_encode("test",
+                                                   encoding='base64'))
+        self.assertEqual('ni\xf1o', safe_encode("ni\xc3\xb1o",
+                                                encoding="iso-8859-1",
+                                                incoming="utf-8"))
+
+        # Forcing incoming to ascii so it falls back to utf-8
+        self.assertEqual('ni\xc3\xb1o', safe_encode('ni\xc3\xb1o',
+                                                    incoming='ascii'))
author	Flaper Fesp <flaper87@gmail.com>	2013-01-24 13:33:45 +0100
committer	Flaper Fesp <flaper87@gmail.com>	2013-02-25 17:52:07 +0100
commit	bd5dad97585208ea5e86d636f3dc3b669e361a41 (patch)
tree	fb58c80f702dea421f3b712221875a6924826cd2
parent	15377750465b6eb261d2354988b9c90f1f3c1d29 (diff)
download	oslo-bd5dad97585208ea5e86d636f3dc3b669e361a41.tar.gz oslo-bd5dad97585208ea5e86d636f3dc3b669e361a41.tar.xz oslo-bd5dad97585208ea5e86d636f3dc3b669e361a41.zip