1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
raise RuntimeError, "Don't use me right now!"
import sys, iconv, codecs, errno
# First we need to find out what the Unicode code set name is
# in this iconv implementation
if sys.platform.startswith("linux"):
unicodename = "unicode"+sys.byteorder
else:
# may need to try UCS-2, UCS-2-LE/BE, Unicode, ...
raise ImportError,"cannot establish name of 2-byte Unicode"
class Codec(codecs.Codec):
def __init__(self):
self.encoder = iconv.open(self.codeset,unicodename)
self.decoder = iconv.open(unicodename,self.codeset)
def encode(self, msg, errors = 'strict'):
try:
return self.encoder.iconv(msg),len(msg)
except iconv.error,e:
print e
errstring,code,inlen,outres=e.args
assert inlen % 2 == 0
inlen /= 2
if code == errno.E2BIG:
# outbuffer was too small, try to encode rest
out1,len1 = self.encode(msg[inlen:],errors)
return outres+out1, inlen+len1
if code == errno.EINVAL:
# An incomplete multibyte sequence has been
# encountered in the input. Should not happen in Unicode
raise AssertionError("EINVAL in encode")
if code == errno.EILSEQ:
# An invalid multibyte sequence has been encountered
# in the input. Used to indicate that the character is
# not supported in the target code
if errors == 'strict':
raise UnicodeError(*args)
if errors == 'replace':
out1,len1 = self.encode(u"?"+msg[inlen+1:],errors)
elif errors == 'ignore':
out1,len1 = self.encode(msg[inlen+1:],errors)
else:
raise ValueError("unsupported error handling")
return outres+out1, inlen+1+len1
raise
def decode(self, msg, errors = 'strict'):
try:
return self.decoder.iconv(msg, return_unicode=1),len(msg)
except iconv.error,e:
errstring,code,inlen,outres = e.args
if code == errno.E2BIG:
# buffer too small
out1,len1 = self.decode(msg[inlen:],errors)
return outres+out1, inlen+len1
if code == errno.EINVAL:
# An incomplete multibyte sequence has been
# encountered in the input.
return outres,inlen
if code == errno.EILSEQ:
# An invalid multibyte sequence has been encountered
# in the input. Ignoring or replacing it is hard to
# achieve, just try one character at a time
if errors == 'strict':
raise UnicodeError(*e.args)
if errors == 'replace':
outres += u'\uFFFD'
out1,len1 = self.decode(msg[inlen:],errors)
elif errors == 'ignore':
out1,len1 = self.decode(msg[inlen:],errors)
else:
raise ValueError("unsupported error handling")
return outres+out1,inlen+len1
def lookup(encoding):
class SpecialCodec(Codec):pass
SpecialCodec.codeset = encoding
class Reader(SpecialCodec, codecs.StreamReader):pass
class Writer(SpecialCodec, codecs.StreamWriter):pass
try:
return SpecialCodec().encode,SpecialCodec().decode, Reader, Writer
except ValueError:
return None
codecs.register(lookup)
|