diff options
Diffstat (limited to 'iconvmodule')
-rw-r--r-- | iconvmodule/.cvsignore | 1 | ||||
-rw-r--r-- | iconvmodule/Makefile | 7 | ||||
-rw-r--r-- | iconvmodule/README.en | 66 | ||||
-rw-r--r-- | iconvmodule/iconvcodec.py | 86 | ||||
-rw-r--r-- | iconvmodule/iconvmodule.c | 190 | ||||
-rw-r--r-- | iconvmodule/setup.py | 19 | ||||
-rw-r--r-- | iconvmodule/test_iconv.py | 8 | ||||
-rw-r--r-- | iconvmodule/test_iconvcodec.py | 4 |
8 files changed, 381 insertions, 0 deletions
diff --git a/iconvmodule/.cvsignore b/iconvmodule/.cvsignore new file mode 100644 index 000000000..0d20b6487 --- /dev/null +++ b/iconvmodule/.cvsignore @@ -0,0 +1 @@ +*.pyc diff --git a/iconvmodule/Makefile b/iconvmodule/Makefile new file mode 100644 index 000000000..70f1198c2 --- /dev/null +++ b/iconvmodule/Makefile @@ -0,0 +1,7 @@ +include ../Makefile.inc + +all: + $(PYTHON) setup.py build --build-platlib=`pwd` + +install: + $(PYTHON) setup.py install --install-lib=$(DESTDIR)/$(PYTHONLIBDIR) diff --git a/iconvmodule/README.en b/iconvmodule/README.en new file mode 100644 index 000000000..83a842033 --- /dev/null +++ b/iconvmodule/README.en @@ -0,0 +1,66 @@ +Iconv-based codec library for Python +==================================== +Written by Martin v. Loewis + +This package provides a set of codecs to Python based on the +underlying iconv library of the operating system, as available on +glibc 2, Solaris, or other Unix variants. It consists of two modules: +iconv and iconvcodec. + +Installation +------------ +To install the module, simply type + +python setup.py install + +This module package requires atleast Python 2.0. + +Module iconv +------------ +The iconv module exposes a global function to create iconv objects: + +open(tocode, fromcode) +Return descriptor for character set conversion. If the conversion +of fromcode to tocode is not known to the system, a ValueError is +raised. + +Iconv objects provide a single method to convert a string + +iconv(in[, outlen[, return_unicode[, count_only]]]) +Return the string resulting from the conversion of in. The parameter +in can be a byte or unicode string, or an arbitrary buffer object. +It is the caller's responsibility to guarantee that the internal +representation of the in object indeed uses fromcode of the Iconv +object. The parameter outlen represents an estimate of the resulting +string size (in bytes, or in characters if return_unicode is true). +If the buffer is to small, an exception is thrown. If return_unicode +is set, a Unicode object is the result. If count_only is set, +no conversion is attempted, but the number of necessary bytes is +returned. + +In case of an error, the iconv method raises the exception iconv.error. +This exception has four arguments: + - the error string as returned from strerror + - the error number + - the number of input bytes processed + - the output string produced so far + +Module iconvcodecs +------------------ +This module encapsulates the iconv module into a set of codecs. To use it, +simply import it. As a result, the C library's codecs will be available: + + unicode("Hello","T.61") + u"World".encode("JOHAB") + +License +------- +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +Author +------ +Martin v. Loewis +loewis@informatik.hu-berlin.de diff --git a/iconvmodule/iconvcodec.py b/iconvmodule/iconvcodec.py new file mode 100644 index 000000000..a887b587e --- /dev/null +++ b/iconvmodule/iconvcodec.py @@ -0,0 +1,86 @@ +import sys, iconv, codecs, errno + +# First we need to find out what the Unicode code set name is +# in this iconv implementation + +if sys.platform.startswith("linux"): + unicodename = "unicode"+sys.byteorder +else: + # may need to try UCS-2, UCS-2-LE/BE, Unicode, ... + raise ImportError,"cannot establish name of 2-byte Unicode" + +class Codec(codecs.Codec): + def __init__(self): + self.encoder = iconv.open(self.codeset,unicodename) + self.decoder = iconv.open(unicodename,self.codeset) + + def encode(self, msg, errors = 'strict'): + try: + return self.encoder.iconv(msg),len(msg) + except iconv.error,e: + print e + errstring,code,inlen,outres=e.args + assert inlen % 2 == 0 + inlen /= 2 + if code == errno.E2BIG: + # outbuffer was too small, try to encode rest + out1,len1 = self.encode(msg[inlen:],errors) + return outres+out1, inlen+len1 + if code == errno.EINVAL: + # An incomplete multibyte sequence has been + # encountered in the input. Should not happen in Unicode + raise AssertionError("EINVAL in encode") + if code == errno.EILSEQ: + # An invalid multibyte sequence has been encountered + # in the input. Used to indicate that the character is + # not supported in the target code + if errors == 'strict': + raise UnicodeError(*args) + if errors == 'replace': + out1,len1 = self.encode(u"?"+msg[inlen+1:],errors) + elif errors == 'ignore': + out1,len1 = self.encode(msg[inlen+1:],errors) + else: + raise ValueError("unsupported error handling") + return outres+out1, inlen+1+len1 + raise + + def decode(self, msg, errors = 'strict'): + try: + return self.decoder.iconv(msg, return_unicode=1),len(msg) + except iconv.error,e: + errstring,code,inlen,outres = e.args + if code == errno.E2BIG: + # buffer too small + out1,len1 = self.decode(msg[inlen:],errors) + return outres+out1, inlen+len1 + if code == errno.EINVAL: + # An incomplete multibyte sequence has been + # encountered in the input. + return outres,inlen + if code == errno.EILSEQ: + # An invalid multibyte sequence has been encountered + # in the input. Ignoring or replacing it is hard to + # achieve, just try one character at a time + if errors == 'strict': + raise UnicodeError(*e.args) + if errors == 'replace': + outres += u'\uFFFD' + out1,len1 = self.decode(msg[inlen:],errors) + elif errors == 'ignore': + out1,len1 = self.decode(msg[inlen:],errors) + else: + raise ValueError("unsupported error handling") + return outres+out1,inlen+len1 + +def lookup(encoding): + class SpecialCodec(Codec):pass + SpecialCodec.codeset = encoding + class Reader(SpecialCodec, codecs.StreamReader):pass + class Writer(SpecialCodec, codecs.StreamWriter):pass + try: + return SpecialCodec().encode,SpecialCodec().decode, Reader, Writer + except ValueError: + return None + +codecs.register(lookup) diff --git a/iconvmodule/iconvmodule.c b/iconvmodule/iconvmodule.c new file mode 100644 index 000000000..6841bc85b --- /dev/null +++ b/iconvmodule/iconvmodule.c @@ -0,0 +1,190 @@ +#include <iconv.h> +#include <Python.h> + +typedef struct { + PyObject_HEAD + iconv_t handle; +} IconvObject; + +static PyObject *error; + +staticforward PyTypeObject Iconv_Type; + +static char iconv_open__doc__[]= +"open(tocode, fromcode) -> iconv handle\n" +"allocate descriptor for character set conversion"; + +static PyObject* +py_iconv_open(PyObject* unused, PyObject* args) +{ + char *tocode, *fromcode; + iconv_t result; + IconvObject *self; + if (!PyArg_ParseTuple(args, "ss", &tocode, &fromcode)) + return NULL; + result = iconv_open(tocode, fromcode); + if (result == (iconv_t)(-1)){ + PyErr_SetFromErrno(PyExc_ValueError); + return NULL; + } + self = PyObject_New(IconvObject, &Iconv_Type); + if (self == NULL){ + iconv_close(result); + return NULL; + } + self->handle = result; + return (PyObject*)self; +} + +static void +Iconv_dealloc(IconvObject *self) +{ + iconv_close(self->handle); + PyObject_Del(self); +} + +static char Iconv_iconv__doc__[]= +"iconv(in[, outlen[, return_unicode[, count_only]]]) -> out\n" +"Convert in to out. outlen is the size of the output buffer;\n" +"it defaults to len(in)."; + +static PyObject* +Iconv_iconv(IconvObject *self, PyObject *args, PyObject* kwargs) +{ + PyObject *inbuf_obj; + const char *inbuf; + char *outbuf; + size_t inbuf_size, outbuf_size, iresult; + int inbuf_size_int, outbuf_size_int = -1; + int return_unicode = 0, count_only = 0; + PyObject *result; + static char *kwarg_names[]={ + "s", + "outlen", + "return_unicode", + "count_only", + NULL + }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, + "O|iii:iconv", kwarg_names, + &inbuf_obj, &outbuf_size_int, + &return_unicode, &count_only)) + return NULL; + + if (inbuf_obj == Py_None){ + /* None means to clear the iconv object */ + inbuf = NULL; + inbuf_size_int = 0; + }else if (inbuf_obj->ob_type->tp_as_buffer){ + if (PyObject_AsReadBuffer(inbuf_obj, (const void**)&inbuf, + &inbuf_size_int) == -1) + return NULL; + }else{ + PyErr_SetString(PyExc_TypeError, + "iconv expects string as first argument"); + return NULL; + } + /* If no result size estimate was given, estimate that the result + string is the same size as the input string. */ + if (outbuf_size_int == -1) + outbuf_size_int = inbuf_size_int; + inbuf_size = inbuf_size_int; + if (count_only){ + result = NULL; + outbuf = NULL; + outbuf_size = outbuf_size_int; + }else if(return_unicode){ + /* Allocate the result string. */ + result = PyUnicode_FromUnicode(NULL, outbuf_size_int); + outbuf = (char*)PyUnicode_AS_UNICODE(result); + outbuf_size = outbuf_size_int*2; + }else{ + /* Allocate the result string. */ + result = PyString_FromStringAndSize(NULL, outbuf_size_int); + if (!result) + return NULL; + outbuf = PyString_AS_STRING(result); + outbuf_size = outbuf_size_int; + } + /* Perform the conversion. */ + iresult = iconv(self->handle, &inbuf, &inbuf_size, &outbuf, &outbuf_size); + if (count_only){ + result = PyInt_FromLong(outbuf_size_int-outbuf_size); + }else if (return_unicode) { + /* If the conversion was successful, the result string may be + larger than necessary; outbuf_size will present the extra + bytes. */ + PyUnicode_Resize(&result, outbuf_size_int-outbuf_size/2); + }else{ + _PyString_Resize(&result, outbuf_size_int-outbuf_size); + } + + if (iresult == -1){ + PyObject *exc; + exc = PyObject_CallFunction(error,"siiO", + strerror(errno),errno, + inbuf_size_int - inbuf_size, + result); + Py_DECREF(result); + PyErr_SetObject(error,exc); + return NULL; + } + return result; +} + +static PyMethodDef Iconv_methods[] = { + {"iconv", (PyCFunction)Iconv_iconv, + METH_KEYWORDS|METH_VARARGS, Iconv_iconv__doc__}, + {NULL, NULL} /* sentinel */ +}; + +static PyObject * +Iconv_getattr(PyObject *self, char *name) +{ + return Py_FindMethod(Iconv_methods, self, name); +} + +statichere PyTypeObject Iconv_Type = { + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "Iconv", /*tp_name*/ + sizeof(IconvObject), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + /* methods */ + (destructor)Iconv_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + (getattrfunc)Iconv_getattr, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ +}; + +static PyMethodDef iconv_methods[] = { + {"open", py_iconv_open, + METH_VARARGS, iconv_open__doc__}, + {NULL, NULL} /* sentinel */ +}; + +static char __doc__[]= +"The iconv module provides an interface to the iconv library."; + +DL_EXPORT(void) +initiconv(void) +{ + PyObject *m, *d; + + Iconv_Type.ob_type = &PyType_Type; + + /* Create the module and add the functions */ + m = Py_InitModule4("iconv", iconv_methods, __doc__, + NULL, PYTHON_API_VERSION); + + /* Add some symbolic constants to the module */ + d = PyModule_GetDict(m); + error = PyErr_NewException("iconv.error", PyExc_ValueError, NULL); + PyDict_SetItemString(d, "error", error); +} diff --git a/iconvmodule/setup.py b/iconvmodule/setup.py new file mode 100644 index 000000000..c5ff5d6eb --- /dev/null +++ b/iconvmodule/setup.py @@ -0,0 +1,19 @@ +from distutils.core import setup, Extension + +setup (name = "iconv", + version = "1.0", + description = "iconv-based Unicode converter", + author = "Martin v. Loewis", + author_email = "loewis@informatik.hu-berlin.de", + url = "http://sourceforge.net/projects/python-codecs/", + long_description = +"""The iconv module exposes the operating system's iconv character +conversion routine to Python. This package provides an iconv wrapper +as well as a Python codec to convert between Unicode objects and +all iconv-provided encodings. +""", + + py_modules = ['iconvcodec'], + ext_modules = [Extension("iconv",sources=["iconvmodule.c"])] + ) + diff --git a/iconvmodule/test_iconv.py b/iconvmodule/test_iconv.py new file mode 100644 index 000000000..d8c6f8325 --- /dev/null +++ b/iconvmodule/test_iconv.py @@ -0,0 +1,8 @@ +import iconv +s=iconv.open("unicodelittle","iso-8859-1") +r=s.iconv("Hallo",11,return_unicode=1) +print repr(r),len(r) + +s=iconv.open("iso-8859-1","unicodelittle") +r=s.iconv(u"Hallo",110) +print r diff --git a/iconvmodule/test_iconvcodec.py b/iconvmodule/test_iconvcodec.py new file mode 100644 index 000000000..f4567ec4a --- /dev/null +++ b/iconvmodule/test_iconvcodec.py @@ -0,0 +1,4 @@ +import iconvcodec + +print u"Hallo".encode("T.61") +print repr(unicode("Hallo","T.61")) |