summaryrefslogtreecommitdiffstats
path: root/iconvmodule
diff options
context:
space:
mode:
Diffstat (limited to 'iconvmodule')
-rw-r--r--iconvmodule/.cvsignore1
-rw-r--r--iconvmodule/Makefile7
-rw-r--r--iconvmodule/README.en66
-rw-r--r--iconvmodule/iconvcodec.py86
-rw-r--r--iconvmodule/iconvmodule.c190
-rw-r--r--iconvmodule/setup.py19
-rw-r--r--iconvmodule/test_iconv.py8
-rw-r--r--iconvmodule/test_iconvcodec.py4
8 files changed, 381 insertions, 0 deletions
diff --git a/iconvmodule/.cvsignore b/iconvmodule/.cvsignore
new file mode 100644
index 000000000..0d20b6487
--- /dev/null
+++ b/iconvmodule/.cvsignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/iconvmodule/Makefile b/iconvmodule/Makefile
new file mode 100644
index 000000000..70f1198c2
--- /dev/null
+++ b/iconvmodule/Makefile
@@ -0,0 +1,7 @@
+include ../Makefile.inc
+
+all:
+ $(PYTHON) setup.py build --build-platlib=`pwd`
+
+install:
+ $(PYTHON) setup.py install --install-lib=$(DESTDIR)/$(PYTHONLIBDIR)
diff --git a/iconvmodule/README.en b/iconvmodule/README.en
new file mode 100644
index 000000000..83a842033
--- /dev/null
+++ b/iconvmodule/README.en
@@ -0,0 +1,66 @@
+Iconv-based codec library for Python
+====================================
+Written by Martin v. Loewis
+
+This package provides a set of codecs to Python based on the
+underlying iconv library of the operating system, as available on
+glibc 2, Solaris, or other Unix variants. It consists of two modules:
+iconv and iconvcodec.
+
+Installation
+------------
+To install the module, simply type
+
+python setup.py install
+
+This module package requires atleast Python 2.0.
+
+Module iconv
+------------
+The iconv module exposes a global function to create iconv objects:
+
+open(tocode, fromcode)
+Return descriptor for character set conversion. If the conversion
+of fromcode to tocode is not known to the system, a ValueError is
+raised.
+
+Iconv objects provide a single method to convert a string
+
+iconv(in[, outlen[, return_unicode[, count_only]]])
+Return the string resulting from the conversion of in. The parameter
+in can be a byte or unicode string, or an arbitrary buffer object.
+It is the caller's responsibility to guarantee that the internal
+representation of the in object indeed uses fromcode of the Iconv
+object. The parameter outlen represents an estimate of the resulting
+string size (in bytes, or in characters if return_unicode is true).
+If the buffer is to small, an exception is thrown. If return_unicode
+is set, a Unicode object is the result. If count_only is set,
+no conversion is attempted, but the number of necessary bytes is
+returned.
+
+In case of an error, the iconv method raises the exception iconv.error.
+This exception has four arguments:
+ - the error string as returned from strerror
+ - the error number
+ - the number of input bytes processed
+ - the output string produced so far
+
+Module iconvcodecs
+------------------
+This module encapsulates the iconv module into a set of codecs. To use it,
+simply import it. As a result, the C library's codecs will be available:
+
+ unicode("Hello","T.61")
+ u"World".encode("JOHAB")
+
+License
+-------
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+Author
+------
+Martin v. Loewis
+loewis@informatik.hu-berlin.de
diff --git a/iconvmodule/iconvcodec.py b/iconvmodule/iconvcodec.py
new file mode 100644
index 000000000..a887b587e
--- /dev/null
+++ b/iconvmodule/iconvcodec.py
@@ -0,0 +1,86 @@
+import sys, iconv, codecs, errno
+
+# First we need to find out what the Unicode code set name is
+# in this iconv implementation
+
+if sys.platform.startswith("linux"):
+ unicodename = "unicode"+sys.byteorder
+else:
+ # may need to try UCS-2, UCS-2-LE/BE, Unicode, ...
+ raise ImportError,"cannot establish name of 2-byte Unicode"
+
+class Codec(codecs.Codec):
+ def __init__(self):
+ self.encoder = iconv.open(self.codeset,unicodename)
+ self.decoder = iconv.open(unicodename,self.codeset)
+
+ def encode(self, msg, errors = 'strict'):
+ try:
+ return self.encoder.iconv(msg),len(msg)
+ except iconv.error,e:
+ print e
+ errstring,code,inlen,outres=e.args
+ assert inlen % 2 == 0
+ inlen /= 2
+ if code == errno.E2BIG:
+ # outbuffer was too small, try to encode rest
+ out1,len1 = self.encode(msg[inlen:],errors)
+ return outres+out1, inlen+len1
+ if code == errno.EINVAL:
+ # An incomplete multibyte sequence has been
+ # encountered in the input. Should not happen in Unicode
+ raise AssertionError("EINVAL in encode")
+ if code == errno.EILSEQ:
+ # An invalid multibyte sequence has been encountered
+ # in the input. Used to indicate that the character is
+ # not supported in the target code
+ if errors == 'strict':
+ raise UnicodeError(*args)
+ if errors == 'replace':
+ out1,len1 = self.encode(u"?"+msg[inlen+1:],errors)
+ elif errors == 'ignore':
+ out1,len1 = self.encode(msg[inlen+1:],errors)
+ else:
+ raise ValueError("unsupported error handling")
+ return outres+out1, inlen+1+len1
+ raise
+
+ def decode(self, msg, errors = 'strict'):
+ try:
+ return self.decoder.iconv(msg, return_unicode=1),len(msg)
+ except iconv.error,e:
+ errstring,code,inlen,outres = e.args
+ if code == errno.E2BIG:
+ # buffer too small
+ out1,len1 = self.decode(msg[inlen:],errors)
+ return outres+out1, inlen+len1
+ if code == errno.EINVAL:
+ # An incomplete multibyte sequence has been
+ # encountered in the input.
+ return outres,inlen
+ if code == errno.EILSEQ:
+ # An invalid multibyte sequence has been encountered
+ # in the input. Ignoring or replacing it is hard to
+ # achieve, just try one character at a time
+ if errors == 'strict':
+ raise UnicodeError(*e.args)
+ if errors == 'replace':
+ outres += u'\uFFFD'
+ out1,len1 = self.decode(msg[inlen:],errors)
+ elif errors == 'ignore':
+ out1,len1 = self.decode(msg[inlen:],errors)
+ else:
+ raise ValueError("unsupported error handling")
+ return outres+out1,inlen+len1
+
+def lookup(encoding):
+ class SpecialCodec(Codec):pass
+ SpecialCodec.codeset = encoding
+ class Reader(SpecialCodec, codecs.StreamReader):pass
+ class Writer(SpecialCodec, codecs.StreamWriter):pass
+ try:
+ return SpecialCodec().encode,SpecialCodec().decode, Reader, Writer
+ except ValueError:
+ return None
+
+codecs.register(lookup)
diff --git a/iconvmodule/iconvmodule.c b/iconvmodule/iconvmodule.c
new file mode 100644
index 000000000..6841bc85b
--- /dev/null
+++ b/iconvmodule/iconvmodule.c
@@ -0,0 +1,190 @@
+#include <iconv.h>
+#include <Python.h>
+
+typedef struct {
+ PyObject_HEAD
+ iconv_t handle;
+} IconvObject;
+
+static PyObject *error;
+
+staticforward PyTypeObject Iconv_Type;
+
+static char iconv_open__doc__[]=
+"open(tocode, fromcode) -> iconv handle\n"
+"allocate descriptor for character set conversion";
+
+static PyObject*
+py_iconv_open(PyObject* unused, PyObject* args)
+{
+ char *tocode, *fromcode;
+ iconv_t result;
+ IconvObject *self;
+ if (!PyArg_ParseTuple(args, "ss", &tocode, &fromcode))
+ return NULL;
+ result = iconv_open(tocode, fromcode);
+ if (result == (iconv_t)(-1)){
+ PyErr_SetFromErrno(PyExc_ValueError);
+ return NULL;
+ }
+ self = PyObject_New(IconvObject, &Iconv_Type);
+ if (self == NULL){
+ iconv_close(result);
+ return NULL;
+ }
+ self->handle = result;
+ return (PyObject*)self;
+}
+
+static void
+Iconv_dealloc(IconvObject *self)
+{
+ iconv_close(self->handle);
+ PyObject_Del(self);
+}
+
+static char Iconv_iconv__doc__[]=
+"iconv(in[, outlen[, return_unicode[, count_only]]]) -> out\n"
+"Convert in to out. outlen is the size of the output buffer;\n"
+"it defaults to len(in).";
+
+static PyObject*
+Iconv_iconv(IconvObject *self, PyObject *args, PyObject* kwargs)
+{
+ PyObject *inbuf_obj;
+ const char *inbuf;
+ char *outbuf;
+ size_t inbuf_size, outbuf_size, iresult;
+ int inbuf_size_int, outbuf_size_int = -1;
+ int return_unicode = 0, count_only = 0;
+ PyObject *result;
+ static char *kwarg_names[]={
+ "s",
+ "outlen",
+ "return_unicode",
+ "count_only",
+ NULL
+ };
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs,
+ "O|iii:iconv", kwarg_names,
+ &inbuf_obj, &outbuf_size_int,
+ &return_unicode, &count_only))
+ return NULL;
+
+ if (inbuf_obj == Py_None){
+ /* None means to clear the iconv object */
+ inbuf = NULL;
+ inbuf_size_int = 0;
+ }else if (inbuf_obj->ob_type->tp_as_buffer){
+ if (PyObject_AsReadBuffer(inbuf_obj, (const void**)&inbuf,
+ &inbuf_size_int) == -1)
+ return NULL;
+ }else{
+ PyErr_SetString(PyExc_TypeError,
+ "iconv expects string as first argument");
+ return NULL;
+ }
+ /* If no result size estimate was given, estimate that the result
+ string is the same size as the input string. */
+ if (outbuf_size_int == -1)
+ outbuf_size_int = inbuf_size_int;
+ inbuf_size = inbuf_size_int;
+ if (count_only){
+ result = NULL;
+ outbuf = NULL;
+ outbuf_size = outbuf_size_int;
+ }else if(return_unicode){
+ /* Allocate the result string. */
+ result = PyUnicode_FromUnicode(NULL, outbuf_size_int);
+ outbuf = (char*)PyUnicode_AS_UNICODE(result);
+ outbuf_size = outbuf_size_int*2;
+ }else{
+ /* Allocate the result string. */
+ result = PyString_FromStringAndSize(NULL, outbuf_size_int);
+ if (!result)
+ return NULL;
+ outbuf = PyString_AS_STRING(result);
+ outbuf_size = outbuf_size_int;
+ }
+ /* Perform the conversion. */
+ iresult = iconv(self->handle, &inbuf, &inbuf_size, &outbuf, &outbuf_size);
+ if (count_only){
+ result = PyInt_FromLong(outbuf_size_int-outbuf_size);
+ }else if (return_unicode) {
+ /* If the conversion was successful, the result string may be
+ larger than necessary; outbuf_size will present the extra
+ bytes. */
+ PyUnicode_Resize(&result, outbuf_size_int-outbuf_size/2);
+ }else{
+ _PyString_Resize(&result, outbuf_size_int-outbuf_size);
+ }
+
+ if (iresult == -1){
+ PyObject *exc;
+ exc = PyObject_CallFunction(error,"siiO",
+ strerror(errno),errno,
+ inbuf_size_int - inbuf_size,
+ result);
+ Py_DECREF(result);
+ PyErr_SetObject(error,exc);
+ return NULL;
+ }
+ return result;
+}
+
+static PyMethodDef Iconv_methods[] = {
+ {"iconv", (PyCFunction)Iconv_iconv,
+ METH_KEYWORDS|METH_VARARGS, Iconv_iconv__doc__},
+ {NULL, NULL} /* sentinel */
+};
+
+static PyObject *
+Iconv_getattr(PyObject *self, char *name)
+{
+ return Py_FindMethod(Iconv_methods, self, name);
+}
+
+statichere PyTypeObject Iconv_Type = {
+ PyObject_HEAD_INIT(NULL)
+ 0, /*ob_size*/
+ "Iconv", /*tp_name*/
+ sizeof(IconvObject), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ /* methods */
+ (destructor)Iconv_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ (getattrfunc)Iconv_getattr, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ 0, /*tp_hash*/
+};
+
+static PyMethodDef iconv_methods[] = {
+ {"open", py_iconv_open,
+ METH_VARARGS, iconv_open__doc__},
+ {NULL, NULL} /* sentinel */
+};
+
+static char __doc__[]=
+"The iconv module provides an interface to the iconv library.";
+
+DL_EXPORT(void)
+initiconv(void)
+{
+ PyObject *m, *d;
+
+ Iconv_Type.ob_type = &PyType_Type;
+
+ /* Create the module and add the functions */
+ m = Py_InitModule4("iconv", iconv_methods, __doc__,
+ NULL, PYTHON_API_VERSION);
+
+ /* Add some symbolic constants to the module */
+ d = PyModule_GetDict(m);
+ error = PyErr_NewException("iconv.error", PyExc_ValueError, NULL);
+ PyDict_SetItemString(d, "error", error);
+}
diff --git a/iconvmodule/setup.py b/iconvmodule/setup.py
new file mode 100644
index 000000000..c5ff5d6eb
--- /dev/null
+++ b/iconvmodule/setup.py
@@ -0,0 +1,19 @@
+from distutils.core import setup, Extension
+
+setup (name = "iconv",
+ version = "1.0",
+ description = "iconv-based Unicode converter",
+ author = "Martin v. Loewis",
+ author_email = "loewis@informatik.hu-berlin.de",
+ url = "http://sourceforge.net/projects/python-codecs/",
+ long_description =
+"""The iconv module exposes the operating system's iconv character
+conversion routine to Python. This package provides an iconv wrapper
+as well as a Python codec to convert between Unicode objects and
+all iconv-provided encodings.
+""",
+
+ py_modules = ['iconvcodec'],
+ ext_modules = [Extension("iconv",sources=["iconvmodule.c"])]
+ )
+
diff --git a/iconvmodule/test_iconv.py b/iconvmodule/test_iconv.py
new file mode 100644
index 000000000..d8c6f8325
--- /dev/null
+++ b/iconvmodule/test_iconv.py
@@ -0,0 +1,8 @@
+import iconv
+s=iconv.open("unicodelittle","iso-8859-1")
+r=s.iconv("Hallo",11,return_unicode=1)
+print repr(r),len(r)
+
+s=iconv.open("iso-8859-1","unicodelittle")
+r=s.iconv(u"Hallo",110)
+print r
diff --git a/iconvmodule/test_iconvcodec.py b/iconvmodule/test_iconvcodec.py
new file mode 100644
index 000000000..f4567ec4a
--- /dev/null
+++ b/iconvmodule/test_iconvcodec.py
@@ -0,0 +1,4 @@
+import iconvcodec
+
+print u"Hallo".encode("T.61")
+print repr(unicode("Hallo","T.61"))