summaryrefslogtreecommitdiffstats
path: root/src/sax.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/sax.c')
-rw-r--r--src/sax.c634
1 files changed, 634 insertions, 0 deletions
diff --git a/src/sax.c b/src/sax.c
new file mode 100644
index 0000000..cb3caef
--- /dev/null
+++ b/src/sax.c
@@ -0,0 +1,634 @@
+/* iksemel (XML parser for Jabber)
+** Copyright (C) 2000-2004 Gurer Ozen <madcat@e-kolay.net>
+** This code is free software; you can redistribute it and/or
+** modify it under the terms of GNU Lesser General Public License.
+*/
+
+#include "common.h"
+#include "iksemel.h"
+
+enum cons_e {
+ C_CDATA = 0,
+ C_TAG_START,
+ C_TAG,
+ C_TAG_END,
+ C_ATTRIBUTE,
+ C_ATTRIBUTE_1,
+ C_ATTRIBUTE_2,
+ C_VALUE,
+ C_VALUE_APOS,
+ C_VALUE_QUOT,
+ C_WHITESPACE,
+ C_ENTITY,
+ C_COMMENT,
+ C_COMMENT_1,
+ C_COMMENT_2,
+ C_COMMENT_3,
+ C_MARKUP,
+ C_MARKUP_1,
+ C_SECT,
+ C_SECT_CDATA,
+ C_SECT_CDATA_1,
+ C_SECT_CDATA_2,
+ C_SECT_CDATA_3,
+ C_SECT_CDATA_4,
+ C_SECT_CDATA_C,
+ C_SECT_CDATA_E,
+ C_SECT_CDATA_E2,
+ C_PI
+};
+
+/* if you add a variable here, dont forget changing iks_parser_reset */
+struct iksparser_struct {
+ ikstack *s;
+ void *user_data;
+ iksTagHook *tagHook;
+ iksCDataHook *cdataHook;
+ iksDeleteHook *deleteHook;
+ /* parser context */
+ char *stack;
+ size_t stack_pos;
+ size_t stack_max;
+
+ enum cons_e context;
+ enum cons_e oldcontext;
+
+ char *tag_name;
+ enum ikstagtype tagtype;
+
+ unsigned int attmax;
+ unsigned int attcur;
+ int attflag;
+ char **atts;
+ int valflag;
+
+ unsigned int entpos;
+ char entity[8];
+
+ unsigned long nr_bytes;
+ unsigned long nr_lines;
+
+ int uni_max;
+ int uni_len;
+};
+
+iksparser *
+iks_sax_new (void *user_data, iksTagHook *tagHook, iksCDataHook *cdataHook)
+{
+ iksparser *prs;
+
+ prs = iks_malloc (sizeof (iksparser));
+ if (NULL == prs) return NULL;
+ memset (prs, 0, sizeof (iksparser));
+ prs->user_data = user_data;
+ prs->tagHook = tagHook;
+ prs->cdataHook = cdataHook;
+ return prs;
+}
+
+iksparser *
+iks_sax_extend (ikstack *s, void *user_data, iksTagHook *tagHook, iksCDataHook *cdataHook, iksDeleteHook *deleteHook)
+{
+ iksparser *prs;
+
+ prs = iks_stack_alloc (s, sizeof (iksparser));
+ if (NULL == prs) return NULL;
+ memset (prs, 0, sizeof (iksparser));
+ prs->s = s;
+ prs->user_data = user_data;
+ prs->tagHook = tagHook;
+ prs->cdataHook = cdataHook;
+ prs->deleteHook = deleteHook;
+ return prs;
+}
+
+ikstack *
+iks_parser_stack (iksparser *prs)
+{
+ return prs->s;
+}
+
+void *
+iks_user_data (iksparser *prs)
+{
+ return prs->user_data;
+}
+
+unsigned long
+iks_nr_bytes (iksparser *prs)
+{
+ return prs->nr_bytes;
+}
+
+unsigned long
+iks_nr_lines (iksparser *prs)
+{
+ return prs->nr_lines;
+}
+
+#define IS_WHITESPACE(x) ' ' == (x) || '\t' == (x) || '\r' == (x) || '\n' == (x)
+#define NOT_WHITESPACE(x) ' ' != (x) && '\t' != (x) && '\r' != (x) && '\n' != (x)
+
+static int
+stack_init (iksparser *prs)
+{
+ prs->stack = iks_malloc (128);
+ if (!prs->stack) return 0;
+ prs->stack_max = 128;
+ prs->stack_pos = 0;
+ return 1;
+}
+
+static int
+stack_expand (iksparser *prs, int len)
+{
+ size_t need;
+ off_t diff;
+ char *tmp;
+ need = len - (prs->stack_max - prs->stack_pos);
+ if (need < prs->stack_max) {
+ need = prs->stack_max * 2;
+ } else {
+ /* need x 1.2 for integer only archs like ARM */
+ need = prs->stack_max + ( (need * 6) / 5);
+ }
+ tmp = iks_malloc (need);
+ if (!tmp) return 0;
+ diff = tmp - prs->stack;
+ memcpy (tmp, prs->stack, prs->stack_max);
+ iks_free (prs->stack);
+ prs->stack = tmp;
+ prs->stack_max = need;
+ prs->tag_name += diff;
+ if (prs->attflag != 0) {
+ int i = 0;
+ while (i < (prs->attmax * 2)) {
+ if (prs->atts[i]) prs->atts[i] += diff;
+ i++;
+ }
+ }
+ return 1;
+}
+
+#define STACK_INIT \
+ if (NULL == prs->stack && 0 == stack_init (prs)) return IKS_NOMEM
+
+#define STACK_PUSH_START (prs->stack + prs->stack_pos)
+
+#define STACK_PUSH(buf,len) \
+{ \
+ char *sbuf = (buf); \
+ size_t slen = (len); \
+ if (prs->stack_max - prs->stack_pos <= slen) { \
+ if (0 == stack_expand (prs, slen)) return IKS_NOMEM; \
+ } \
+ memcpy (prs->stack + prs->stack_pos, sbuf, slen); \
+ prs->stack_pos += slen; \
+}
+
+#define STACK_PUSH_END \
+{ \
+ if (prs->stack_pos >= prs->stack_max) { \
+ if (0 == stack_expand (prs, 1)) return IKS_NOMEM; \
+ } \
+ prs->stack[prs->stack_pos] = '\0'; \
+ prs->stack_pos++; \
+}
+
+static enum ikserror
+sax_core (iksparser *prs, char *buf, int len)
+{
+ enum ikserror err;
+ int pos = 0, old = 0, re, stack_old = -1;
+ unsigned char c;
+
+ while (pos < len) {
+ re = 0;
+ c = buf[pos];
+ if (0 == c || 0xFE == c || 0xFF == c) return IKS_BADXML;
+ if (prs->uni_max) {
+ if ((c & 0xC0) != 0x80) return IKS_BADXML;
+ prs->uni_len++;
+ if (prs->uni_len == prs->uni_max) prs->uni_max = 0;
+ goto cont;
+ } else {
+ if (c & 0x80) {
+ unsigned char mask;
+ if ((c & 0x60) == 0x40) {
+ prs->uni_max = 2;
+ mask = 0x1F;
+ } else if ((c & 0x70) == 0x60) {
+ prs->uni_max = 3;
+ mask = 0x0F;
+ } else if ((c & 0x78) == 0x70) {
+ prs->uni_max = 4;
+ mask = 0x07;
+ } else if ((c & 0x7C) == 0x78) {
+ prs->uni_max = 5;
+ mask = 0x03;
+ } else if ((c & 0x7E) == 0x7C) {
+ prs->uni_max = 6;
+ mask = 0x01;
+ } else {
+ return IKS_BADXML;
+ }
+ if ((c & mask) == 0) return IKS_BADXML;
+ prs->uni_len = 1;
+ if (stack_old == -1
+ && (prs->context == C_TAG
+ || prs->context == C_ATTRIBUTE_1
+ || prs->context == C_VALUE_APOS
+ || prs->context == C_VALUE_QUOT)) stack_old = pos;
+ goto cont;
+ }
+ }
+
+ switch (prs->context) {
+ case C_CDATA:
+ if ('&' == c) {
+ if (old < pos && prs->cdataHook) {
+ err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
+ if (IKS_OK != err) return err;
+ }
+ prs->context = C_ENTITY;
+ prs->entpos = 0;
+ break;
+ }
+ if ('<' == c) {
+ if (old < pos && prs->cdataHook) {
+ err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
+ if (IKS_OK != err) return err;
+ }
+ STACK_INIT;
+ prs->tag_name = STACK_PUSH_START;
+ if (!prs->tag_name) return IKS_NOMEM;
+ prs->context = C_TAG_START;
+ }
+ break;
+
+ case C_TAG_START:
+ prs->context = C_TAG;
+ if ('/' == c) {
+ prs->tagtype = IKS_CLOSE;
+ break;
+ }
+ if ('?' == c) {
+ prs->context = C_PI;
+ break;
+ }
+ if ('!' == c) {
+ prs->context = C_MARKUP;
+ break;
+ }
+ prs->tagtype = IKS_OPEN;
+ stack_old = pos;
+ break;
+
+ case C_TAG:
+ if (IS_WHITESPACE(c)) {
+ if (IKS_CLOSE == prs->tagtype)
+ prs->oldcontext = C_TAG_END;
+ else
+ prs->oldcontext = C_ATTRIBUTE;
+ prs->context = C_WHITESPACE;
+ if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
+ stack_old = -1;
+ STACK_PUSH_END;
+ break;
+ }
+ if ('/' == c) {
+ if (IKS_CLOSE == prs->tagtype) return IKS_BADXML;
+ prs->tagtype = IKS_SINGLE;
+ prs->context = C_TAG_END;
+ if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
+ stack_old = -1;
+ STACK_PUSH_END;
+ break;
+ }
+ if ('>' == c) {
+ prs->context = C_TAG_END;
+ if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
+ stack_old = -1;
+ STACK_PUSH_END;
+ re = 1;
+ break;
+ }
+ if (stack_old == -1) stack_old = pos;
+ break;
+
+ case C_TAG_END:
+ if (c != '>') return IKS_BADXML;
+ if (prs->tagHook) {
+ char **tmp;
+ if (prs->attcur == 0) tmp = NULL; else tmp = prs->atts;
+ err = prs->tagHook (prs->user_data, prs->tag_name, tmp, prs->tagtype);
+ if (IKS_OK != err) return err;
+ }
+ prs->stack_pos = 0;
+ stack_old = -1;
+ prs->attcur = 0;
+ prs->attflag = 0;
+ prs->context = C_CDATA;
+ old = pos + 1;
+ break;
+
+ case C_ATTRIBUTE:
+ if ('/' == c) {
+ prs->tagtype = IKS_SINGLE;
+ prs->context = C_TAG_END;
+ break;
+ }
+ if ('>' == c) {
+ prs->context = C_TAG_END;
+ re = 1;
+ break;
+ }
+ if (!prs->atts) {
+ prs->attmax = 12;
+ prs->atts = iks_malloc (sizeof(char *) * 2 * 12);
+ if (!prs->atts) return IKS_NOMEM;
+ memset (prs->atts, 0, sizeof(char *) * 2 * 12);
+ prs->attcur = 0;
+ } else {
+ if (prs->attcur >= (prs->attmax * 2)) {
+ void *tmp;
+ prs->attmax += 12;
+ tmp = iks_malloc (sizeof(char *) * 2 * prs->attmax);
+ if (!tmp) return IKS_NOMEM;
+ memset (tmp, 0, sizeof(char *) * 2 * prs->attmax);
+ memcpy (tmp, prs->atts, sizeof(char *) * prs->attcur);
+ free (prs->atts);
+ prs->atts = tmp;
+ }
+ }
+ prs->attflag = 1;
+ prs->atts[prs->attcur] = STACK_PUSH_START;
+ stack_old = pos;
+ prs->context = C_ATTRIBUTE_1;
+ break;
+
+ case C_ATTRIBUTE_1:
+ if ('=' == c) {
+ if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
+ stack_old = -1;
+ STACK_PUSH_END;
+ prs->context = C_VALUE;
+ break;
+ }
+ if (stack_old == -1) stack_old = pos;
+ break;
+
+ case C_ATTRIBUTE_2:
+ if ('/' == c) {
+ prs->tagtype = IKS_SINGLE;
+ prs->atts[prs->attcur] = NULL;
+ prs->context = C_TAG_END;
+ break;
+ }
+ if ('>' == c) {
+ prs->atts[prs->attcur] = NULL;
+ prs->context = C_TAG_END;
+ re = 1;
+ break;
+ }
+ prs->context = C_ATTRIBUTE;
+ re = 1;
+ break;
+
+ case C_VALUE:
+ prs->atts[prs->attcur + 1] = STACK_PUSH_START;
+ if ('\'' == c) {
+ prs->context = C_VALUE_APOS;
+ break;
+ }
+ if ('"' == c) {
+ prs->context = C_VALUE_QUOT;
+ break;
+ }
+ return IKS_BADXML;
+
+ case C_VALUE_APOS:
+ if ('\'' == c) {
+ if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
+ stack_old = -1;
+ STACK_PUSH_END;
+ prs->oldcontext = C_ATTRIBUTE_2;
+ prs->context = C_WHITESPACE;
+ prs->attcur += 2;
+ }
+ if (stack_old == -1) stack_old = pos;
+ break;
+
+ case C_VALUE_QUOT:
+ if ('"' == c) {
+ if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
+ stack_old = -1;
+ STACK_PUSH_END;
+ prs->oldcontext = C_ATTRIBUTE_2;
+ prs->context = C_WHITESPACE;
+ prs->attcur += 2;
+ }
+ if (stack_old == -1) stack_old = pos;
+ break;
+
+ case C_WHITESPACE:
+ if (NOT_WHITESPACE(c)) {
+ prs->context = prs->oldcontext;
+ re = 1;
+ }
+ break;
+
+ case C_ENTITY:
+ if (';' == c) {
+ char hede[2];
+ char t = '?';
+ prs->entity[prs->entpos] = '\0';
+ if (strcmp(prs->entity, "amp") == 0)
+ t = '&';
+ else if (strcmp(prs->entity, "quot") == 0)
+ t = '"';
+ else if (strcmp(prs->entity, "apos") == 0)
+ t = '\'';
+ else if (strcmp(prs->entity, "lt") == 0)
+ t = '<';
+ else if (strcmp(prs->entity, "gt") == 0)
+ t = '>';
+ old = pos + 1;
+ hede[0] = t;
+ if (prs->cdataHook) {
+ err = prs->cdataHook (prs->user_data, &hede[0], 1);
+ if (IKS_OK != err) return err;
+ }
+ prs->context = C_CDATA;
+ } else {
+ prs->entity[prs->entpos++] = buf[pos];
+ if (prs->entpos > 7) return IKS_BADXML;
+ }
+ break;
+
+ case C_COMMENT:
+ if ('-' != c) return IKS_BADXML;
+ prs->context = C_COMMENT_1;
+ break;
+
+ case C_COMMENT_1:
+ if ('-' == c) prs->context = C_COMMENT_2;
+ break;
+
+ case C_COMMENT_2:
+ if ('-' == c)
+ prs->context = C_COMMENT_3;
+ else
+ prs->context = C_COMMENT_1;
+ break;
+
+ case C_COMMENT_3:
+ if ('>' != c) return IKS_BADXML;
+ prs->context = C_CDATA;
+ old = pos + 1;
+ break;
+
+ case C_MARKUP:
+ if ('[' == c) {
+ prs->context = C_SECT;
+ break;
+ }
+ if ('-' == c) {
+ prs->context = C_COMMENT;
+ break;
+ }
+ prs->context = C_MARKUP_1;
+
+ case C_MARKUP_1:
+ if ('>' == c) {
+ old = pos + 1;
+ prs->context = C_CDATA;
+ }
+ break;
+
+ case C_SECT:
+ if ('C' == c) {
+ prs->context = C_SECT_CDATA;
+ break;
+ }
+ return IKS_BADXML;
+
+ case C_SECT_CDATA:
+ if ('D' != c) return IKS_BADXML;
+ prs->context = C_SECT_CDATA_1;
+ break;
+
+ case C_SECT_CDATA_1:
+ if ('A' != c) return IKS_BADXML;
+ prs->context = C_SECT_CDATA_2;
+ break;
+
+ case C_SECT_CDATA_2:
+ if ('T' != c) return IKS_BADXML;
+ prs->context = C_SECT_CDATA_3;
+ break;
+
+ case C_SECT_CDATA_3:
+ if ('A' != c) return IKS_BADXML;
+ prs->context = C_SECT_CDATA_4;
+ break;
+
+ case C_SECT_CDATA_4:
+ if ('[' != c) return IKS_BADXML;
+ old = pos + 1;
+ prs->context = C_SECT_CDATA_C;
+ break;
+
+ case C_SECT_CDATA_C:
+ if (']' == c) {
+ prs->context = C_SECT_CDATA_E;
+ if (prs->cdataHook && old < pos) {
+ err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
+ if (IKS_OK != err) return err;
+ }
+ }
+ break;
+
+ case C_SECT_CDATA_E:
+ if (']' == c) {
+ prs->context = C_SECT_CDATA_E2;
+ } else {
+ if (prs->cdataHook) {
+ err = prs->cdataHook (prs->user_data, "]", 1);
+ if (IKS_OK != err) return err;
+ }
+ old = pos;
+ prs->context = C_SECT_CDATA_C;
+ }
+ break;
+
+ case C_SECT_CDATA_E2:
+ if ('>' == c) {
+ old = pos + 1;
+ prs->context = C_CDATA;
+ } else {
+ if (prs->cdataHook) {
+ err = prs->cdataHook (prs->user_data, "]]", 2);
+ if (IKS_OK != err) return err;
+ }
+ old = pos;
+ prs->context = C_SECT_CDATA_C;
+ }
+ break;
+
+ case C_PI:
+ old = pos + 1;
+ if ('>' == c) prs->context = C_CDATA;
+ break;
+ }
+cont:
+ if (0 == re) {
+ pos++;
+ prs->nr_bytes++;
+ if ('\n' == c) prs->nr_lines++;
+ }
+ }
+
+ if (stack_old != -1)
+ STACK_PUSH (buf + stack_old, pos - stack_old);
+
+ err = IKS_OK;
+ if (prs->cdataHook && (prs->context == C_CDATA || prs->context == C_SECT_CDATA_C) && old < pos)
+ err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
+ return err;
+}
+
+int
+iks_parse (iksparser *prs, const char *data, size_t len, int finish)
+{
+ if (!data) return IKS_OK;
+ if (len == 0) len = strlen (data);
+ return sax_core (prs, (char *) data, len);
+}
+
+void
+iks_parser_reset (iksparser *prs)
+{
+ if (prs->deleteHook) prs->deleteHook (prs->user_data);
+ prs->stack_pos = 0;
+ prs->context = 0;
+ prs->oldcontext = 0;
+ prs->tagtype = 0;
+ prs->attcur = 0;
+ prs->attflag = 0;
+ prs->valflag = 0;
+ prs->entpos = 0;
+ prs->nr_bytes = 0;
+ prs->nr_lines = 0;
+ prs->uni_max = 0;
+ prs->uni_len = 0;
+}
+
+void
+iks_parser_delete (iksparser *prs)
+{
+ if (prs->deleteHook) prs->deleteHook (prs->user_data);
+ if (prs->stack) iks_free (prs->stack);
+ if (prs->atts) iks_free (prs->atts);
+ if (prs->s) iks_stack_delete (prs->s); else iks_free (prs);
+}