diff options
Diffstat (limited to 'codegen/docextract_to_xml.py')
-rwxr-xr-x | codegen/docextract_to_xml.py | 19 |
1 files changed, 12 insertions, 7 deletions
diff --git a/codegen/docextract_to_xml.py b/codegen/docextract_to_xml.py index 76ac85d..f8d3bae 100755 --- a/codegen/docextract_to_xml.py +++ b/codegen/docextract_to_xml.py @@ -7,21 +7,26 @@ # # ./docextract_to_xml.py -s /gnome/head/cvs/gtk+/gtk/ -s /gnome/head/cvs/gtk+/docs/reference/gtk/tmpl/ > gtk_docs.xml import getopt +import re import string import sys import docextract def escape_text(unescaped_text): - escaped_text = unescaped_text + # Escape every "&" not part of an entity reference + escaped_text = re.sub(r'&(?![A-Za-z]+;)', '&', unescaped_text) + + # These weird entities turn up in the output... + escaped_text = string.replace(escaped_text, '—', '—') + escaped_text = string.replace(escaped_text, '*', '*') + escaped_text = string.replace(escaped_text, '%', '%') + escaped_text = string.replace(escaped_text, '@', '@') + + # Escape for both tag contents and attribute values escaped_text = string.replace(escaped_text, '<', '<') escaped_text = string.replace(escaped_text, '>', '>') - escaped_text = string.replace(escaped_text, '&', '&') - escaped_text = string.replace(escaped_text, '\'', ''') - escaped_text = string.replace(escaped_text, '\"', '"') - - #Apparently this is an undefined symbol: - escaped_text = string.replace(escaped_text, '—', ' mdash ') + escaped_text = string.replace(escaped_text, '"', '"') return escaped_text |