summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--README4
-rwxr-xr-xbin/tidy-bowl63
-rwxr-xr-xbin/xmlformat1688
-rw-r--r--bin/xmlformat-fdp.conf101
4 files changed, 1856 insertions, 0 deletions
diff --git a/README b/README
index b32f1aa..8aa36af 100644
--- a/README
+++ b/README
@@ -8,6 +8,10 @@ directory. No XML files should be in the root directory.
Four modules comprise the minimal set that must be obtained to view
or edit any document:
+ bin/
+
+ Client-side scripts and tools.
+
common/
DocBook XML files shared by multiple guides.
diff --git a/bin/tidy-bowl b/bin/tidy-bowl
new file mode 100755
index 0000000..aac11a2
--- /dev/null
+++ b/bin/tidy-bowl
@@ -0,0 +1,63 @@
+#!/bin/sh
+########################################################################
+#
+# This script is invoked by "commitinfo" just after the incoming file has
+# been uploaded to the server. It gets passed these arguments:
+#
+# tidy-bowl file ...
+#
+# If we approve of the commit for EVERY file in the list, then we return
+# a zero (0). Any non-zero return aborts the commit, so we must be very
+# careful here.
+#
+# AFAIK, the $PWD is the temporary file where the incoming "file" is located,
+# but that needs to be verified.
+#
+# For every "file" that ends with ".xml", we will attempt to normalize the
+# formatting by using tidy(1) to prettyprint it. We must overwrite "file" so
+# that the rest of CVS knows where to find the file.
+#
+# There doesn't seem to be any way to return error messages, so we'll just bang
+# them out to "/var/log/messages".
+#
+########################################################################
+# exit 0 # Early exit while frobbing around
+########################################################################
+# All ancillary files should be in the same directory as this script
+ME=`/bin/basename $0`
+PREFIX=`/usr/bin/dirname $0`
+########################################################################
+USAGE="Usage: ${ME} [-T tool] [-t tool-args] file..."
+NORMTOOL=${PREFIX}/xmlformat
+NORMARGS="-i -v -f ${PREFIX}/xmlformat-fdp.conf"
+while getopts T:t: c
+do
+ case "${c}" in
+ T) NORMTOOL="${OPTARG}";;
+ t) NORMARGS="${OPTARG}";;
+ *) /bin/echo "${USAGE}" >&2; exit 1;;
+ esac
+done
+shift `/usr/bin/expr "${OPTIND}" - 1`
+########################################################################
+for fn in $@
+do
+ case "${fn}" in
+ *.[Xx][Mm][Ll] )
+ # Make sure we can find the input file
+ if [ ! -f "${fn}" ]; then
+ /bin/echo $0 "Cannot find '${fn}'"
+ exit 1
+ fi
+ # Try to clean it up
+ ${NORMTOOL} ${NORMARGS} "${fn}"
+ if [ $? -ne 0 ]; then
+ /bin/echo $0 "File '${fn}' unclean."
+ exit 1
+ fi
+ ;;
+ * )
+ ;;
+ esac
+done
+exit 0
diff --git a/bin/xmlformat b/bin/xmlformat
new file mode 100755
index 0000000..8a308f9
--- /dev/null
+++ b/bin/xmlformat
@@ -0,0 +1,1688 @@
+#! /usr/bin/perl -w
+# vim:set ts=2 sw=2 expandtab:
+
+# xmlformat - configurable XML file formatter/pretty-printer
+
+# Copyright (c) 2004, Kitebird, LLC. All rights reserved.
+# Some portions are based on the REX shallow XML parser, which
+# is Copyright (c) 1998, Robert D. Cameron. These include the
+# regular expression parsing variables and the shallow_parse()
+# method.
+# This software is licensed as described in the file LICENSE,
+# which you should have received as part of this distribution.
+
+# Syntax: xmlformat [config-file] xml-file
+
+# Default config file is $ENV{XMLFORMAT_CONF} or ./xmlformat.conf, in that
+# order.
+
+# Paul DuBois
+# paul@kitebird.com
+# 2003-12-14
+
+# The input document first is parsed into a list of strings. Each string
+# represents one of the following:
+# - text node
+# - processing instruction (the XML declaration is treated as a PI)
+# - comment
+# - CDATA section
+# - DOCTYPE declaration
+# - element tag (either <abc>, </abc>, or <abc/>), *including attributes*
+
+# Entities are left untouched. They appear in their original form as part
+# of the text node in which they occur.
+
+# The list of strings then is converted to a hierarchical structure.
+# The document top level is represented by a reference to a list.
+# Each list element is a reference to a node -- a hash that has "type"
+# and "content" key/value pairs. The "type" key indicates the node
+# type and has one of the following values:
+
+# "text" - text node
+# "pi" - processing instruction node
+# "comment" - comment node
+# "CDATA" - CDATA section node
+# "DOCTYPE" - DOCTYPE node
+# "elt" - element node
+
+# (For purposes of this program, it's really only necessary to have "text",
+# "elt", and "other". The types other than "text" and "elt" currently are
+# all treated the same way.)
+
+# For all but element nodes, the "content" value is the text of the node.
+
+# For element nodes, the "content" hash is a reference to a list of
+# nodes for the element's children. In addition, an element node has
+# three additional key/value pairs:
+# - The "name" value is the tag name within the opening tag, minus angle
+# brackets or attributes.
+# - The "open_tag" value is the full opening tag, which may also be the
+# closing tag.
+# - The "close_tag" value depends on the opening tag. If the open tag is
+# "<abc>", the close tag is "</abc>". If the open tag is "<abc/>", the
+# close tag is the empty string.
+
+# If the tree structure is converted back into a string with
+# tree_stringify(), the result can be compared to the input file
+# as a regression test. The string should be identical to the original
+# input document.
+
+use strict;
+
+use Getopt::Long;
+$Getopt::Long::ignorecase = 0; # options are case sensitive
+$Getopt::Long::bundling = 1; # allow short options to be bundled
+
+my $XMLFORMAT_VERSION = "1.03";
+my $prog_name = "xmlformat";
+
+# ----------------------------------------------------------------------
+
+package XMLFormat;
+
+use strict;
+
+# ----------------------------------------------------------------------
+
+# Regular expressions for parsing document components. Based on REX.
+
+# SPE = shallow parsing expression
+# SE = scanning expression
+# CE = completion expression
+# RSB = right square brackets
+# QM = question mark
+
+my $TextSE = "[^<]+";
+my $UntilHyphen = "[^-]*-";
+my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
+my $CommentCE = "$Until2Hyphens>?";
+my $UntilRSBs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+";
+my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
+my $S = "[ \\n\\t\\r]+";
+my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
+my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
+my $Name = "(?:$NameStrt)(?:$NameChar)*";
+my $QuoteSE = "\"[^\"]*\"|'[^']*'";
+my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
+my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
+my $S1 = "[\\n\\r\\t ]";
+my $UntilQMs = "[^?]*\\?+";
+my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
+my $DT_ItemSE =
+"<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
+my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*\\](?:$S)?)?>?";
+my $DeclCE =
+"--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
+my $PI_CE = "$Name(?:$PI_Tail)?";
+my $EndTagCE = "$Name(?:$S)?>?";
+my $AttValSE = "\"[^<\"]*\"|'[^<']*'";
+my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
+my $MarkupSPE =
+"<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
+my $XML_SPE = "$TextSE|$MarkupSPE";
+
+# ----------------------------------------------------------------------
+
+# Allowable options and their possible values:
+# - The keys of this hash are the allowable option names
+# - The value for each key is list of allowable option values
+# - If the value is undef, the option value must be numeric
+# If any new formatting option is added to this program, it
+# must be specified here, *and* a default value for it should
+# be listed in the *DOCUMENT and *DEFAULT pseudo-element
+# option hashes.
+
+my %opt_list = (
+ "format" => [ "block", "inline", "verbatim" ],
+ "normalize" => [ "yes", "no" ],
+ "subindent" => undef,
+ "wrap-length" => undef,
+ "entry-break" => undef,
+ "exit-break" => undef,
+ "element-break" => undef
+);
+
+# Object creation: set up the default formatting configuration
+# and variables for maintaining input and output document.
+
+sub new
+{
+my $type = shift;
+
+ my $self = {};
+
+ # Formatting options for each element.
+
+ $self->{elt_opts} = { };
+
+ # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can
+ # be overridden in the configuration file, but the options must also be
+ # built in to make sure they exist if not specified in the configuration
+ # file. Each of the structures must have a value for every option.
+
+ # Options for top-level document children.
+ # - Do not change entry-break: 0 ensures no extra newlines before
+ # first element of output.
+ # - Do not change exit-break: 1 ensures a newline after final element
+ # of output document.
+ # - It's probably best not to change any of the others, except perhaps
+ # if you want to increase the element-break.
+
+ $self->{elt_opts}->{"*DOCUMENT"} = {
+ "format" => "block",
+ "normalize" => "no",
+ "subindent" => 0,
+ "wrap-length" => 0,
+ "entry-break" => 0, # do not change
+ "exit-break" => 1, # do not change
+ "element-break" => 1
+ };
+
+ # Default options. These are used for any elements in the document
+ # that are not specified explicitly in the configuration file.
+
+ $self->{elt_opts}->{"*DEFAULT"} = {
+ "format" => "block",
+ "normalize" => "no",
+ "subindent" => 1,
+ "wrap-length" => 0,
+ "entry-break" => 1,
+ "exit-break" => 1,
+ "element-break" => 1
+ };
+
+ # Run the *DOCUMENT and *DEFAULT options through the option-checker
+ # to verify that the built-in values are legal.
+
+ my $err_count = 0;
+
+ foreach my $elt_name (keys (%{$self->{elt_opts}})) # ... for each element
+ {
+ # Check each option for element
+ while (my ($opt_name, $opt_val) = each (%{$self->{elt_opts}->{$elt_name}}))
+ {
+ my $err_msg;
+
+ ($opt_val, $err_msg) = check_option ($opt_name, $opt_val);
+ if (!defined ($err_msg))
+ {
+ $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val;
+ }
+ else
+ {
+ warn "LOGIC ERROR: $elt_name default option is invalid\n";
+ warn "$err_msg\n";
+ ++$err_count;
+ }
+ }
+ }
+
+ # Make sure that the every option is represented in the
+ # *DOCUMENT and *DEFAULT structures.
+
+ foreach my $opt_name (keys (%opt_list))
+ {
+ foreach my $elt_name (keys (%{$self->{elt_opts}}))
+ {
+ if (!exists ($self->{elt_opts}->{$elt_name}->{$opt_name}))
+ {
+ warn "LOGIC ERROR: $elt_name has no default '$opt_name' option\n";
+ ++$err_count;
+ }
+ }
+ }
+
+ die "Cannot continue; internal default formatting options must be fixed\n"
+ if $err_count > 0;
+
+ bless $self, $type; # bless object and return it
+}
+
+# Initialize the variables that are used per-document
+
+sub init_doc_vars
+{
+my $self = shift;
+
+ # Elements that are used in the document but not named explicitly
+ # in the configuration file.
+
+ $self->{unconf_elts} = { };
+
+ # List of tokens for current document.
+
+ $self->{tokens} = [ ];
+
+ # Document node tree (constructed from the token list).
+
+ $self->{tree} = [ ];
+
+ # Variables for formatting operations:
+ # out_doc = resulting output document (constructed from document tree)
+ # pending = array of pending tokens being held until flushed
+
+ $self->{out_doc} = "";
+ $self->{pending} = [ ];
+
+ # Inline elements within block elements are processed using the
+ # text normalization (and possible line-wrapping) values of their
+ # enclosing block. Blocks and inlines may be nested, so we maintain
+ # a stack that allows the normalize/wrap-length values of the current
+ # block to be determined.
+
+ $self->{block_name_stack} = [ ]; # for debugging
+ $self->{block_opts_stack} = [ ];
+
+ # A similar stack for maintaining each block's current break type.
+
+ $self->{block_break_type_stack} = [ ];
+}
+
+# Accessors for token list and resulting output document
+
+sub tokens
+{
+my $self = shift;
+
+ return $self->{tokens};
+}
+
+sub out_doc
+{
+my $self = shift;
+
+ return $self->{out_doc};
+}
+
+
+# Methods for adding strings to output document or
+# to the pending output array
+
+sub add_to_doc
+{
+my ($self, $str) = @_;
+
+ $self->{out_doc} .= $str;
+}
+
+sub add_to_pending
+{
+my ($self, $str) = @_;
+
+ push (@{$self->{pending}}, $str);
+}
+
+
+# Block stack mainenance methods
+
+# Push options onto or pop options off from the stack. When doing
+# this, also push or pop an element onto the break-level stack.
+
+sub begin_block
+{
+my ($self, $name, $opts) = @_;
+
+ push (@{$self->{block_name_stack}}, $name);
+ push (@{$self->{block_opts_stack}}, $opts);
+ push (@{$self->{block_break_type_stack}}, "entry-break");
+}
+
+sub end_block
+{
+my $self = shift;
+
+ pop (@{$self->{block_name_stack}});
+ pop (@{$self->{block_opts_stack}});
+ pop (@{$self->{block_break_type_stack}});
+}
+
+# Return the current block's normalization status or wrap length
+
+sub block_normalize
+{
+my $self = shift;
+
+ my $size = @{$self->{block_opts_stack}};
+ my $opts = $self->{block_opts_stack}->[$size-1];
+ return $opts->{normalize} eq "yes";
+}
+
+sub block_wrap_length
+{
+my $self = shift;
+
+ my $size = @{$self->{block_opts_stack}};
+ my $opts = $self->{block_opts_stack}->[$size-1];
+ return $opts->{"wrap-length"};
+}
+
+# Set the current block's break type, or return the number of newlines
+# for the block's break type
+
+sub set_block_break_type
+{
+my ($self, $type) = @_;
+
+ my $size = @{$self->{block_break_type_stack}};
+ $self->{block_break_type_stack}->[$size-1] = $type;
+}
+
+sub block_break_value
+{
+my $self = shift;
+
+ my $size = @{$self->{block_opts_stack}};
+ my $opts = $self->{block_opts_stack}->[$size-1];
+ $size = @{$self->{block_break_type_stack}};
+ my $type = $self->{block_break_type_stack}->[$size-1];
+ return $opts->{$type};
+}
+
+
+# ----------------------------------------------------------------------
+
+# Read configuration information. For each element, construct a hash
+# containing a hash key and value for each option name and value.
+# After reading the file, fill in missing option values for
+# incomplete option structures using the *DEFAULT options.
+
+sub read_config
+{
+my $self = shift;
+my $conf_file = shift;
+my @elt_names = ();
+my $err_msg;
+my $in_continuation = 0;
+my $saved_line = "";
+
+ open (FH, $conf_file) or die "Cannot read config file $conf_file: $!\n";
+ while (<FH>)
+ {
+ chomp;
+
+ next if /^\s*($|#)/; # skip blank lines, comments
+ if ($in_continuation)
+ {
+ $_ = $saved_line . " " . $_;
+ $saved_line = "";
+ $in_continuation = 0;
+ }
+ if (!/^\s/)
+ {
+ # Line doesn't begin with whitespace, so it lists element names.
+ # Names are separated by whitespace or commas, possibly followed
+ # by a continuation character or a comment.
+ if (/\\$/)
+ {
+ s/\\$//; # remove continuation character
+ $saved_line = $_;
+ $in_continuation = 1;
+ next;
+ }
+ s/\s*#.*$//; # remove any trailing comment
+ @elt_names = split (/[\s,]+/, $_);
+ # make sure each name has an entry in the elt_opts structure
+ foreach my $elt_name (@elt_names)
+ {
+ $self->{elt_opts}->{$elt_name} = { }
+ unless exists ($self->{elt_opts}->{$elt_name});
+ }
+ }
+ else
+ {
+ # Line begins with whitespace, so it contains an option
+ # to apply to the current element list, possibly followed by
+ # a comment. First check that there is a current list.
+ # Then parse the option name/value.
+
+ die "$conf_file:$.: Option setting found before any "
+ . "elements were named.\n"
+ if !@elt_names;
+ s/\s*#.*$//;
+ my ($opt_name, $opt_val) = /^\s+(\S+)(?:\s+|\s*=\s*)(\S+)$/;
+ die "$conf_file:$.: Malformed line: $_\n" unless defined ($opt_val);
+
+ # Check option. If illegal, die with message. Otherwise,
+ # add option to each element in current element list
+
+ ($opt_val, $err_msg) = check_option ($opt_name, $opt_val);
+ die "$conf_file:$.: $err_msg\n" if defined ($err_msg);
+ foreach my $elt_name (@elt_names)
+ {
+ $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val;
+ }
+ }
+ }
+ close (FH);
+
+ # For any element that has missing option values, fill in the values
+ # using the options for the *DEFAULT pseudo-element. This speeds up
+ # element option lookups later. It also makes it unnecessary to test
+ # each option to see if it's defined: All element option structures
+ # will have every option defined.
+
+ my $def_opts = $self->{elt_opts}->{"*DEFAULT"};
+
+ foreach my $elt_name (keys (%{$self->{elt_opts}}))
+ {
+ next if $elt_name eq "*DEFAULT";
+ foreach my $opt_name (keys (%{$def_opts}))
+ {
+ next if exists ($self->{elt_opts}->{$elt_name}->{$opt_name}); # already set
+ $self->{elt_opts}->{$elt_name}->{$opt_name} = $def_opts->{$opt_name};
+ }
+ }
+}
+
+
+# Check option name to make sure it's legal. Check the value to make sure
+# that it's legal for the name. Return a two-element array:
+# (value, undef) if the option name and value are legal.
+# (undef, message) if an error was found; message contains error message.
+# For legal values, the returned value should be assigned to the option,
+# because it may get type-converted here.
+
+sub check_option
+{
+my ($opt_name, $opt_val) = @_;
+
+ # - Check option name to make sure it's a legal option
+ # - Then check the value. If there is a list of values
+ # the value must be one of them. Otherwise, the value
+ # must be an integer.
+
+ return (undef, "Unknown option name: $opt_name")
+ unless exists ($opt_list{$opt_name});
+ my $allowable_val = $opt_list{$opt_name};
+ if (defined ($allowable_val))
+ {
+ return (undef, "Unknown '$opt_name' value: $opt_val")
+ unless grep (/^$opt_val$/, @{$allowable_val});
+ }
+ else # other options should be numeric
+ {
+ # "$opt_val" converts $opt_val to string for pattern match
+ return (undef, "'$opt_name' value ($opt_val) should be an integer")
+ unless "$opt_val" =~ /^\d+$/;
+ }
+ return ($opt_val, undef);
+}
+
+
+# Return hash of option values for a given element. If no options are found:
+# - Add the element name to the list of unconfigured options.
+# - Assign the default options to the element. (This way the test for the
+# option fails only once.)
+
+sub get_opts
+{
+my $self = shift;
+my $elt_name = shift;
+
+ my $opts = $self->{elt_opts}->{$elt_name};
+ if (!defined ($opts))
+ {
+ $self->{unconf_elts}->{$elt_name} = 1;
+ $opts = $self->{elt_opts}->{$elt_name} = $self->{elt_opts}->{"*DEFAULT"};
+ }
+ return $opts;
+}
+
+
+# Display contents of configuration options to be used to process document.
+# For each element named in the elt_opts structure, display its format
+# type, and those options that apply to the type.
+
+sub display_config
+{
+my $self = shift;
+# Format types and the additional options that apply to each type
+my $format_opts = {
+ "block" => [
+ "entry-break",
+ "element-break",
+ "exit-break",
+ "subindent",
+ "normalize",
+ "wrap-length"
+ ],
+ "inline" => [ ],
+ "verbatim" => [ ]
+};
+
+ foreach my $elt_name (sort (keys (%{$self->{elt_opts}})))
+ {
+ print "$elt_name\n";
+ my %opts = %{$self->{elt_opts}->{$elt_name}};
+ my $format = $opts{format};
+ # Write out format type, then options that apply to the format type
+ print " format = $format\n";
+ foreach my $opt_name (@{$format_opts->{$format}})
+ {
+ print " $opt_name = $opts{$opt_name}\n";
+ }
+ print "\n";
+ }
+}
+
+
+# Display the list of elements that are used in the document but not
+# configured in the configuration file.
+
+# Then re-unconfigure the elements so that they won't be considered
+# as configured for the next document, if there is one.
+
+sub display_unconfigured_elements
+{
+my $self = shift;
+
+ my @elts = keys (%{$self->{unconf_elts}});
+ if (@elts == 0)
+ {
+ print "The document contains no unconfigured elements.\n";
+ }
+ else
+ {
+ print "The following document elements were assigned no formatting options:\n";
+ foreach my $line ($self->line_wrap ([ join (" ", sort (@elts)) ], 0, 0, 65))
+ {
+ print "$line\n";
+ }
+ }
+
+ foreach my $elt_name (@elts)
+ {
+ delete ($self->{elt_opts}->{$elt_name});
+ }
+}
+
+# ----------------------------------------------------------------------
+
+# Main document processing routine.
+# - Argument is a string representing an input document
+# - Return value is the reformatted document, or undef. An undef return
+# signifies either than an error occurred, or that some option was
+# given that suppresses document output. In either case, don't write
+# any output for the document. Any error messages will already have
+# been printed when this returns.
+
+sub process_doc
+{
+my $self = shift;
+my ($doc, $verbose, $check_parser, $canonize_only, $show_unconf_elts) = @_;
+my $str;
+
+ $self->init_doc_vars ();
+
+ # Perform lexical parse to split document into list of tokens
+ warn "Parsing document...\n" if $verbose;
+ $self->shallow_parse ($doc);
+
+ if ($check_parser)
+ {
+ warn "Checking parser...\n" if $verbose;
+ # concatentation of tokens should be identical to original document
+ if ($doc eq join ("", @{$self->tokens ()}))
+ {
+ print "Parser is okay\n";
+ }
+ else
+ {
+ print "PARSER ERROR: document token concatenation differs from document\n";
+ }
+ return undef;
+ }
+
+ # Look for and report any error tokens returned by parser
+ warn "Checking document for errors...\n" if $verbose;
+ if ($self->report_errors () > 0)
+ {
+ warn "Cannot continue processing document.\n";
+ return undef;
+ }
+
+ # Convert the token list to a tree structure
+ warn "Converting document tokens to tree...\n" if $verbose;
+ if ($self->tokens_to_tree () > 0)
+ {
+ warn "Cannot continue processing document.\n";
+ return undef;
+ }
+
+ # Check: Stringify the tree to convert it back to a single string,
+ # then compare to original document string (should be identical)
+ # (This is an integrity check on the validity of the to-tree and stringify
+ # operations; if one or both do not work properly, a mismatch should occur.)
+ #$str = $self->tree_stringify ();
+ #print $str;
+ #warn "ERROR: mismatch between document and resulting string\n" if $doc ne $str;
+
+ # Canonize tree to remove extraneous whitespace
+ warn "Canonizing document tree...\n" if $verbose;
+ $self->tree_canonize ();
+
+ if ($canonize_only)
+ {
+ print $self->tree_stringify () . "\n";
+ return undef;
+ }
+
+ # One side-effect of canonizing the tree is that the formatting
+ # options are looked up for each element in the document. That
+ # causes the list of elements that have no explicit configuration
+ # to be built. Display the list and return if user requested it.
+
+ if ($show_unconf_elts)
+ {
+ $self->display_unconfigured_elements ();
+ return undef;
+ }
+
+ # Format the tree to produce formatted XML as a single string
+ warn "Formatting document tree...\n" if $verbose;
+ $self->tree_format ();
+
+ # If the document is not empty, add a newline and emit a warning if
+ # reformatting failed to add a trailing newline. This shouldn't
+ # happen if the *DOCUMENT options are set up with exit-break = 1,
+ # which is the reason for the warning rather than just silently
+ # adding the newline.
+
+ $str = $self->out_doc ();
+ if ($str ne "" && $str !~ /\n$/)
+ {
+ warn "LOGIC ERROR: trailing newline had to be added\n";
+ $str .= "\n";
+ }
+
+ return $str;
+}
+
+
+# ----------------------------------------------------------------------
+
+sub shallow_parse
+{
+my ($self, $xml_document) = @_;
+
+ $self->{tokens} = [ $xml_document =~ /$XML_SPE/g ];
+}
+
+# Extract a tag name from a tag and return it.
+
+# Dies if the tag cannot be found, because this is supposed to be
+# called only with a legal tag.
+
+sub extract_tag_name
+{
+my $tag = shift;
+
+ die "Cannot find tag name in tag: $tag\n" unless $tag =~ /^<\/?($Name)/;
+ return $1;
+}
+
+# ----------------------------------------------------------------------
+
+# Check token list for errors and report any that are found. Error
+# tokens are those that begin with "<" but do not end with ">".
+
+# Returns the error count.
+
+# Does not modify the original token list.
+
+sub report_errors
+{
+my $self = shift;
+my $err_count = 0;
+
+ for (my $i = 0; $i < @{$self->{tokens}}; $i++)
+ {
+ my $token = $self->{tokens}->[$i];
+ if ($token =~ /^</ && $token !~ />$/)
+ {
+ warn "Error (token " . ($i+1) . "): $token\n";
+ ++$err_count;
+ }
+ }
+ warn "Number of errors found: $err_count\n" if $err_count > 0;
+ return $err_count;
+}
+
+# ----------------------------------------------------------------------
+
+# Convert the list of XML document tokens to a tree representation.
+# The implementation uses a loop and a stack rather than recursion.
+
+# Does not modify the original token list.
+
+# Returns an error count.
+
+sub tokens_to_tree
+{
+my $self = shift;
+
+ my @tag_stack = (); # stack for element tags
+ my @children_stack = (); # stack for lists of children
+ my $children = [ ]; # current list of children
+ my $err_count = 0;
+
+ for (my $i = 0; $i < @{$self->{tokens}}; $i++)
+ {
+ my $token = $self->{tokens}->[$i];
+ if ($token !~ /^</) # text
+ {
+ push (@{$children}, text_node ($token));
+ }
+ elsif ($token =~ /^<!--/) # comment
+ {
+ push (@{$children}, comment_node ($token));
+ }
+ elsif ($token =~ /^<\?/) # processing instruction
+ {
+ push (@{$children}, pi_node ($token));
+ }
+ elsif ($token =~ /^<!DOCTYPE/) # DOCTYPE
+ {
+ push (@{$children}, doctype_node ($token));
+ }
+ elsif ($token =~ /^<!\[/) # CDATA
+ {
+ push (@{$children}, cdata_node ($token));
+ }
+ elsif ($token =~ /^<\//) # element close tag
+ {
+ if (!@tag_stack)
+ {
+ warn "Empty tag stack; malformed document?\n";
+ ++$err_count;
+ next;
+ }
+ if (!@children_stack)
+ {
+ warn "Empty children stack; malformed document?\n";
+ ++$err_count;
+ next;
+ }
+ my $tag = pop (@tag_stack);
+ my $open_tag_name = extract_tag_name ($tag);
+ my $close_tag_name = extract_tag_name ($token);
+ if ($open_tag_name ne $close_tag_name)
+ {
+ warn "MISMATCH open ($open_tag_name), close ($close_tag_name);",
+ " malformed document?\n";
+ ++$err_count;
+ next;
+ }
+ my $elt = element_node ($tag, $token, $children);
+ $children = pop (@children_stack);
+ push (@{$children}, $elt);
+ }
+ else # element open tag
+ {
+ # If we reach here, we're seeing the open tag for an element:
+ # - If the tag is also the close tag (e.g., <abc/>), close the
+ # element immediately, giving it an empty child list.
+ # - Otherwise, push tag and child list on stacks, begin new child
+ # list for element body.
+ if ($token =~ /\/>$/) # tag is of form <abc/>
+ {
+ push (@{$children}, element_node ($token, "", [ ]));
+ }
+ else # tag is of form <abc>
+ {
+ push (@tag_stack, $token);
+ push (@children_stack, $children);
+ $children = [ ];
+ }
+ }
+ }
+
+ # At this point, the stacks should be empty if the document is
+ # well-formed.
+
+ if (@tag_stack)
+ {
+ warn "Non-empty tag stack; malformed document?\n";
+ ++$err_count;
+ }
+ if (@children_stack)
+ {
+ warn "Non-empty children stack; malformed document?\n";
+ ++$err_count;
+ }
+
+ $self->{tree} = $children;
+ return $err_count;
+}
+
+
+# Node-generating helper methods for tokens_to_tree
+
+# Generic node generator
+
+sub node { return { "type" => $_[0], "content" => $_[1] }; }
+
+# Generators for specific non-element nodes
+
+sub text_node { return node ("text", $_[0]); }
+sub comment_node { return node ("comment", $_[0]); }
+sub pi_node { return node ("pi", $_[0]); }
+sub doctype_node { return node ("DOCTYPE", $_[0]); }
+sub cdata_node { return node ("CDATA", $_[0]); }
+
+# For an element node, create a standard node with the type and content
+# key/value pairs. Then add pairs for the "name", "open_tag", and
+# "close_tag" hash keys.
+
+sub element_node
+{
+my ($open_tag, $close_tag, $children) = @_;
+
+ my $elt = node ("elt", $children);
+ # name is the open tag with angle brackets and attibutes stripped
+ $elt->{name} = extract_tag_name ($open_tag);
+ $elt->{open_tag} = $open_tag;
+ $elt->{close_tag} = $close_tag;
+ return $elt;
+}
+
+# ----------------------------------------------------------------------
+
+# Convert the given XML document tree (or subtree) to string form by
+# concatentating all of its components. Argument is a reference
+# to a list of nodes at a given level of the tree.
+
+# Does not modify the node list.
+
+sub tree_stringify
+{
+my $self = shift;
+my $children = shift || $self->{tree}; # use entire tree if no arg;
+my $str = "";
+
+ for (my $i = 0; $i < @{$children}; $i++)
+ {
+ my $child = $children->[$i];
+
+ # - Elements have list of child nodes as content (process recursively)
+ # - All other node types have text content
+
+ if ($child->{type} eq "elt")
+ {
+ $str .= $child->{open_tag}
+ . $self->tree_stringify ($child->{content})
+ . $child->{close_tag};
+ }
+ else
+ {
+ $str .= $child->{content};
+ }
+ }
+ return $str;
+}
+
+# ----------------------------------------------------------------------
+
+
+# Put tree in "canonical" form by eliminating extraneous whitespace
+# from element text content.
+
+# $children is a list of child nodes
+
+# This function modifies the node list.
+
+# Canonizing occurs as follows:
+# - Comment, PI, DOCTYPE, and CDATA nodes remain untouched
+# - Verbatim elements and their descendants remain untouched
+# - Within non-normalized block elements:
+# - Delete all-whitespace text node children
+# - Leave other text node children untouched
+# - Within normalized block elements:
+# - Convert runs of whitespace (including line-endings) to single spaces
+# - Trim leading whitespace of first text node
+# - Trim trailing whitespace of last text node
+# - Trim whitespace that is adjacent to a verbatim or non-normalized
+# sub-element. (For example, if a <programlisting> is followed by
+# more text, delete any whitespace at beginning of that text.)
+# - Within inline elements:
+# - Normalize the same way as the enclosing block element, with the
+# exception that a space at the beginning or end is not removed.
+# (Otherwise, <para>three<literal> blind </literal>mice</para>
+# would become <para>three<literal>blind</literal>mice</para>.)
+
+sub tree_canonize
+{
+my $self = shift;
+
+ $self->{tree} = $self->tree_canonize2 ($self->{tree}, "*DOCUMENT");
+}
+
+
+sub tree_canonize2
+{
+my $self = shift;
+my $children = shift;
+my $par_name = shift;
+
+ # Formatting options for parent
+ my $par_opts = $self->get_opts ($par_name);
+
+ # If parent is a block element, remember its formatting options on
+ # the block stack so they can be used to control canonization of
+ # inline child elements.
+
+ $self->begin_block ($par_name, $par_opts) if $par_opts->{format} eq "block";
+
+ # Iterate through list of child nodes to preserve, modify, or
+ # discard whitespace. Return resulting list of children.
+
+ # Canonize element and text nodes. Leave everything else (comments,
+ # processing instructions, etc.) untouched.
+
+ my @new_children = ();
+
+ while (@{$children})
+ {
+ my $child = shift (@{$children});
+
+ if ($child->{type} eq "elt")
+ {
+ # Leave verbatim elements untouched. For other element nodes,
+ # canonize child list using options appropriate to element.
+
+ if ($self->get_opts ($child->{name})->{format} ne "verbatim")
+ {
+ $child->{content} = $self->tree_canonize2 ($child->{content},
+ $child->{name});
+ }
+ }
+ elsif ($child->{type} eq "text")
+ {
+ # Delete all-whitespace node or strip whitespace as appropriate.
+
+ # Paranoia check: We should never get here for verbatim elements,
+ # because normalization is irrelevant for them.
+
+ die "LOGIC ERROR: trying to canonize verbatim element $par_name!\n"
+ if $par_opts->{format} eq "verbatim";
+
+ if (!$self->block_normalize ())
+ {
+ # Enclosing block is not normalized:
+ # - Delete child all-whitespace text nodes.
+ # - Leave other text nodes untouched.
+
+ next if $child->{content} =~ /^\s*$/;
+ }
+ else
+ {
+ # Enclosing block is normalized, so normalize this text node:
+ # - Convert runs of whitespace characters (including
+ # line-endings characters) to single spaces.
+ # - Trim leading whitespace if this node is the first child
+ # of a block element or it follows a non-normalized node.
+ # - Trim leading whitespace if this node is the last child
+ # of a block element or it precedes a non-normalized node.
+
+ # These are nil if there is no prev or next child
+ my $prev_child = $new_children[$#new_children];
+ my $next_child = $children->[0];
+
+ $child->{content} =~ s/\s+/ /g;
+ $child->{content} =~ s/^ //
+ if (!defined ($prev_child) && $par_opts->{format} eq "block")
+ || $self->non_normalized_node ($prev_child);
+ $child->{content} =~ s/ $//
+ if (!defined ($next_child) && $par_opts->{format} eq "block")
+ || $self->non_normalized_node ($next_child);
+
+ # If resulting text is empty, discard the node.
+ next if $child->{content} =~ /^$/;
+ }
+ }
+ push (@new_children, $child);
+ }
+
+ # Pop block stack if parent was a block element
+ $self->end_block () if $par_opts->{format} eq "block";
+
+ return \@new_children;
+}
+
+
+# Helper function for tree_canonize().
+
+# Determine whether a node is normalized. This is used to check
+# the node that is adjacent to a given text node (either previous
+# or following).
+# - No is node is nil
+# - No if the node is a verbatim element
+# - If the node is a block element, yes or no according to its
+# normalize option
+# - No if the node is an inline element. Inlines are normalized
+# if the parent block is normalized, but this method is not called
+# except while examinine normalized blocks. So its inline children
+# are also normalized.
+# - No if node is a comment, PI, DOCTYPE, or CDATA section. These are
+# treated like verbatim elements.
+
+sub non_normalized_node
+{
+my $self = shift;
+my $node = shift;
+
+ return 0 if !$node;
+ my $type = $node->{type};
+ if ($type eq "elt")
+ {
+ my $node_opts = $self->get_opts ($node->{name});
+ if ($node_opts->{format} eq "verbatim")
+ {
+ return 1;
+ }
+ if ($node_opts->{format} eq "block")
+ {
+ return $node_opts->{normalize} eq "no";
+ }
+ if ($node_opts->{format} eq "inline")
+ {
+ return 0;
+ }
+ die "LOGIC ERROR: non_normalized_node: unhandled node format.\n";
+ }
+ if ($type eq "comment" || $type eq "pi" || $type eq "DOCTYPE"
+ || $type eq "CDATA")
+ {
+ return 1;
+ }
+ if ($type eq "text")
+ {
+ die "LOGIC ERROR: non_normalized_node: got called for text node.\n";
+ }
+ die "LOGIC ERROR: non_normalized_node: unhandled node type.\n";
+}
+
+# ----------------------------------------------------------------------
+
+# Format (pretty-print) the document tree
+
+# Does not modify the node list.
+
+# The class maintains two variables for storing output:
+# - out_doc stores content that has been seen and "flushed".
+# - pending stores an array of strings (content of text nodes and inline
+# element tags). These are held until they need to be flushed, at
+# which point they are concatenated and possibly wrapped/indented.
+# Flushing occurs when a break needs to be written, which happens
+# when something other than a text node or inline element is seen.
+
+# If parent name and children are not given, format the entire document.
+# Assume prevailing indent = 0 if not given.
+
+sub tree_format
+{
+my $self = shift;
+my $par_name = shift || "*DOCUMENT"; # format entire document if no arg
+my $children = shift || $self->{tree}; # use entire tree if no arg
+my $indent = shift || 0;
+
+ # Formatting options for parent element
+ my $par_opts = $self->get_opts ($par_name);
+
+ # If parent is a block element:
+ # - Remember its formatting options on the block stack so they can
+ # be used to control formatting of inline child elements.
+ # - Set initial break type to entry-break.
+ # - Shift prevailing indent right before generating child content.
+
+ if ($par_opts->{format} eq "block")
+ {
+ $self->begin_block ($par_name, $par_opts);
+ $self->set_block_break_type ("entry-break");
+ $indent += $par_opts->{"subindent"};
+ }
+
+ # Variables for keeping track of whether the previous child
+ # was a text node. Used for controlling break behavior in
+ # non-normalized block elements: No line breaks are added around
+ # text in such elements, nor is indenting added.
+
+ my $prev_child_is_text = 0;
+ my $cur_child_is_text = 0;
+
+ foreach my $child (@{$children})
+ {
+ $prev_child_is_text = $cur_child_is_text;
+
+ # Text nodes: just add text to pending output
+
+ if ($child->{type} eq "text")
+ {
+ $cur_child_is_text = 1;
+ $self->add_to_pending ($child->{content});
+ next;
+ }
+
+ $cur_child_is_text = 0;
+
+ # Element nodes: handle depending on format type
+
+ if ($child->{type} eq "elt")
+ {
+ my $child_opts = $self->get_opts ($child->{name});
+
+ # Verbatim elements:
+ # - Print literally without change (use _stringify).
+ # - Do not line-wrap or add any indent.
+
+ if ($child_opts->{format} eq "verbatim")
+ {
+ $self->flush_pending ($indent);
+ $self->emit_break (0)
+ unless $prev_child_is_text && !$self->block_normalize ();
+ $self->set_block_break_type ("element-break");
+ $self->add_to_doc ($child->{open_tag}
+ . $self->tree_stringify ($child->{content})
+ . $child->{close_tag});
+ next;
+ }
+
+ # Inline elements:
+ # - Do not break or indent.
+ # - Do not line-wrap content; just add content to pending output
+ # and let it be wrapped as part of parent's content.
+
+ if ($child_opts->{format} eq "inline")
+ {
+ $self->add_to_pending ($child->{open_tag});
+ $self->tree_format ($child->{name}, $child->{content}, $indent);
+ $self->add_to_pending ($child->{close_tag});
+ next;
+ }
+
+ # If we get here, node is a block element.
+
+ # - Break and flush any pending output
+ # - Break and indent (no indent if break count is zero)
+ # - Process element itself:
+ # - Put out opening tag
+ # - Put out element content
+ # - Put out any indent needed before closing tag. None needed if:
+ # - Element's exit-break is 0 (closing tag is not on new line,
+ # so don't indent it)
+ # - There is no separate closing tag (it was in <abc/> format)
+ # - Element has no children (tags will be written as
+ # <abc></abc>, so don't indent closing tag)
+ # - Element has children, but the block is not normalized and
+ # the last child is a text node
+ # - Put out closing tag
+
+ $self->flush_pending ($indent);
+ $self->emit_break ($indent)
+ unless $prev_child_is_text && !$self->block_normalize ();
+ $self->set_block_break_type ("element-break");
+ $self->add_to_doc ($child->{open_tag});
+ $self->tree_format ($child->{name}, $child->{content}, $indent);
+ $self->add_to_doc (" " x $indent)
+ unless $child_opts->{"exit-break"} <= 0
+ || $child->{close_tag} eq ""
+ || !@{$child->{content}}
+ || (@{$child->{content}}
+ && $child->{content}->[$#{$child->{content}}]->{type} eq "text"
+ && $child_opts->{normalize} eq "no");
+ $self->add_to_doc ($child->{close_tag});
+ next;
+ }
+
+ # Comments, PIs, etc. (everything other than text and elements),
+ # treat similarly to verbatim block:
+ # - Flush any pending output
+ # - Put out a break
+ # - Add node content to collected output
+
+ $self->flush_pending ($indent);
+ $self->emit_break (0)
+ unless $prev_child_is_text && !$self->block_normalize ();
+ $self->set_block_break_type ("element-break");
+ $self->add_to_doc ($child->{content});
+ }
+
+ $prev_child_is_text = $cur_child_is_text;
+
+ # Done processing current element's children now.
+
+ # If current element is a block element:
+ # - If there were any children, flush any pending output and put
+ # out the exit break.
+ # - Pop the block stack
+
+ if ($par_opts->{format} eq "block")
+ {
+ if (@{$children})
+ {
+ $self->flush_pending ($indent);
+ $self->set_block_break_type ("exit-break");
+ $self->emit_break (0)
+ unless $prev_child_is_text && !$self->block_normalize ();
+ }
+ $self->end_block ();
+ }
+}
+
+
+# Emit a break - the appropriate number of newlines according to the
+# enclosing block's current break type.
+
+# In addition, emit the number of spaces indicated by indent. (indent
+# > 0 when breaking just before emitting an element tag that should
+# be indented within its parent element.)
+
+# Exception: Emit no indent if break count is zero. That indicates
+# any following output will be written on the same output line, not
+# indented on a new line.
+
+# Initially, when processing a node's child list, the break type is
+# set to entry-break. Each subsequent break is an element-break.
+# (After child list has been processed, an exit-break is produced as well.)
+
+sub emit_break
+{
+my ($self, $indent) = @_;
+
+ # number of newlines to emit
+ my $break_value = $self->block_break_value ();
+
+ $self->add_to_doc ("\n" x $break_value);
+ # add indent if there *was* a break
+ $self->add_to_doc (" " x $indent) if $indent > 0 && $break_value > 0;
+}
+
+
+# Flush pending output to output document collected thus far:
+# - Wrap pending contents as necessary, with indent before *each* line.
+# - Add pending text to output document (thus "flushing" it)
+# - Clear pending array.
+
+sub flush_pending
+{
+my ($self, $indent) = @_;
+
+ # Do nothing if nothing to flush
+ return if !@{$self->{pending}};
+
+ # If current block is not normalized:
+ # - Text nodes cannot be modified (no wrapping or indent). Flush
+ # text as is without adding a break or indent.
+ # If current block is normalized:
+ # - Add a break.
+ # - If line wrap is disabled:
+ # - Add indent if there is a break. (If there isn't a break, text
+ # should immediately follow preceding tag, so don't add indent.)
+ # - Add text without wrapping
+ # - If line wrap is enabled:
+ # - First line indent is 0 if there is no break. (Text immediately
+ # follows preceding tag.) Otherwise first line indent is same as
+ # prevailing indent.
+ # - Any subsequent lines get the prevailing indent.
+
+ # After flushing text, advance break type to element-break.
+
+ my $s = "";
+
+ if (!$self->block_normalize ())
+ {
+ $s .= join ("", @{$self->{pending}});
+ }
+ else
+ {
+ $self->emit_break (0);
+ my $wrap_len = $self->block_wrap_length ();
+ my $break_value = $self->block_break_value ();
+ if ($wrap_len <= 0)
+ {
+ $s .= " " x $indent if $break_value > 0;
+ $s .= join ("", @{$self->{pending}});
+ }
+ else
+ {
+ my $first_indent = ($break_value > 0 ? $indent : 0);
+ # Wrap lines, then join by newlines (don't add one at end)
+ my @lines = $self->line_wrap ($self->{pending},
+ $first_indent,
+ $indent,
+ $wrap_len);
+ $s .= join ("\n", @lines);
+ }
+ }
+
+ $self->add_to_doc ($s);
+ $self->{pending} = [ ];
+ $self->set_block_break_type ("element-break");
+}
+
+
+# Perform line-wrapping of string array to lines no longer than given
+# length (including indent).
+# Any word longer than line length appears by itself on line.
+# Return array of lines (not newline-terminated).
+
+# $strs - reference to array of text items to be joined and line-wrapped.
+# Each item may be:
+# - A tag (such as <emphasis role="bold">). This should be treated as
+# an atomic unit, which is important for preserving inline tags intact.
+# - A possibly multi-word string (such as "This is a string"). In this
+# latter case, line-wrapping preserves internal whitespace in the
+# string, with the exception that if whitespace would be placed at
+# the end of a line, it is discarded.
+
+# $first_indent - indent for first line
+# $rest_indent - indent for any remaining lines
+# $max_len - maximum length of output lines (including indent)
+
+sub line_wrap
+{
+my ($self, $strs, $first_indent, $rest_indent, $max_len) = @_;
+
+ # First, tokenize the strings
+
+ my @words = ();
+ foreach my $str (@{$strs})
+ {
+ if ($str =~ /^</)
+ {
+ # String is a tag; treat as atomic unit and don't split
+ push (@words, $str);
+ }
+ else
+ {
+ # String of white and non-white tokens.
+ # Tokenize into white and non-white tokens.
+ push (@words, ($str =~ /\S+|\s+/g));
+ }
+ }
+
+ # Now merge tokens that are not separated by whitespace tokens. For
+ # example, "<i>", "word", "</i>" gets merged to "<i>word</i>". But
+ # "<i>", " ", "word", " ", "</i>" gets left as separate tokens.
+
+ my @words2 = ();
+ foreach my $word (@words)
+ {
+ # If there is a previous word that does not end with whitespace,
+ # and the currrent word does not begin with whitespace, concatenate
+ # current word to previous word. Otherwise append current word to
+ # end of list of words.
+ if (@words2 && $words2[$#words2] !~ /\s$/ && $word !~ /^\s/)
+ {
+ $words2[$#words2] .= $word;
+ }
+ else
+ {
+ push (@words2, $word);
+ }
+ }
+
+ my @lines = ();
+ my $line = "";
+ my $llen = 0;
+ # set the indent for the first line
+ my $indent = $first_indent;
+ # saved-up whitespace to put before next non-white word
+ my $white = "";
+
+ foreach my $word (@words2) # ... while words remain to wrap
+ {
+ # If word is whitespace, save it. It gets added before next
+ # word if no line-break occurs.
+ if ($word =~ /^\s/)
+ {
+ $white .= $word;
+ next;
+ }
+ my $wlen = length ($word);
+ if ($llen == 0)
+ {
+ # New output line; it gets at least one word (discard any
+ # saved whitespace)
+ $line = " " x $indent . $word;
+ $llen = $indent + $wlen;
+ $indent = $rest_indent;
+ $white = "";
+ next;
+ }
+ if ($llen + length ($white) + $wlen > $max_len)
+ {
+ # Word (plus saved whitespace) won't fit on current line.
+ # Begin new line (discard any saved whitespace).
+ push (@lines, $line);
+ $line = " " x $indent . $word;
+ $llen = $indent + $wlen;
+ $indent = $rest_indent;
+ $white = "";
+ next;
+ }
+ # add word to current line with saved whitespace between
+ $line .= $white . $word;
+ $llen += length ($white) + $wlen;
+ $white = "";
+ }
+
+ # push remaining line, if any
+ push (@lines, $line) if $line ne "";
+
+ return @lines;
+}
+
+1;
+
+# ----------------------------------------------------------------------
+
+# Begin main program
+
+package main;
+
+
+my $usage = <<EOF;
+Usage: $prog_name [options] xml-file
+
+Options:
+--help, -h
+ Print this message
+--backup suffix -b suffix
+ Back up the input document, adding suffix to the input
+ filename to create the backup filename.
+--canonized-output
+ Proceed only as far as the document canonization stage,
+ printing the result.
+--check-parser
+ Parse the document into tokens and verify that their
+ concatenation is identical to the original input document.
+ This option suppresses further document processing.
+--config-file file_name, -f file_name
+ Specify the configuration filename. If no file is named,
+ xmlformat uses the file named by the environment variable
+ XMLFORMAT_CONF, if it exists, or ./xmlformat.conf, if it
+ exists. Otherwise, xmlformat uses built-in formatting
+ options.
+--in-place, -i
+ Format the document in place, replacing the contents of
+ the input file with the reformatted document. (It's a
+ good idea to use --backup along with this option.)
+--show-config
+ Show configuration options after reading configuration
+ file. This option suppresses document processing.
+--show-unconfigured-elements
+ Show elements that are used in the document but for
+ which no options were specified in the configuration
+ file. This option suppresses document output.
+--verbose, -v
+ Be verbose about processing stages.
+--version, -V
+ Show version information and exit.
+EOF
+
+# Variables for command line options; most are undefined initially.
+my $help;
+my $backup_suffix;
+my $conf_file;
+my $canonize_only;
+my $check_parser;
+my $in_place;
+my $show_conf;
+my $show_unconf_elts;
+my $show_version;
+my $verbose;
+
+GetOptions (
+ # =i means an integer argument is required after the option
+ # =s means a string argument is required after the option
+ # :s means a string argument is optional after the option
+ "help|h" => \$help, # print help message
+ "backup|b=s" => \$backup_suffix, # make backup using suffix
+ "canonized-output" => \$canonize_only, # print canonized document
+ "check-parser" => \$check_parser, # verify parser integrity
+ "config-file|f=s" => \$conf_file, # config file
+ "in-place|i" => \$in_place, # format in place
+ "show-config" => \$show_conf, # show configuration file
+ # need better name
+ "show-unconfigured-elements" => \$show_unconf_elts, # show unconfigured elements
+ "verbose|v" => \$verbose, # be verbose
+ "version|V" => \$show_version, # show version info
+) or do { print "$usage\n"; exit (1); };
+
+if (defined ($help))
+{
+ print "$usage\n";
+ exit (0);
+}
+
+if (defined ($show_version))
+{
+ print "$prog_name $XMLFORMAT_VERSION\n";
+ exit (0);
+}
+
+# --in-place option requires a named file
+
+warn "WARNING: --in-place/-i option ignored (requires named input files)\n"
+ if defined ($in_place) && @ARGV == 0;
+
+# --backup/-b is meaningless without --in-place
+
+if (defined ($backup_suffix))
+{
+ if (!defined ($in_place))
+ {
+ die "--backup/-b option meaningless without --in-place/-i option\n";
+ }
+}
+
+# Save input filenames
+my @in_file = @ARGV;
+
+my $xf = XMLFormat->new ();
+
+# If a configuration file was named explicitly, use it. An error occurs
+# if the file does not exist.
+
+# If no configuration file was named, fall back to:
+# - The file named by the environment variable XMLFORMAT_CONF, if it exists
+# - ./xmlformat.conf, if it exists
+
+# If no configuration file can be found at all, the built-in default options
+# are used. (These are set up in new().)
+
+my $env_conf_file = $ENV{XMLFORMAT_CONF};
+my $def_conf_file = "./xmlformat.conf";
+
+# If no config file was named, but XMLFORMAT_CONF is set, use its value
+# as the config file name.
+if (!defined ($conf_file))
+{
+ $conf_file = $env_conf_file if defined ($env_conf_file);
+}
+# If config file still isn't defined, use the default file if it exists.
+if (!defined ($conf_file))
+{
+ if (-r $def_conf_file && ! -d $def_conf_file)
+ {
+ $conf_file = $def_conf_file;
+ }
+}
+if (defined ($conf_file))
+{
+ warn "Reading configuration file...\n" if $verbose;
+ die "Configuration file '$conf_file' is not readable.\n" if ! -r $conf_file;
+ die "Configuration file '$conf_file' is a directory.\n" if -d $conf_file;
+ $xf->read_config ($conf_file)
+}
+
+if ($show_conf) # show configuration and exit
+{
+ $xf->display_config ();
+ exit(0);
+}
+
+my ($in_doc, $out_doc);
+
+# Process arguments.
+# - If no files named, read string, write to stdout.
+# - If files named, read and process each one. Write output to stdout
+# unless --in-place option was given. Make backup of original file
+# if --backup option was given.
+
+if (@ARGV == 0)
+{
+ warn "Reading document...\n" if $verbose;
+ {
+ local $/ = undef;
+ $in_doc = <>; # slurp input document as single string
+ }
+
+ $out_doc = $xf->process_doc ($in_doc,
+ $verbose, $check_parser, $canonize_only, $show_unconf_elts);
+ if (defined ($out_doc))
+ {
+ warn "Writing output document...\n" if $verbose;
+ print $out_doc;
+ }
+}
+else
+{
+ foreach my $file (@ARGV)
+ {
+ warn "Reading document $file...\n" if $verbose;
+ open (IN, $file)
+ or die "Cannot read $file: $!\n";
+ {
+ local $/ = undef;
+ $in_doc = <IN>; # slurp input document as single string
+ }
+ close (IN);
+ $out_doc = $xf->process_doc ($in_doc,
+ $verbose, $check_parser, $canonize_only, $show_unconf_elts);
+ next unless defined ($out_doc);
+ if (defined ($in_place))
+ {
+ if (defined ($backup_suffix))
+ {
+ warn "Making backup of $file to $file$backup_suffix...\n" if $verbose;
+ rename ($file, $file . $backup_suffix)
+ or die "Could not rename $file to $file$backup_suffix: $!\n";
+ }
+ warn "Writing output document to $file...\n" if $verbose;
+ open (OUT, ">$file") or die "Cannot write to $file: $!\n";
+ print OUT $out_doc;
+ close (OUT);
+ }
+ else
+ {
+ warn "Writing output document...\n" if $verbose;
+ print $out_doc;
+ }
+ }
+}
+
+warn "Done!\n" if $verbose;
+
+exit (0);
diff --git a/bin/xmlformat-fdp.conf b/bin/xmlformat-fdp.conf
new file mode 100644
index 0000000..b724168
--- /dev/null
+++ b/bin/xmlformat-fdp.conf
@@ -0,0 +1,101 @@
+# Comments are treated as CDATA and not touched. It's best to set
+# them out from other text if possible. A doublespace is nice but
+# code is still code, so it's not really that important. (Down, inner
+# stickler...)
+#
+# ChangeLog:
+# * Jun 26 2005 Paul W. Frields &lt;stickster@gmail.com&gt;
+# - Initial version.
+#
+
+*DEFAULT
+ format = block
+ entry-break = 1
+ element-break = 1
+ exit-break = 1
+ subindent = 2
+ normalize = no
+ wrap-length = 72
+
+*DOCUMENT
+ format = block
+ wrap-length = 72
+ element-break = 2
+
+year holder
+ entry-break = 0
+ exit-break = 0
+ normalize = yes
+
+firstname surname othername
+ entry-break = 0
+ exit-break = 0
+
+revnumber date authorinitials
+ entry-break = 0
+ exit-break = 0
+
+revremark
+ normalize = yes
+
+
+article book
+ element-break = 2
+
+section
+ entry-break = 1
+ normalize = yes
+
+# "Normalize" means make smart whitespace decisions
+para example important note warning caution itemizedlist variablelist varlistentry
+ normalize = yes
+
+title titleabbrev
+ entry-break = 0
+ exit-break = 0
+ normalize = yes
+
+emphasis literal
+ format = inline
+
+trademark
+ format = inline
+
+# Do not fubar <screen> or <programlisting>
+screen programlisting
+ format = verbatim
+
+command application filename option userinput computeroutput replaceable
+ format = inline
+
+# The <primary> and <secondary> subelements of <indexterm> are still block
+firstterm
+ format = inline
+ normalize = yes
+
+indexterm
+ format = block
+# normalize = yes
+
+primary secondary
+ format = block
+ entry-break = 1
+ exit-break = 1
+
+varlistentry
+ element-break = 1
+
+term
+ entry-break = 0
+ exit-break = 0
+
+menuchoice guilabel guimenu guisubmenu guimenuitem
+ format = inline
+
+wordasword systemitem citetitle footnote
+ format = inline
+
+# Make <ulink> and <xref> less goofy in their use of whitespace
+ulink xref
+ format = inline
+