3 files changed, 228 insertions, 147 deletions
diff --git a/install/po/Makefile.in b/install/po/Makefile.in
index 4bee861a8..d65ba0c70 100644
--- a/install/po/Makefile.in
+++ b/install/po/Makefile.in
@@ -15,6 +15,7 @@ MSGINIT = @MSGINIT@
 MSGMERGE = @MSGMERGE@
 MSGCMP = @MSGCMP@
 TX = @TX@
+IPA_TEST_I18N = ../../tests/i18n.py
 
 DOMAIN = @GETTEXT_DOMAIN@
 MSGMERGE_UPDATE = $(MSGMERGE) --update
@@ -129,7 +130,7 @@ update-pot:
 	    echo "$(DOMAIN).pot unmodified" ; \
 	fi || :
 	@rm -f $(DOMAIN).pot.update $(DOMAIN).pot.update.tmp $(DOMAIN).pot.tmp
-	./test_i18n.py --show-strings --validate-pot $(DOMAIN).pot
+	$(IPA_TEST_I18N) --show-strings --validate-pot $(DOMAIN).pot
 
 msg-stats:
 	@pot_count=`$(MSGFMT) --statistics $(DOMAIN).pot 2>&1 | \
@@ -169,38 +170,14 @@ distclean: clean
 
 maintainer-clean: distclean
 
-# We test our translations by taking the original untranslated string
-# (e.g. msgid) and prepend a prefix character and then append a suffix
-# character. The test consists of asserting that the first character in the
-# translated string is the prefix, the last character in the translated string
-# is the suffix and the everything between the first and last character exactly
-# matches the original msgid.
-#
-# We use unicode characters not in the ascii character set for the prefix and
-# suffix to enhance the test. To make reading the translated string easier the
-# prefix is the unicode right pointing arrow and the suffix left pointing arrow,
-# thus the translated string looks like the original string enclosed in
-# arrows. In ASCII art the string "foo" would render as:
-# -->foo<--
-#
-# Unicode right pointing arrow: u'\u2192', utf-8 = '\xe2\x86\x92'
-# Unicode left pointing arrow:  u'\u2190', utf-8 = '\xe2\x86\x90'
-#
-# The sed command below performs the prefix and suffix substitution.
-#
-# When msginit is invoked with an English target locale it copies the msgid
-# into the msgstr. This is an undocumented feature of msginit. Otherwise the
-# msgstr will be set to the empty string (i.e. untranslated). We depend on
-# the msgid being copied to the msgstr.
-
 test:
-	./test_i18n.py --test-gettext
+	$(IPA_TEST_I18N) --test-gettext
 
 validate-pot:
-	./test_i18n.py --show-strings --validate-pot $(DOMAIN).pot
+	$(IPA_TEST_I18N) --show-strings --validate-pot $(DOMAIN).pot
 
 validate-po:
-	./test_i18n.py --show-strings --validate-po $(po_files)
+	$(IPA_TEST_I18N) --show-strings --validate-po $(po_files)
 
 debug:
 	@echo Python potfiles:
diff --git a/install/po/test_i18n.py b/tests/i18n.py
index beb43ccaa..067bc5e39 100755
--- a/install/po/test_i18n.py
+++ b/tests/i18n.py
@@ -19,6 +19,8 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
+# WARNING: Do not import ipa modules, this is also used as a
+# stand-alone script (invoked from install/po Makefile).
 import optparse
 import sys
 import gettext
@@ -86,14 +88,108 @@ _shell_substitution_regexp = re.compile(r'\$(\s*)([({]?)(\s*)\w+(\s*)([)}]?)')
 # group 4: whitespace between variable and ending delimiter
 # group 5: ending delimiter
 
-# We do not permit anonymous substitutions in translation strings
-# (e.g. '%s occurred' % error) because they do not provide the
-# necessary context to translators, they would only see
-# '%s occurred'. Instead a keyword substitution should be used
-# (e.g. '%(error)s occurred' % {'error': error_message})
+printf_fmt_re = re.compile(
+    r"%"                                     # start
+     "(\d+\$)?"                              # fmt_arg    (group  1)
+     "(([#0 +'I]|-(?!\d))*)"                 # flags      (group  2)
+     "(([+-]?([1-9][0-9]*)?)|(\*|\*\d+\$))?" # width      (group  4)
+     "(\.((-?\d*)|(\*|)|(\*\d+\$)))?"        # precision  (group  8)
+     "(h|hh|l|ll|L|j|z|t)?"                  # length     (group 13)
+     "([diouxXeEfFgGaAcspnm%])")             # conversion (group 14)
 
-# Python anonymous format substitutions, e.g. %s, %d, %f, etc.
-python_anonymous_substitutions_regexp = re.compile(r'%[srduoxf]\b') # e.g. %s
+#-------------------------------------------------------------------------------
+
+def get_prog_langs(entry):
+    '''
+    Given an entry in a pot or po file return a set of the
+    programming languges it was found in. It needs to be a set
+    because the same msgid may appear in more than one file which may
+    be in different programming languages.
+
+    Note: One might think you could use the c-format etc. flags to
+    attached to entry to make this determination, but you can't. Those
+    flags refer to the style of the string not the programming
+    language it came from. Also the flags are often omitted and/or are
+    inaccurate.
+
+    For now we just look at the file extension. If we knew the path to
+    the file we could use other heuristics such as looking for the
+    shbang interpreter string.
+
+    The set of possible language types witch might be returned are:
+
+    * c
+    * python
+
+    '''
+    result = set()
+
+    for location in entry.occurrences:
+        filename = location[0]
+        ext = os.path.splitext(filename)[1]
+
+        if ext in ('.c', '.h', '.cxx', '.cpp', '.hxx'):
+            result.add('c')
+        elif ext in ('.py'):
+            result.add('python')
+
+    return result
+
+def parse_printf_fmt(s):
+    '''
+    Parse a printf style format string and return a list of format
+    conversions found in the string.
+
+    Each conversion specification is introduced by the character %, and
+    ends with a conversion specifier.  In between there may be (in this
+    order) zero or more flags, an optional minimum field width, an
+    optional precision and an optional length modifier. See "man 3
+    printf" for details.
+
+    Each item in the returned list is a dict whose keys are the
+    sub-parts of a conversion specification. The key and values are:
+
+    fmt
+        The entire format conversion specification
+    fmt_arg
+        The positional index of the matching argument in the argument
+        list, e.g. %1$ indicates the first argument in the argument
+        will be read for this conversion, excludes the leading % but
+        includes the trailing $, 1$ is the fmt_arg in %1$.
+    flags
+        The flag characaters, e.g. 0 is the flag in %08d
+    width
+        The width field, e.g. 20 is the width in %20s
+    precision
+        The precisioin field, e.g. .2 is the precision in %8.2f
+    length
+        The length modifier field, e.g. l is the length modifier in %ld
+    conversion
+        The conversion specifier character, e.g. d is the conversion
+        specification character in %ld
+
+    If the part is not found in the format it's value will be None.
+    '''
+
+    result = []
+
+    # get list of all matches, but skip escaped %
+    matches = [x for x in printf_fmt_re.finditer(s) if x.group(0) != "%%"]
+
+    # build dict of each sub-part of the format, append to result
+    for match in matches:
+        parts = {}
+        parts['fmt']        = match.group(0)
+        parts['fmt_arg']    = match.group(1)
+        parts['flags']      = match.group(2) or None
+        parts['width']      = match.group(4) or None
+        parts['precision']  = match.group(8)
+        parts['length']     = match.group(13)
+        parts['conversion'] = match.group(14)
+
+        result.append(parts)
+
+    return result
 
 def validate_substitutions_match(s1, s2, s1_name='string1', s2_name='string2'):
     '''
@@ -233,22 +329,35 @@ def validate_substitution_syntax(s, s_name='string'):
     return errors
 
 
-def validate_anonymous_substitutions(s, s_name='string'):
+def validate_positional_substitutions(s, prog_langs, s_name='string'):
     '''
-    We do not permit multiple anonymous substitutions in translation
+    We do not permit multiple positional substitutions in translation
     strings (e.g. '%s') because they do not allow translators to reorder the
     wording. Instead keyword substitutions should be used when there are
     more than one.
     '''
     errors = []
 
+    fmts = parse_printf_fmt(s)
+    n_fmts = len(fmts)
 
-    matches = list(python_anonymous_substitutions_regexp.finditer(s))
-
-    if len(matches) > 1:
-        for match in python_anonymous_substitutions_regexp.finditer(s):
-            errors.append("%s has anonymous substitution '%s', use keyword substitution instead" %
-                          (s_name, match.group(0)))
+    errors = []
+    if n_fmts > 1:
+        for i, fmt_parts in enumerate(fmts):
+            fmt        = fmt_parts['fmt']
+            fmt_arg    = fmt_parts['fmt_arg']
+            width      = fmt_parts['width']
+
+            if width == '*':
+                errors.append("Error: * width arg in format '%s should be indexed" % fmt)
+
+            if fmt_arg is None:
+                if 'c' in prog_langs:
+                    errors.append("%s format '%s' is positional, should use indexed argument" %
+                                  (s_name, fmt))
+                else:
+                    errors.append("%s format '%s' is positional, should use keyword substitution" %
+                                  (s_name, fmt))
 
     if errors:
         if show_strings:
@@ -265,7 +374,7 @@ def validate_file(file_path, validation_mode):
 
     * validate_substitutions_match()
     * validate_substitution_syntax()
-    * validate_anonymous_substitutions()
+    * validate_positional_substitutions()
 
     Returns the number of entries with errors.
     '''
@@ -290,7 +399,8 @@ def validate_file(file_path, validation_mode):
         have_msgstr = msgstr.strip() != ''
         if validation_mode == 'pot':
             if have_msgid:
-                errors = validate_anonymous_substitutions(msgid, 'msgid')
+                prog_langs = get_prog_langs(entry)
+                errors = validate_positional_substitutions(msgid, prog_langs, 'msgid')
                 entry_errors.extend(errors)
         if validation_mode == 'po':
             if have_msgid and have_msgstr:
@@ -387,23 +497,28 @@ def validate_unicode_edit(msgid, msgstr):
 
 
 def test_translations(po_file, lang, domain, locale_dir):
-    try:
+    # The test installs the test message catalog under the xh_ZA
+    # (e.g. Zambia Xhosa) language by default. It would be nice to
+    # use a dummy language not associated with any real language,
+    # but the setlocale function demands the locale be a valid
+    # known locale, Zambia Xhosa is a reasonable choice :)
 
-        # The test installs the test message catalog under the xh_ZA
-        # (e.g. Zambia Xhosa) language by default. It would be nice to
-        # use a dummy language not associated with any real language,
-        # but the setlocale function demands the locale be a valid
-        # known locale, Zambia Xhosa is a reasonable choice :)
+    os.environ['LANG'] = lang
 
-        os.environ['LANG'] = lang
+    # Create a gettext translation object specifying our domain as
+    # 'ipa' and the locale_dir as 'test_locale' (i.e. where to
+    # look for the message catalog). Then use that translation
+    # object to obtain the translation functions.
 
-        # Create a gettext translation object specifying our domain as
-        # 'ipa' and the locale_dir as 'test_locale' (i.e. where to
-        # look for the message catalog). Then use that translation
-        # object to obtain the translation functions.
+    t = gettext.translation(domain, locale_dir)
 
-        t = gettext.translation(domain, locale_dir)
+    get_msgstr = t.ugettext
+    get_msgstr_plural = t.ungettext
 
+    return po_file_iterate(po_file, get_msgstr, get_msgstr_plural)
+
+def po_file_iterate(po_file, get_msgstr, get_msgstr_plural):
+    try:
         # Iterate over the msgid's
         if not os.path.isfile(po_file):
             print >>sys.stderr, 'file does not exist "%s"' % (po_file)
@@ -422,8 +537,8 @@ def test_translations(po_file, lang, domain, locale_dir):
             if entry.msgid_plural:
                 msgid = entry.msgid
                 msgid_plural = entry.msgid_plural
-                msgstr = t.ungettext(msgid, msgid_plural, 1)
-                msgstr_plural = t.ungettext(msgid, msgid_plural, 2)
+                msgstr = get_msgstr_plural(msgid, msgid_plural, 1)
+                msgstr_plural = get_msgstr_plural(msgid, msgid_plural, 2)
 
                 try:
                     n_translations += 1
@@ -448,7 +563,7 @@ def test_translations(po_file, lang, domain, locale_dir):
 
             else:
                 msgid = entry.msgid
-                msgstr = t.ugettext(msgid)
+                msgstr = get_msgstr(msgid)
 
                 try:
                     n_translations += 1
diff --git a/tests/test_ipalib/test_text.py b/tests/test_ipalib/test_text.py
index 1931ca4fe..9f60785ff 100644
--- a/tests/test_ipalib/test_text.py
+++ b/tests/test_ipalib/test_text.py
@@ -22,10 +22,13 @@ Test the `ipalib.text` module.
 """
 
 import os
+import shutil
+import tempfile
 import re
 import nose
 import locale
 from tests.util import raises, assert_equal
+from tests.i18n import create_po, po_file_iterate
 from ipalib.request import context
 from ipalib import request
 from ipalib import text
@@ -35,93 +38,6 @@ singular = '%(count)d goose makes a %(dish)s'
 plural = '%(count)d geese make a %(dish)s'
 
 
-# Unicode right pointing arrow
-prefix = u'\u2192'               # utf-8 == '\xe2\x86\x92'
-# Unicode left pointing arrow
-suffix = u'\u2190'               # utf-8 == '\xe2\x86\x90'
-
-def get_msgid(po_file):
-    'Get the first non-empty msgid from the po file'
-
-    msgid_re = re.compile(r'^\s*msgid\s+"(.+)"\s*$')
-    f = open(po_file)
-    for line in f.readlines():
-        match = msgid_re.search(line)
-        if match:
-            msgid = match.group(1)
-            f.close()
-            return msgid
-    f.close()
-    raise ValueError('No msgid found in %s' % po_file)
-
-def test_gettext():
-    '''
-    Test gettext translation
-
-    We test our translations by taking the original untranslated
-    string (e.g. msgid) and prepend a prefix character and then append
-    a suffix character. The test consists of asserting that the first
-    character in the translated string is the prefix, the last
-    character in the translated string is the suffix and the
-    everything between the first and last character exactly matches
-    the original msgid.
-
-    We use unicode characters not in the ascii character set for the
-    prefix and suffix to enhance the test. To make reading the
-    translated string easier the prefix is the unicode right pointing
-    arrow and the suffix left pointing arrow, thus the translated
-    string looks like the original string enclosed in arrows. In ASCII
-    art the string "foo" would render as: "-->foo<--"
-    '''
-
-    localedir='install/po/test_locale'
-    test_file='install/po/test.po'
-
-    lang = os.environ['LANG']
-    os.environ['LANG'] = 'xh_ZA'
-
-    # Tell gettext that our domain is 'ipa', that locale_dir is
-    # 'test_locale' (i.e. where to look for the message catalog)
-    _ = text.GettextFactory('ipa', localedir)
-
-    # We need a translatable string to test with, read one from the
-    # test po file
-    if not file_exists(test_file):
-        raise nose.SkipTest(
-           'Test language not available, run "make test_lang" in install/po'
-        )
-    msgid = get_msgid(test_file)
-
-    # Get the localized instance of the msgid, it should be a Gettext
-    # instance.
-    localized = _(msgid)
-    assert(isinstance(localized, text.Gettext))
-
-    # Get the translated string from the Gettext instance by invoking
-    # unicode on it.
-    translated = unicode(localized)
-
-    # Perform the verifications on the translated string.
-
-    # Verify the first character is the test prefix
-    assert(translated[0] == prefix)
-
-    # Verify the last character is the test suffix
-    assert(translated[-1] == suffix)
-
-    # Verify everything between the first and last character is the
-    # original untranslated string
-    assert(translated[1:-1] == msgid)
-    
-    # Reset the language and assure we don't get the test values
-    context.__dict__.clear()
-    os.environ['LANG'] = lang
-
-    translated = unicode(localized)
-
-    assert(translated[0] != prefix)
-    assert(translated[-1] != suffix)
-
 def test_create_translation():
     f = text.create_translation
     key = ('foo', None)
@@ -129,6 +45,79 @@ def test_create_translation():
     assert context.__dict__[key] is t
 
 
+class test_TestLang(object):
+    def setUp(self):
+        self.tmp_dir = None
+        self.saved_lang  = None
+
+        self.lang = 'xh_ZA'
+        self.domain = 'ipa'
+
+        self.ipa_i18n_dir = os.path.join(os.path.dirname(__file__), '../../install/po')
+
+        self.pot_basename = '%s.pot' % self.domain
+        self.po_basename = '%s.po' % self.lang
+        self.mo_basename = '%s.mo' % self.domain
+
+        self.tmp_dir = tempfile.mkdtemp()
+        self.saved_lang  = os.environ['LANG']
+
+        self.locale_dir = os.path.join(self.tmp_dir, 'test_locale')
+        self.msg_dir = os.path.join(self.locale_dir, self.lang, 'LC_MESSAGES')
+
+        if not os.path.exists(self.msg_dir):
+            os.makedirs(self.msg_dir)
+
+        self.pot_file = os.path.join(self.ipa_i18n_dir, self.pot_basename)
+        self.mo_file = os.path.join(self.msg_dir, self.mo_basename)
+        self.po_file = os.path.join(self.tmp_dir, self.po_basename)
+
+        result = create_po(self.pot_file, self.po_file, self.mo_file)
+        if result:
+            raise nose.SkipTest('Unable to create po file "%s" & mo file "%s" from pot file "%s"' %
+                                (self.po_file, self.mo_file, self.pot_file))
+
+        if not file_exists(self.po_file):
+            raise nose.SkipTest('Test po file unavailable, run "make test" in install/po')
+
+        if not file_exists(self.mo_file):
+            raise nose.SkipTest('Test mo file unavailable, run "make test" in install/po')
+
+        self.po_file_iterate = po_file_iterate
+
+    def tearDown(self):
+        if self.saved_lang is not None:
+            os.environ['LANG'] = self.saved_lang
+
+        if self.tmp_dir is not None:
+            shutil.rmtree(self.tmp_dir)
+
+    def test_test_lang(self):
+        print "test_test_lang"
+        # The test installs the test message catalog under the xh_ZA
+        # (e.g. Zambia Xhosa) language by default. It would be nice to
+        # use a dummy language not associated with any real language,
+        # but the setlocale function demands the locale be a valid
+        # known locale, Zambia Xhosa is a reasonable choice :)
+
+        os.environ['LANG'] = self.lang
+
+        # Create a gettext translation object specifying our domain as
+        # 'ipa' and the locale_dir as 'test_locale' (i.e. where to
+        # look for the message catalog). Then use that translation
+        # object to obtain the translation functions.
+
+        def get_msgstr(msg):
+            gt = text.GettextFactory(localedir=self.locale_dir)(msg)
+            return unicode(gt)
+
+        def get_msgstr_plural(singular, plural, count):
+            ng = text.NGettextFactory(localedir=self.locale_dir)(singular, plural, count)
+            return ng(count)
+
+        result = self.po_file_iterate(self.po_file, get_msgstr, get_msgstr_plural)
+        assert result == 0
+
 class test_LazyText(object):
 
     klass = text.LazyText