1 files changed, 562 insertions, 0 deletions
diff --git a/silpa/modules/guesslanguages/guess_language.py b/silpa/modules/guesslanguages/guess_language.py
new file mode 100644
index 0000000..158b5f5
--- /dev/null
+++ b/silpa/modules/guesslanguages/guess_language.py
@@ -0,0 +1,562 @@
+''' Guess the language of text.
+
+    Based on guesslanguage.cpp by Jacob R Rideout for KDE
+    http://websvn.kde.org/branches/work/sonnet-refactoring/common/nlp/guesslanguage.cpp?view=markup
+    which itself is based on Language::Guess by Maciej Ceglowski
+    http://languid.cantbedone.org/
+
+    Copyright (c) 2008, Kent S Johnson
+
+    C++ version is Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
+    Perl version is (c) 2004-6 Maciej Ceglowski
+    
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+    
+    Note: Language::Guess is GPL-licensed. KDE developers received permission
+    from the author to distribute their port under LGPL:
+    http://lists.kde.org/?l=kde-sonnet&m=116910092228811&w=2
+    
+'''
+
+import codecs, os, re, sys, unicodedata
+try:
+	from collections import defaultdict
+except:
+    class defaultdict(dict):
+        def __init__(self, default_factory=None, *a, **kw):
+            if (default_factory is not None and
+                not hasattr(default_factory, '__call__')):
+                raise TypeError('first argument must be callable')
+            dict.__init__(self, *a, **kw)
+            self.default_factory = default_factory
+        def __getitem__(self, key):
+            try:
+                return dict.__getitem__(self, key)
+            except KeyError:
+                return self.__missing__(key)
+        def __missing__(self, key):
+            if self.default_factory is None:
+                raise KeyError(key)
+            self[key] = value = self.default_factory()
+            return value
+        def __reduce__(self):
+            if self.default_factory is None:
+                args = tuple()
+            else:
+                args = self.default_factory,
+            return type(self), args, None, None, self.items()
+        def copy(self):
+            return self.__copy__()
+        def __copy__(self):
+            return type(self)(self.default_factory, self)
+        def __deepcopy__(self, memo):
+            import copy
+            return type(self)(self.default_factory,
+                              copy.deepcopy(self.items()))
+        def __repr__(self):
+            return 'defaultdict(%s, %s)' % (self.default_factory,
+                                            dict.__repr__(self))
+
+from blocks import unicodeBlock
+from common import SilpaModule
+
+MIN_LENGTH = 20
+
+BASIC_LATIN = "en ceb ha so tlh id haw la sw eu nr nso zu xh ss st tn ts".split()
+EXTENDED_LATIN = "cs af pl hr ro sk sl tr hu az et sq ca es fr de nl it da is nb sv fi lv pt ve lt tl cy".split()
+ALL_LATIN = BASIC_LATIN + EXTENDED_LATIN
+CYRILLIC = "ru uk kk uz mn sr mk bg ky".split()
+ARABIC = "ar fa ps ur".split()
+DEVANAGARI = "hi ne".split()
+
+# NOTE mn appears twice, once for mongolian script and once for CYRILLIC
+SINGLETONS = [
+    ('Armenian', 'hy'),
+    ('Hebrew', 'he'),
+    ('Bengali', 'bn'),
+    ('Gurmukhi', 'pa'),
+    ('Greek', 'el'),
+    ('Gujarati', 'gu'),
+    ('Oriya', 'or'),
+    ('Tamil', 'ta'),
+    ('Telugu', 'te'),
+    ('Kannada', 'kn'),
+    ('Malayalam', 'ml'),
+    ('Sinhala', 'si'),
+    ('Thai', 'th'),
+    ('Lao', 'lo'),
+    ('Tibetan', 'bo'),
+    ('Burmese', 'my'),
+    ('Georgian', 'ka'),
+    ('Mongolian', 'mn-Mong'),
+    ('Khmer', 'km'),
+]
+
+PT = "pt_BR pt_PT".split()
+
+UNKNOWN = 'UNKNOWN'
+
+models = {}
+
+NAME_MAP = {
+    "ab" : "Abkhazian",
+    "af" : "Afrikaans",
+    "ar" : "Arabic",
+    "az" : "Azeri",
+    "be" : "Byelorussian",
+    "bg" : "Bulgarian",
+    "bn" : "Bengali",
+    "bo" : "Tibetan",
+    "br" : "Breton",
+    "ca" : "Catalan",
+    "ceb" : "Cebuano",
+    "cs" : "Czech",
+    "cy" : "Welsh",
+    "da" : "Danish",
+    "de" : "German",
+    "el" : "Greek",
+    "en" : "English",
+    "eo" : "Esperanto",
+    "es" : "Spanish",
+    "et" : "Estonian",
+    "eu" : "Basque",
+    "fa" : "Farsi",
+    "fi" : "Finnish",
+    "fo" : "Faroese",
+    "fr" : "French",
+    "fy" : "Frisian",
+    "gd" : "Scots Gaelic",
+    "gl" : "Galician",
+    "gu" : "Gujarati",
+    "ha" : "Hausa",
+    "haw" : "Hawaiian",
+    "he" : "Hebrew",
+    "hi" : "Hindi",
+    "hr" : "Croatian",
+    "hu" : "Hungarian",
+    "hy" : "Armenian",
+    "id" : "Indonesian",
+    "is" : "Icelandic",
+    "it" : "Italian",
+    "ja" : "Japanese",
+    "ka" : "Georgian",
+    "kk" : "Kazakh",
+    "km" : "Cambodian",
+    "ko" : "Korean",
+    "ku" : "Kurdish",
+    "ky" : "Kyrgyz",
+    "la" : "Latin",
+    "lt" : "Lithuanian",
+    "lv" : "Latvian",
+    "mg" : "Malagasy",
+    "mk" : "Macedonian",
+    "ml" : "Malayalam",
+    "mn" : "Mongolian",
+    "mr" : "Marathi",
+    "ms" : "Malay",
+    "nd" : "Ndebele",
+    "ne" : "Nepali",
+    "nl" : "Dutch",
+    "nn" : "Nynorsk",
+    "no" : "Norwegian",
+    "nso" : "Sepedi",
+    "pa" : "Punjabi",
+    "pl" : "Polish",
+    "ps" : "Pashto",
+    "pt" : "Portuguese",
+    "ro" : "Romanian",
+    "ru" : "Russian",
+    "sa" : "Sanskrit",
+    "sh" : "Serbo-Croatian",
+    "sk" : "Slovak",
+    "sl" : "Slovene",
+    "so" : "Somali",
+    "sq" : "Albanian",
+    "sr" : "Serbian",
+    "sv" : "Swedish",
+    "sw" : "Swahili",
+    "ta" : "Tamil",
+    "te" : "Telugu",
+    "th" : "Thai",
+    "tl" : "Tagalog",
+    "tlh" : "Klingon",
+    "tn" : "Setswana",
+    "tr" : "Turkish",
+    "ts" : "Tsonga",
+    "tw" : "Twi",
+    "uk" : "Ukrainian",
+    "uk" : "Ukranian",
+    "ur" : "Urdu",
+    "uz" : "Uzbek",
+    "ve" : "Venda",
+    "vi" : "Vietnamese",
+    "xh" : "Xhosa",
+    "zh" : "Chinese",
+    "zh-tw" : "Traditional Chinese (Taiwan)",
+    "zu" : "Zulu",
+}
+
+IANA_MAP = {
+    "ab" : 12026,
+    "af" : 40,
+    "ar" : 26020,
+    "az" : 26030,
+    "be" : 11890,
+    "bg" : 26050,
+    "bn" : 26040,
+    "bo" : 26601,
+    "br" : 1361,
+    "ca" : 3,
+    "ceb" : 26060,
+    "cs" : 26080,
+    "cy" : 26560,
+    "da" : 26090,
+    "de" : 26160,
+    "el" : 26165,
+    "en" : 26110,
+    "eo" : 11933,
+    "es" : 26460,
+    "et" : 26120,
+    "eu" : 1232,
+    "fa" : 26130,
+    "fi" : 26140,
+    "fo" : 11817,
+    "fr" : 26150,
+    "fy" : 1353,
+    "gd" : 65555,
+    "gl" : 1252,
+    "gu" : 26599,
+    "ha" : 26170,
+    "haw" : 26180,
+    "he" : 26592,
+    "hi" : 26190,
+    "hr" : 26070,
+    "hu" : 26200,
+    "hy" : 26597,
+    "id" : 26220,
+    "is" : 26210,
+    "it" : 26230,
+    "ja" : 26235,
+    "ka" : 26600,
+    "kk" : 26240,
+    "km" : 1222,
+    "ko" : 26255,
+    "ku" : 11815,
+    "ky" : 26260,
+    "la" : 26280,
+    "lt" : 26300,
+    "lv" : 26290,
+    "mg" : 1362,
+    "mk" : 26310,
+    "ml" : 26598,
+    "mn" : 26320,
+    "mr" : 1201,
+    "ms" : 1147,
+    "ne" : 26330,
+    "nl" : 26100,
+    "nn" : 172,
+    "no" : 26340,
+    "pa" : 65550,
+    "pl" : 26380,
+    "ps" : 26350,
+    "pt" : 26390,
+    "ro" : 26400,
+    "ru" : 26410,
+    "sa" : 1500,
+    "sh" : 1399,
+    "sk" : 26430,
+    "sl" : 26440,
+    "so" : 26450,
+    "sq" : 26010,
+    "sr" : 26420,
+    "sv" : 26480,
+    "sw" : 26470,
+    "ta" : 26595,
+    "te" : 26596,
+    "th" : 26594,
+    "tl" : 26490,
+    "tlh" : 26250,
+    "tn" : 65578,
+    "tr" : 26500,
+    "tw" : 1499,
+    "uk" : 26510,
+    "uk" : 26520,
+    "ur" : 26530,
+    "uz" : 26540,
+    "vi" : 26550,
+    "zh" : 26065,
+    "zh-tw" : 22,
+}
+
+
+def _load_models():
+    modelsDir = os.path.join(os.path.dirname(__file__), 'trigrams')
+    modelsList = os.listdir(modelsDir)
+    
+    lineRe = re.compile(r"(.{3})\s+(.*)")
+    for modelFile in modelsList:
+        modelPath = os.path.join(modelsDir, modelFile)
+        if os.path.isdir(modelPath):
+            continue
+        f = codecs.open(modelPath, 'r', 'utf-8')
+        model = {}  # QHash<QString,int> model
+        for line in f:
+            m = lineRe.search(line)
+            if m:
+                model[m.group(1)] = int(m.group(2))
+                
+        models[modelFile.lower()] = model
+
+
+_load_models()
+
+def guessLanguage(text):
+    ''' Returns the language code, i.e. 'en' '''
+    if not text:
+        return UNKNOWN
+    
+    if isinstance(text, str):
+        text = unicode(text, 'utf-8')
+    
+    text = normalize(text)
+    
+    return _identify(text, find_runs(text))
+
+
+def guessLanguageInfo(text):
+    """
+        Returns (tag, id, name)  i.e. ('en', 26110, 'english')
+    """
+    tag = guessLanguage(text)
+    
+    if tag == UNKNOWN:
+        return UNKNOWN,UNKNOWN,UNKNOWN
+
+    id = _getId(tag)
+    name = _getName(tag)
+    return tag,id,name
+
+
+# An alias for guessLanguage
+guessLanguageTag = guessLanguage
+
+
+def guessLanguageId(text):
+    """
+        Returns the language id.  i.e. 26110
+    """
+    lang = guessLanguage(text)
+    return _getId(lang)
+
+
+def guessLanguageName(text):
+    """
+        Returns the language name.  i.e. 'english'
+    """
+    lang = guessLanguage(text)
+    return _getName(lang) 
+
+
+def _getId(iana):
+    return IANA_MAP.get(iana, UNKNOWN)
+
+def _getName(iana):
+    return NAME_MAP.get(iana, UNKNOWN)
+
+
+def find_runs(text):
+    ''' Count the number of characters in each character block '''
+    run_types = defaultdict(int)
+
+    totalCount = 0
+
+    for c in text:
+        if c.isalpha():
+            block = unicodeBlock(c)
+            run_types[block] += 1
+            totalCount += 1
+
+#    pprint.pprint(run_types)
+    
+    # return run types that used for 40% or more of the string
+    # always return basic latin if found more than 15%
+    # and extended additional latin if over 10% (for Vietnamese)
+    relevant_runs = []
+    for key, value in run_types.items():
+        pct = (value*100) / totalCount
+        if pct >=40:
+            relevant_runs.append(key)
+        elif key == "Basic Latin" and ( pct >=15 ):
+            relevant_runs.append(key)
+        elif key == "Latin Extended Additional" and ( pct >=10 ):
+            relevant_runs.append(key)
+
+    return relevant_runs
+
+
+def _identify(sample, scripts):
+    if len(sample) < 3:
+        return UNKNOWN
+
+    if "Hangul Syllables" in scripts or "Hangul Jamo" in scripts \
+            or "Hangul Compatibility Jamo" in scripts or "Hangul" in scripts:
+        return "ko"
+
+    if "Greek and Coptic" in scripts:
+        return "el"
+
+    if "Katakana" in scripts or "Hiragana" in scripts or "Katakana Phonetic Extensions" in scripts:
+        return "ja"
+
+    if "CJK Unified Ideographs" in scripts or "Bopomofo" in scripts \
+            or "Bopomofo Extended" in scripts or "KangXi Radicals" in scripts:
+
+# This is in both Ceglowski and Rideout
+# I can't imagine why...
+#            or "Arabic Presentation Forms-A" in scripts
+        return "zh"
+
+    if "Cyrillic" in scripts:
+        return check( sample, CYRILLIC )
+
+    if "Arabic" in scripts or "Arabic Presentation Forms-A" in scripts or "Arabic Presentation Forms-B" in scripts:
+        return check( sample, ARABIC )
+
+    if "Devanagari" in scripts:
+        return check( sample, DEVANAGARI )
+
+
+    # Try languages with unique scripts
+    for blockName, langName in SINGLETONS:
+        if blockName in scripts:
+            return langName
+
+    if "Latin Extended Additional" in scripts:
+        return "vi"
+
+    if "Extended Latin" in scripts:
+        latinLang = check( sample, EXTENDED_LATIN )
+        if latinLang == "pt":
+            return check(sample, PT)
+        else:
+            return latinLang
+            
+    if "Basic Latin" in scripts:
+        return check( sample, ALL_LATIN )
+
+    return UNKNOWN
+
+
+def check(sample, langs):
+    if len(sample) < MIN_LENGTH:
+        return UNKNOWN
+
+    scores = []
+    model = createOrderedModel(sample)  # QMap<int,QString>
+
+    for key in langs:
+        lkey = key.lower()
+
+        if lkey in models:
+            scores.append( (distance(model, models[lkey]), key) )
+
+    if not scores:
+        return UNKNOWN
+
+    # we want the lowest score, less distance = greater chance of match
+#    pprint(sorted(scores))
+    return min(scores)[1]
+
+
+def createOrderedModel(content):
+    ''' Create a list of trigrams in content sorted by frequency '''
+    trigrams = defaultdict(int) # QHash<QString,int> 
+    content = content.lower()
+    
+    for i in xrange(0, len(content)-2):
+        trigrams[content[i:i+3]]+=1
+
+    return sorted(trigrams.keys(), key=lambda k: (-trigrams[k], k))
+
+
+spRe = re.compile(r"\s\s", re.UNICODE)
+MAXGRAMS = 300
+
+def distance(model, knownModel):
+    dist = 0
+
+    for i, value in enumerate(model[:MAXGRAMS]):
+        if not spRe.search(value):
+            if value in knownModel:
+                dist += abs(i - knownModel[value])
+            else:
+                dist += MAXGRAMS
+
+    return dist
+
+
+def _makeNonAlphaRe():
+    nonAlpha = [u'[^']
+    for i in range(sys.maxunicode):
+      c = unichr(i)
+      if c.isalpha(): nonAlpha.append(c)
+    nonAlpha.append(u']')
+    nonAlpha = u"".join(nonAlpha)
+    return re.compile(nonAlpha)
+
+
+nonAlphaRe = _makeNonAlphaRe()
+spaceRe = re.compile('\s+', re.UNICODE)
+    
+def normalize(u):
+    ''' Convert to normalized unicode.
+        Remove non-alpha chars and compress runs of spaces.
+    '''
+    u = unicodedata.normalize('NFC', u)
+    u = nonAlphaRe.sub(' ', u)
+    u = spaceRe.sub(' ', u)
+    return u
+class LangGuess(SilpaModule):
+	def process(self, form):
+		response = """
+		<h2>Guess the language</h2></hr>
+		<p>Enter the text for guessing the language in the below text area.
+		 You can give the text in any language and even with mixed language
+		</p>
+		<form action="" method="post">
+		<textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea>
+		<input  type="submit" id="Guess Language" value="Guess Language"  name="action" style="width:12em;"/>
+		<input type="reset" value="Clear" style="width:12em;"/>
+		</br>
+		</form>
+		"""
+		if(form.has_key('input_text')):
+			text = action=form['input_text'].value	.decode('utf-8')
+			response=response % text
+			result = guessLanguageName(text)
+			response = response+"<h2>Guess Language Results</h2></hr>"
+			response = response+result
+		else:
+			response=response % ""	
+		return response
+	def get_module_name(self):
+		return "Guess Language"
+	def get_info(self):
+		return 	"Guess the language of given text. This module can detect more than 50 languages. Based on Language::Guess by Maciej Ceglowski(http://languid.cantbedone.org/)"
+		
+def getInstance():
+	return LangGuess()	
+
+