summaryrefslogtreecommitdiffstats
path: root/silpa/modules/guesslanguages/guess_language.py
diff options
context:
space:
mode:
Diffstat (limited to 'silpa/modules/guesslanguages/guess_language.py')
-rw-r--r--silpa/modules/guesslanguages/guess_language.py562
1 files changed, 562 insertions, 0 deletions
diff --git a/silpa/modules/guesslanguages/guess_language.py b/silpa/modules/guesslanguages/guess_language.py
new file mode 100644
index 0000000..158b5f5
--- /dev/null
+++ b/silpa/modules/guesslanguages/guess_language.py
@@ -0,0 +1,562 @@
+''' Guess the language of text.
+
+ Based on guesslanguage.cpp by Jacob R Rideout for KDE
+ http://websvn.kde.org/branches/work/sonnet-refactoring/common/nlp/guesslanguage.cpp?view=markup
+ which itself is based on Language::Guess by Maciej Ceglowski
+ http://languid.cantbedone.org/
+
+ Copyright (c) 2008, Kent S Johnson
+
+ C++ version is Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
+ Perl version is (c) 2004-6 Maciej Ceglowski
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ Note: Language::Guess is GPL-licensed. KDE developers received permission
+ from the author to distribute their port under LGPL:
+ http://lists.kde.org/?l=kde-sonnet&m=116910092228811&w=2
+
+'''
+
+import codecs, os, re, sys, unicodedata
+try:
+ from collections import defaultdict
+except:
+ class defaultdict(dict):
+ def __init__(self, default_factory=None, *a, **kw):
+ if (default_factory is not None and
+ not hasattr(default_factory, '__call__')):
+ raise TypeError('first argument must be callable')
+ dict.__init__(self, *a, **kw)
+ self.default_factory = default_factory
+ def __getitem__(self, key):
+ try:
+ return dict.__getitem__(self, key)
+ except KeyError:
+ return self.__missing__(key)
+ def __missing__(self, key):
+ if self.default_factory is None:
+ raise KeyError(key)
+ self[key] = value = self.default_factory()
+ return value
+ def __reduce__(self):
+ if self.default_factory is None:
+ args = tuple()
+ else:
+ args = self.default_factory,
+ return type(self), args, None, None, self.items()
+ def copy(self):
+ return self.__copy__()
+ def __copy__(self):
+ return type(self)(self.default_factory, self)
+ def __deepcopy__(self, memo):
+ import copy
+ return type(self)(self.default_factory,
+ copy.deepcopy(self.items()))
+ def __repr__(self):
+ return 'defaultdict(%s, %s)' % (self.default_factory,
+ dict.__repr__(self))
+
+from blocks import unicodeBlock
+from common import SilpaModule
+
+MIN_LENGTH = 20
+
+BASIC_LATIN = "en ceb ha so tlh id haw la sw eu nr nso zu xh ss st tn ts".split()
+EXTENDED_LATIN = "cs af pl hr ro sk sl tr hu az et sq ca es fr de nl it da is nb sv fi lv pt ve lt tl cy".split()
+ALL_LATIN = BASIC_LATIN + EXTENDED_LATIN
+CYRILLIC = "ru uk kk uz mn sr mk bg ky".split()
+ARABIC = "ar fa ps ur".split()
+DEVANAGARI = "hi ne".split()
+
+# NOTE mn appears twice, once for mongolian script and once for CYRILLIC
+SINGLETONS = [
+ ('Armenian', 'hy'),
+ ('Hebrew', 'he'),
+ ('Bengali', 'bn'),
+ ('Gurmukhi', 'pa'),
+ ('Greek', 'el'),
+ ('Gujarati', 'gu'),
+ ('Oriya', 'or'),
+ ('Tamil', 'ta'),
+ ('Telugu', 'te'),
+ ('Kannada', 'kn'),
+ ('Malayalam', 'ml'),
+ ('Sinhala', 'si'),
+ ('Thai', 'th'),
+ ('Lao', 'lo'),
+ ('Tibetan', 'bo'),
+ ('Burmese', 'my'),
+ ('Georgian', 'ka'),
+ ('Mongolian', 'mn-Mong'),
+ ('Khmer', 'km'),
+]
+
+PT = "pt_BR pt_PT".split()
+
+UNKNOWN = 'UNKNOWN'
+
+models = {}
+
+NAME_MAP = {
+ "ab" : "Abkhazian",
+ "af" : "Afrikaans",
+ "ar" : "Arabic",
+ "az" : "Azeri",
+ "be" : "Byelorussian",
+ "bg" : "Bulgarian",
+ "bn" : "Bengali",
+ "bo" : "Tibetan",
+ "br" : "Breton",
+ "ca" : "Catalan",
+ "ceb" : "Cebuano",
+ "cs" : "Czech",
+ "cy" : "Welsh",
+ "da" : "Danish",
+ "de" : "German",
+ "el" : "Greek",
+ "en" : "English",
+ "eo" : "Esperanto",
+ "es" : "Spanish",
+ "et" : "Estonian",
+ "eu" : "Basque",
+ "fa" : "Farsi",
+ "fi" : "Finnish",
+ "fo" : "Faroese",
+ "fr" : "French",
+ "fy" : "Frisian",
+ "gd" : "Scots Gaelic",
+ "gl" : "Galician",
+ "gu" : "Gujarati",
+ "ha" : "Hausa",
+ "haw" : "Hawaiian",
+ "he" : "Hebrew",
+ "hi" : "Hindi",
+ "hr" : "Croatian",
+ "hu" : "Hungarian",
+ "hy" : "Armenian",
+ "id" : "Indonesian",
+ "is" : "Icelandic",
+ "it" : "Italian",
+ "ja" : "Japanese",
+ "ka" : "Georgian",
+ "kk" : "Kazakh",
+ "km" : "Cambodian",
+ "ko" : "Korean",
+ "ku" : "Kurdish",
+ "ky" : "Kyrgyz",
+ "la" : "Latin",
+ "lt" : "Lithuanian",
+ "lv" : "Latvian",
+ "mg" : "Malagasy",
+ "mk" : "Macedonian",
+ "ml" : "Malayalam",
+ "mn" : "Mongolian",
+ "mr" : "Marathi",
+ "ms" : "Malay",
+ "nd" : "Ndebele",
+ "ne" : "Nepali",
+ "nl" : "Dutch",
+ "nn" : "Nynorsk",
+ "no" : "Norwegian",
+ "nso" : "Sepedi",
+ "pa" : "Punjabi",
+ "pl" : "Polish",
+ "ps" : "Pashto",
+ "pt" : "Portuguese",
+ "ro" : "Romanian",
+ "ru" : "Russian",
+ "sa" : "Sanskrit",
+ "sh" : "Serbo-Croatian",
+ "sk" : "Slovak",
+ "sl" : "Slovene",
+ "so" : "Somali",
+ "sq" : "Albanian",
+ "sr" : "Serbian",
+ "sv" : "Swedish",
+ "sw" : "Swahili",
+ "ta" : "Tamil",
+ "te" : "Telugu",
+ "th" : "Thai",
+ "tl" : "Tagalog",
+ "tlh" : "Klingon",
+ "tn" : "Setswana",
+ "tr" : "Turkish",
+ "ts" : "Tsonga",
+ "tw" : "Twi",
+ "uk" : "Ukrainian",
+ "uk" : "Ukranian",
+ "ur" : "Urdu",
+ "uz" : "Uzbek",
+ "ve" : "Venda",
+ "vi" : "Vietnamese",
+ "xh" : "Xhosa",
+ "zh" : "Chinese",
+ "zh-tw" : "Traditional Chinese (Taiwan)",
+ "zu" : "Zulu",
+}
+
+IANA_MAP = {
+ "ab" : 12026,
+ "af" : 40,
+ "ar" : 26020,
+ "az" : 26030,
+ "be" : 11890,
+ "bg" : 26050,
+ "bn" : 26040,
+ "bo" : 26601,
+ "br" : 1361,
+ "ca" : 3,
+ "ceb" : 26060,
+ "cs" : 26080,
+ "cy" : 26560,
+ "da" : 26090,
+ "de" : 26160,
+ "el" : 26165,
+ "en" : 26110,
+ "eo" : 11933,
+ "es" : 26460,
+ "et" : 26120,
+ "eu" : 1232,
+ "fa" : 26130,
+ "fi" : 26140,
+ "fo" : 11817,
+ "fr" : 26150,
+ "fy" : 1353,
+ "gd" : 65555,
+ "gl" : 1252,
+ "gu" : 26599,
+ "ha" : 26170,
+ "haw" : 26180,
+ "he" : 26592,
+ "hi" : 26190,
+ "hr" : 26070,
+ "hu" : 26200,
+ "hy" : 26597,
+ "id" : 26220,
+ "is" : 26210,
+ "it" : 26230,
+ "ja" : 26235,
+ "ka" : 26600,
+ "kk" : 26240,
+ "km" : 1222,
+ "ko" : 26255,
+ "ku" : 11815,
+ "ky" : 26260,
+ "la" : 26280,
+ "lt" : 26300,
+ "lv" : 26290,
+ "mg" : 1362,
+ "mk" : 26310,
+ "ml" : 26598,
+ "mn" : 26320,
+ "mr" : 1201,
+ "ms" : 1147,
+ "ne" : 26330,
+ "nl" : 26100,
+ "nn" : 172,
+ "no" : 26340,
+ "pa" : 65550,
+ "pl" : 26380,
+ "ps" : 26350,
+ "pt" : 26390,
+ "ro" : 26400,
+ "ru" : 26410,
+ "sa" : 1500,
+ "sh" : 1399,
+ "sk" : 26430,
+ "sl" : 26440,
+ "so" : 26450,
+ "sq" : 26010,
+ "sr" : 26420,
+ "sv" : 26480,
+ "sw" : 26470,
+ "ta" : 26595,
+ "te" : 26596,
+ "th" : 26594,
+ "tl" : 26490,
+ "tlh" : 26250,
+ "tn" : 65578,
+ "tr" : 26500,
+ "tw" : 1499,
+ "uk" : 26510,
+ "uk" : 26520,
+ "ur" : 26530,
+ "uz" : 26540,
+ "vi" : 26550,
+ "zh" : 26065,
+ "zh-tw" : 22,
+}
+
+
+def _load_models():
+ modelsDir = os.path.join(os.path.dirname(__file__), 'trigrams')
+ modelsList = os.listdir(modelsDir)
+
+ lineRe = re.compile(r"(.{3})\s+(.*)")
+ for modelFile in modelsList:
+ modelPath = os.path.join(modelsDir, modelFile)
+ if os.path.isdir(modelPath):
+ continue
+ f = codecs.open(modelPath, 'r', 'utf-8')
+ model = {} # QHash<QString,int> model
+ for line in f:
+ m = lineRe.search(line)
+ if m:
+ model[m.group(1)] = int(m.group(2))
+
+ models[modelFile.lower()] = model
+
+
+_load_models()
+
+def guessLanguage(text):
+ ''' Returns the language code, i.e. 'en' '''
+ if not text:
+ return UNKNOWN
+
+ if isinstance(text, str):
+ text = unicode(text, 'utf-8')
+
+ text = normalize(text)
+
+ return _identify(text, find_runs(text))
+
+
+def guessLanguageInfo(text):
+ """
+ Returns (tag, id, name) i.e. ('en', 26110, 'english')
+ """
+ tag = guessLanguage(text)
+
+ if tag == UNKNOWN:
+ return UNKNOWN,UNKNOWN,UNKNOWN
+
+ id = _getId(tag)
+ name = _getName(tag)
+ return tag,id,name
+
+
+# An alias for guessLanguage
+guessLanguageTag = guessLanguage
+
+
+def guessLanguageId(text):
+ """
+ Returns the language id. i.e. 26110
+ """
+ lang = guessLanguage(text)
+ return _getId(lang)
+
+
+def guessLanguageName(text):
+ """
+ Returns the language name. i.e. 'english'
+ """
+ lang = guessLanguage(text)
+ return _getName(lang)
+
+
+def _getId(iana):
+ return IANA_MAP.get(iana, UNKNOWN)
+
+def _getName(iana):
+ return NAME_MAP.get(iana, UNKNOWN)
+
+
+def find_runs(text):
+ ''' Count the number of characters in each character block '''
+ run_types = defaultdict(int)
+
+ totalCount = 0
+
+ for c in text:
+ if c.isalpha():
+ block = unicodeBlock(c)
+ run_types[block] += 1
+ totalCount += 1
+
+# pprint.pprint(run_types)
+
+ # return run types that used for 40% or more of the string
+ # always return basic latin if found more than 15%
+ # and extended additional latin if over 10% (for Vietnamese)
+ relevant_runs = []
+ for key, value in run_types.items():
+ pct = (value*100) / totalCount
+ if pct >=40:
+ relevant_runs.append(key)
+ elif key == "Basic Latin" and ( pct >=15 ):
+ relevant_runs.append(key)
+ elif key == "Latin Extended Additional" and ( pct >=10 ):
+ relevant_runs.append(key)
+
+ return relevant_runs
+
+
+def _identify(sample, scripts):
+ if len(sample) < 3:
+ return UNKNOWN
+
+ if "Hangul Syllables" in scripts or "Hangul Jamo" in scripts \
+ or "Hangul Compatibility Jamo" in scripts or "Hangul" in scripts:
+ return "ko"
+
+ if "Greek and Coptic" in scripts:
+ return "el"
+
+ if "Katakana" in scripts or "Hiragana" in scripts or "Katakana Phonetic Extensions" in scripts:
+ return "ja"
+
+ if "CJK Unified Ideographs" in scripts or "Bopomofo" in scripts \
+ or "Bopomofo Extended" in scripts or "KangXi Radicals" in scripts:
+
+# This is in both Ceglowski and Rideout
+# I can't imagine why...
+# or "Arabic Presentation Forms-A" in scripts
+ return "zh"
+
+ if "Cyrillic" in scripts:
+ return check( sample, CYRILLIC )
+
+ if "Arabic" in scripts or "Arabic Presentation Forms-A" in scripts or "Arabic Presentation Forms-B" in scripts:
+ return check( sample, ARABIC )
+
+ if "Devanagari" in scripts:
+ return check( sample, DEVANAGARI )
+
+
+ # Try languages with unique scripts
+ for blockName, langName in SINGLETONS:
+ if blockName in scripts:
+ return langName
+
+ if "Latin Extended Additional" in scripts:
+ return "vi"
+
+ if "Extended Latin" in scripts:
+ latinLang = check( sample, EXTENDED_LATIN )
+ if latinLang == "pt":
+ return check(sample, PT)
+ else:
+ return latinLang
+
+ if "Basic Latin" in scripts:
+ return check( sample, ALL_LATIN )
+
+ return UNKNOWN
+
+
+def check(sample, langs):
+ if len(sample) < MIN_LENGTH:
+ return UNKNOWN
+
+ scores = []
+ model = createOrderedModel(sample) # QMap<int,QString>
+
+ for key in langs:
+ lkey = key.lower()
+
+ if lkey in models:
+ scores.append( (distance(model, models[lkey]), key) )
+
+ if not scores:
+ return UNKNOWN
+
+ # we want the lowest score, less distance = greater chance of match
+# pprint(sorted(scores))
+ return min(scores)[1]
+
+
+def createOrderedModel(content):
+ ''' Create a list of trigrams in content sorted by frequency '''
+ trigrams = defaultdict(int) # QHash<QString,int>
+ content = content.lower()
+
+ for i in xrange(0, len(content)-2):
+ trigrams[content[i:i+3]]+=1
+
+ return sorted(trigrams.keys(), key=lambda k: (-trigrams[k], k))
+
+
+spRe = re.compile(r"\s\s", re.UNICODE)
+MAXGRAMS = 300
+
+def distance(model, knownModel):
+ dist = 0
+
+ for i, value in enumerate(model[:MAXGRAMS]):
+ if not spRe.search(value):
+ if value in knownModel:
+ dist += abs(i - knownModel[value])
+ else:
+ dist += MAXGRAMS
+
+ return dist
+
+
+def _makeNonAlphaRe():
+ nonAlpha = [u'[^']
+ for i in range(sys.maxunicode):
+ c = unichr(i)
+ if c.isalpha(): nonAlpha.append(c)
+ nonAlpha.append(u']')
+ nonAlpha = u"".join(nonAlpha)
+ return re.compile(nonAlpha)
+
+
+nonAlphaRe = _makeNonAlphaRe()
+spaceRe = re.compile('\s+', re.UNICODE)
+
+def normalize(u):
+ ''' Convert to normalized unicode.
+ Remove non-alpha chars and compress runs of spaces.
+ '''
+ u = unicodedata.normalize('NFC', u)
+ u = nonAlphaRe.sub(' ', u)
+ u = spaceRe.sub(' ', u)
+ return u
+class LangGuess(SilpaModule):
+ def process(self, form):
+ response = """
+ <h2>Guess the language</h2></hr>
+ <p>Enter the text for guessing the language in the below text area.
+ You can give the text in any language and even with mixed language
+ </p>
+ <form action="" method="post">
+ <textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea>
+ <input type="submit" id="Guess Language" value="Guess Language" name="action" style="width:12em;"/>
+ <input type="reset" value="Clear" style="width:12em;"/>
+ </br>
+ </form>
+ """
+ if(form.has_key('input_text')):
+ text = action=form['input_text'].value .decode('utf-8')
+ response=response % text
+ result = guessLanguageName(text)
+ response = response+"<h2>Guess Language Results</h2></hr>"
+ response = response+result
+ else:
+ response=response % ""
+ return response
+ def get_module_name(self):
+ return "Guess Language"
+ def get_info(self):
+ return "Guess the language of given text. This module can detect more than 50 languages. Based on Language::Guess by Maciej Ceglowski(http://languid.cantbedone.org/)"
+
+def getInstance():
+ return LangGuess()
+
+