1 files changed, 377 insertions, 0 deletions
diff --git a/bots/dictdlib.py b/bots/dictdlib.py
new file mode 100644
index 0000000..e1d9036
--- /dev/null
+++ b/bots/dictdlib.py
@@ -0,0 +1,377 @@
+# Dictionary creation library
+# Copyright (C) 2002 John Goerzen
+# <jgoerzen@complete.org>
+#
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import sys, string, gzip, os
+
+b64_list = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+url_headword = "00-database-url"
+short_headword = "00-database-short"
+info_headword = "00-database-info"
+
+
+def b64_encode(val):
+    """Takes as input an integer val and returns a string of it encoded
+    with the base64 algorithm used by dict indexes."""
+    startfound = 0
+    retval = ""
+    for i in range(5, -1, -1):
+        thispart = (val >> (6 * i)) & ((2 ** 6) - 1)
+        if (not startfound) and (not thispart):
+            # Both zero -- keep going.
+            continue
+        startfound = 1
+        retval += b64_list[thispart]
+    if len(retval):
+        return retval
+    else:
+        return b64_list[0]
+    
+def b64_decode(str):
+    """Takes as input a string and returns an integer value of it decoded
+    with the base64 algorithm used by dict indexes."""
+    if not len(str):
+        return 0
+    retval = 0
+    shiftval = 0
+    for i in range(len(str) - 1, -1, -1):
+        val = b64_list.index(str[i])
+        retval = retval | (val << shiftval)
+        shiftval += 6
+    return retval
+
+validdict = {}
+for x in string.ascii_letters + string.digits + " \t":
+    validdict[x] = 1
+
+def sortnormalize(x):
+    """Returns a value such that x is mapped to a format that sorts properly
+    with standard comparison."""
+    x2 = ''
+    for i in range(len(x)):
+        if validdict.has_key(x[i]):
+            x2 += x[i]
+    return x2.upper() + "\0" + x.upper()
+
+def sortfunc(x, y):
+   """Emulate sort -df."""
+   xl = x.split("\0")
+   yl = y.split("\0")
+   ret = cmp(xl[0], yl[0])
+   if ret != 0:
+       return ret
+   return cmp(xl[1], yl[1])
+
+class DictDB:
+    def __init__(self, basename, mode = 'read', quiet = 0):
+        #, url = 'unknown', shortname = 'unknown',
+        #         longinfo = 'unknown', quiet = 0):
+        """Initialize a DictDB object.
+
+        Mode must be one of:
+
+        read -- read-only access
+
+        write -- write-only access, truncates existing files, does not work
+        with .dz.  dict created if nonexistant.
+
+        update -- read/write access, dict created if nonexistant.  Does not
+        work with .dz.
+
+        Read can read dict or dict.dz files.  Write and update will NOT work
+        with dict.dz files.
+        
+        If quiet is nonzero, status messages
+        will be suppressed."""
+
+        self.mode = mode
+        self.quiet = quiet
+        self.indexentries = {}
+        self.count = 0
+        self.basename = basename
+
+        self.indexfilename = self.basename + ".index"
+        if os.path.isfile(self.basename + ".dict.dz"):
+            self.dictfilename = self.basename + ".dict.dz"
+            self.usecompression = 1
+        else:
+            self.dictfilename = self.basename + ".dict"
+            self.usecompression = 0
+
+        if mode == 'read':
+            self.indexfile = open(self.indexfilename, "rt")
+            if self.usecompression:
+                self.dictfile = gzip.GzipFile(self.dictfilename, "rb")
+            else:
+                self.dictfile = open(self.dictfilename, "rb")
+            self._initindex()
+        elif mode == 'write':
+            self.indexfile = open(self.indexfilename, "wt")
+            if self.usecompression:
+                raise ValueError, "'write' mode incompatible with .dz files"
+            else:
+                self.dictfile = open(self.dictfilename, "wb")
+        elif mode == 'update':
+            try:
+                self.indexfile = open(self.indexfilename, "r+b")
+            except IOError:
+                self.indexfile = open(self.indexfilename, "w+b")
+            if self.usecompression:
+                # Open it read-only since we don't support mods.
+                self.dictfile = gzip.GzipFile(self.dictfilename, "rb")
+            else:
+                try:
+                    self.dictfile = open(self.dictfilename, "r+b")
+                except IOError:
+                    self.dictfile = open(self.dictfilename, "w+b")
+            self._initindex()
+        else:
+            raise ValueError, "mode must be 'read', 'write', or 'update'"
+
+        #self.writeentry(url_headword + "\n     " + url, [url_headword])
+        #self.writeentry(short_headword + "\n     " + shortname,
+        #                [short_headword])
+        #self.writeentry(info_headword + "\n" + longinfo, [info_headword])
+
+    def _initindex(self):
+        """Load the entire index off disk into memory."""
+        self.indexfile.seek(0)
+        for line in self.indexfile.xreadlines():
+            splits = line.rstrip().split("\t")
+            if not self.indexentries.has_key(splits[0]):
+                self.indexentries[splits[0]] = []
+            self.indexentries[splits[0]].append([b64_decode(splits[1]),
+                                                 b64_decode(splits[2])])
+
+    def addindexentry(self, word, start, size):
+        """Adds an entry to the index.  word is the relevant word.
+        start is the starting position in the dictionary and size is the
+        size of the definition; both are integers."""
+        if not self.indexentries.has_key(word):
+            self.indexentries[word] = []
+        self.indexentries[word].append([start, size])
+
+    def delindexentry(self, word, start = None, size = None):
+        """Removes an entry from the index; word is the word to search for.
+
+        start and size are optional.  If they are specified, only index
+        entries matching the specified values will be removed.
+
+        For instance, if word is "foo" and start and size are not specified,
+        all index entries for the word foo will be removed.  If start and size
+        are specified, only those entries matching all criteria will be
+        removed.
+
+        This function does not actually remove the data from the .dict file.
+        Therefore, information removed by this function will still
+        exist on-disk in the .dict file, but the dict server will just
+        not "see" it -- there will be no way to get to it anymore.
+
+        Returns a count of the deleted entries."""
+
+        if not self.indexentries.has_key(word):
+            return 0
+        retval = 0
+        entrylist = self.indexentries[word]
+        for i in range(len(entrylist) - 1, -1, -1):
+            # Go backwords so the del doesn't effect the index.
+            if (start == None or start == entrylist[i][0]) and \
+               (size == None or size == entrylist[i][1]):
+                del(entrylist[i])
+                retval += 1
+        if len(entrylist) == 0:         # If we emptied it, del it completely
+            del(self.indexentries[word])
+        return retval        
+
+    def update(self, string):
+        """Writes string out, if not quiet."""
+        if not self.quiet:
+            sys.stdout.write(string)
+            sys.stdout.flush()
+
+    def seturl(self, url):
+        """Sets the URL attribute of this database.  If there was
+        already a URL specified, we will use delindexentry() on it
+        first."""
+        self.delindexentry(url_headword)
+        self.addentry(url_headword + "\n     " + url, [url_headword])
+
+    def setshortname(self, shortname):
+        """Sets the shortname for this database.  If there was already
+        a shortname specified, we will use delindexentry() on it first."""
+        self.delindexentry(short_headword)
+        self.addentry(short_headword + "\n     " + shortname,
+                      [short_headword])
+
+    def setlonginfo(self, longinfo):
+        """Sets the extended information for this database.  If there was
+        already long info specified, we will use delindexentry() on it
+        first."""
+        self.delindexentry(info_headword)
+        self.addentry(info_headword + "\n" + longinfo, [info_headword])
+
+
+    def addentry(self, defstr, headwords):
+        """Writes an entry.  defstr holds the content of the definition.
+        headwords is a list specifying one or more words under which this
+        definition should be indexed.  This function always adds \\n
+        to the end of defstr."""
+        self.dictfile.seek(0, 2)        # Seek to end of file
+        start = self.dictfile.tell()
+        defstr += "\n"
+        self.dictfile.write(defstr)
+        for word in headwords:
+            self.addindexentry(word, start, len(defstr))
+            self.count += 1
+
+        if self.count % 1000 == 0:
+            self.update("Processed %d records\r" % self.count)
+
+    def finish(self, dosort = 1):
+        """Called to finish the writing process.
+        **REQUIRED IF OPENED WITH 'update' OR 'write' MODES**.
+        This will write the index and close the files.
+
+        dosort is optional and defaults to true.  If set to false,
+        dictlib will not sort the index file.  In this case, you
+        MUST manually sort it through "sort -df" before it can be used."""
+
+        self.update("Processed %d records.\n" % self.count)
+
+        if dosort:
+            self.update("Sorting index: converting")
+
+            indexlist = []
+            for word, defs in self.indexentries.items():
+                for thisdef in defs:
+                    indexlist.append("%s\t%s\t%s" % (word,
+                                                     b64_encode(thisdef[0]),
+                                                     b64_encode(thisdef[1])))
+
+            self.update(" mapping")
+                
+            sortmap = {}
+            for entry in indexlist:
+                norm = sortnormalize(entry)
+                if sortmap.has_key(norm):
+                    sortmap[norm].append(entry)
+                    sortmap[norm].sort(sortfunc)
+                else:
+                    sortmap[norm] = [entry]
+
+            self.update(" listing")
+                
+            normalizedentries = sortmap.keys()
+
+            self.update(" sorting")
+
+            normalizedentries.sort()
+
+            self.update(" re-mapping")
+            indexlist = []
+
+            for normentry in normalizedentries:
+                for entry in sortmap[normentry]:
+                    indexlist.append(entry)
+
+            self.update(", done.\n")
+
+        self.update("Writing index...\n")
+
+        self.indexfile.seek(0)
+            
+        for entry in indexlist:
+            self.indexfile.write(entry + "\n")
+
+        if self.mode == 'update':
+            # In case things were deleted
+            self.indexfile.truncate()
+        self.indexfile.close()
+        self.dictfile.close()
+
+        self.update("Complete.\n")
+
+    def getdeflist(self):
+        """Returns a list of strings naming all definitions contained
+        in this dictionary."""
+        return self.indexentries.keys()
+
+    def hasdef(self, word):
+        return self.indexentries.has_key(word)
+
+    def getdef(self, word):
+        """Given a definition name, returns a list of strings with all
+        matching definitions.  This is an *exact* match, not a
+        case-insensitive one.  Returns [] if word is not in the dictionary."""
+        retval = []
+        if not self.hasdef(word):
+            return retval
+        for start, length in self.indexentries[word]:
+            self.dictfile.seek(start)
+            retval.append(self.dictfile.read(length))
+        return retval
+            
+
+class DictReader:
+    """This object provides compatibility with earlier versions
+    of dictdlib.  It is now deprecated."""
+    
+    def __init__(self, basename):
+        """Initialize a DictReader object.  Provide it with the basename."""
+        self.dictdb = DictDB(basename, 'read')
+
+    def getdeflist(self):
+        """Returns a list of strings naming all definitions contained
+        in this dictionary."""
+        return self.dictdb.getdeflist()
+
+    def getdef(self, defname):
+        """Given a definition name, returns a list of strings
+        with all matching definitions."""
+        return self.dictdb.getdef(defname)
+
+class DictWriter:
+    """This object provides compatibility with earlier versions
+    of dictdlib.  It is now deprecated."""
+
+    def __init__(self, basename, url = 'unknown', shortname = 'unknown',
+                 longinfo = 'unknown', quiet = 0):
+        """Initialize a DictWriter object.  Will create 'basename.dict' and
+        'basename.index' files.  url, shortname, and longinfo specify the
+        respective attributes of the database.  If quiet is 1,
+        status messages are not printed."""
+        self.dictdb = DictDB(basename, 'write', quiet)
+        self.dictdb.seturl(url)
+        self.dictdb.setshortname(shortname)
+        self.dictdb.setlonginfo(longinfo)
+
+    def writeentry(self, defstr, headwords):
+        """Writes an entry.  defstr holds the content of the definition.
+        headwords is a list specifying one or more words under which this
+        definition should be indexed.  This function always adds \\n
+        to the end of defstr."""
+        self.dictdb.addentry(defstr, headwords)
+
+    def finish(self, dosort = 1):
+        """Called to finish the writing process.  **REQUIRED**.
+        This will write the index and close the files.
+
+        dosort is optional and defaults to true.  If set to false,
+        dictlib will not sort the index file.  In this case, you
+        MUST manually sort it through "sort -df" before it can be used."""
+        self.dictdb.finish(dosort)
+