1 files changed, 497 insertions, 0 deletions
diff --git a/presto-utils/dumpMetadata.py b/presto-utils/dumpMetadata.py
new file mode 100644
index 0000000..0ec2c20
--- /dev/null
+++ b/presto-utils/dumpMetadata.py
@@ -0,0 +1,497 @@
+#!/usr/bin/python -t
+# base classes and functions for dumping out package Metadata
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# Copyright 2004 Duke University
+
+# $Id: dumpMetadata.py,v 1.36 2006/02/21 20:10:08 pnasrat Exp $
+
+import os
+import rpm
+import exceptions
+import md5
+import sha
+import types
+import struct
+import re
+import stat
+
+# done to fix gzip randomly changing the checksum
+import gzip
+from zlib import error as zlibError
+from gzip import write32u, FNAME
+
+__all__ = ["GzipFile","open"]
+
+class GzipFile(gzip.GzipFile):
+    def _write_gzip_header(self):
+        self.fileobj.write('\037\213')             # magic header
+        self.fileobj.write('\010')                 # compression method
+        fname = self.filename[:-3]
+        flags = 0
+        if fname:
+            flags = FNAME
+        self.fileobj.write(chr(flags))
+        write32u(self.fileobj, long(0))
+        self.fileobj.write('\002')
+        self.fileobj.write('\377')
+        if fname:
+            self.fileobj.write(fname + '\000')
+
+
+def _gzipOpen(filename, mode="rb", compresslevel=9):
+    return GzipFile(filename, mode, compresslevel)
+    
+
+
+def returnFD(filename):
+    try:
+        fdno = os.open(filename, os.O_RDONLY)
+    except OSError:
+        raise MDError, "Error opening file"
+    return fdno
+    
+def returnHdr(ts, package):
+    """hand back the rpm header or raise an Error if the pkg is fubar"""
+    opened_here = 0
+    try:
+        if type(package) is types.StringType:
+            opened_here = 1
+            fdno = os.open(package, os.O_RDONLY)
+        else: 
+            fdno = package # let's assume this is an fdno and go with it :)
+    except OSError:
+        raise MDError, "Error opening file"
+    ts.setVSFlags((rpm._RPMVSF_NOSIGNATURES|rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD))
+    try:
+        hdr = ts.hdrFromFdno(fdno)
+    except rpm.error:
+        raise MDError, "Error opening package"
+    if type(hdr) != rpm.hdr:
+        raise MDError, "Error opening package"
+    ts.setVSFlags(0)
+    
+    if opened_here:
+        os.close(fdno)
+        del fdno
+
+    return hdr
+    
+def getChecksum(sumtype, file, CHUNK=2**16):
+    """takes filename, hand back Checksum of it
+       sumtype = md5 or sha
+       filename = /path/to/file
+       CHUNK=65536 by default"""
+       
+    # chunking brazenly lifted from Ryan Tomayko
+    opened_here = 0
+    try:
+        if type(file) is not types.StringType:
+            fo = file # assume it's a file-like-object
+        else:
+            opened_here = 1
+            fo = open(file, 'rb', CHUNK)
+            
+        if sumtype == 'md5':
+            sum = md5.new()
+        elif sumtype == 'sha':
+            sum = sha.new()
+        else:
+            raise MDError, 'Error Checksumming file, wrong checksum type %s' % sumtype
+        chunk = fo.read
+        while chunk: 
+            chunk = fo.read(CHUNK)
+            sum.update(chunk)
+
+        if opened_here:
+            fo.close()
+            del fo
+            
+        return sum.hexdigest()
+    except:
+        raise MDError, 'Error opening file for checksum: %s' % file
+
+
+def utf8String(string):
+    """hands back a unicoded string"""
+    if string is None:
+        return ''
+    elif isinstance(string, unicode):    
+        return string
+    try:
+        x = unicode(string, 'ascii')
+        return string
+    except UnicodeError:
+        encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2']
+        for enc in encodings:
+            try:
+                x = unicode(string, enc)
+            except UnicodeError:
+                pass
+            else:
+                if x.encode(enc) == string:
+                    return x.encode('utf-8')
+    newstring = ''
+    for char in string:
+        if ord(char) > 127:
+            newstring = newstring + '?'
+        else:
+            newstring = newstring + char
+    return newstring
+
+        
+def byteranges(file):
+    """takes an rpm file or fileobject and returns byteranges for location of the header"""
+    opened_here = 0
+    if type(file) is not types.StringType:
+        fo = file
+    else:
+        opened_here = 1
+        fo = open(file, 'r')
+    #read in past lead and first 8 bytes of sig header
+    fo.seek(104)
+    # 104 bytes in
+    binindex = fo.read(4)
+    # 108 bytes in
+    (sigindex, ) = struct.unpack('>I', binindex)
+    bindata = fo.read(4)
+    # 112 bytes in
+    (sigdata, ) = struct.unpack('>I', bindata)
+    # each index is 4 32bit segments - so each is 16 bytes
+    sigindexsize = sigindex * 16
+    sigsize = sigdata + sigindexsize
+    # we have to round off to the next 8 byte boundary
+    disttoboundary = (sigsize % 8)
+    if disttoboundary != 0:
+        disttoboundary = 8 - disttoboundary
+    # 112 bytes - 96 == lead, 8 = magic and reserved, 8 == sig header data
+    hdrstart = 112 + sigsize  + disttoboundary
+    
+    fo.seek(hdrstart) # go to the start of the header
+    fo.seek(8,1) # read past the magic number and reserved bytes
+
+    binindex = fo.read(4) 
+    (hdrindex, ) = struct.unpack('>I', binindex)
+    bindata = fo.read(4)
+    (hdrdata, ) = struct.unpack('>I', bindata)
+    
+    # each index is 4 32bit segments - so each is 16 bytes
+    hdrindexsize = hdrindex * 16 
+    # add 16 to the hdrsize to account for the 16 bytes of misc data b/t the
+    # end of the sig and the header.
+    hdrsize = hdrdata + hdrindexsize + 16
+    
+    # header end is hdrstart + hdrsize 
+    hdrend = hdrstart + hdrsize 
+    if opened_here:
+        fo.close()
+        del fo
+    return (hdrstart, hdrend)
+    
+
+class MDError(exceptions.Exception):
+    def __init__(self, args=None):
+        exceptions.Exception.__init__(self)
+        self.args = args
+
+
+
+class RpmMetaData:
+    """each drpm is one object, you pass it an rpm file
+       it opens the file, and pulls the information out in bite-sized chunks :)
+    """
+
+    mode_cache = {}
+
+    def __init__(self, ts, basedir, filename, options, is_drpm):
+        try:
+            stats = os.stat(os.path.join(basedir, filename))
+            self.size = stats[6]
+            self.mtime = stats[8]
+            del stats
+        except OSError, e:
+            raise MDError, "Error Stat'ing file %s %s" % (basedir, filename)
+        self.options = options
+        self.localurl = options['baseurl']
+        self.relativepath = filename
+        fd = returnFD(os.path.join(basedir, filename))
+        self.hdr = returnHdr(ts, fd)
+        os.lseek(fd, 0, 0)
+        fo = os.fdopen(fd, 'rb')
+        self.pkgid = self.doChecksumCache(fo)
+        fo.seek(0)
+        (self.rangestart, self.rangeend) = byteranges(fo)
+        self.is_drpm = False
+        if is_drpm:
+            fo.seek(self.rangeend)
+            self._getOldInfo(fo)
+            self.is_drpm = True
+        del fo
+        del fd
+                    
+    def arch(self):
+        if self.tagByName('sourcepackage') == 1:
+            return 'src'
+        else:
+            return self.tagByName('arch')
+
+    def _stringToNEVR(self, string):
+        i = string.rfind("-", 0, string.rfind("-")-1)
+        name = string[:i]
+        (epoch, ver, rel) = self._stringToVersion(string[i+1:])
+        return (name, epoch, ver, rel)
+        
+    def _getLength(self, in_data):
+        length = 0
+        for val in in_data:
+            length = length * 256
+            length += ord(val)
+        return length
+        
+    def _getOldInfo(self, fo):
+        try:
+            compobj = gzip.GzipFile("", "rb", 9, fo)
+        except:
+            raise zlibError("Data not stored in gzip format")
+            
+        if compobj.read(4)[:3] != "DLT":
+            raise Exception("Not a deltarpm")
+        
+        nevr_length = self._getLength(compobj.read(4))
+        nevr = compobj.read(nevr_length).strip("\x00")
+        seq_length = self._getLength(compobj.read(4))
+        seq = compobj.read(seq_length)
+        hex_seq = ""
+        for char in seq:
+            hex_seq += str("%02x" % ord(char))
+        self.oldnevrstring = nevr
+        self.oldnevr = self._stringToNEVR(nevr)
+        self.sequence = hex_seq
+        compobj.close()
+            
+    def _stringToVersion(self, strng):
+        i = strng.find(':')
+        if i != -1:
+            epoch = strng[:i]
+        else:
+            epoch = '0'
+        j = strng.find('-')
+        if j != -1:
+            if strng[i + 1:j] == '':
+                version = None
+            else:
+                version = strng[i + 1:j]
+            release = strng[j + 1:]
+        else:
+            if strng[i + 1:] == '':
+                version = None
+            else:
+                version = strng[i + 1:]
+            release = None
+        return (epoch, version, release)
+
+    ###########
+    # Title: Remove duplicates from a sequence
+    # Submitter: Tim Peters 
+    # From: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560                      
+        
+    def _uniq(self,s):
+        """Return a list of the elements in s, but without duplicates.
+    
+        For example, unique([1,2,3,1,2,3]) is some permutation of [1,2,3],
+        unique("abcabc") some permutation of ["a", "b", "c"], and
+        unique(([1, 2], [2, 3], [1, 2])) some permutation of
+        [[2, 3], [1, 2]].
+    
+        For best speed, all sequence elements should be hashable.  Then
+        unique() will usually work in linear time.
+    
+        If not possible, the sequence elements should enjoy a total
+        ordering, and if list(s).sort() doesn't raise TypeError it's
+        assumed that they do enjoy a total ordering.  Then unique() will
+        usually work in O(N*log2(N)) time.
+    
+        If that's not possible either, the sequence elements must support
+        equality-testing.  Then unique() will usually work in quadratic
+        time.
+        """
+    
+        n = len(s)
+        if n == 0:
+            return []
+    
+        # Try using a dict first, as that's the fastest and will usually
+        # work.  If it doesn't work, it will usually fail quickly, so it
+        # usually doesn't cost much to *try* it.  It requires that all the
+        # sequence elements be hashable, and support equality comparison.
+        u = {}
+        try:
+            for x in s:
+                u[x] = 1
+        except TypeError:
+            del u  # move on to the next method
+        else:
+            return u.keys()
+    
+        # We can't hash all the elements.  Second fastest is to sort,
+        # which brings the equal elements together; then duplicates are
+        # easy to weed out in a single pass.
+        # NOTE:  Python's list.sort() was designed to be efficient in the
+        # presence of many duplicate elements.  This isn't true of all
+        # sort functions in all languages or libraries, so this approach
+        # is more effective in Python than it may be elsewhere.
+        try:
+            t = list(s)
+            t.sort()
+        except TypeError:
+            del t  # move on to the next method
+        else:
+            assert n > 0
+            last = t[0]
+            lasti = i = 1
+            while i < n:
+                if t[i] != last:
+                    t[lasti] = last = t[i]
+                    lasti += 1
+                i += 1
+            return t[:lasti]
+    
+        # Brute force is all that's left.
+        u = []
+        for x in s:
+            if x not in u:
+                u.append(x)
+        return u
+
+    def tagByName(self, tag):
+        data = self.hdr[tag]
+        if type(data) is types.ListType:
+            if len(data) > 0:
+                return data[0]
+            else:
+                return ''
+        else:
+            return data
+    
+    def listTagByName(self, tag):
+        """take a tag that should be a list and make sure it is one"""
+        lst = []
+        data = self.hdr[tag]
+        if data is None:
+            return lst
+            
+        if type(data) is types.ListType:
+            lst.extend(data)
+        else:
+            lst.append(data)
+        return lst
+
+    def epoch(self):
+        if self.hdr['epoch'] is None:
+            return 0
+        else:
+            return self.tagByName('epoch')
+                    
+    def doChecksumCache(self, fo):
+        """return a checksum for a package:
+           - check if the checksum cache is enabled
+              if not - return the checksum
+              if so - check to see if it has a cache file
+                if so, open it and return the first line's contents
+                if not, grab the checksum and write it to a file for this pkg
+            """
+        if not self.options['cache']:
+            return getChecksum(self.options['sumtype'], fo)
+        
+        csumtag = os.path.basename(self.relativepath) + ".cache"
+        csumfile = '%s/%s' % (self.options['cachedir'], csumtag)
+        if os.path.exists(csumfile) and self.mtime <= os.stat(csumfile)[8]:
+            csumo = open(csumfile, 'r')
+            checksum = csumo.readline()
+            csumo.close()
+            
+        else:
+            checksum = getChecksum(self.options['sumtype'], fo)
+            csumo = open(csumfile, 'w')
+            csumo.write(checksum)
+            csumo.close()
+            
+        return checksum
+
+
+    
+def generateXML(doc, node, formatns, drpmObj, sumtype, pkgDeltas):
+    """takes an xml doc object and a package metadata entry node, populates a 
+       package node with the md information"""
+    name = drpmObj.tagByName('name')
+    arch = drpmObj.arch()
+    epoch = str(drpmObj.epoch())
+    ver = str(drpmObj.tagByName('version'))
+    rel = str(drpmObj.tagByName('release'))
+    if not pkgDeltas.has_key('%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)):
+        pkgNode = node.newChild(None, "package", None)
+        pkgNode.newProp('type', 'rpm')
+        pkgNode.newChild(None, 'name', name)
+        pkgNode.newChild(None, 'arch', arch)
+        version = pkgNode.newChild(None, 'version', None)
+        version.newProp('epoch', epoch)
+        version.newProp('ver', ver)
+        version.newProp('rel', rel)
+        deltas = pkgNode.newChild(None, 'deltas', None)
+        pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)] = deltas
+    else:
+        deltas = pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)]
+    (oldname, oldepoch, oldver, oldrel) = drpmObj.oldnevr
+    drpmNode = deltas.newChild(None, "oldrpm", None)
+    if name != oldname:
+        drpmNode.newChild(None, 'name', oldname)
+    # oldrpm arch is not stored in drpm, so we can only work within same arch
+    version = drpmNode.newChild(None, 'version', None)
+    if epoch != oldepoch:
+        version.newProp('epoch', oldepoch)
+    if ver != oldver:
+        version.newProp('ver', oldver)
+    version.newProp('rel', oldrel)
+    drpmNode.newChild(None, 'drpm_filename', drpmObj.relativepath)
+    drpmNode.newChild(None, 'size', str(drpmObj.size))
+    drpmNode.newChild(None, 'sequence', '%s-%s' % (drpmObj.oldnevrstring, drpmObj.sequence))
+    checksum = drpmNode.newChild(None, 'checksum', drpmObj.pkgid)
+    checksum.newProp('type', drpmObj.options['sumtype'])
+    
+        
+def repoXML(node, cmds):
+    """generate the repomd.xml file that stores the info on the other files"""
+    sumtype = cmds['sumtype']
+    workfiles = [(cmds['prestofile'], 'deltas')]
+    
+    
+    for (file, ftype) in workfiles:
+        zfo = _gzipOpen(os.path.join(cmds['outputdir'], cmds['tempdir'], file))
+        uncsum = getChecksum(sumtype, zfo)
+        zfo.close()
+        csum = getChecksum(sumtype, os.path.join(cmds['outputdir'], cmds['tempdir'], file))
+        timestamp = os.stat(os.path.join(cmds['outputdir'], cmds['tempdir'], file))[8]
+        data = node.newChild(None, 'data', None)
+        data.newProp('type', ftype)
+        location = data.newChild(None, 'location', None)
+        if cmds['baseurl'] is not None:
+            location.newProp('xml:base', cmds['baseurl'])
+        location.newProp('href', os.path.join(cmds['finaldir'], file))
+        checksum = data.newChild(None, 'checksum', csum)
+        checksum.newProp('type', sumtype)
+        timestamp = data.newChild(None, 'timestamp', str(timestamp))
+        unchecksum = data.newChild(None, 'open-checksum', uncsum)
+        unchecksum.newProp('type', sumtype)
+