From dce0600bc64c793ba6e8f67c56c286d8d97e7c4c Mon Sep 17 00:00:00 2001
From: Jonathan Dieter <jdieter@gmail.com>
Date: Tue, 19 Jun 2007 20:58:07 +0300
Subject: Many bugfixes and a few enhancements

Signed-off-by: Jonathan Dieter <jdieter@gmail.com>
---
 createprestorepo/dumpMetadata.py | 496 ---------------------------------------
 1 file changed, 496 deletions(-)
 delete mode 100644 createprestorepo/dumpMetadata.py

(limited to 'createprestorepo/dumpMetadata.py')

diff --git a/createprestorepo/dumpMetadata.py b/createprestorepo/dumpMetadata.py
deleted file mode 100644
index 6969400..0000000
--- a/createprestorepo/dumpMetadata.py
+++ /dev/null
@@ -1,496 +0,0 @@
-#!/usr/bin/python -t
-# base classes and functions for dumping out package Metadata
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Library General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-# Copyright 2004 Duke University
-
-# $Id: dumpMetadata.py,v 1.36 2006/02/21 20:10:08 pnasrat Exp $
-
-import os
-import rpm
-import exceptions
-import md5
-import sha
-import types
-import struct
-import re
-import stat
-
-# done to fix gzip randomly changing the checksum
-import gzip
-from zlib import error as zlibError
-from gzip import write32u, FNAME
-
-__all__ = ["GzipFile","open"]
-
-class GzipFile(gzip.GzipFile):
-    def _write_gzip_header(self):
-        self.fileobj.write('\037\213')             # magic header
-        self.fileobj.write('\010')                 # compression method
-        fname = self.filename[:-3]
-        flags = 0
-        if fname:
-            flags = FNAME
-        self.fileobj.write(chr(flags))
-        write32u(self.fileobj, long(0))
-        self.fileobj.write('\002')
-        self.fileobj.write('\377')
-        if fname:
-            self.fileobj.write(fname + '\000')
-
-
-def _gzipOpen(filename, mode="rb", compresslevel=9):
-    return GzipFile(filename, mode, compresslevel)
-    
-
-
-def returnFD(filename):
-    try:
-        fdno = os.open(filename, os.O_RDONLY)
-    except OSError:
-        raise MDError, "Error opening file"
-    return fdno
-    
-def returnHdr(ts, package):
-    """hand back the rpm header or raise an Error if the pkg is fubar"""
-    opened_here = 0
-    try:
-        if type(package) is types.StringType:
-            opened_here = 1
-            fdno = os.open(package, os.O_RDONLY)
-        else: 
-            fdno = package # let's assume this is an fdno and go with it :)
-    except OSError:
-        raise MDError, "Error opening file"
-    ts.setVSFlags((rpm._RPMVSF_NOSIGNATURES|rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD))
-    try:
-        hdr = ts.hdrFromFdno(fdno)
-    except rpm.error:
-        raise MDError, "Error opening package"
-    if type(hdr) != rpm.hdr:
-        raise MDError, "Error opening package"
-    ts.setVSFlags(0)
-    
-    if opened_here:
-        os.close(fdno)
-        del fdno
-
-    return hdr
-    
-def getChecksum(sumtype, file, CHUNK=2**16):
-    """takes filename, hand back Checksum of it
-       sumtype = md5 or sha
-       filename = /path/to/file
-       CHUNK=65536 by default"""
-       
-    # chunking brazenly lifted from Ryan Tomayko
-    opened_here = 0
-    try:
-        if type(file) is not types.StringType:
-            fo = file # assume it's a file-like-object
-        else:
-            opened_here = 1
-            fo = open(file, 'rb', CHUNK)
-            
-        if sumtype == 'md5':
-            sum = md5.new()
-        elif sumtype == 'sha':
-            sum = sha.new()
-        else:
-            raise MDError, 'Error Checksumming file, wrong checksum type %s' % sumtype
-        chunk = fo.read
-        while chunk: 
-            chunk = fo.read(CHUNK)
-            sum.update(chunk)
-
-        if opened_here:
-            fo.close()
-            del fo
-            
-        return sum.hexdigest()
-    except:
-        raise MDError, 'Error opening file for checksum: %s' % file
-
-
-def utf8String(string):
-    """hands back a unicoded string"""
-    if string is None:
-        return ''
-    elif isinstance(string, unicode):    
-        return string
-    try:
-        x = unicode(string, 'ascii')
-        return string
-    except UnicodeError:
-        encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2']
-        for enc in encodings:
-            try:
-                x = unicode(string, enc)
-            except UnicodeError:
-                pass
-            else:
-                if x.encode(enc) == string:
-                    return x.encode('utf-8')
-    newstring = ''
-    for char in string:
-        if ord(char) > 127:
-            newstring = newstring + '?'
-        else:
-            newstring = newstring + char
-    return newstring
-
-        
-def byteranges(file):
-    """takes an rpm file or fileobject and returns byteranges for location of the header"""
-    opened_here = 0
-    if type(file) is not types.StringType:
-        fo = file
-    else:
-        opened_here = 1
-        fo = open(file, 'r')
-    #read in past lead and first 8 bytes of sig header
-    fo.seek(104)
-    # 104 bytes in
-    binindex = fo.read(4)
-    # 108 bytes in
-    (sigindex, ) = struct.unpack('>I', binindex)
-    bindata = fo.read(4)
-    # 112 bytes in
-    (sigdata, ) = struct.unpack('>I', bindata)
-    # each index is 4 32bit segments - so each is 16 bytes
-    sigindexsize = sigindex * 16
-    sigsize = sigdata + sigindexsize
-    # we have to round off to the next 8 byte boundary
-    disttoboundary = (sigsize % 8)
-    if disttoboundary != 0:
-        disttoboundary = 8 - disttoboundary
-    # 112 bytes - 96 == lead, 8 = magic and reserved, 8 == sig header data
-    hdrstart = 112 + sigsize  + disttoboundary
-    
-    fo.seek(hdrstart) # go to the start of the header
-    fo.seek(8,1) # read past the magic number and reserved bytes
-
-    binindex = fo.read(4) 
-    (hdrindex, ) = struct.unpack('>I', binindex)
-    bindata = fo.read(4)
-    (hdrdata, ) = struct.unpack('>I', bindata)
-    
-    # each index is 4 32bit segments - so each is 16 bytes
-    hdrindexsize = hdrindex * 16 
-    # add 16 to the hdrsize to account for the 16 bytes of misc data b/t the
-    # end of the sig and the header.
-    hdrsize = hdrdata + hdrindexsize + 16
-    
-    # header end is hdrstart + hdrsize 
-    hdrend = hdrstart + hdrsize 
-    if opened_here:
-        fo.close()
-        del fo
-    return (hdrstart, hdrend)
-    
-
-class MDError(exceptions.Exception):
-    def __init__(self, args=None):
-        exceptions.Exception.__init__(self)
-        self.args = args
-
-
-
-class RpmMetaData:
-    """each drpm is one object, you pass it an rpm file
-       it opens the file, and pulls the information out in bite-sized chunks :)
-    """
-
-    mode_cache = {}
-
-    def __init__(self, ts, basedir, filename, options, is_drpm):
-        try:
-            stats = os.stat(os.path.join(basedir, filename))
-            self.size = stats[6]
-            self.mtime = stats[8]
-            del stats
-        except OSError, e:
-            raise MDError, "Error Stat'ing file %s %s" % (basedir, filename)
-        self.options = options
-        self.localurl = options['baseurl']
-        self.relativepath = filename
-        fd = returnFD(os.path.join(basedir, filename))
-        self.hdr = returnHdr(ts, fd)
-        os.lseek(fd, 0, 0)
-        fo = os.fdopen(fd, 'rb')
-        self.pkgid = self.doChecksumCache(fo)
-        fo.seek(0)
-        (self.rangestart, self.rangeend) = byteranges(fo)
-        self.is_drpm = False
-        if is_drpm:
-            fo.seek(self.rangeend)
-            self._getOldInfo(fo)
-            self.is_drpm = True
-        del fo
-        del fd
-                    
-    def arch(self):
-        if self.tagByName('sourcepackage') == 1:
-            return 'src'
-        else:
-            return self.tagByName('arch')
-
-    def _stringToNEVR(self, string):
-        i = string.rfind("-", 0, string.rfind("-")-1)
-        name = string[:i]
-        (epoch, ver, rel) = self._stringToVersion(string[i+1:])
-        return (name, epoch, ver, rel)
-        
-    def _getLength(self, in_data):
-        length = 0
-        for val in in_data:
-            length = length * 256
-            length += ord(val)
-        return length
-        
-    def _getOldInfo(self, fo):
-        try:
-            compobj = gzip.GzipFile("", "rb", 9, fo)
-        except:
-            raise zlibError("Data not stored in gzip format")
-            
-        if compobj.read(4)[:3] != "DLT":
-            raise Exception("Not a deltarpm")
-        
-        nevr_length = self._getLength(compobj.read(4))
-        nevr = compobj.read(nevr_length).strip("\x00")
-        seq_length = self._getLength(compobj.read(4))
-        seq = compobj.read(seq_length)
-        hex_seq = ""
-        for char in seq:
-            hex_seq += str("%02x" % ord(char))
-        self.oldnevrstring = nevr
-        self.oldnevr = self._stringToNEVR(nevr)
-        self.sequence = hex_seq
-        compobj.close()
-            
-    def _stringToVersion(self, strng):
-        i = strng.find(':')
-        if i != -1:
-            epoch = strng[:i]
-        else:
-            epoch = '0'
-        j = strng.find('-')
-        if j != -1:
-            if strng[i + 1:j] == '':
-                version = None
-            else:
-                version = strng[i + 1:j]
-            release = strng[j + 1:]
-        else:
-            if strng[i + 1:] == '':
-                version = None
-            else:
-                version = strng[i + 1:]
-            release = None
-        return (epoch, version, release)
-
-    ###########
-    # Title: Remove duplicates from a sequence
-    # Submitter: Tim Peters 
-    # From: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560                      
-        
-    def _uniq(self,s):
-        """Return a list of the elements in s, but without duplicates.
-    
-        For example, unique([1,2,3,1,2,3]) is some permutation of [1,2,3],
-        unique("abcabc") some permutation of ["a", "b", "c"], and
-        unique(([1, 2], [2, 3], [1, 2])) some permutation of
-        [[2, 3], [1, 2]].
-    
-        For best speed, all sequence elements should be hashable.  Then
-        unique() will usually work in linear time.
-    
-        If not possible, the sequence elements should enjoy a total
-        ordering, and if list(s).sort() doesn't raise TypeError it's
-        assumed that they do enjoy a total ordering.  Then unique() will
-        usually work in O(N*log2(N)) time.
-    
-        If that's not possible either, the sequence elements must support
-        equality-testing.  Then unique() will usually work in quadratic
-        time.
-        """
-    
-        n = len(s)
-        if n == 0:
-            return []
-    
-        # Try using a dict first, as that's the fastest and will usually
-        # work.  If it doesn't work, it will usually fail quickly, so it
-        # usually doesn't cost much to *try* it.  It requires that all the
-        # sequence elements be hashable, and support equality comparison.
-        u = {}
-        try:
-            for x in s:
-                u[x] = 1
-        except TypeError:
-            del u  # move on to the next method
-        else:
-            return u.keys()
-    
-        # We can't hash all the elements.  Second fastest is to sort,
-        # which brings the equal elements together; then duplicates are
-        # easy to weed out in a single pass.
-        # NOTE:  Python's list.sort() was designed to be efficient in the
-        # presence of many duplicate elements.  This isn't true of all
-        # sort functions in all languages or libraries, so this approach
-        # is more effective in Python than it may be elsewhere.
-        try:
-            t = list(s)
-            t.sort()
-        except TypeError:
-            del t  # move on to the next method
-        else:
-            assert n > 0
-            last = t[0]
-            lasti = i = 1
-            while i < n:
-                if t[i] != last:
-                    t[lasti] = last = t[i]
-                    lasti += 1
-                i += 1
-            return t[:lasti]
-    
-        # Brute force is all that's left.
-        u = []
-        for x in s:
-            if x not in u:
-                u.append(x)
-        return u
-
-    def tagByName(self, tag):
-        data = self.hdr[tag]
-        if type(data) is types.ListType:
-            if len(data) > 0:
-                return data[0]
-            else:
-                return ''
-        else:
-            return data
-    
-    def listTagByName(self, tag):
-        """take a tag that should be a list and make sure it is one"""
-        lst = []
-        data = self.hdr[tag]
-        if data is None:
-            return lst
-            
-        if type(data) is types.ListType:
-            lst.extend(data)
-        else:
-            lst.append(data)
-        return lst
-
-    def epoch(self):
-        if self.hdr['epoch'] is None:
-            return 0
-        else:
-            return self.tagByName('epoch')
-                    
-    def doChecksumCache(self, fo):
-        """return a checksum for a package:
-           - check if the checksum cache is enabled
-              if not - return the checksum
-              if so - check to see if it has a cache file
-                if so, open it and return the first line's contents
-                if not, grab the checksum and write it to a file for this pkg
-            """
-        if not self.options['cache']:
-            return getChecksum(self.options['sumtype'], fo)
-        
-        csumtag = os.path.basename(self.relativepath) + ".cache"
-        csumfile = '%s/%s' % (self.options['cachedir'], csumtag)
-        if os.path.exists(csumfile) and self.mtime <= os.stat(csumfile)[8]:
-            csumo = open(csumfile, 'r')
-            checksum = csumo.readline()
-            csumo.close()
-            
-        else:
-            checksum = getChecksum(self.options['sumtype'], fo)
-            csumo = open(csumfile, 'w')
-            csumo.write(checksum)
-            csumo.close()
-            
-        return checksum
-
-
-    
-def generateXML(doc, node, formatns, drpmObj, sumtype, pkgDeltas):
-    """takes an xml doc object and a package metadata entry node, populates a 
-       package node with the md information"""
-    name = drpmObj.tagByName('name')
-    arch = drpmObj.arch()
-    epoch = str(drpmObj.epoch())
-    ver = str(drpmObj.tagByName('version'))
-    rel = str(drpmObj.tagByName('release'))
-    if not pkgDeltas.has_key('%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)):
-        pkgNode = node.newChild(None, "package", None)
-        pkgNode.newProp('type', 'rpm')
-        pkgNode.newChild(None, 'name', name)
-        pkgNode.newChild(None, 'arch', arch)
-        version = pkgNode.newChild(None, 'version', None)
-        version.newProp('epoch', epoch)
-        version.newProp('ver', ver)
-        version.newProp('rel', rel)
-        deltas = pkgNode.newChild(None, 'deltas', None)
-        pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)] = deltas
-    else:
-        deltas = pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)]
-    (oldname, oldepoch, oldver, oldrel) = drpmObj.oldnevr
-    drpmNode = deltas.newChild(None, "oldrpm", None)
-    if name != oldname:
-        drpmNode.newChild(None, 'name', oldname)
-    # oldrpm arch is not stored in drpm, so we can only work within same arch
-    version = drpmNode.newChild(None, 'version', None)
-    if epoch != oldepoch:
-        version.newProp('epoch', oldepoch)
-    if ver != oldver:
-        version.newProp('ver', oldver)
-    version.newProp('rel', oldrel)
-    drpmNode.newChild(None, 'drpm_filename', drpmObj.relativepath)
-    drpmNode.newChild(None, 'size', str(drpmObj.size))
-    drpmNode.newChild(None, 'sequence', '%s-%s' % (drpmObj.oldnevrstring, drpmObj.sequence))
-    checksum = drpmNode.newChild(None, 'checksum', drpmObj.pkgid)
-    checksum.newProp('type', drpmObj.options['sumtype'])
-    
-        
-def repoXML(node, cmds):
-    """generate the repomd.xml file that stores the info on the other files"""
-    sumtype = cmds['sumtype']
-    workfiles = [(cmds['prestofile'], 'deltas')]
-    
-    
-    for (file, ftype) in workfiles:
-        zfo = _gzipOpen(os.path.join(cmds['outputdir'], cmds['tempdir'], file))
-        uncsum = getChecksum(sumtype, zfo)
-        zfo.close()
-        csum = getChecksum(sumtype, os.path.join(cmds['outputdir'], cmds['tempdir'], file))
-        timestamp = os.stat(os.path.join(cmds['outputdir'], cmds['tempdir'], file))[8]
-        data = node.newChild(None, 'data', None)
-        data.newProp('type', ftype)
-        location = data.newChild(None, 'location', None)
-        if cmds['baseurl'] is not None:
-            location.newProp('xml:base', cmds['baseurl'])
-        location.newProp('href', os.path.join(cmds['finaldir'], file))
-        checksum = data.newChild(None, 'checksum', csum)
-        checksum.newProp('type', sumtype)
-        timestamp = data.newChild(None, 'timestamp', str(timestamp))
-        unchecksum = data.newChild(None, 'open-checksum', uncsum)
-        unchecksum.newProp('type', sumtype)
-- 
cgit