# base classes and functions for dumping out package Metadata # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Library General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Copyright 2004 Duke University # $Id: dumpMetadata.py,v 1.36 2006/02/21 20:10:08 pnasrat Exp $ import os import rpm import exceptions import md5 import sha import types import struct import re import stat # done to fix gzip randomly changing the checksum import gzip from zlib import error as zlibError from gzip import write32u, FNAME __all__ = ["GzipFile","open"] class GzipFile(gzip.GzipFile): def _write_gzip_header(self): self.fileobj.write('\037\213') # magic header self.fileobj.write('\010') # compression method fname = self.filename[:-3] flags = 0 if fname: flags = FNAME self.fileobj.write(chr(flags)) write32u(self.fileobj, long(0)) self.fileobj.write('\002') self.fileobj.write('\377') if fname: self.fileobj.write(fname + '\000') def _gzipOpen(filename, mode="rb", compresslevel=9): return GzipFile(filename, mode, compresslevel) def returnFD(filename): try: fdno = os.open(filename, os.O_RDONLY) except OSError: raise MDError, "Error opening file" return fdno def returnHdr(ts, package): """hand back the rpm header or raise an Error if the pkg is fubar""" opened_here = 0 try: if type(package) is types.StringType: opened_here = 1 fdno = os.open(package, os.O_RDONLY) else: fdno = package # let's assume this is an fdno and go with it :) except OSError: raise MDError, "Error opening file" ts.setVSFlags((rpm._RPMVSF_NOSIGNATURES|rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD)) try: hdr = ts.hdrFromFdno(fdno) except rpm.error: raise MDError, "Error opening package" if type(hdr) != rpm.hdr: raise MDError, "Error opening package" ts.setVSFlags(0) if opened_here: os.close(fdno) del fdno return hdr def getChecksum(sumtype, file, CHUNK=2**16): """takes filename, hand back Checksum of it sumtype = md5 or sha filename = /path/to/file CHUNK=65536 by default""" # chunking brazenly lifted from Ryan Tomayko opened_here = 0 try: if type(file) is not types.StringType: fo = file # assume it's a file-like-object else: opened_here = 1 fo = open(file, 'rb', CHUNK) if sumtype == 'md5': sum = md5.new() elif sumtype == 'sha': sum = sha.new() else: raise MDError, 'Error Checksumming file, wrong checksum type %s' % sumtype chunk = fo.read while chunk: chunk = fo.read(CHUNK) sum.update(chunk) if opened_here: fo.close() del fo return sum.hexdigest() except: raise MDError, 'Error opening file for checksum: %s' % file def utf8String(string): """hands back a unicoded string""" if string is None: return '' elif isinstance(string, unicode): return string try: x = unicode(string, 'ascii') return string except UnicodeError: encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2'] for enc in encodings: try: x = unicode(string, enc) except UnicodeError: pass else: if x.encode(enc) == string: return x.encode('utf-8') newstring = '' for char in string: if ord(char) > 127: newstring = newstring + '?' else: newstring = newstring + char return newstring def byteranges(file): """takes an rpm file or fileobject and returns byteranges for location of the header""" opened_here = 0 if type(file) is not types.StringType: fo = file else: opened_here = 1 fo = open(file, 'r') #read in past lead and first 8 bytes of sig header fo.seek(104) # 104 bytes in binindex = fo.read(4) # 108 bytes in (sigindex, ) = struct.unpack('>I', binindex) bindata = fo.read(4) # 112 bytes in (sigdata, ) = struct.unpack('>I', bindata) # each index is 4 32bit segments - so each is 16 bytes sigindexsize = sigindex * 16 sigsize = sigdata + sigindexsize # we have to round off to the next 8 byte boundary disttoboundary = (sigsize % 8) if disttoboundary != 0: disttoboundary = 8 - disttoboundary # 112 bytes - 96 == lead, 8 = magic and reserved, 8 == sig header data hdrstart = 112 + sigsize + disttoboundary fo.seek(hdrstart) # go to the start of the header fo.seek(8,1) # read past the magic number and reserved bytes binindex = fo.read(4) (hdrindex, ) = struct.unpack('>I', binindex) bindata = fo.read(4) (hdrdata, ) = struct.unpack('>I', bindata) # each index is 4 32bit segments - so each is 16 bytes hdrindexsize = hdrindex * 16 # add 16 to the hdrsize to account for the 16 bytes of misc data b/t the # end of the sig and the header. hdrsize = hdrdata + hdrindexsize + 16 # header end is hdrstart + hdrsize hdrend = hdrstart + hdrsize if opened_here: fo.close() del fo return (hdrstart, hdrend) class MDError(exceptions.Exception): def __init__(self, args=None): exceptions.Exception.__init__(self) self.args = args class RpmMetaData: """each drpm is one object, you pass it an rpm file it opens the file, and pulls the information out in bite-sized chunks :) """ mode_cache = {} def __init__(self, ts, basedir, filename, options, is_drpm): try: stats = os.stat(os.path.join(basedir, filename)) self.size = stats[6] self.mtime = stats[8] del stats except OSError, e: raise MDError, "Error Stat'ing file %s %s" % (basedir, filename) self.options = options self.localurl = options['baseurl'] self.relativepath = filename fd = returnFD(os.path.join(basedir, filename)) self.hdr = returnHdr(ts, fd) os.lseek(fd, 0, 0) fo = os.fdopen(fd, 'rb') self.pkgid = self.doChecksumCache(fo) fo.seek(0) (self.rangestart, self.rangeend) = byteranges(fo) self.is_drpm = False if is_drpm: fo.seek(self.rangeend) self._getOldInfo(fo) self.is_drpm = True del fo del fd def arch(self): if self.tagByName('sourcepackage') == 1: return 'src' else: return self.tagByName('arch') def _stringToNEVR(self, string): i = string.rfind("-", 0, string.rfind("-")-1) name = string[:i] (epoch, ver, rel) = self._stringToVersion(string[i+1:]) return (name, epoch, ver, rel) def _getLength(self, in_data): length = 0 for val in in_data: length = length * 256 length += ord(val) return length def _getOldInfo(self, fo): try: compobj = gzip.GzipFile("", "rb", 9, fo) except: raise zlibError("Data not stored in gzip format") if compobj.read(4)[:3] != "DLT": raise Exception("Not a deltarpm") nevr_length = self._getLength(compobj.read(4)) nevr = compobj.read(nevr_length).strip("\x00") seq_length = self._getLength(compobj.read(4)) seq = compobj.read(seq_length) hex_seq = "" for char in seq: hex_seq += str("%02x" % ord(char)) self.oldnevrstring = nevr self.oldnevr = self._stringToNEVR(nevr) self.sequence = hex_seq compobj.close() def _stringToVersion(self, strng): i = strng.find(':') if i != -1: epoch = strng[:i] else: epoch = '0' j = strng.find('-') if j != -1: if strng[i + 1:j] == '': version = None else: version = strng[i + 1:j] release = strng[j + 1:] else: if strng[i + 1:] == '': version = None else: version = strng[i + 1:] release = None return (epoch, version, release) ########### # Title: Remove duplicates from a sequence # Submitter: Tim Peters # From: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560 def _uniq(self,s): """Return a list of the elements in s, but without duplicates. For example, unique([1,2,3,1,2,3]) is some permutation of [1,2,3], unique("abcabc") some permutation of ["a", "b", "c"], and unique(([1, 2], [2, 3], [1, 2])) some permutation of [[2, 3], [1, 2]]. For best speed, all sequence elements should be hashable. Then unique() will usually work in linear time. If not possible, the sequence elements should enjoy a total ordering, and if list(s).sort() doesn't raise TypeError it's assumed that they do enjoy a total ordering. Then unique() will usually work in O(N*log2(N)) time. If that's not possible either, the sequence elements must support equality-testing. Then unique() will usually work in quadratic time. """ n = len(s) if n == 0: return [] # Try using a dict first, as that's the fastest and will usually # work. If it doesn't work, it will usually fail quickly, so it # usually doesn't cost much to *try* it. It requires that all the # sequence elements be hashable, and support equality comparison. u = {} try: for x in s: u[x] = 1 except TypeError: del u # move on to the next method else: return u.keys() # We can't hash all the elements. Second fastest is to sort, # which brings the equal elements together; then duplicates are # easy to weed out in a single pass. # NOTE: Python's list.sort() was designed to be efficient in the # presence of many duplicate elements. This isn't true of all # sort functions in all languages or libraries, so this approach # is more effective in Python than it may be elsewhere. try: t = list(s) t.sort() except TypeError: del t # move on to the next method else: assert n > 0 last = t[0] lasti = i = 1 while i < n: if t[i] != last: t[lasti] = last = t[i] lasti += 1 i += 1 return t[:lasti] # Brute force is all that's left. u = [] for x in s: if x not in u: u.append(x) return u def tagByName(self, tag): data = self.hdr[tag] if type(data) is types.ListType: if len(data) > 0: return data[0] else: return '' else: return data def listTagByName(self, tag): """take a tag that should be a list and make sure it is one""" lst = [] data = self.hdr[tag] if data is None: return lst if type(data) is types.ListType: lst.extend(data) else: lst.append(data) return lst def epoch(self): if self.hdr['epoch'] is None: return 0 else: return self.tagByName('epoch') def doChecksumCache(self, fo): """return a checksum for a package: - check if the checksum cache is enabled if not - return the checksum if so - check to see if it has a cache file if so, open it and return the first line's contents if not, grab the checksum and write it to a file for this pkg """ if not self.options['cache']: return getChecksum(self.options['sumtype'], fo) csumtag = os.path.basename(self.relativepath) + ".cache" csumfile = '%s/%s' % (self.options['cachedir'], csumtag) if os.path.exists(csumfile) and self.mtime <= os.stat(csumfile)[8]: csumo = open(csumfile, 'r') checksum = csumo.readline() csumo.close() else: checksum = getChecksum(self.options['sumtype'], fo) csumo = open(csumfile, 'w') csumo.write(checksum) csumo.close() return checksum def generateXML(doc, node, formatns, drpmObj, sumtype, pkgDeltas): """takes an xml doc object and a package metadata entry node, populates a package node with the md information""" name = drpmObj.tagByName('name') arch = drpmObj.arch() epoch = str(drpmObj.epoch()) ver = str(drpmObj.tagByName('version')) rel = str(drpmObj.tagByName('release')) if not pkgDeltas.has_key('%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)): pkgNode = node.newChild(None, "package", None) pkgNode.newProp('type', 'rpm') pkgNode.newChild(None, 'name', name) pkgNode.newChild(None, 'arch', arch) version = pkgNode.newChild(None, 'version', None) version.newProp('epoch', epoch) version.newProp('ver', ver) version.newProp('rel', rel) deltas = pkgNode.newChild(None, 'deltas', None) pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)] = deltas else: deltas = pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)] (oldname, oldepoch, oldver, oldrel) = drpmObj.oldnevr drpmNode = deltas.newChild(None, "oldrpm", None) if name != oldname: drpmNode.newChild(None, 'name', oldname) # oldrpm arch is not stored in drpm, so we can only work within same arch version = drpmNode.newChild(None, 'version', None) if epoch != oldepoch: version.newProp('epoch', oldepoch) if ver != oldver: version.newProp('ver', oldver) version.newProp('rel', oldrel) drpmNode.newChild(None, 'drpm_filename', drpmObj.relativepath) drpmNode.newChild(None, 'size', str(drpmObj.size)) drpmNode.newChild(None, 'sequence', '%s-%s' % (drpmObj.oldnevrstring, drpmObj.sequence)) checksum = drpmNode.newChild(None, 'checksum', drpmObj.pkgid) checksum.newProp('type', drpmObj.options['sumtype']) def repoXML(node, cmds): """generate the repomd.xml file that stores the info on the other files""" sumtype = cmds['sumtype'] workfiles = [(cmds['prestofile'], 'deltas')] for (file, ftype) in workfiles: zfo = _gzipOpen(os.path.join(cmds['outputdir'], cmds['tempdir'], file)) uncsum = getChecksum(sumtype, zfo) zfo.close() csum = getChecksum(sumtype, os.path.join(cmds['outputdir'], cmds['tempdir'], file)) timestamp = os.stat(os.path.join(cmds['outputdir'], cmds['tempdir'], file))[8] data = node.newChild(None, 'data', None) data.newProp('type', ftype) location = data.newChild(None, 'location', None) if cmds['baseurl'] is not None: location.newProp('xml:base', cmds['baseurl']) location.newProp('href', os.path.join(cmds['finaldir'], file)) checksum = data.newChild(None, 'checksum', csum) checksum.newProp('type', sumtype) timestamp = data.newChild(None, 'timestamp', str(timestamp)) unchecksum = data.newChild(None, 'open-checksum', uncsum) unchecksum.newProp('type', sumtype)