summaryrefslogtreecommitdiffstats
path: root/presto-utils/dumpMetadata.py
diff options
context:
space:
mode:
Diffstat (limited to 'presto-utils/dumpMetadata.py')
-rw-r--r--presto-utils/dumpMetadata.py497
1 files changed, 497 insertions, 0 deletions
diff --git a/presto-utils/dumpMetadata.py b/presto-utils/dumpMetadata.py
new file mode 100644
index 0000000..0ec2c20
--- /dev/null
+++ b/presto-utils/dumpMetadata.py
@@ -0,0 +1,497 @@
+#!/usr/bin/python -t
+# base classes and functions for dumping out package Metadata
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# Copyright 2004 Duke University
+
+# $Id: dumpMetadata.py,v 1.36 2006/02/21 20:10:08 pnasrat Exp $
+
+import os
+import rpm
+import exceptions
+import md5
+import sha
+import types
+import struct
+import re
+import stat
+
+# done to fix gzip randomly changing the checksum
+import gzip
+from zlib import error as zlibError
+from gzip import write32u, FNAME
+
+__all__ = ["GzipFile","open"]
+
+class GzipFile(gzip.GzipFile):
+ def _write_gzip_header(self):
+ self.fileobj.write('\037\213') # magic header
+ self.fileobj.write('\010') # compression method
+ fname = self.filename[:-3]
+ flags = 0
+ if fname:
+ flags = FNAME
+ self.fileobj.write(chr(flags))
+ write32u(self.fileobj, long(0))
+ self.fileobj.write('\002')
+ self.fileobj.write('\377')
+ if fname:
+ self.fileobj.write(fname + '\000')
+
+
+def _gzipOpen(filename, mode="rb", compresslevel=9):
+ return GzipFile(filename, mode, compresslevel)
+
+
+
+def returnFD(filename):
+ try:
+ fdno = os.open(filename, os.O_RDONLY)
+ except OSError:
+ raise MDError, "Error opening file"
+ return fdno
+
+def returnHdr(ts, package):
+ """hand back the rpm header or raise an Error if the pkg is fubar"""
+ opened_here = 0
+ try:
+ if type(package) is types.StringType:
+ opened_here = 1
+ fdno = os.open(package, os.O_RDONLY)
+ else:
+ fdno = package # let's assume this is an fdno and go with it :)
+ except OSError:
+ raise MDError, "Error opening file"
+ ts.setVSFlags((rpm._RPMVSF_NOSIGNATURES|rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD))
+ try:
+ hdr = ts.hdrFromFdno(fdno)
+ except rpm.error:
+ raise MDError, "Error opening package"
+ if type(hdr) != rpm.hdr:
+ raise MDError, "Error opening package"
+ ts.setVSFlags(0)
+
+ if opened_here:
+ os.close(fdno)
+ del fdno
+
+ return hdr
+
+def getChecksum(sumtype, file, CHUNK=2**16):
+ """takes filename, hand back Checksum of it
+ sumtype = md5 or sha
+ filename = /path/to/file
+ CHUNK=65536 by default"""
+
+ # chunking brazenly lifted from Ryan Tomayko
+ opened_here = 0
+ try:
+ if type(file) is not types.StringType:
+ fo = file # assume it's a file-like-object
+ else:
+ opened_here = 1
+ fo = open(file, 'rb', CHUNK)
+
+ if sumtype == 'md5':
+ sum = md5.new()
+ elif sumtype == 'sha':
+ sum = sha.new()
+ else:
+ raise MDError, 'Error Checksumming file, wrong checksum type %s' % sumtype
+ chunk = fo.read
+ while chunk:
+ chunk = fo.read(CHUNK)
+ sum.update(chunk)
+
+ if opened_here:
+ fo.close()
+ del fo
+
+ return sum.hexdigest()
+ except:
+ raise MDError, 'Error opening file for checksum: %s' % file
+
+
+def utf8String(string):
+ """hands back a unicoded string"""
+ if string is None:
+ return ''
+ elif isinstance(string, unicode):
+ return string
+ try:
+ x = unicode(string, 'ascii')
+ return string
+ except UnicodeError:
+ encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2']
+ for enc in encodings:
+ try:
+ x = unicode(string, enc)
+ except UnicodeError:
+ pass
+ else:
+ if x.encode(enc) == string:
+ return x.encode('utf-8')
+ newstring = ''
+ for char in string:
+ if ord(char) > 127:
+ newstring = newstring + '?'
+ else:
+ newstring = newstring + char
+ return newstring
+
+
+def byteranges(file):
+ """takes an rpm file or fileobject and returns byteranges for location of the header"""
+ opened_here = 0
+ if type(file) is not types.StringType:
+ fo = file
+ else:
+ opened_here = 1
+ fo = open(file, 'r')
+ #read in past lead and first 8 bytes of sig header
+ fo.seek(104)
+ # 104 bytes in
+ binindex = fo.read(4)
+ # 108 bytes in
+ (sigindex, ) = struct.unpack('>I', binindex)
+ bindata = fo.read(4)
+ # 112 bytes in
+ (sigdata, ) = struct.unpack('>I', bindata)
+ # each index is 4 32bit segments - so each is 16 bytes
+ sigindexsize = sigindex * 16
+ sigsize = sigdata + sigindexsize
+ # we have to round off to the next 8 byte boundary
+ disttoboundary = (sigsize % 8)
+ if disttoboundary != 0:
+ disttoboundary = 8 - disttoboundary
+ # 112 bytes - 96 == lead, 8 = magic and reserved, 8 == sig header data
+ hdrstart = 112 + sigsize + disttoboundary
+
+ fo.seek(hdrstart) # go to the start of the header
+ fo.seek(8,1) # read past the magic number and reserved bytes
+
+ binindex = fo.read(4)
+ (hdrindex, ) = struct.unpack('>I', binindex)
+ bindata = fo.read(4)
+ (hdrdata, ) = struct.unpack('>I', bindata)
+
+ # each index is 4 32bit segments - so each is 16 bytes
+ hdrindexsize = hdrindex * 16
+ # add 16 to the hdrsize to account for the 16 bytes of misc data b/t the
+ # end of the sig and the header.
+ hdrsize = hdrdata + hdrindexsize + 16
+
+ # header end is hdrstart + hdrsize
+ hdrend = hdrstart + hdrsize
+ if opened_here:
+ fo.close()
+ del fo
+ return (hdrstart, hdrend)
+
+
+class MDError(exceptions.Exception):
+ def __init__(self, args=None):
+ exceptions.Exception.__init__(self)
+ self.args = args
+
+
+
+class RpmMetaData:
+ """each drpm is one object, you pass it an rpm file
+ it opens the file, and pulls the information out in bite-sized chunks :)
+ """
+
+ mode_cache = {}
+
+ def __init__(self, ts, basedir, filename, options, is_drpm):
+ try:
+ stats = os.stat(os.path.join(basedir, filename))
+ self.size = stats[6]
+ self.mtime = stats[8]
+ del stats
+ except OSError, e:
+ raise MDError, "Error Stat'ing file %s %s" % (basedir, filename)
+ self.options = options
+ self.localurl = options['baseurl']
+ self.relativepath = filename
+ fd = returnFD(os.path.join(basedir, filename))
+ self.hdr = returnHdr(ts, fd)
+ os.lseek(fd, 0, 0)
+ fo = os.fdopen(fd, 'rb')
+ self.pkgid = self.doChecksumCache(fo)
+ fo.seek(0)
+ (self.rangestart, self.rangeend) = byteranges(fo)
+ self.is_drpm = False
+ if is_drpm:
+ fo.seek(self.rangeend)
+ self._getOldInfo(fo)
+ self.is_drpm = True
+ del fo
+ del fd
+
+ def arch(self):
+ if self.tagByName('sourcepackage') == 1:
+ return 'src'
+ else:
+ return self.tagByName('arch')
+
+ def _stringToNEVR(self, string):
+ i = string.rfind("-", 0, string.rfind("-")-1)
+ name = string[:i]
+ (epoch, ver, rel) = self._stringToVersion(string[i+1:])
+ return (name, epoch, ver, rel)
+
+ def _getLength(self, in_data):
+ length = 0
+ for val in in_data:
+ length = length * 256
+ length += ord(val)
+ return length
+
+ def _getOldInfo(self, fo):
+ try:
+ compobj = gzip.GzipFile("", "rb", 9, fo)
+ except:
+ raise zlibError("Data not stored in gzip format")
+
+ if compobj.read(4)[:3] != "DLT":
+ raise Exception("Not a deltarpm")
+
+ nevr_length = self._getLength(compobj.read(4))
+ nevr = compobj.read(nevr_length).strip("\x00")
+ seq_length = self._getLength(compobj.read(4))
+ seq = compobj.read(seq_length)
+ hex_seq = ""
+ for char in seq:
+ hex_seq += str("%02x" % ord(char))
+ self.oldnevrstring = nevr
+ self.oldnevr = self._stringToNEVR(nevr)
+ self.sequence = hex_seq
+ compobj.close()
+
+ def _stringToVersion(self, strng):
+ i = strng.find(':')
+ if i != -1:
+ epoch = strng[:i]
+ else:
+ epoch = '0'
+ j = strng.find('-')
+ if j != -1:
+ if strng[i + 1:j] == '':
+ version = None
+ else:
+ version = strng[i + 1:j]
+ release = strng[j + 1:]
+ else:
+ if strng[i + 1:] == '':
+ version = None
+ else:
+ version = strng[i + 1:]
+ release = None
+ return (epoch, version, release)
+
+ ###########
+ # Title: Remove duplicates from a sequence
+ # Submitter: Tim Peters
+ # From: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560
+
+ def _uniq(self,s):
+ """Return a list of the elements in s, but without duplicates.
+
+ For example, unique([1,2,3,1,2,3]) is some permutation of [1,2,3],
+ unique("abcabc") some permutation of ["a", "b", "c"], and
+ unique(([1, 2], [2, 3], [1, 2])) some permutation of
+ [[2, 3], [1, 2]].
+
+ For best speed, all sequence elements should be hashable. Then
+ unique() will usually work in linear time.
+
+ If not possible, the sequence elements should enjoy a total
+ ordering, and if list(s).sort() doesn't raise TypeError it's
+ assumed that they do enjoy a total ordering. Then unique() will
+ usually work in O(N*log2(N)) time.
+
+ If that's not possible either, the sequence elements must support
+ equality-testing. Then unique() will usually work in quadratic
+ time.
+ """
+
+ n = len(s)
+ if n == 0:
+ return []
+
+ # Try using a dict first, as that's the fastest and will usually
+ # work. If it doesn't work, it will usually fail quickly, so it
+ # usually doesn't cost much to *try* it. It requires that all the
+ # sequence elements be hashable, and support equality comparison.
+ u = {}
+ try:
+ for x in s:
+ u[x] = 1
+ except TypeError:
+ del u # move on to the next method
+ else:
+ return u.keys()
+
+ # We can't hash all the elements. Second fastest is to sort,
+ # which brings the equal elements together; then duplicates are
+ # easy to weed out in a single pass.
+ # NOTE: Python's list.sort() was designed to be efficient in the
+ # presence of many duplicate elements. This isn't true of all
+ # sort functions in all languages or libraries, so this approach
+ # is more effective in Python than it may be elsewhere.
+ try:
+ t = list(s)
+ t.sort()
+ except TypeError:
+ del t # move on to the next method
+ else:
+ assert n > 0
+ last = t[0]
+ lasti = i = 1
+ while i < n:
+ if t[i] != last:
+ t[lasti] = last = t[i]
+ lasti += 1
+ i += 1
+ return t[:lasti]
+
+ # Brute force is all that's left.
+ u = []
+ for x in s:
+ if x not in u:
+ u.append(x)
+ return u
+
+ def tagByName(self, tag):
+ data = self.hdr[tag]
+ if type(data) is types.ListType:
+ if len(data) > 0:
+ return data[0]
+ else:
+ return ''
+ else:
+ return data
+
+ def listTagByName(self, tag):
+ """take a tag that should be a list and make sure it is one"""
+ lst = []
+ data = self.hdr[tag]
+ if data is None:
+ return lst
+
+ if type(data) is types.ListType:
+ lst.extend(data)
+ else:
+ lst.append(data)
+ return lst
+
+ def epoch(self):
+ if self.hdr['epoch'] is None:
+ return 0
+ else:
+ return self.tagByName('epoch')
+
+ def doChecksumCache(self, fo):
+ """return a checksum for a package:
+ - check if the checksum cache is enabled
+ if not - return the checksum
+ if so - check to see if it has a cache file
+ if so, open it and return the first line's contents
+ if not, grab the checksum and write it to a file for this pkg
+ """
+ if not self.options['cache']:
+ return getChecksum(self.options['sumtype'], fo)
+
+ csumtag = os.path.basename(self.relativepath) + ".cache"
+ csumfile = '%s/%s' % (self.options['cachedir'], csumtag)
+ if os.path.exists(csumfile) and self.mtime <= os.stat(csumfile)[8]:
+ csumo = open(csumfile, 'r')
+ checksum = csumo.readline()
+ csumo.close()
+
+ else:
+ checksum = getChecksum(self.options['sumtype'], fo)
+ csumo = open(csumfile, 'w')
+ csumo.write(checksum)
+ csumo.close()
+
+ return checksum
+
+
+
+def generateXML(doc, node, formatns, drpmObj, sumtype, pkgDeltas):
+ """takes an xml doc object and a package metadata entry node, populates a
+ package node with the md information"""
+ name = drpmObj.tagByName('name')
+ arch = drpmObj.arch()
+ epoch = str(drpmObj.epoch())
+ ver = str(drpmObj.tagByName('version'))
+ rel = str(drpmObj.tagByName('release'))
+ if not pkgDeltas.has_key('%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)):
+ pkgNode = node.newChild(None, "package", None)
+ pkgNode.newProp('type', 'rpm')
+ pkgNode.newChild(None, 'name', name)
+ pkgNode.newChild(None, 'arch', arch)
+ version = pkgNode.newChild(None, 'version', None)
+ version.newProp('epoch', epoch)
+ version.newProp('ver', ver)
+ version.newProp('rel', rel)
+ deltas = pkgNode.newChild(None, 'deltas', None)
+ pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)] = deltas
+ else:
+ deltas = pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)]
+ (oldname, oldepoch, oldver, oldrel) = drpmObj.oldnevr
+ drpmNode = deltas.newChild(None, "oldrpm", None)
+ if name != oldname:
+ drpmNode.newChild(None, 'name', oldname)
+ # oldrpm arch is not stored in drpm, so we can only work within same arch
+ version = drpmNode.newChild(None, 'version', None)
+ if epoch != oldepoch:
+ version.newProp('epoch', oldepoch)
+ if ver != oldver:
+ version.newProp('ver', oldver)
+ version.newProp('rel', oldrel)
+ drpmNode.newChild(None, 'drpm_filename', drpmObj.relativepath)
+ drpmNode.newChild(None, 'size', str(drpmObj.size))
+ drpmNode.newChild(None, 'sequence', '%s-%s' % (drpmObj.oldnevrstring, drpmObj.sequence))
+ checksum = drpmNode.newChild(None, 'checksum', drpmObj.pkgid)
+ checksum.newProp('type', drpmObj.options['sumtype'])
+
+
+def repoXML(node, cmds):
+ """generate the repomd.xml file that stores the info on the other files"""
+ sumtype = cmds['sumtype']
+ workfiles = [(cmds['prestofile'], 'deltas')]
+
+
+ for (file, ftype) in workfiles:
+ zfo = _gzipOpen(os.path.join(cmds['outputdir'], cmds['tempdir'], file))
+ uncsum = getChecksum(sumtype, zfo)
+ zfo.close()
+ csum = getChecksum(sumtype, os.path.join(cmds['outputdir'], cmds['tempdir'], file))
+ timestamp = os.stat(os.path.join(cmds['outputdir'], cmds['tempdir'], file))[8]
+ data = node.newChild(None, 'data', None)
+ data.newProp('type', ftype)
+ location = data.newChild(None, 'location', None)
+ if cmds['baseurl'] is not None:
+ location.newProp('xml:base', cmds['baseurl'])
+ location.newProp('href', os.path.join(cmds['finaldir'], file))
+ checksum = data.newChild(None, 'checksum', csum)
+ checksum.newProp('type', sumtype)
+ timestamp = data.newChild(None, 'timestamp', str(timestamp))
+ unchecksum = data.newChild(None, 'open-checksum', uncsum)
+ unchecksum.newProp('type', sumtype)
+