summaryrefslogtreecommitdiffstats
path: root/createprestorepo/dumpMetadata.py
diff options
context:
space:
mode:
authorJonathan Dieter <jdieter@gmail.com>2007-06-19 20:58:07 +0300
committerJonathan Dieter <jdieter@gmail.com>2007-06-19 20:58:07 +0300
commitdce0600bc64c793ba6e8f67c56c286d8d97e7c4c (patch)
tree71c559e031b3c10ba56a187e0a017f09d4d25137 /createprestorepo/dumpMetadata.py
parent93b2295180471308e969640472bdc601d1f10015 (diff)
downloadpresto-dce0600bc64c793ba6e8f67c56c286d8d97e7c4c.zip
presto-dce0600bc64c793ba6e8f67c56c286d8d97e7c4c.tar.gz
presto-dce0600bc64c793ba6e8f67c56c286d8d97e7c4c.tar.xz
Many bugfixes and a few enhancements
Signed-off-by: Jonathan Dieter <jdieter@gmail.com>
Diffstat (limited to 'createprestorepo/dumpMetadata.py')
-rw-r--r--createprestorepo/dumpMetadata.py496
1 files changed, 0 insertions, 496 deletions
diff --git a/createprestorepo/dumpMetadata.py b/createprestorepo/dumpMetadata.py
deleted file mode 100644
index 6969400..0000000
--- a/createprestorepo/dumpMetadata.py
+++ /dev/null
@@ -1,496 +0,0 @@
-#!/usr/bin/python -t
-# base classes and functions for dumping out package Metadata
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-# Copyright 2004 Duke University
-
-# $Id: dumpMetadata.py,v 1.36 2006/02/21 20:10:08 pnasrat Exp $
-
-import os
-import rpm
-import exceptions
-import md5
-import sha
-import types
-import struct
-import re
-import stat
-
-# done to fix gzip randomly changing the checksum
-import gzip
-from zlib import error as zlibError
-from gzip import write32u, FNAME
-
-__all__ = ["GzipFile","open"]
-
-class GzipFile(gzip.GzipFile):
- def _write_gzip_header(self):
- self.fileobj.write('\037\213') # magic header
- self.fileobj.write('\010') # compression method
- fname = self.filename[:-3]
- flags = 0
- if fname:
- flags = FNAME
- self.fileobj.write(chr(flags))
- write32u(self.fileobj, long(0))
- self.fileobj.write('\002')
- self.fileobj.write('\377')
- if fname:
- self.fileobj.write(fname + '\000')
-
-
-def _gzipOpen(filename, mode="rb", compresslevel=9):
- return GzipFile(filename, mode, compresslevel)
-
-
-
-def returnFD(filename):
- try:
- fdno = os.open(filename, os.O_RDONLY)
- except OSError:
- raise MDError, "Error opening file"
- return fdno
-
-def returnHdr(ts, package):
- """hand back the rpm header or raise an Error if the pkg is fubar"""
- opened_here = 0
- try:
- if type(package) is types.StringType:
- opened_here = 1
- fdno = os.open(package, os.O_RDONLY)
- else:
- fdno = package # let's assume this is an fdno and go with it :)
- except OSError:
- raise MDError, "Error opening file"
- ts.setVSFlags((rpm._RPMVSF_NOSIGNATURES|rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD))
- try:
- hdr = ts.hdrFromFdno(fdno)
- except rpm.error:
- raise MDError, "Error opening package"
- if type(hdr) != rpm.hdr:
- raise MDError, "Error opening package"
- ts.setVSFlags(0)
-
- if opened_here:
- os.close(fdno)
- del fdno
-
- return hdr
-
-def getChecksum(sumtype, file, CHUNK=2**16):
- """takes filename, hand back Checksum of it
- sumtype = md5 or sha
- filename = /path/to/file
- CHUNK=65536 by default"""
-
- # chunking brazenly lifted from Ryan Tomayko
- opened_here = 0
- try:
- if type(file) is not types.StringType:
- fo = file # assume it's a file-like-object
- else:
- opened_here = 1
- fo = open(file, 'rb', CHUNK)
-
- if sumtype == 'md5':
- sum = md5.new()
- elif sumtype == 'sha':
- sum = sha.new()
- else:
- raise MDError, 'Error Checksumming file, wrong checksum type %s' % sumtype
- chunk = fo.read
- while chunk:
- chunk = fo.read(CHUNK)
- sum.update(chunk)
-
- if opened_here:
- fo.close()
- del fo
-
- return sum.hexdigest()
- except:
- raise MDError, 'Error opening file for checksum: %s' % file
-
-
-def utf8String(string):
- """hands back a unicoded string"""
- if string is None:
- return ''
- elif isinstance(string, unicode):
- return string
- try:
- x = unicode(string, 'ascii')
- return string
- except UnicodeError:
- encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2']
- for enc in encodings:
- try:
- x = unicode(string, enc)
- except UnicodeError:
- pass
- else:
- if x.encode(enc) == string:
- return x.encode('utf-8')
- newstring = ''
- for char in string:
- if ord(char) > 127:
- newstring = newstring + '?'
- else:
- newstring = newstring + char
- return newstring
-
-
-def byteranges(file):
- """takes an rpm file or fileobject and returns byteranges for location of the header"""
- opened_here = 0
- if type(file) is not types.StringType:
- fo = file
- else:
- opened_here = 1
- fo = open(file, 'r')
- #read in past lead and first 8 bytes of sig header
- fo.seek(104)
- # 104 bytes in
- binindex = fo.read(4)
- # 108 bytes in
- (sigindex, ) = struct.unpack('>I', binindex)
- bindata = fo.read(4)
- # 112 bytes in
- (sigdata, ) = struct.unpack('>I', bindata)
- # each index is 4 32bit segments - so each is 16 bytes
- sigindexsize = sigindex * 16
- sigsize = sigdata + sigindexsize
- # we have to round off to the next 8 byte boundary
- disttoboundary = (sigsize % 8)
- if disttoboundary != 0:
- disttoboundary = 8 - disttoboundary
- # 112 bytes - 96 == lead, 8 = magic and reserved, 8 == sig header data
- hdrstart = 112 + sigsize + disttoboundary
-
- fo.seek(hdrstart) # go to the start of the header
- fo.seek(8,1) # read past the magic number and reserved bytes
-
- binindex = fo.read(4)
- (hdrindex, ) = struct.unpack('>I', binindex)
- bindata = fo.read(4)
- (hdrdata, ) = struct.unpack('>I', bindata)
-
- # each index is 4 32bit segments - so each is 16 bytes
- hdrindexsize = hdrindex * 16
- # add 16 to the hdrsize to account for the 16 bytes of misc data b/t the
- # end of the sig and the header.
- hdrsize = hdrdata + hdrindexsize + 16
-
- # header end is hdrstart + hdrsize
- hdrend = hdrstart + hdrsize
- if opened_here:
- fo.close()
- del fo
- return (hdrstart, hdrend)
-
-
-class MDError(exceptions.Exception):
- def __init__(self, args=None):
- exceptions.Exception.__init__(self)
- self.args = args
-
-
-
-class RpmMetaData:
- """each drpm is one object, you pass it an rpm file
- it opens the file, and pulls the information out in bite-sized chunks :)
- """
-
- mode_cache = {}
-
- def __init__(self, ts, basedir, filename, options, is_drpm):
- try:
- stats = os.stat(os.path.join(basedir, filename))
- self.size = stats[6]
- self.mtime = stats[8]
- del stats
- except OSError, e:
- raise MDError, "Error Stat'ing file %s %s" % (basedir, filename)
- self.options = options
- self.localurl = options['baseurl']
- self.relativepath = filename
- fd = returnFD(os.path.join(basedir, filename))
- self.hdr = returnHdr(ts, fd)
- os.lseek(fd, 0, 0)
- fo = os.fdopen(fd, 'rb')
- self.pkgid = self.doChecksumCache(fo)
- fo.seek(0)
- (self.rangestart, self.rangeend) = byteranges(fo)
- self.is_drpm = False
- if is_drpm:
- fo.seek(self.rangeend)
- self._getOldInfo(fo)
- self.is_drpm = True
- del fo
- del fd
-
- def arch(self):
- if self.tagByName('sourcepackage') == 1:
- return 'src'
- else:
- return self.tagByName('arch')
-
- def _stringToNEVR(self, string):
- i = string.rfind("-", 0, string.rfind("-")-1)
- name = string[:i]
- (epoch, ver, rel) = self._stringToVersion(string[i+1:])
- return (name, epoch, ver, rel)
-
- def _getLength(self, in_data):
- length = 0
- for val in in_data:
- length = length * 256
- length += ord(val)
- return length
-
- def _getOldInfo(self, fo):
- try:
- compobj = gzip.GzipFile("", "rb", 9, fo)
- except:
- raise zlibError("Data not stored in gzip format")
-
- if compobj.read(4)[:3] != "DLT":
- raise Exception("Not a deltarpm")
-
- nevr_length = self._getLength(compobj.read(4))
- nevr = compobj.read(nevr_length).strip("\x00")
- seq_length = self._getLength(compobj.read(4))
- seq = compobj.read(seq_length)
- hex_seq = ""
- for char in seq:
- hex_seq += str("%02x" % ord(char))
- self.oldnevrstring = nevr
- self.oldnevr = self._stringToNEVR(nevr)
- self.sequence = hex_seq
- compobj.close()
-
- def _stringToVersion(self, strng):
- i = strng.find(':')
- if i != -1:
- epoch = strng[:i]
- else:
- epoch = '0'
- j = strng.find('-')
- if j != -1:
- if strng[i + 1:j] == '':
- version = None
- else:
- version = strng[i + 1:j]
- release = strng[j + 1:]
- else:
- if strng[i + 1:] == '':
- version = None
- else:
- version = strng[i + 1:]
- release = None
- return (epoch, version, release)
-
- ###########
- # Title: Remove duplicates from a sequence
- # Submitter: Tim Peters
- # From: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560
-
- def _uniq(self,s):
- """Return a list of the elements in s, but without duplicates.
-
- For example, unique([1,2,3,1,2,3]) is some permutation of [1,2,3],
- unique("abcabc") some permutation of ["a", "b", "c"], and
- unique(([1, 2], [2, 3], [1, 2])) some permutation of
- [[2, 3], [1, 2]].
-
- For best speed, all sequence elements should be hashable. Then
- unique() will usually work in linear time.
-
- If not possible, the sequence elements should enjoy a total
- ordering, and if list(s).sort() doesn't raise TypeError it's
- assumed that they do enjoy a total ordering. Then unique() will
- usually work in O(N*log2(N)) time.
-
- If that's not possible either, the sequence elements must support
- equality-testing. Then unique() will usually work in quadratic
- time.
- """
-
- n = len(s)
- if n == 0:
- return []
-
- # Try using a dict first, as that's the fastest and will usually
- # work. If it doesn't work, it will usually fail quickly, so it
- # usually doesn't cost much to *try* it. It requires that all the
- # sequence elements be hashable, and support equality comparison.
- u = {}
- try:
- for x in s:
- u[x] = 1
- except TypeError:
- del u # move on to the next method
- else:
- return u.keys()
-
- # We can't hash all the elements. Second fastest is to sort,
- # which brings the equal elements together; then duplicates are
- # easy to weed out in a single pass.
- # NOTE: Python's list.sort() was designed to be efficient in the
- # presence of many duplicate elements. This isn't true of all
- # sort functions in all languages or libraries, so this approach
- # is more effective in Python than it may be elsewhere.
- try:
- t = list(s)
- t.sort()
- except TypeError:
- del t # move on to the next method
- else:
- assert n > 0
- last = t[0]
- lasti = i = 1
- while i < n:
- if t[i] != last:
- t[lasti] = last = t[i]
- lasti += 1
- i += 1
- return t[:lasti]
-
- # Brute force is all that's left.
- u = []
- for x in s:
- if x not in u:
- u.append(x)
- return u
-
- def tagByName(self, tag):
- data = self.hdr[tag]
- if type(data) is types.ListType:
- if len(data) > 0:
- return data[0]
- else:
- return ''
- else:
- return data
-
- def listTagByName(self, tag):
- """take a tag that should be a list and make sure it is one"""
- lst = []
- data = self.hdr[tag]
- if data is None:
- return lst
-
- if type(data) is types.ListType:
- lst.extend(data)
- else:
- lst.append(data)
- return lst
-
- def epoch(self):
- if self.hdr['epoch'] is None:
- return 0
- else:
- return self.tagByName('epoch')
-
- def doChecksumCache(self, fo):
- """return a checksum for a package:
- - check if the checksum cache is enabled
- if not - return the checksum
- if so - check to see if it has a cache file
- if so, open it and return the first line's contents
- if not, grab the checksum and write it to a file for this pkg
- """
- if not self.options['cache']:
- return getChecksum(self.options['sumtype'], fo)
-
- csumtag = os.path.basename(self.relativepath) + ".cache"
- csumfile = '%s/%s' % (self.options['cachedir'], csumtag)
- if os.path.exists(csumfile) and self.mtime <= os.stat(csumfile)[8]:
- csumo = open(csumfile, 'r')
- checksum = csumo.readline()
- csumo.close()
-
- else:
- checksum = getChecksum(self.options['sumtype'], fo)
- csumo = open(csumfile, 'w')
- csumo.write(checksum)
- csumo.close()
-
- return checksum
-
-
-
-def generateXML(doc, node, formatns, drpmObj, sumtype, pkgDeltas):
- """takes an xml doc object and a package metadata entry node, populates a
- package node with the md information"""
- name = drpmObj.tagByName('name')
- arch = drpmObj.arch()
- epoch = str(drpmObj.epoch())
- ver = str(drpmObj.tagByName('version'))
- rel = str(drpmObj.tagByName('release'))
- if not pkgDeltas.has_key('%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)):
- pkgNode = node.newChild(None, "package", None)
- pkgNode.newProp('type', 'rpm')
- pkgNode.newChild(None, 'name', name)
- pkgNode.newChild(None, 'arch', arch)
- version = pkgNode.newChild(None, 'version', None)
- version.newProp('epoch', epoch)
- version.newProp('ver', ver)
- version.newProp('rel', rel)
- deltas = pkgNode.newChild(None, 'deltas', None)
- pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)] = deltas
- else:
- deltas = pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)]
- (oldname, oldepoch, oldver, oldrel) = drpmObj.oldnevr
- drpmNode = deltas.newChild(None, "oldrpm", None)
- if name != oldname:
- drpmNode.newChild(None, 'name', oldname)
- # oldrpm arch is not stored in drpm, so we can only work within same arch
- version = drpmNode.newChild(None, 'version', None)
- if epoch != oldepoch:
- version.newProp('epoch', oldepoch)
- if ver != oldver:
- version.newProp('ver', oldver)
- version.newProp('rel', oldrel)
- drpmNode.newChild(None, 'drpm_filename', drpmObj.relativepath)
- drpmNode.newChild(None, 'size', str(drpmObj.size))
- drpmNode.newChild(None, 'sequence', '%s-%s' % (drpmObj.oldnevrstring, drpmObj.sequence))
- checksum = drpmNode.newChild(None, 'checksum', drpmObj.pkgid)
- checksum.newProp('type', drpmObj.options['sumtype'])
-
-
-def repoXML(node, cmds):
- """generate the repomd.xml file that stores the info on the other files"""
- sumtype = cmds['sumtype']
- workfiles = [(cmds['prestofile'], 'deltas')]
-
-
- for (file, ftype) in workfiles:
- zfo = _gzipOpen(os.path.join(cmds['outputdir'], cmds['tempdir'], file))
- uncsum = getChecksum(sumtype, zfo)
- zfo.close()
- csum = getChecksum(sumtype, os.path.join(cmds['outputdir'], cmds['tempdir'], file))
- timestamp = os.stat(os.path.join(cmds['outputdir'], cmds['tempdir'], file))[8]
- data = node.newChild(None, 'data', None)
- data.newProp('type', ftype)
- location = data.newChild(None, 'location', None)
- if cmds['baseurl'] is not None:
- location.newProp('xml:base', cmds['baseurl'])
- location.newProp('href', os.path.join(cmds['finaldir'], file))
- checksum = data.newChild(None, 'checksum', csum)
- checksum.newProp('type', sumtype)
- timestamp = data.newChild(None, 'timestamp', str(timestamp))
- unchecksum = data.newChild(None, 'open-checksum', uncsum)
- unchecksum.newProp('type', sumtype)