# base classes and functions for dumping out package Metadata
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Library General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# Copyright 2004 Duke University

# $Id: dumpMetadata.py,v 1.36 2006/02/21 20:10:08 pnasrat Exp $

import os
import rpm
import exceptions
import md5
import sha
import types
import struct
import re
import stat

# done to fix gzip randomly changing the checksum
import gzip
from zlib import error as zlibError
from gzip import write32u, FNAME

__all__ = ["GzipFile","open"]

class GzipFile(gzip.GzipFile):
    def _write_gzip_header(self):
        self.fileobj.write('\037\213')             # magic header
        self.fileobj.write('\010')                 # compression method
        fname = self.filename[:-3]
        flags = 0
        if fname:
            flags = FNAME
        self.fileobj.write(chr(flags))
        write32u(self.fileobj, long(0))
        self.fileobj.write('\002')
        self.fileobj.write('\377')
        if fname:
            self.fileobj.write(fname + '\000')


def _gzipOpen(filename, mode="rb", compresslevel=9):
    return GzipFile(filename, mode, compresslevel)
    

def returnFD(filename):
    try:
        fdno = os.open(filename, os.O_RDONLY)
    except OSError:
        raise MDError, "Error opening file"
    return fdno
    
def returnHdr(ts, package):
    """hand back the rpm header or raise an Error if the pkg is fubar"""
    opened_here = 0
    try:
        if type(package) is types.StringType:
            opened_here = 1
            fdno = os.open(package, os.O_RDONLY)
        else: 
            fdno = package # let's assume this is an fdno and go with it :)
    except OSError:
        raise MDError, "Error opening file"
    ts.setVSFlags((rpm._RPMVSF_NOSIGNATURES|rpm.RPMVSF_NOMD5|rpm.RPMVSF_NEEDPAYLOAD))
    try:
        hdr = ts.hdrFromFdno(fdno)
    except rpm.error:
        raise MDError, "Error opening package"
    if type(hdr) != rpm.hdr:
        raise MDError, "Error opening package"
    ts.setVSFlags(0)
    
    if opened_here:
        os.close(fdno)
        del fdno

    return hdr
    
def getChecksum(sumtype, file, CHUNK=2**16):
    """takes filename, hand back Checksum of it
       sumtype = md5 or sha
       filename = /path/to/file
       CHUNK=65536 by default"""
       
    # chunking brazenly lifted from Ryan Tomayko
    opened_here = 0
    try:
        if type(file) is not types.StringType:
            fo = file # assume it's a file-like-object
        else:
            opened_here = 1
            fo = open(file, 'rb', CHUNK)
            
        if sumtype == 'md5':
            sum = md5.new()
        elif sumtype == 'sha':
            sum = sha.new()
        else:
            raise MDError, 'Error Checksumming file, wrong checksum type %s' % sumtype
        chunk = fo.read
        while chunk: 
            chunk = fo.read(CHUNK)
            sum.update(chunk)

        if opened_here:
            fo.close()
            del fo
            
        return sum.hexdigest()
    except:
        raise MDError, 'Error opening file for checksum: %s' % file


def utf8String(string):
    """hands back a unicoded string"""
    if string is None:
        return ''
    elif isinstance(string, unicode):    
        return string
    try:
        x = unicode(string, 'ascii')
        return string
    except UnicodeError:
        encodings = ['utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2']
        for enc in encodings:
            try:
                x = unicode(string, enc)
            except UnicodeError:
                pass
            else:
                if x.encode(enc) == string:
                    return x.encode('utf-8')
    newstring = ''
    for char in string:
        if ord(char) > 127:
            newstring = newstring + '?'
        else:
            newstring = newstring + char
    return newstring

        
def byteranges(file):
    """takes an rpm file or fileobject and returns byteranges for location of the header"""
    opened_here = 0
    if type(file) is not types.StringType:
        fo = file
    else:
        opened_here = 1
        fo = open(file, 'r')
    #read in past lead and first 8 bytes of sig header
    fo.seek(104)
    # 104 bytes in
    binindex = fo.read(4)
    # 108 bytes in
    (sigindex, ) = struct.unpack('>I', binindex)
    bindata = fo.read(4)
    # 112 bytes in
    (sigdata, ) = struct.unpack('>I', bindata)
    # each index is 4 32bit segments - so each is 16 bytes
    sigindexsize = sigindex * 16
    sigsize = sigdata + sigindexsize
    # we have to round off to the next 8 byte boundary
    disttoboundary = (sigsize % 8)
    if disttoboundary != 0:
        disttoboundary = 8 - disttoboundary
    # 112 bytes - 96 == lead, 8 = magic and reserved, 8 == sig header data
    hdrstart = 112 + sigsize  + disttoboundary
    
    fo.seek(hdrstart) # go to the start of the header
    fo.seek(8,1) # read past the magic number and reserved bytes

    binindex = fo.read(4) 
    (hdrindex, ) = struct.unpack('>I', binindex)
    bindata = fo.read(4)
    (hdrdata, ) = struct.unpack('>I', bindata)
    
    # each index is 4 32bit segments - so each is 16 bytes
    hdrindexsize = hdrindex * 16 
    # add 16 to the hdrsize to account for the 16 bytes of misc data b/t the
    # end of the sig and the header.
    hdrsize = hdrdata + hdrindexsize + 16
    
    # header end is hdrstart + hdrsize 
    hdrend = hdrstart + hdrsize 
    if opened_here:
        fo.close()
        del fo
    return (hdrstart, hdrend)
    

class MDError(exceptions.Exception):
    def __init__(self, args=None):
        exceptions.Exception.__init__(self)
        self.args = args


class RpmMetaData:
    """each drpm is one object, you pass it an rpm file
       it opens the file, and pulls the information out in bite-sized chunks :)
    """

    mode_cache = {}

    def __init__(self, ts, basedir, filename, options, is_drpm):
        try:
            stats = os.stat(os.path.join(basedir, filename))
            self.size = stats[6]
            self.mtime = stats[8]
            del stats
        except OSError, e:
            raise MDError, "Error Stat'ing file %s %s" % (basedir, filename)
        self.options = options
        self.localurl = options['baseurl']
        self.relativepath = filename
        fd = returnFD(os.path.join(basedir, filename))
        self.hdr = returnHdr(ts, fd)
        os.lseek(fd, 0, 0)
        fo = os.fdopen(fd, 'rb')
        self.pkgid = self.doChecksumCache(fo)
        fo.seek(0)
        (self.rangestart, self.rangeend) = byteranges(fo)
        self.is_drpm = False
        if is_drpm:
            fo.seek(self.rangeend)
            self._getOldInfo(fo)
            self.is_drpm = True
        del fo
        del fd
                    
    def arch(self):
        if self.tagByName('sourcepackage') == 1:
            return 'src'
        else:
            return self.tagByName('arch')

    def _stringToNEVR(self, string):
        i = string.rfind("-", 0, string.rfind("-")-1)
        name = string[:i]
        (epoch, ver, rel) = self._stringToVersion(string[i+1:])
        return (name, epoch, ver, rel)
        
    def _getLength(self, in_data):
        length = 0
        for val in in_data:
            length = length * 256
            length += ord(val)
        return length
        
    def _getOldInfo(self, fo):
        try:
            compobj = gzip.GzipFile("", "rb", 9, fo)
        except:
            raise zlibError("Data not stored in gzip format")
            
        if compobj.read(4)[:3] != "DLT":
            raise Exception("Not a deltarpm")
        
        nevr_length = self._getLength(compobj.read(4))
        nevr = compobj.read(nevr_length).strip("\x00")
        seq_length = self._getLength(compobj.read(4))
        seq = compobj.read(seq_length)
        hex_seq = ""
        for char in seq:
            hex_seq += str("%02x" % ord(char))
        self.oldnevrstring = nevr
        self.oldnevr = self._stringToNEVR(nevr)
        self.sequence = hex_seq
        compobj.close()
            
    def _stringToVersion(self, strng):
        i = strng.find(':')
        if i != -1:
            epoch = strng[:i]
        else:
            epoch = '0'
        j = strng.find('-')
        if j != -1:
            if strng[i + 1:j] == '':
                version = None
            else:
                version = strng[i + 1:j]
            release = strng[j + 1:]
        else:
            if strng[i + 1:] == '':
                version = None
            else:
                version = strng[i + 1:]
            release = None
        return (epoch, version, release)

    ###########
    # Title: Remove duplicates from a sequence
    # Submitter: Tim Peters 
    # From: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52560                      
        
    def _uniq(self,s):
        """Return a list of the elements in s, but without duplicates.
    
        For example, unique([1,2,3,1,2,3]) is some permutation of [1,2,3],
        unique("abcabc") some permutation of ["a", "b", "c"], and
        unique(([1, 2], [2, 3], [1, 2])) some permutation of
        [[2, 3], [1, 2]].
    
        For best speed, all sequence elements should be hashable.  Then
        unique() will usually work in linear time.
    
        If not possible, the sequence elements should enjoy a total
        ordering, and if list(s).sort() doesn't raise TypeError it's
        assumed that they do enjoy a total ordering.  Then unique() will
        usually work in O(N*log2(N)) time.
    
        If that's not possible either, the sequence elements must support
        equality-testing.  Then unique() will usually work in quadratic
        time.
        """
    
        n = len(s)
        if n == 0:
            return []
    
        # Try using a dict first, as that's the fastest and will usually
        # work.  If it doesn't work, it will usually fail quickly, so it
        # usually doesn't cost much to *try* it.  It requires that all the
        # sequence elements be hashable, and support equality comparison.
        u = {}
        try:
            for x in s:
                u[x] = 1
        except TypeError:
            del u  # move on to the next method
        else:
            return u.keys()
    
        # We can't hash all the elements.  Second fastest is to sort,
        # which brings the equal elements together; then duplicates are
        # easy to weed out in a single pass.
        # NOTE:  Python's list.sort() was designed to be efficient in the
        # presence of many duplicate elements.  This isn't true of all
        # sort functions in all languages or libraries, so this approach
        # is more effective in Python than it may be elsewhere.
        try:
            t = list(s)
            t.sort()
        except TypeError:
            del t  # move on to the next method
        else:
            assert n > 0
            last = t[0]
            lasti = i = 1
            while i < n:
                if t[i] != last:
                    t[lasti] = last = t[i]
                    lasti += 1
                i += 1
            return t[:lasti]
    
        # Brute force is all that's left.
        u = []
        for x in s:
            if x not in u:
                u.append(x)
        return u

    def tagByName(self, tag):
        data = self.hdr[tag]
        if type(data) is types.ListType:
            if len(data) > 0:
                return data[0]
            else:
                return ''
        else:
            return data
    
    def listTagByName(self, tag):
        """take a tag that should be a list and make sure it is one"""
        lst = []
        data = self.hdr[tag]
        if data is None:
            return lst
            
        if type(data) is types.ListType:
            lst.extend(data)
        else:
            lst.append(data)
        return lst

    def epoch(self):
        if self.hdr['epoch'] is None:
            return 0
        else:
            return self.tagByName('epoch')
                    
    def doChecksumCache(self, fo):
        """return a checksum for a package:
           - check if the checksum cache is enabled
              if not - return the checksum
              if so - check to see if it has a cache file
                if so, open it and return the first line's contents
                if not, grab the checksum and write it to a file for this pkg
            """
        if not self.options['cache']:
            return getChecksum(self.options['sumtype'], fo)
        
        csumtag = os.path.basename(self.relativepath) + ".cache"
        csumfile = '%s/%s' % (self.options['cachedir'], csumtag)
        if os.path.exists(csumfile) and self.mtime <= os.stat(csumfile)[8]:
            csumo = open(csumfile, 'r')
            checksum = csumo.readline()
            csumo.close()
            
        else:
            checksum = getChecksum(self.options['sumtype'], fo)
            csumo = open(csumfile, 'w')
            csumo.write(checksum)
            csumo.close()
            
        return checksum


def generateXML(doc, node, formatns, drpmObj, sumtype, pkgDeltas):
    """takes an xml doc object and a package metadata entry node, populates a 
       package node with the md information"""
    name = drpmObj.tagByName('name')
    arch = drpmObj.arch()
    epoch = str(drpmObj.epoch())
    ver = str(drpmObj.tagByName('version'))
    rel = str(drpmObj.tagByName('release'))
    if not pkgDeltas.has_key('%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)):
        pkgNode = node.newChild(None, "package", None)
        pkgNode.newProp('type', 'rpm')
        pkgNode.newChild(None, 'name', name)
        pkgNode.newChild(None, 'arch', arch)
        version = pkgNode.newChild(None, 'version', None)
        version.newProp('epoch', epoch)
        version.newProp('ver', ver)
        version.newProp('rel', rel)
        deltas = pkgNode.newChild(None, 'deltas', None)
        pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)] = deltas
    else:
        deltas = pkgDeltas['%s-%s:%s-%s.%s' % (name, epoch, ver, rel, arch)]
    (oldname, oldepoch, oldver, oldrel) = drpmObj.oldnevr
    drpmNode = deltas.newChild(None, "oldrpm", None)
    if name != oldname:
        drpmNode.newChild(None, 'name', oldname)
    # oldrpm arch is not stored in drpm, so we can only work within same arch
    version = drpmNode.newChild(None, 'version', None)
    if epoch != oldepoch:
        version.newProp('epoch', oldepoch)
    if ver != oldver:
        version.newProp('ver', oldver)
    version.newProp('rel', oldrel)
    drpmNode.newChild(None, 'drpm_filename', drpmObj.relativepath)
    drpmNode.newChild(None, 'size', str(drpmObj.size))
    drpmNode.newChild(None, 'sequence', '%s-%s' % (drpmObj.oldnevrstring, drpmObj.sequence))
    checksum = drpmNode.newChild(None, 'checksum', drpmObj.pkgid)
    checksum.newProp('type', drpmObj.options['sumtype'])
    
        
def repoXML(node, cmds):
    """generate the repomd.xml file that stores the info on the other files"""
    sumtype = cmds['sumtype']
    workfiles = [(cmds['prestofile'], 'deltas')]
    
    
    for (file, ftype) in workfiles:
        zfo = _gzipOpen(os.path.join(cmds['outputdir'], cmds['tempdir'], file))
        uncsum = getChecksum(sumtype, zfo)
        zfo.close()
        csum = getChecksum(sumtype, os.path.join(cmds['outputdir'], cmds['tempdir'], file))
        timestamp = os.stat(os.path.join(cmds['outputdir'], cmds['tempdir'], file))[8]
        data = node.newChild(None, 'data', None)
        data.newProp('type', ftype)
        location = data.newChild(None, 'location', None)
        if cmds['baseurl'] is not None:
            location.newProp('xml:base', cmds['baseurl'])
        location.newProp('href', os.path.join(cmds['finaldir'], file))
        checksum = data.newChild(None, 'checksum', csum)
        checksum.newProp('type', sumtype)
        timestamp = data.newChild(None, 'timestamp', str(timestamp))
        unchecksum = data.newChild(None, 'open-checksum', uncsum)
        unchecksum.newProp('type', sumtype)