summaryrefslogtreecommitdiffstats
path: root/libpython.py
blob: dc931774472d3a44907093c5a206ab7268271afb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright © 2010 Red Hat, Inc.
#
# This software is licensed to you under the GNU Lesser General Public
# License, version 2.1 (LGPLv2.1). There is NO WARRANTY for this software,
# express or implied, including the implied warranties of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. You should have received a copy of
# LGPLv2.1 along with this software; if not, see
# http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
#
# Red Hat trademarks are not licensed under LGPLv2.1. No permission is
# granted to use or replicate Red Hat trademarks that are incorporated in
# this software or its documentation.
# 
# Red Hat Author(s): David Hugh Malcolm <dmalcolm@redhat.com>
'''
From gdb 7 onwards, gdb's build can be configured --with-python, allowing gdb
to be extended with Python code e.g. for library-specific data visualizations,
such as for the C++ STL types.

This python module deals with the case when the process being debugged (the
"inferior process" in gdb parlance) is itself python, or more specifically,
linked against libpython.  In this situation, almost every item of data is a
(PyObject*), and having the debugger merely print their addresses is not very
enlightening.

This module embeds knowledge about the implementation details of libpython so
that we can emit useful visualizations e.g. a string, a list, a dict, a frame
giving file/line information and the state of local variables

In particular, given a gdb.Value corresponding to a PyObject* in the inferior
process, we can generate a "proxy value" within the gdb process.  For example,
given a PyObject* in the inferior process that is in fact a PyListObject*
holding three PyObject* that turn out to be PyStringObject* instances, we can
generate a proxy value within the gdb process that is a list of strings:
  ["foo", "bar", "baz"]

We try to defer all gdb.lookup_type() invocations until as late as possible:
when the /usr/bin/python process starts in the debugger, the libpython.so
hasn't been dynamically loaded yet, so none of the type names are known to
the debugger

Tested with both libpython2.6 and libpython3.1

TODO: better handling of "instance"
'''

import gdb

class NullPyObjectPtr(RuntimeError):
    pass

def is_py3k():
    # This code assumes that a libpython's DWARF data has actually been
    # loaded by the point that this function is called
    sym = gdb.lookup_symbol('PyBytes_Type')[0]
    if sym:
        #...then PyBytes_Type exists, assume this is libpython3.*
        return True
    else:
        #...then PyBytes_Type doesn't exist, assume this is libpython2.*
        return False

class PyObjectPtr(object):
    """
    Class wrapping a gdb.Value that's a either a (PyObject*) within the
    inferior process, or some subclass pointer e.g. (PyStringObject*)

    There will be a subclass for every refined PyObject type that we care
    about.

    Note that at every stage the underlying pointer could be NULL, point
    to corrupt data, etc; this is the debugger, after all.
    """
    _typename = 'PyObject'

    def __init__(self, gdbval, cast_to = None):
        if cast_to:
                self._gdbval = gdbval.cast(cast_to)
        else:
            self._gdbval = gdbval

    def field(self, name):
        '''
        Get the gdb.Value for the given field within the PyObject, coping with
        some python 2 versus python 3 differences.

        Various libpython types are defined using the "PyObject_HEAD" and
        "PyObject_VAR_HEAD" macros.
            
        In Python 2, this these are defined so that "ob_type" and (for a var 
        object) "ob_size" are fields of the type in question.
        
        In Python 3, this is defined as an embedded PyVarObject type thus:
           PyVarObject ob_base;
        so that the "ob_size" field is located insize the "ob_base" field, and
        the "ob_type" is most easily accessed by casting back to a (PyObject*).
        '''
        if self.is_null():
            raise NullPyObjectPtr(self)

        if name == 'ob_type':
            return self._gdbval.cast(PyObjectPtr.get_gdb_type()).dereference()[name]
                
        if name == 'ob_size':
            try:
                # Python 2:
                return self._gdbval.dereference()[name]
            except RuntimeError:
                # Python 3:
                return self._gdbval.dereference()['ob_base'][name]
            
        # General case: look it up inside the object:
        return self._gdbval.dereference()[name]

    def type(self):
        return PyTypeObjectPtr(self.field('ob_type'))

    def is_null(self):
        return 0 == long(self._gdbval)

    def proxyval(self):
        '''
        Scrape a value from the inferior process, and try to represent it
        within the gdb process, whilst (hopefully) avoiding crashes when
        the remote data is corrupt.

        Derived classes will override this.

        For example, a PyIntObject* with ob_ival 42 in the inferior process
        should result in an int(42) in this process.
        '''
        class FakeRepr(object):
            """
            Class representing a non-descript PyObject* value in the inferior
            process for when we don't have a custom scraper, intended to have
            a sane repr().
            """
            def __init__(self, tp_name, address):
                self.tp_name = tp_name
                self.address = address

            def __repr__(self):
                return '<%s at remote 0x%x>' % (self.tp_name, self.address)

        try:
            tp_name = self.type().field('tp_name').string()
        except NullPyObjectPtr:
            # NULL tp_name?
            tp_name = 'unknown'
        return FakeRepr(tp_name,
                        long(self._gdbval))

    @classmethod
    def subclass_for_tp_name(cls, tp_name):
        if tp_name == 'str':
            if is_py3k():
                return PyUnicodeObjectPtr
            else:
                return PyStringObjectPtr
        if tp_name == 'int':
            if is_py3k():
                return PyLongObjectPtr
            else:
                return PyIntObjectPtr

        name_map = {'dict': PyDictObjectPtr,
                    'list': PyListObjectPtr,
                    'long': PyLongObjectPtr,
                    'tuple': PyTupleObjectPtr,
                    'frame': PyFrameObjectPtr,
                    'unicode': PyUnicodeObjectPtr,
                    }
        if tp_name in name_map:
            return name_map[tp_name]
        # Use the base class:
        return cls

    @classmethod
    def from_pyobject_ptr(cls, gdbval):
        '''
        Try to locate the appropriate derived class dynamically, and cast
        the pointer accordingly:
        For now, we just do string comparison on the tp_name
        Other approaches:
        (i) look up the symbols for the global types, but that isn't working yet:
            (gdb) python print gdb.lookup_symbol('PyList_Type')[0].value
            Traceback (most recent call last):
              File "<string>", line 1, in <module>
            NotImplementedError: Symbol type not yet supported in Python scripts.
            Error while executing Python code.
        (ii) look at tp_flags, looking e.g. for Py_TPFLAGS_LIST_SUBCLASS however
        this would rely on the values of those flags.
        
        So we go with the simple approach of looking at tp_name
        '''
        # 
        try:
            p = PyObjectPtr(gdbval)
            t = p.type()
            tp_name = t.field('tp_name').string()
            cls = cls.subclass_for_tp_name(tp_name)
            return cls(gdbval, cast_to=cls.get_gdb_type())
        except RuntimeError:
            # Handle any kind of error e.g. NULL ptrs by simply using the base
            # class
            pass
        return cls(gdbval)

    @classmethod
    def get_gdb_type(cls):
        return gdb.lookup_type(cls._typename).pointer()

class PyCodeObjectPtr(PyObjectPtr):
    """
    Class wrapping a gdb.Value that's a PyCodeObject* i.e. a <code> instance
    within the process being debugged.
    """
    _typename = 'PyCodeObject'

class PyDictObjectPtr(PyObjectPtr):
    """
    Class wrapping a gdb.Value that's a PyDictObject* i.e. a dict instance
    within the process being debugged.
    """
    _typename = 'PyDictObject'

    def proxyval(self):
        result = {}
        for i in xrange(self.field('ma_mask')):
            ep = self.field('ma_table') + i
            pvalue = PyObjectPtr.from_pyobject_ptr(ep['me_value'])
            if not pvalue.is_null():
                pkey = PyObjectPtr.from_pyobject_ptr(ep['me_key'])
                result[pkey.proxyval()] = pvalue.proxyval()
        return result

class PyIntObjectPtr(PyObjectPtr):
    _typename = 'PyIntObject'

    def proxyval(self):
        result = int_from_int(self.field('ob_ival'))
        return result

class PyListObjectPtr(PyObjectPtr):
    _typename = 'PyListObject'

    def __getitem__(self, i):
        # Get the gdb.Value for the (PyObject*) with the given index:
        field_ob_item = self.field('ob_item')
        return field_ob_item[i]

    def proxyval(self):
        result = [PyObjectPtr.from_pyobject_ptr(self[i]).proxyval()
                  for i in range(int_from_int(self.field('ob_size')))]
        return result

class PyLongObjectPtr(PyObjectPtr):
    _typename = 'PyLongObject'

    def proxyval(self):
        '''
        Python's Include/longobjrep.h has this declaration:
           struct _longobject {
               PyObject_VAR_HEAD
               digit ob_digit[1];
           };

        with this description:      
            The absolute value of a number is equal to
                 SUM(for i=0 through abs(ob_size)-1) ob_digit[i] * 2**(SHIFT*i)
            Negative numbers are represented with ob_size < 0;
            zero is represented by ob_size == 0.

        where SHIFT can be either:
            #define PyLong_SHIFT        30
            #define PyLong_SHIFT        15
        '''
        ob_size = long(self.field('ob_size'))
        if ob_size == 0:
            return 0L

        ob_digit = self.field('ob_digit')

        if gdb.lookup_type('digit').sizeof == 2:
            SHIFT = 15L
        else:
            # FIXME: I haven't yet tested this case
            SHIFT = 30L

        digits = [long(ob_digit[i]) * 2**(SHIFT*i) for i in xrange(abs(ob_size))]
        result = sum(digits)
        if ob_size < 0:
            result = -result
        return result

class PyFrameObjectPtr(PyObjectPtr):
    _typename = 'PyFrameObject'
    def __str__(self):
        fi = FrameInfo(self)
        return str(fi)

class PyStringObjectPtr(PyObjectPtr):
    _typename = 'PyStringObject'

    def __str__(self):
        # Lookup the gdb.Type for "char*"
        _type_char_ptr = gdb.lookup_type('char').pointer()

        field_ob_sval = self.field('ob_sval')
        char_ptr = field_ob_sval.address.cast(_type_char_ptr)
        return char_ptr.string()

    def proxyval(self):
        return str(self)

class PyTupleObjectPtr(PyObjectPtr):
    _typename = 'PyTupleObject'

    def __getitem__(self, i):
        # Get the gdb.Value for the (PyObject*) with the given index:
        field_ob_item = self.field('ob_item')
        return field_ob_item[i]

    def proxyval(self):
        result = tuple([PyObjectPtr.from_pyobject_ptr(self[i]).proxyval()
                        for i in range(int_from_int(self.field('ob_size')))])
        return result

class PyTypeObjectPtr(PyObjectPtr):
    _typename = 'PyTypeObject'

class PyUnicodeObjectPtr(PyObjectPtr):
    _typename = 'PyUnicodeObject'

    def proxyval(self):
        # From unicodeobject.h:
        #     Py_ssize_t length;  /* Length of raw Unicode data in buffer */
        #     Py_UNICODE *str;    /* Raw Unicode buffer */
        field_length = long(self.field('length'))
        field_str = self.field('str')

        # Gather a list of ints from the Py_UNICODE array; these are either
        # UCS-2 or UCS-4 code points:
        Py_UNICODEs = [int(field_str[i]) for i in xrange(field_length)]

        # Convert the int code points to unicode characters, and generate a
        # local unicode instance:
        result = u''.join([unichr(ucs) for ucs in Py_UNICODEs])
        return result

def int_from_int(gdbval):
    return int(str(gdbval))

def stringify(val):
    # TODO: repr() puts everything on one line; pformat can be nicer, but
    # can lead to v.long results; this function isolates the choice
    if True:
        return repr(val)
    else:
        from pprint import pformat
        return pformat(val)

class FrameInfo:
    '''
    Class representing all of the information we can scrape about a
    PyFrameObject*
    '''
    def __init__(self, fval):
        self.fval = fval
        self.co = PyCodeObjectPtr.from_pyobject_ptr(fval.field('f_code'))
        self.co_name = PyObjectPtr.from_pyobject_ptr(self.co.field('co_name'))
        self.co_filename = PyObjectPtr.from_pyobject_ptr(self.co.field('co_filename'))
        self.f_lineno = int_from_int(fval.field('f_lineno'))
        self.co_nlocals = int_from_int(self.co.field('co_nlocals'))
        self.co_varnames = PyTupleObjectPtr.from_pyobject_ptr(self.co.field('co_varnames'))
        self.locals = [] # list of kv pairs
        f_localsplus = self.fval.field('f_localsplus')
        for i in xrange(min(self.co_nlocals, 200)): # arbitrary upper sanity limit in case co_nlocals is corrupt
            #print 'i=%i' % i
            value = PyObjectPtr.from_pyobject_ptr(f_localsplus[i])
            if not value.is_null():
                name = PyObjectPtr.from_pyobject_ptr(self.co_varnames[i])
                #print 'name=%s' % name
                value = value.proxyval()
                #print 'value=%s' % value
                self.locals.append((str(name), value))
        
    def __str__(self):
        return ('File %s, line %i, in %s (%s)'
                % (self.co_filename,
                   self.f_lineno,
                   self.co_name,
                   ', '.join(['%s=%s' % (k, stringify(v)) for k, v in self.locals]))
                )

class PyObjectPtrPrinter:
    "Prints a (PyObject*)"

    def __init__ (self, gdbval):
        self.gdbval = gdbval

    def to_string (self):
        proxyval = PyObjectPtr.from_pyobject_ptr(self.gdbval).proxyval()
        return stringify(proxyval)

class PyFrameObjectPtrPrinter(PyObjectPtrPrinter):
    "Prints a (PyFrameObject*)"

    def to_string (self):
        pyop = PyObjectPtr.from_pyobject_ptr(self.gdbval)
        fi = FrameInfo(pyop)
        return str(fi)

def pretty_printer_lookup(gdbval):
    type = gdbval.type.unqualified()
    if type.code == gdb.TYPE_CODE_PTR:
        type = type.target().unqualified()
        t = str(type)
        if t == "PyObject":
            return PyObjectPtrPrinter(gdbval)
        elif t == "PyFrameObject":
            return PyFrameObjectPtrPrinter(gdbval)


# Wire up the pretty-printer
# FIXME: is there a better way to do this than to simply do it on import?
gdb.pretty_printers.append(pretty_printer_lookup)

"""
Here's how I've been invoking this code:
(gdb) python

import sys
sys.path.append('/home/david/coding/python-gdb')
import libpython
reload(libpython)

end
"""


# Remainder of the file is work-in-progress; don't use:
def pyframe():
    '''
    Work towards a reimplementation of "pyframe" from the gdbinit file
    '''
    fval = PyFrameObjectPtr.from_pyobject_ptr(gdb.selected_frame().read_var('f')) #.dereference()
    fi = FrameInfo(fval)
    print fi

def mybt():
    '''
    Attempt at writing a replacement backtrace hook
    Ought to be integrated into pluggable gdb backtrace hook
    '''
    def print_EvalFrameEx(gdbframe):
        try:
            f = gdbframe.read_var('f')
        except ValueError:
            print '(unable to print python frame, could not access "f")',
            return

        try:
            fval = PyFrameObjectPtr.from_pyobject_ptr(f) #.dereference()
            fi = FrameInfo(fval)
            print fi, 
        except RuntimeError:
            print '(unable to print python frame; corrupt data?)',
                    
        
    for i, gdbframe in enumerate(gdb.selected_thread().frames()):
        #print dir(gdbframe), gdbframe.name()
        print '#%i' % i, 
        if 'PyEval_EvalFrameEx' == gdbframe.name():
            print_EvalFrameEx(gdbframe)
        print '  0x%x in %s' % (gdbframe.pc(), gdbframe.name())

#pyframe()
#if gdb.selected_thread():
#    mybt()