diff options
33 files changed, 5743 insertions, 552 deletions
@@ -1,4 +1,4 @@ -2008-02-21 Dave Brolley <brolley@redhat.com> +2008-02-27 Dave Brolley <brolley@redhat.com> PR5189 * staptree.h (print_format::conv_memory): New enumerator. diff --git a/doc/ChangeLog b/doc/ChangeLog new file mode 100644 index 00000000..d9e609d2 --- /dev/null +++ b/doc/ChangeLog @@ -0,0 +1,7 @@ +2008-02-27 Frank Ch. Eigler <fche@redhat.com> + + * langref.tex, tutorial.tex: Copied over & aggregated + from former comfy digs under /cvs/doc. + * tutorial/*: Samples scripts from tutorial. + * Makefile.am: New build instructions. + * Makefile.in: New generated file. diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 00000000..bf80fbd3 --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1,28 @@ +# Makefile.am --- automake input file for systemtap docs +## process this file with automake to produce Makefile.in + +if BUILD_DOCS +all-local: tutorial.pdf langref.pdf + +clean-local: + rm -f *.pdf *.out *.log *.aux *.toc *.lot *.idx *.glo +endif + +SUFFIXES = ps pdf dvi ps tex + +.ps.pdf: + ps2pdf -r600 $< + +.dvi.ps: + dvips -t letter -o $@ $< + +.tex.dvi: + pwd=`pwd`; cd $(srcdir); \ + latex -output-directory=$$pwd $<; \ + touch $*.glo \ + makeindex $*.glo -s nomencl.ist -o $*.gls \ + latex -output-directory=$$pwd $<; \ + latex -output-directory=$$pwd $<; \ + latex -output-directory=$$pwd $< + +EXTRA_DIST = tutorial.tex langref.tex tutorial diff --git a/doc/Makefile.in b/doc/Makefile.in new file mode 100644 index 00000000..27a4ab8b --- /dev/null +++ b/doc/Makefile.in @@ -0,0 +1,336 @@ +# Makefile.in generated by automake 1.10 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005, 2006 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# Makefile.am --- automake input file for systemtap docs +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +subdir = doc +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +SOURCES = +DIST_SOURCES = +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DATE = @DATE@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +GREP = @GREP@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAINT = @MAINT@ +MAKEINFO = @MAKEINFO@ +MKDIR_P = @MKDIR_P@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PROCFLAGS = @PROCFLAGS@ +RANLIB = @RANLIB@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +U = @U@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build_alias = @build_alias@ +builddir = @builddir@ +cap_LIBS = @cap_LIBS@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +elfutils_abs_srcdir = @elfutils_abs_srcdir@ +exec_prefix = @exec_prefix@ +have_dvips = @have_dvips@ +have_latex = @have_latex@ +have_ps2pdf = @have_ps2pdf@ +host_alias = @host_alias@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +sqlite3_LIBS = @sqlite3_LIBS@ +srcdir = @srcdir@ +stap_LIBS = @stap_LIBS@ +staplog_CPPFLAGS = @staplog_CPPFLAGS@ +subdirs = @subdirs@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +SUFFIXES = ps pdf dvi ps tex +EXTRA_DIST = tutorial.tex langref.tex tutorial +all: all-am + +.SUFFIXES: +.SUFFIXES: ps pdf dvi ps tex .dvi .pdf .ps .tex +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \ + && exit 0; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu doc/Makefile'; \ + cd $(top_srcdir) && \ + $(AUTOMAKE) --gnu doc/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +tags: TAGS +TAGS: + +ctags: CTAGS +CTAGS: + + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +@BUILD_DOCS_FALSE@all-local: +all-am: Makefile all-local +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +@BUILD_DOCS_FALSE@clean-local: +clean: clean-am + +clean-am: clean-generic clean-local mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-generic + +dvi: dvi-am + +dvi-am: + +html: html-am + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-exec-am: + +install-html: install-html-am + +install-info: install-info-am + +install-man: + +install-pdf: install-pdf-am + +install-ps: install-ps-am + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-generic + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: all all-am all-local check check-am clean clean-generic \ + clean-local distclean distclean-generic distdir dvi dvi-am \ + html html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-generic pdf \ + pdf-am ps ps-am uninstall uninstall-am + + +@BUILD_DOCS_TRUE@all-local: tutorial.pdf langref.pdf + +@BUILD_DOCS_TRUE@clean-local: +@BUILD_DOCS_TRUE@ rm -f *.pdf *.out *.log *.aux *.toc *.lot *.idx *.glo + +.ps.pdf: + ps2pdf -r600 $< + +.dvi.ps: + dvips -t letter -o $@ $< + +.tex.dvi: + pwd=`pwd`; cd $(srcdir); \ + latex -output-directory=$$pwd $<; \ + touch $*.glo \ + makeindex $*.glo -s nomencl.ist -o $*.gls \ + latex -output-directory=$$pwd $<; \ + latex -output-directory=$$pwd $<; \ + latex -output-directory=$$pwd $< +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/doc/langref.tex b/doc/langref.tex new file mode 100644 index 00000000..5b91d01d --- /dev/null +++ b/doc/langref.tex @@ -0,0 +1,3285 @@ +% SystemTap Language Reference +\documentclass[twoside,english]{article} +\usepackage{geometry} +\geometry{verbose,letterpaper,tmargin=1.5in,bmargin=1.5in,lmargin=1in,rmargin=1in} +\usepackage{fancyhdr} +\pagestyle{fancy} +\usepackage{array} +\usepackage{varioref} +\usepackage{float} +\usepackage{makeidx} +\usepackage{verbatim} +\usepackage{url} +\makeindex + +\makeatletter + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands. +\newcommand{\noun}[1]{\textsc{#1}} +%% Bold symbol macro for standard LaTeX users +%\providecommand{\boldsymbol}[1]{\mbox{\boldmath $#1$}} + +%% Because html converters don't know tabularnewline +\providecommand{\tabularnewline}{\\} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands. +\setlength{\parindent}{0pt} +%\setlength{\parskip}{3pt plus 2pt minus 1pt} +\setlength{\parskip}{5pt} + +% +% this makes list spacing much better. +% +\newenvironment{my_itemize}{ +\begin{itemize} + \setlength{\itemsep}{1pt} + \setlength{\parskip}{0pt} + \setlength{\parsep}{0pt}}{\end{itemize} +} + +\newenvironment{vindent} +{\begin{list}{}{\setlength{\listparindent}{6pt}} +\item[]} +{\end{list}} + +\usepackage{babel} +\makeatother +\begin{document} + +\title{SystemTap Language Reference} + +\maketitle +\newpage{} +This document was derived from other documents contributed to the SystemTap project by employees of Red Hat, IBM and Intel.\newline + +Copyright \copyright\space 2007 Red Hat Inc.\newline +Copyright \copyright\space 2007 IBM Corp.\newline +Copyright \copyright\space 2007 Intel Corporation.\newline + +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.2 +or any later version published by the Free Software Foundation; +with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.\newline + +The GNU Free Documentation License is available from +\url{http://www.gnu.org/licenses/fdl.html} or by writing to +the Free Software Foundation, Inc., 51 Franklin Street, +Fifth Floor, Boston, MA 02110-1301, USA. +\newpage{} +\tableofcontents{} +\listoftables +\newpage{} + +\section{SystemTap overview\label{sec:SystemTap-Overview}} + +\subsection{About this guide} + +This guide is a comprehensive reference of SystemTap's language constructs +and syntax. The contents borrow heavily from existing SystemTap documentation +found in manual pages and the tutorial. The presentation of information here +provides the reader with a single place to find language syntax and recommended +usage. In order to successfully use this guide, you should be familiar with +the general theory and operation of SystemTap. If you are new to SystemTap, +you will find the tutorial to be an excellent place to start learning. For +detailed information about tapsets, see the manual pages provided with the +distribution. For information about the entire collection of SystemTap reference +material, see Section~\ref{sec:For-Further-Reference} + +\subsection{Reasons to use SystemTap} + +SystemTap provides infrastructure to simplify the gathering of information +about a running Linux kernel so that it may be further analyzed. This analysis +assists in identifying the underlying cause of a performance or functional +problem. SystemTap was designed to eliminate the need for a developer to +go through the tedious instrument, recompile, install, and reboot sequence +normally required to collect this kind of data. To do this, it provides a +simple command-line interface and scripting language for writing kernel instrumentation. +With SystemTap, developers, system administrators, and users can easily write +scripts that gather and manipulate kernel data that is not otherwise available +using standard Linux tools. Users of SystemTap will find it to be a significant +improvement over older methods. + +\subsection{Event-action language} +\index{language} +SystemTap's language is strictly typed, declaration free, procedural, and +inspired by dtrace and awk. Source code points or events in the kernel are +associated with handlers, which are subroutines that are executed synchronously. +These probes are conceptually similar to \char`\"{}breakpoint command lists\char`\"{} +in the GDB debugger. + +There are two main outermost constructs: probes and functions. Within these, +statements and expressions use C-like operator syntax and precedence. + +\subsection{Sample SystemTap scripts} +\index{example scripts} +Following are some example scripts that illustrate the basic operation of +SystemTap. For more examples, see the examples/small\_demos/ directory in +the source directory, the SystemTap wiki at \url{http://sourceware.org/systemtap/wiki/HomePage}, +or the SystemTap War Stories at \url{http://sourceware.org/systemtap/wiki/WarStories} page. + +\subsubsection{Basic SystemTap syntax and control structures} + +The following code examples demonstrate SystemTap syntax and control structures. + +\begin{vindent} +\begin{verbatim} +global odds, evens + +probe begin { + # "no" and "ne" are local integers + for (i = 0; i < 10; i++) { + if (i % 2) odds [no++] = i + else evens [ne++] = i + } + + delete odds[2] + delete evens[3] + exit() +} + +probe end { + foreach (x+ in odds) + printf ("odds[%d] = %d", x, odds[x]) + + foreach (x in evens-) + printf ("evens[%d] = %d", x, evens[x]) +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +odds[0] = 1 +odds[1] = 3 +odds[3] = 7 +odds[4] = 9 +evens[4] = 8 +evens[2] = 4 +evens[1] = 2 +evens[0] = 0 +\end{verbatim} +\end{vindent} +Note that all variable types are inferred, and that all locals and globals +are initialized. + +\subsubsection{Primes between 0 and 49} + +\begin{vindent} +\begin{verbatim} +function isprime (x) { + if (x < 2) return 0 + for (i = 2; i < x; i++) { + if (x % i == 0) return 0 + if (i * i > x) break + } + return 1 +} + +probe begin { + for (i = 0; i < 50; i++) + if (isprime (i)) printf("%d\n", i) + exit() +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +2 +3 +5 +7 +11 +13 +17 +19 +23 +29 +31 +37 +41 +43 +47 +\end{verbatim} +\end{vindent} + +\subsubsection{Recursive functions} +\index{recursion} +\begin{vindent} +\begin{verbatim} +function fibonacci(i) { + if (i < 1) error ("bad number") + if (i == 1) return 1 + if (i == 2) return 2 + return fibonacci (i-1) + fibonacci (i-2) +} + +probe begin { + printf ("11th fibonacci number: %d", fibonacci (11)) + exit () +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +11th fibonacci number: 118 +\end{verbatim} +\end{vindent} +Any larger number input to the function may exceed the MAXACTION or MAXNESTING +limits, which will be caught by the parser and result in an error. For more +about limits see Section~\ref{sub:SystemTap-safety}. +\newpage{} +\subsection{The stap command} +\index{stap} +The stap program is the front-end to the SystemTap tool. It accepts probing +instructions written in its scripting language, translates those instructions +into C code, compiles this C code, and loads the resulting kernel module +into a running Linux kernel to perform the requested system trace or probe +functions. You can supply the script in a named file, from standard input, +or from the command line. The program runs until it is interrupted by the +user or a sufficient number of soft errors, or if the script voluntarily +invokes the exit() function. + +The stap command does the following: + +\begin{itemize} +\item Translates the script +\item Generates and compiles a kernel module +\item Inserts the module; output to stap's stdout +\item CTRL-C unloads the module and terminates stap +\end{itemize} +For a full list of options to the stap command, see the stap(1) manual page. + +\subsection{Safety and security\label{sub:SystemTap-safety}} +\index{limits} +SystemTap is an administrative tool. It exposes kernel internal data structures +and potentially private user information. It requires root privileges to +actually run the kernel objects it builds using the \textbf{sudo} command, +applied to the \textbf{staprun} program. + +staprun is a part of the SystemTap package, dedicated to module loading and +unloading and kernel-to-user data transfer. Since staprun does not perform +any additional security checks on the kernel objects it is given, do not +give elevated privileges via sudo to untrusted users. + +The translator asserts certain safety constraints. \index{constraints}It +ensures that no handler routine can run for too long, allocate memory, perform +unsafe operations, or unintentionally interfere with the kernel. Use of script +global variables is locked to protect against manipulation by concurrent +probe handlers. Use of \emph{guru mode} constructs such as embedded C (see +Section~\ref{sub:Embedded-C}) can violate these constraints, leading to +a kernel crash or data corruption. + +The resource use limits are set by macros in the generated C code. These +may be overridden with the -D flag. The following list describes a selection +of these macros: + +\textbf{MAXNESTING} -- The maximum number of recursive function call levels. The default is 10. + +\textbf{MAXSTRINGLEN} -- The maximum length of strings. The default is 128. + +\textbf{MAXTRYLOCK} -- The maximum number of iterations to wait for locks on global variables before +declaring possible deadlock and skipping the probe. The default is 1000. + +\textbf{MAXACTION} -- The maximum number of statements to execute during any single probe hit. The default is 1000. + +\textbf{MAXMAPENTRIES} -- The maximum number of rows in an array if the array size is not specified +explicitly when declared. The default is 2048. + +\textbf{MAXERRORS} -- The maximum number of soft errors before an exit is triggered. The default is 0. + +\textbf{MAXSKIPPED} -- The maximum number of skipped reentrant probes before an exit is triggered. The default is 100. + +\textbf{MINSTACKSPACE} -- The minimum number of free kernel stack bytes required in order to run a +probe handler. This number should be large enough for the probe handler's +own needs, plus a safety margin. The default is 1024. + +If something goes wrong with stap or staprun after a probe has started running, +you may safely kill both user processes, and remove the active probe kernel +module with the rmmod command. Any pending trace messages may be lost. + +\section{Types of SystemTap scripts\label{sec:Types-of-SystemTap}} + +\subsection{Probe scripts} + +Probe scripts are analogous to programs; these scripts identify probe points +and associated handlers. + +\subsection{Tapset scripts} + +Tapset scripts are libraries of probe aliases and auxiliary functions. + +The /usr/share/systemtap/tapset directory contains tapset scripts. While +these scripts look like regular SystemTap scripts, they cannot be run directly. + +\section{Components of a SystemTap script} + +The main construct in the scripting language identifies probes. Probes associate +abstract events with a statement block, or probe handler, that is to be executed +when any of those events occur. + +The following example shows how to trace entry and exit from a function using +two probes. + +\begin{vindent} +\begin{verbatim} +probe kernel.function("sys_mkdir") { log ("enter") } +probe kernel.function("sys_mkdir").return { log ("exit") } +\end{verbatim} +\end{vindent} + +To list the probe-able functions in the kernel, use the last-pass option +to the translator. The output needs to be filtered because each inlined function +instance is listed separately. The following statement is an example. + +\begin{vindent} +\begin{verbatim} +# stap -p2 -e 'probe kernel.function("*") {}' | sort | uniq +\end{verbatim} +\end{vindent} + +\subsection{Probe definitions} + +The general syntax is as follows. + +\begin{vindent} +\begin{verbatim} +probe PROBEPOINT [, PROBEPOINT] { [STMT ...] } +\end{verbatim} +\end{vindent} +Events are specified in a special syntax called \emph{probe points}. There +are several varieties of probe points defined by the translator, and tapset +scripts may define others using aliases. The provided probe points are listed +in the stapprobes(5) man pages. + +The probe handler is interpreted relative to the context of each event. For +events associated with kernel code, this context may include variables defined +in the source code at that location. These \emph{target variables}\index{target variables} +are presented to the script as variables whose names are prefixed with a +dollar sign (\$). They may be accessed only if the compiler used to compile +the kernel preserved them, despite optimization. This is the same constraint +imposed by a debugger when working with optimized code. Other events may +have very little context. + + +\subsection{Probe aliases\label{sub:Probe-aliases}} +\index{probe aliases} +The general syntax is as follows. + +\begin{vindent} +\begin{verbatim} +probe <alias> = <probepoint> { <prologue_stmts> } +probe <alias> += <probepoint> { <epilogue_stmts> } +\end{verbatim} +\end{vindent} +New probe points may be defined using \emph{aliases}. A probe point alias +looks similar to probe definitions, but instead of activating a probe at +the given point, it defines a new probe point name as an alias to an existing +one. New probe aliases may refer to one or more existing probe aliases. The +following is an example. + +\begin{vindent} +\begin{verbatim} +probe socket.sendmsg = kernel.function ("sock_sendmsg") { ... } +probe socket.do_write = kernel.function ("do_sock_write") { ... } +probe socket.send = socket.sendmsg, socket.do_write { ... } +\end{verbatim} +\end{vindent} +There are two types of aliases, the prologue style and the epilogue style +which are identified by the equal sign (\texttt{\textbf{=}}) and \char`\"{}\texttt{\textbf{+=}}\char`\"{} +respectively. + +A probe that names the new probe point will create an actual probe, with +the handler of the alias \emph{pre-pended}. + +This pre-pending behavior serves several purposes. It allows the alias definition +to pre-process the context of the probe before passing control to the handler +specified by the user. This has several possible uses, demonstrated as follows. + +\begin{vindent} +\begin{verbatim} +# Skip probe unless given condition is met: +if ($flag1 != $flag2) next + +# Supply values describing probes: +name = "foo" + +# Extract the target variable to a plain local variable: +var = $var +\end{verbatim} +\end{vindent} + +\subsubsection{Prologue-style aliases (=)} +\index{prologue-style aliases} +\index{=} +For a prologue style alias, the statement block that follows an alias definition +is implicitly added as a prologue to any probe that refers to the alias. +The following is an example. + +\begin{vindent} +\begin{verbatim} +# Defines a new probe point syscall.read, which expands to +# kernel.function("sys_read"), with the given statement as +# a prologue. +# +probe syscall.read = kernel.function("sys_read") { + fildes = $fd +} +\end{verbatim} +\end{vindent} + +\subsubsection{Epilogue-style aliases (+=)} +\index{epilogue-style aliases} +\index{+=} +The statement block that follows an alias definition is implicitly added +as an epilogue to any probe that refers to the alias. The following is an +example: + +\begin{vindent} +\begin{verbatim} +# Defines a new probe point with the given statement as an +# epilogue. +# +probe syscall.read += kernel.function("sys_read") { + fildes = $fd +} +\end{verbatim} +\end{vindent} + +\subsubsection{Probe alias usage} + +Another probe definition may use a previously defined alias. The following +is an example. + +\begin{vindent} +\begin{verbatim} +probe syscall.read { + printf("reading fd=%d\n", fildes) +} +\end{verbatim} +\end{vindent} + +\subsubsection{Unused alias variables} +\index{unused variables} +An unused alias variable is a variable defined in a probe alias, usually +as one of a group of \texttt{var = \$var} assignments, which is not actually +used by the script probe that instantiates the alias. These variables are +discarded. + +\subsection{Variables\label{sub:Variables}} +\index{variables} +Identifiers for variables and functions are alphanumeric sequences, and may +include the underscore (\_) and the dollar sign (\$) characters. They may +not start with a plain digit. Each variable is by default local to the probe +or function statement block where it is mentioned, and therefore its scope +and lifetime is limited to a particular probe or function invocation. Scalar +variables are implicitly typed as either string or integer. Associative arrays +also have a string or integer value, and a tuple of strings or integers serves +as a key. Arrays must be declared as global. Local arrays\index{local arrays} +are not allowed. + +The translator performs \emph{type inference} on all identifiers, including +array indexes and function parameters. Inconsistent type-related use of identifiers +results in an error. + +Variables may be declared global. Global variables are shared among all probes +and remain instantiated as long as the SystemTap session. There is one namespace +for all global variables, regardless of the script file in which they are +found. Because of possible concurrency limits, such as multiple probe handlers, +each global variable used by a probe is automatically read- or write-locked +while the handler is running. A global declaration may be written at the +outermost level anywhere in a script file, not just within a block of code. +The following declaration marks \texttt{var1} and \texttt{var2} as global. +The translator will infer a value type for each, and if the variable is used +as an array, its key types. + +\begin{vindent} +\begin{verbatim} +global var1[=<value>], var2[=<value>] +\end{verbatim} +\end{vindent} + +\subsection{Auxiliary functions\label{sub:Auxiliary-functions}} +\index{auxiliary functions} +General syntax: + +\begin{vindent} +\begin{verbatim} +function <name>[:<type>] ( <arg1>[:<type>], ... ) { <stmts> } +\end{verbatim} +\end{vindent} +SystemTap scripts may define subroutines to factor out common work. Functions +may take any number of scalar arguments, and must return a single scalar +value. Scalars in this context are integers or strings. For more information +on scalars, see Section~\ref{sub:Variables} and Section~\ref{sub:Data-types}\texttt{.} +The following is an example function declaration. + +\begin{vindent} +\begin{verbatim} +function thisfn (arg1, arg2) { + return arg1 + arg2 +} +\end{verbatim} +\end{vindent} +Note the general absence of type declarations, which are inferred by the +translator. If desired, a function definition may include explicit type declarations +for its return value, its arguments, or both. This is helpful for embedded-C +functions. In the following example, the type inference engine need only +infer the type of arg2, a string. + +\begin{vindent} +\begin{verbatim} +function thatfn:string(arg1:long, arg2) { + return sprintf("%d%s", arg1, arg2) +} +\end{verbatim} +\end{vindent} +Functions may call others or themselves recursively, up to a fixed nesting +limit. See Section~\ref{sub:SystemTap-safety}. + + +\subsection{Embedded C\label{sub:Embedded-C}} +\index{embedded C} +SystemTap supports a \emph{guru\index{guru mode} mode} where script safety +features such as code and data memory reference protection are removed. Guru +mode is set by passing the ''-g'' flag to the stap command. When in guru +mode, the translator accepts embedded code enclosed between {}``\%\{'' +and {}``\%\}'' markers in the script file. Embedded code is transcribed +verbatim, without analysis, in sequence, into generated C code. At the outermost +level of a script, guru mode may be useful to add \#include instructions, +or any auxiliary definitions for use by other embedded code. + + +\subsection{Embedded C functions} + +General syntax: + +\begin{vindent} +\begin{verbatim} +function <name>:<type> ( <arg1>:<type>, ... ) %{ <C_stmts> %} +\end{verbatim} +\end{vindent} +Embedded code is permitted in a function body. In that case, the script language +body is replaced entirely by a piece of C code enclosed between \%\{ and +\%\} markers. The enclosed code may do anything reasonable and safe as allowed +by the parser. + +There are a number of undocumented but complex safety constraints on concurrency, +resource consumption and runtime limits that are applied to code written +in the SystemTap language. These constraints are not applied to embedded +C code, so use such code with caution as it is used verbatim. Be especially +careful when dereferencing pointers. Use the kread() macro to dereference +any pointers that could potentially be invalid or dangerous. If you are unsure, +err on the side of caution and use kread(). The kread() macro is one of the +safety mechanisms used in code generated by embedded C. It protects against +pointer accesses that could crash the system. + +For example, to access the pointer chain \texttt{name = skb->dev->name} in +embedded C, use the following code. + +\begin{vindent} +\begin{verbatim} +struct net_device *dev; +char *name; +dev = kread(&(skb->dev)); +name = kread(&(dev->name)); +\end{verbatim} +\end{vindent} +The memory locations reserved for input and output values are provided to +a function using a macro named \texttt{THIS}\index{THIS}. The following +are examples. + +\begin{vindent} +\begin{verbatim} +function add_one (val) %{ + THIS->__retvalue = THIS->val + 1; +} +function add_one_str (val) %{ + strlcpy (THIS->__retvalue, THIS->val, MAXSTRINGLEN); + strlcat (THIS->__retvalue, "one", MAXSTRINGLEN); +} +\end{verbatim} +\end{vindent} +The function argument and return value types must be inferred by the translator +from the call sites in order for this method to work. You should examine +C code generated for ordinary script language functions to write compatible +embedded-C. Note that all SystemTap functions and probes run with interrupts +disabled, thus you cannot call functions that might sleep from within embedded +C. + +\section{Probe points\label{sec:Probe-Points}} +\index{probe points} +\subsection{General syntax} +\index{probe syntax} +The general probe point syntax is a dotted-symbol sequence. This divides +the event namespace into parts, analogous to the style of the Domain Name +System. Each component identifier is parameterized by a string or number +literal, with a syntax analogous to a function call. + +The following are all syntactically valid probe points. + +\begin{vindent} +\begin{verbatim} +kernel.function("foo") +kernel.function("foo").return +module{"ext3"}.function("ext3_*") +kernel.function("no_such_function") ? +syscall.* +end +timer.ms(5000) +\end{verbatim} +\end{vindent} +Probes may be broadly classified into \emph{synchronous}\index{synchronous} +or \emph{asynchronous}.\index{asynchronous} A synchronous event occurs when +any processor executes an instruction matched by the specification. This +gives these probes a reference point (instruction address) from which more +contextual data may be available. Other families of probe points refer to +asynchronous events such as timers, where no fixed reference point is related. +Each probe point specification may match multiple locations, such as by using +wildcards or aliases, and all are probed. A probe declaration may contain +several specifications separated by commas, which are all probed. + +\subsubsection{Prefixes} +\index{prefixes} +Prefixes specify the probe target, such as \textbf{kernel}, \textbf{module}, +\textbf{timer}, and so on. + +\subsubsection{Suffixes} +\index{suffixes} +Suffixes further qualify the point to probe, such as \textbf{.return} for the +exit point of a probed function. The absence of a suffix implies the function +entry point. + +\subsubsection{Wildcarded file names, function names} +\index{wildcards} +A component may include an asterisk ({*}) character, which expands to other +matching probe points. An example follows. + +\begin{vindent} +\begin{verbatim} +kernel.syscall.* +kernel.function("sys_*) +\end{verbatim} +\end{vindent} + +\subsubsection{Optional probe points\label{sub:Optional-probe-points}} +\index{?} +A probe point may be followed by a question mark (?) character, to indicate +that it is optional, and that no error should result if it fails to expand. +This effect passes down through all levels of alias or wildcard expansion. + +The following is the general syntax. + +\begin{vindent} +\begin{verbatim} +kernel.function("no_such_function") ? +\end{verbatim} +\end{vindent} + +\subsection{Built-in probe point types (DWARF probes)} +\index{built-in probes} +\index{dwarf probes} +This family of probe points uses symbolic debugging information for the target +kernel or module, as may be found in executables that have not +been stripped, or in the separate \textbf{debuginfo} packages. They allow +logical placement of probes into the execution path of the target +by specifying a set of points in the source or object code. When a matching +statement executes on any processor, the probe handler is run in that context. + +Points in a kernel are identified by module, source file, line number, function +name or some combination of these. + +Here is a list of probe point specifications currently supported: + +\begin{vindent} +\begin{verbatim} +kernel.function(PATTERN) +kernel.function(PATTERN).call +kernel.function(PATTERN).return +kernel.function(PATTERN).return.maxactive(VALUE) +kernel.function(PATTERN).inline +module(MPATTERN).function(PATTERN) +module(MPATTERN).function(PATTERN).call +module(MPATTERN).function(PATTERN).return.maxactive(VALUE) +module(MPATTERN).function(PATTERN).inline +kernel.statement(PATTERN) +kernel.statement(ADDRESS).absolute +module(MPATTERN).statement(PATTERN) +\end{verbatim} +\end{vindent} + +The \textbf{.function} variant places a probe near the beginning of the named +function, so that parameters are available as context variables. + +The \textbf{.return} variant places a probe at the moment of return from the named +function, so the return value is available as the \$return context variable. +The entry parameters are also available, though the function may have changed +their values. Return probes may be further qualified with \textbf{.maxactive}, +which specifies how many instances of the specified function can be probed simultaneously. +You can leave off \textbf{.maxactive} in most cases, as the default should be sufficient. +However, if you notice an excessive number of skipped probes, try setting \textbf{.maxactive} +to incrementally higher values to see if the number of skipped probes decreases. + +The \textbf{.inline} modifier for \textbf{.function} filters the results to include only +instances of inlined functions. The \textbf{.call} modifier selects the opposite subset. +Inline functions do not have an identifiable return point, so \textbf{.return} +is not supported on \textbf{.inline} probes. + +The \textbf{.statement} variant places a probe at the exact spot, exposing those local +variables that are visible there. + +In the above probe descriptions, MPATTERN stands for a string literal +that identifies the loaded kernel module of interest. It may include asterisk +({*}), square brackets \char`\"{}{[}]\char`\"{}, and question mark (?) wildcards. +PATTERN stands for a string literal that identifies a point in the program. +It is composed of three parts: + +\begin{enumerate} +\item The first part is the name of a function, as would appear in the nm program's +output. This part may use the asterisk and question mark wildcard operators +to match multiple names. +\item The second part is optional, and begins with the ampersand (@) character. +It is followed by the path to the source file containing the function, +which may include a wildcard pattern, such as mm/slab{*}. +In most cases, the path should be relative to the top of the +linux source directory, although an absolute path may be necessary for some kernels. +If a relative pathname doesn't work, try absolute. +\item The third part is optional if the file name part was given. It identifies +the line number in the source file, preceded by a colon. +\end{enumerate} +Alternately, specify PATTERN as a numeric constant to indicate a relative +module address or an absolute kernel address. + +Some of the source-level variables, such as function parameters, locals, +or globals visible in the compilation unit, are visible to probe handlers. +Refer to these variables by prefixing their name with a dollar sign within +the scripts. In addition, a special syntax allows limited traversal of structures, +pointers, and arrays. + +\texttt{\$var} refers to an in-scope variable var. If it is a type similar +to an integer, it will be cast to a 64-bit integer for script use. Pointers +similar to a string (char {*}) are copied to SystemTap string values by the +kernel\_string() or user\_string functions(). + +\texttt{\$var->field} traverses a structure's field. The indirection operator +may be repeated to follow additional levels of pointers. + +\texttt{\$var{[}N]} indexes into an array. The index is given with a literal +number. + +\subsubsection{kernel.function, module().function} +\index{kernel.function} +\index{module().function} +The \textbf{.function} variant places a probe near the beginning of the named function, +so that parameters are available as context variables. + +General syntax: + +\begin{vindent} +\begin{verbatim} +kernel.function("func[@file]" +module("modname").function("func[@file]" +\end{verbatim} +\end{vindent} +Examples: + +\begin{vindent} +\begin{verbatim} +# Refers to all kernel functions with "init" or "exit" +# in the name: +kernel.function("*init*"), kernel.function("*exit*") + +# Refers to any functions within the "kernel/sched.c" +# file that span line 240: +kernel.function("*@kernel/sched.c:240") + +# Refers to all functions in the ext3 module: +module("ext3").function("*") +\end{verbatim} +\end{vindent} + +\subsubsection{kernel.statement, module().statement} +\index{kernel.statement} +\index{module().statement} +The \textbf{.statement} variant places a probe at the exact spot, exposing those local +variables that are visible there. + +General syntax: + +\begin{vindent} +\begin{verbatim} +kernel.statement("func@file:linenumber") +module("modname").statement("func@file:linenumber") +\end{verbatim} +\end{vindent} +Example: + +\begin{vindent} +\begin{verbatim} +# Refers to the statement at line 2917 within the +# kernel/sched.c file: +kernel.statement("*@kernel/sched.c:2917") +\end{verbatim} +\end{vindent} + +\begin{comment} +\subsection{Marker probes} + +This family of probe points connects to static probe markers inserted into +the kernel or a module. These markers are special macro calls in the kernel +that make probing faster and more reliable than with DWARF-based probes. +DWARF debugging information is not required to use probe markers. + +Marker probe points begin with a kernel or module(\char`\"{}\emph{name}\char`\"{}) +prefix, the same as DWARF probes. This prefix identifies the source of the +symbol table used for finding markers. The suffix names the marker itself: +mark(\char`\"{}\emph{name}\char`\"{}). The marker name string, which may +contain wildcard characters, is matched against the names given to the marker +macros when the kernel or module was compiled. + +The handler associated with a marker probe reads any optional parameters +specified at the macro call site named \$arg1 through \$argNN, where NN is +the number of parameters supplied by the macro. Number and string parameters +are passed in a type-safe manner. +\end{comment} + +\subsection{Timer probes} +\index{timer probes} +You can use intervals defined by the standard kernel jiffies\index{jiffies} +timer to trigger probe handlers asynchronously. A \emph{jiffy} is a kernel-defined +unit of time typically between 1 and 60 msec. Two probe point variants are +supported by the translator: + +\begin{vindent} +\begin{verbatim} +timer.jiffies(N) +timer.jiffies(N).randomize(M) +\end{verbatim} +\end{vindent} +The probe handler runs every N jiffies. If the \texttt{randomize}\index{randomize} +component is given, a linearly distributed random value in the range {[}-M +\ldots{} +M] is added to N every time the handler executes. N is restricted +to a reasonable range (1 to approximately 1,000,000), and M is restricted +to be less than N. There are no target variables provided in either context. +Probes can be run concurrently on multiple processors. + +Intervals may be specified in units of time. There are two probe point variants +similar to the jiffies timer: + +\begin{vindent} +\begin{verbatim} +timer.ms(N) +timer.ms(N).randomize(M) +\end{verbatim} +\end{vindent} +Here, N and M are specified in milliseconds\index{milliseconds}, but the +full options for units are seconds (s or sec), milliseconds (ms or msec), +microseconds (us or usec), nanoseconds (ns or nsec), and hertz (hz). Randomization +is not supported for hertz timers. + +The resolution of the timers depends on the target kernel. For kernels prior +to 2.6.17, timers are limited to jiffies resolution, so intervals are rounded +up to the nearest jiffies interval. After 2.6.17, the implementation uses +hrtimers for tighter precision, though the resulting resolution will be dependent +upon architecture. In either case, if the randomize component is given, then +the random value will be added to the interval before any rounding occurs. + +Profiling timers are available to provide probes that execute on all CPUs +at each system tick. This probe takes no parameters, as follows. + +\begin{vindent} +\begin{verbatim} +timer.profile +\end{verbatim} +\end{vindent} +Full context information of the interrupted process is available, making +this probe suitable for implementing a time-based sampling profiler. + +The following is an example of timer usage. + +\begin{vindent} +\begin{verbatim} +# Refers to a periodic interrupt, every 1000 jiffies: +timer.jiffies(1000) + +# Fires every 5 seconds: +timer.sec(5) + +# Refers to a periodic interrupt, every 1000 +/- 200 jiffies: +timer.jiffies(1000).randomize(200) +\end{verbatim} +\end{vindent} + +\subsection{Return probes} +\index{return probes} +The \texttt{.return} variant places a probe at the moment of return from +the named function, so that the return value is available as the \$return +context variable. The entry parameters are also accessible in the context +of the return probe, though their values may have been changed by the function. +Inline functions do not have an identifiable return point, so \texttt{.return} +is not supported on \texttt{.inline} probes. + + +\subsection{Special probe points} + +The probe points \texttt{begin} and \texttt{end} are defined by the translator +to refer to the time of session startup and shutdown. There are no target +variables available in either context. + + +\subsubsection{begin} +\index{begin} +The \texttt{begin} probe is the start of the SystemTap session. All \texttt{begin} +probe handlers are run during the startup of the session. All global variables +must be declared prior to this point. + + +\subsubsection{end} +\index{end} +The \texttt{end} probe is the end of the SystemTap session. All \texttt{end} +probes are run during the normal shutdown of a session, such as in the aftermath +of an \texttt{exit} function call, or an interruption from the user. In the +case of an shutdown triggered by error, \texttt{end} probes are not run. + + +\subsubsection{begin and end probe sequence} +\index{sequence} +\texttt{begin} and \texttt{end} probes are specified with an optional sequence +number that controls the order in which they are run. If no sequence number +is provided, the sequence number defaults to zero and probes are run in the +order that they occur in the script file. Sequence numbers may be either +positive or negative, and are especially useful for tapset writers who want +to do initialization in a \texttt{begin} probe. The following are examples. + +\begin{vindent} +\begin{verbatim} +# In a tapset file: +probe begin(-1000) { ... } + +# In a user script: +probe begin { ... } +\end{verbatim} +\end{vindent} +The user script \texttt{begin} probe defaults to sequence number zero, so +the tapset \texttt{begin} probe will run first. + + +\subsubsection{never} +\index{never} +The \texttt{never} probe point is defined by the translator to mean \emph{never}. +Its statements are analyzed for symbol and type correctness, but its probe +handler is never run. This probe point may be useful in conjunction with +optional probes. See Section~\ref{sub:Optional-probe-points}. + + +\begin{comment} % Comment out until perfmon code is reactivated +\subsection{Probes to monitor performance} + +The perfmon family of probe points is used to access the performance monitoring +hardware available in modern processors. These probe points require perfmon2 +support in the kernel to access the hardware. + +Performance monitor hardware points have a \texttt{perfmon} prefix. The suffix +names the event being counted, for example \texttt{counter(event)}. The event +names are specific to the processor implementation, except for generic cycle +and instructions events, which are available on all processors. The probe +\texttt{perfmon.counter(event)} starts a counter on the processor which counts +the number of events that occur on that processor. For more details about +the performance monitoring events available on a specific processor, see +the help text returned by typing the perfmon2 command \texttt{pfmon -l.} + +\subsubsection{\$counter} + +\$counter is a handle used in the body of a probe for operations involving +the counter associated with the probe. + +\subsubsection{read\_counter} + +read\_counter is a function passed to the handle for a perfmon probe. It +returns the current count for the event. +\end{comment} + +\section{Language elements\label{sec:Language-Elements}} + + +\subsection{Identifiers} +\index{identifiers} +\emph{Identifiers} are used to name variables and functions. They are an +alphanumeric sequence that may include the underscore (\_) and dollar sign +(\$) characters. They have the same syntax as C identifiers, except that +the dollar sign is also a legal character. Identifiers that begin with a +dollar sign are interpreted as references to variables in the target software, +rather than to SystemTap script variables. Identifiers may not start with +a plain digit. + + +\subsection{Data types\label{sub:Data-types}} +\index{data types} +The SystemTap language includes a small number of data types, but no type +declarations. A variable's type is inferred\index{inference} from its use. +To support this inference, the translator enforces consistent typing of function +arguments and return values, array indices and values. There are no implicit +type conversions between strings and numbers. Inconsistent type-related use +of identifiers signals an error. + + +\subsubsection{Numbers} +\index{numbers} +Numbers are 64-bit signed integers. The parser will also accept (and wrap +around) values above positive $2^{63}$. + + +\subsubsection{Literals} +\index{literals} +Literals are either strings or integers. Literals can be expressed as decimal, +octal, or hexadecimal, using C notation. Type suffixes (e.g., \emph{L} or +\emph{U}) are not used. + + +\subsubsection{Integers\label{sub:Integers}} +\index{integers} +Integers are decimal, hexadecimal, or octal, and use the same notation as +in C. Integers are 64-bit signed quantities, although the parser also accepts +(and wraps around) values above positive $2^{63}$. + + +\subsubsection{Strings\label{sub:Strings}} +\index{strings} +Strings are enclosed in quotation marks ({}``string''), and pass through +standard C escape codes with backslashes. Strings are limited in length to +MAXSTRINGLEN. For more information about this and other limits, see Section~\ref{sub:SystemTap-safety}. + + +\subsubsection{Associative arrays} + +See Section~\ref{sec:Associative-Arrays} + + +\subsubsection{Statistics} + +See Section~\ref{sec:Statistics} + + +\subsection{Semicolons} +\index{;} +The semicolon is the null statement, or do nothing statement. It is optional, +and useful as a separator between statements to improve detection of syntax +errors and to reduce ambiguities in grammar. + + +\subsection{Comments} +\index{comments} +Three forms of comments are supported, as follows. + +\begin{vindent} +\begin{verbatim} +# ... shell style, to the end of line +// ... C++ style, to the end of line +/* ... C style ... */ +\end{verbatim} +\end{vindent} + +\subsection{Whitespace} +\index{whitespace} +As in C, spaces, tabs, returns, newlines, and comments are treated as whitespace. +Whitespace is ignored by the parser. + + +\subsection{Expressions} +\index{expressions} +SystemTap supports a number of operators that use the same general syntax, +semantics, and precedence as in C and awk. Arithmetic is performed per C +rules for signed integers. If the parser detects division by zero or an overflow, +it generates an error. The following subsections list these operators. + + +\subsubsection{Binary numeric operators} +\index{binary} +\texttt{{*} / \% + - >\,{}> <\,{}< \& \textasciicircum{} +| \&\& ||} + + +\subsubsection{Binary string operators} +\index{binary} +\texttt{\textbf{.}} (string concatenation) + + +\subsubsection{Numeric assignment operators} +\index{numeric} +\texttt{= {*}= /= \%= += -= >\,{}>= <\,{}<= +\&= \textasciicircum{}= |=} + + +\subsubsection{String assignment operators} + +\texttt{= .=} + + +\subsubsection{Unary numeric operators} +\index{unary} +\texttt{+ - ! \textasciitilde{} ++ -{}-} + + +\subsubsection{Binary numeric or string comparison operators} +\index{comparison} +\texttt{< > <= >= == !=} + + +\subsubsection{Ternary operator\label{sub:Ternary-operator}} +\index{?} +\texttt{cond ? exp1 : exp2} + + +\subsubsection{Grouping operator} +\index{grouping} +\texttt{( exp )} + + +\subsubsection{Function call} +\index{fn} +General syntax: + +\texttt{fn ({[} arg1, arg2, ... ])} + + +\subsubsection{\$ptr-\textgreater member} +\index{pointer} +\texttt{ptr} is a kernel pointer available in a probed context. + + +\subsubsection{\textless value\textgreater\ in \textless array\_name\textgreater} +\index{index} +This expression evaluates to true if the array contains an element with the +specified index. + + +\subsubsection{{[} \textless value\textgreater, ... ] in \textless array\_name\textgreater} + +The number of index values must match the number of indexes previously specified. + + +\subsection{Literals passed in from the stap command line\label{sub:Literals-passed-in}} +\index{literals} +\emph{Literals} are either strings enclosed in double quotes ('' '') or +integers. For information about integers, see Section~\ref{sub:Integers}. +For information about strings, see Section~\ref{sub:Strings}. + +Script arguments at the end of a command line are expanded as literals. You +can use these in all contexts where literals are accepted. A reference to +a nonexistent argument number is an error. + + +\subsubsection{\$1 \ldots{} \$\textless NN\textgreater\ for integers} +\index{\$} +Use \texttt{\$1 \ldots{} \$<NN>} for casting as a numeric literal. + + +\subsubsection{@1 \ldots{} @\textless NN\textgreater\ for strings} + +Use \texttt{@1 \ldots{} @<NN>} for casting as a string literal. + + +\subsubsection{Examples} + +For example, if the following script named example.stp + +\begin{vindent} +\begin{verbatim} +probe begin { printf("%d, %s\n", $1, @2) } +\end{verbatim} +\end{vindent} +is invoked as follows + +\begin{vindent} +\begin{verbatim} +# stap example.stp 10 mystring +\end{verbatim} +\end{vindent} +then 10 is substituted for \$1 and \char`\"{}mystring\char`\"{} for @2. The +output will be + +\begin{vindent} +\begin{verbatim} +10, mystring +\end{verbatim} +\end{vindent} + +\subsection{Conditional compilation} + + +\subsubsection{Conditions} +\index{conditions} +One of the steps of parsing is a simple conditional preprocessing stage. +The general form of this is similar to the ternary operator (Section~\ref{sub:Ternary-operator}). + +\begin{vindent} +\begin{verbatim} +%( CONDITION %? TRUE-TOKENS %) +%( CONDITION %? TRUE-TOKENS %: FALSE-TOKENS %) +\end{verbatim} +\end{vindent} +The CONDITION is a limited expression whose format is determined by its first +keyword. The following is the general syntax. + +\begin{vindent} +\begin{verbatim} +%( <condition> %? <code> [ %: <code> ] %) +\end{verbatim} +\end{vindent} + +\subsubsection{Conditions based on kernel version: kernel\_v, kernel\_vr} +\index{kernel version} +\index{kernel\_vr} +\index{kernel\_v} +If the first part of a conditional expression is the identifier \texttt{kernel\_v} +or \texttt{kernel\_vr}, the second part must be one of six standard numeric +comparison operators {}``\textless'', {}``\textless ='', {}``=='', {}``!='', {}``\textgreater'', +or {}``\textgreater ='', +and the third part must be a string literal that contains an RPM-style version-release +value. The condition returns true if the version of the target kernel (as +optionally overridden by the \textbf{-r} option) matches the given version +string. The comparison is performed by the glibc function strverscmp. + +\texttt{kernel\_v} refers to the kernel version number only, such as {}``2.6.13\char`\"{}. + +\texttt{kernel\_vr} refers to the kernel version number including the release +code suffix, such as {}``2.6.13-1.322FC3smp''. + + +\subsubsection{Conditions based on architecture: arch} +\index{arch} +If the first part of the conditional expression is the identifier \texttt{arch} +which refers to the processor architecture, then the second part is a string +comparison operator ''=='' or ''!='', and the third part is a string +literal for matching it. This comparison is a simple string equality or inequality. +The currently supported architecture strings are i386, i686, x86\_64, ia64, +s390x and ppc64. + + +\subsubsection{True and False Tokens} +\index{tokens} +TRUE-TOKENS and FALSE-TOKENS are zero or more general parser tokens, possibly +including nested preprocessor conditionals, that are pasted into the input +stream if the condition is true or false. For example, the following code +induces a parse error unless the target kernel version is newer than 2.6.5. + +\begin{vindent} +\begin{verbatim} +%( kernel_v <= "2.6.5" %? **ERROR** %) # invalid token sequence +\end{verbatim} +\end{vindent} +The following code adapts to hypothetical kernel version drift. + +\begin{vindent} +\begin{verbatim} +probe kernel.function ( + %( kernel_v <= "2.6.12" %? "__mm_do_fault" %: + %( kernel_vr == "2.6.13-1.8273FC3smp" %? "do_page_fault" %: UNSUPPORTED %) + %)) { /* ... */ } + +%( arch == "ia64" %? + probe syscall.vliw = kernel.function("vliw_widget") {} +%) +\end{verbatim} +\end{vindent} + +\section{Statement types\label{sec:Statement-Types}} + +Statements enable procedural control flow within functions and probe handlers. +The total number of statements executed in response to any single probe event +is limited to MAXACTION, which defaults to 1000. See Section~\ref{sub:SystemTap-safety}. + + +\subsection{break and continue} +\index{break} +\index{continue} +Use \texttt{break} or \texttt{continue} to exit or iterate the innermost +nesting loop statement, such as within a \texttt{while, for,} or \texttt{foreach} +statement. The syntax and semantics are the same as those used in C. + + +\subsection{delete} +\index{delete} +\texttt{delete} removes an element. + +The following statement removes from ARRAY the element specified by the index +tuple. The value will no longer be available, and subsequent iterations will +not report the element. It is not an error to delete an element that does +not exist. + +\begin{vindent} +\begin{verbatim} +delete ARRAY[INDEX1, INDEX2, ...] +\end{verbatim} +\end{vindent} +The following syntax removes all elements from ARRAY: + +\begin{vindent} +\begin{verbatim} +delete ARRAY +\end{verbatim} +\end{vindent} +The following statement removes the value of SCALAR. Integers and strings +are cleared to zero and null (\char`\"{}\char`\"{}) respectively, while statistics +are reset to their initial empty state. + +\begin{vindent} +\begin{verbatim} +delete SCALAR +\end{verbatim} +\end{vindent} + +\subsection{do} +\index{do} +The \texttt{do} statement has the same syntax and semantics as in C. + +\begin{vindent} +\begin{verbatim} +do STMT while (EXP) +\end{verbatim} +\end{vindent} + +\subsection{EXP (expression)} +\index{expression} +An \texttt{expression} executes a string- or integer-valued expression and +discards the value. + + +\subsection{for} +\index{for} +General syntax: +\begin{vindent} +\begin{verbatim} +for (EXP1; EXP2; EXP3) STMT +\end{verbatim} +\end{vindent} +The \texttt{for} statement is similar to the \texttt{for} statement in C. +The \texttt{for} expression executes EXP1 as initialization. While EXP2 is +non-zero, it executes STMT, then the iteration expression EXP3. + +\subsection{foreach\label{sub:foreach}} +\index{foreach} +General syntax: +\begin{vindent} +\begin{verbatim} +foreach (VAR in ARRAY) STMT +\end{verbatim} +\end{vindent} +The \texttt{foreach} statement loops over each element of a named global array, assigning +the current key to VAR. The array must not be modified within the statement. +If you add a single plus (+) or minus (-) operator after the VAR or the ARRAY +identifier, the iteration order will be sorted by the ascending or descending +index or value. + +The following statement behaves the same as the first example, except it +is used when an array is indexed with a tuple of keys. Use a sorting suffix +on at most one VAR or ARRAY identifier. + +\begin{vindent} +\begin{verbatim} +foreach ([VAR1, VAR2, ...] in ARRAY) STMT +\end{verbatim} +\end{vindent} +The following statement is the same as the first example, except that the +\texttt{limit} keyword limits the number of loop iterations to EXP times. +EXP is evaluated once at the beginning of the loop. + +\begin{vindent} +\begin{verbatim} +foreach (VAR in ARRAY limit EXP) STMT +\end{verbatim} +\end{vindent} + +\subsection{if} +\index{if} +General syntax: + +\begin{vindent} +\begin{verbatim} +if (EXP) STMT1 [ else STMT2 ] +\end{verbatim} +\end{vindent} +The \texttt{if} statement compares an integer-valued EXP to zero. It executes +the first STMT if non-zero, or the second STMT if zero. + +The \texttt{if} command has the same syntax and semantics as used in C. + + +\subsection{next} +\index{next} +The \texttt{next} statement returns immediately from the enclosing probe +handler. + + +\subsection{; (null statement)} +\index{;} +\index{null statement} +General syntax: + +\begin{vindent} +\begin{verbatim} +statement1 +; +statement2 +\end{verbatim} +\end{vindent} +The semicolon represents the null statement, or do nothing. It is useful +as an optional separator between statements to improve syntax error detection +and to handle certain grammar ambiguities. + + +\subsection{return} +\index{return} +General syntax: + +\begin{vindent} +\begin{verbatim} +return EXP +\end{verbatim} +\end{vindent} +The \texttt{return} statement returns the EXP value from the enclosing function. +If the value of the function is not returned, then a return statement is +not needed, and the function will have a special \emph{unknown} type with +no return value. + +\subsection{\{ \} (statement block)} +\index{\{ \}} +\index{statement block} +This is the statement block with zero or more statements enclosed within +brackets. The following is the general syntax: + +\begin{vindent} +\begin{verbatim} +{ STMT1 STMT2 ... } +\end{verbatim} +\end{vindent} +The statement block executes each statement in sequence in the block. Separators +or terminators are generally not necessary between statements. The statement +block uses the same syntax and semantics as in C. + + +\subsection{while} +\index{while} +General syntax: + +\begin{vindent} +\begin{verbatim} +while (EXP) STMT +\end{verbatim} +\end{vindent} +The \texttt{while} statement uses the same syntax and semantics as in C. +In the statement above, while the integer-valued EXP evaluates to non-zero, +the parser will execute STMT. + + +\section{Associative arrays\label{sec:Associative-Arrays}} +\index{associative arrays} +Associative arrays are implemented as hash tables with a maximum size set +at startup. Associative arrays are too large to be created dynamically for +individual probe handler runs, so they must be declared as global. The basic +operations for arrays are setting and looking up elements. These operations +are expressed in awk syntax: the array name followed by an opening bracket +({[}), a comma-separated list of up to five index index expressions, and +a closing bracket (]). Each index expression may be a string or a number, +as long as it is consistently typed throughout the script. + + +\subsection{Examples} + +\begin{vindent} +\begin{verbatim} +# Increment the named array slot: +foo [4,"hello"] ++ + +# Update a statistic: +processusage [uid(),execname()] ++ + +# Set a timestamp reference point: +times [tid()] = get_cycles() + +# Compute a timestamp delta: +delta = get_cycles() - times [tid()] +\end{verbatim} +\end{vindent} + +\subsection{Types of values} + +Array elements may be set to a number or a string. The type must be consistent +throughout the use of the array. The first assignment to the array defines +the type of the elements. Unset array elements may be fetched and return +a null value (zero or empty string) as appropriate, but they are not seen +by a membership test. + + +\subsection{Array capacity} + +Array sizes can be specified explicitly or allowed to default to the maximum +size as defined by MAXMAPENTRIES. See Section~\ref{sub:SystemTap-safety} +for details on changing MAXMAPENTRIES. + +You can explicitly specify the size of an array as follows: + +\begin{vindent} +\begin{verbatim} +global ARRAY[<size>] +\end{verbatim} +\end{vindent} +If you do not specify the size parameter, then the array is created to hold +MAXMAPENTRIES number of elements + + +\subsection{Iteration, foreach} +\index{foreach} +Like awk, SystemTap's foreach creates a loop that iterates over key tuples +of an array, not only values. The iteration may be sorted by any single key +or a value by adding an extra plus symbol (+) or minus symbol (-) to the +code. The following are examples. + +\begin{vindent} +\begin{verbatim} +# Simple loop in arbitrary sequence: +foreach ([a,b] in foo) + fuss_with(foo[a,b]) + +# Loop in increasing sequence of value: +foreach ([a,b] in foo+) { ... } + +# Loop in decreasing sequence of first key: +foreach ([a-,b] in foo) { ... } +\end{verbatim} +\end{vindent} +The \texttt{break} and \texttt{continue} statements also work inside foreach +loops. Since arrays can be large but probe handlers must execute quickly, +you should write scripts that exit iteration early, if possible. For simplicity, +SystemTap forbids any modification of an array during iteration with a foreach. + + +\section{Statistics (aggregates)\label{sec:Statistics}} +\index{aggregates} +Aggregate instances are used to collect statistics on numerical values, when +it is important to accumulate new data quickly and in large volume. These +instances operate without exclusive locks, and store only aggregated stream +statistics. Aggregates make sense only for global variables. They are stored +individually or as elements of an array. + +\subsection{The aggregation (\textless\hspace{1 sp}\textless\hspace{1 sp}\textless) operator} +\index{\textless\hspace{1 sp}\textless\hspace{1 sp}\textless} +The aggregation operator is {}``\textless\hspace{1 sp}\textless\hspace{1 sp}\textless'', +and its effect is similar to an assignment or a C++ output streaming operation. +The left operand specifies a scalar or array-index \emph{l-value}, which +must be declared global. The right operand is a numeric expression. The meaning +is intuitive: add the given number to the set of numbers to compute their +statistics. The specific list of statistics to gather is given separately +by the extraction functions. The following is an example. + +\begin{vindent} +\begin{verbatim} +a <<< delta_timestamp +writes[execname()] <<< count +\end{verbatim} +\end{vindent} + +\subsection{Extraction functions} +\index{extraction} +For each instance of a distinct extraction function operating on a given +identifier, the translator computes a set of statistics. With each execution +of an extraction function, the aggregation is computed for that moment across +all processors. The first argument of each function is the same style of +l-value as used on the left side of the aggregation operation. + + +\subsection{Integer extractors} + +The following functions provide methods to extract information about integer +values. + + +\subsubsection{@count(s)} +\index{count} +This statement returns the number of all values accumulated into s. + + +\subsubsection{@sum(s)} +\index{sum} +This statement returns the total of all values accumulated into s. + + +\subsubsection{@min(s)} +\index{min} +This statement returns the minimum of all values accumulated into s. + + +\subsubsection{@max(s)} +\index{max} +This statement returns the maximum of all values accumulated into s. + + +\subsubsection{@avg(s)} +\index{avg} +This statement returns the average of all values accumulated into s. + + +\subsection{Histogram extractors} +\index{histograms} +The following functions provide methods to extract histogram information. +Printing a histogram with the print family of functions renders a histogram +object as a tabular "ASCII art" bar chart. + +\subsubsection{@hist\_linear} +\index{hist\_linear} +The statement \texttt{@hist\_linear(v,L,H,W)} represents a linear histogram +\texttt{v}, where \emph{L} and \emph{H} represent the lower and upper end of +a range of values and \emph{W} represents the width (or size) of each bucket +within the range. The low and high values can be negative, but the overall +difference (high minus low) must be positive. The width parameter must also +be positive. + +In the output, a range of consecutive empty buckets may be replaced with a tilde +(\textasciitilde{}) character. This can be controlled on the command line +with -DHIST\_ELISION=\textless\hspace{1 sp}num\textgreater\hspace{1 sp}, +where \textless\hspace{1 sp}num\textgreater\hspace{1 sp} specifies how many +empty buckets at the top and bottom of the range to print. +The default is 2. A \textless\hspace{1 sp}num\textgreater\hspace{1 sp} of 0 +removes all empty buckets. A negative \textless\hspace{1 sp}num\textgreater\hspace{1 sp} +turns off bucket removal all together. + +For example, if you specify -DHIST\_ELISION=3 and the histogram has 10 +consecutive empty buckets, the first 3 and last 3 empty buckets will +be printed and the middle 4 empty buckets will be represented by a +tilde (\textasciitilde{}). + +The following is an example. + +\begin{vindent} +\begin{verbatim} +global reads +probe netdev.receive { + reads <<< length +} +probe end { + print(@hist_linear(reads, 0, 10240, 200)) +} +\end{verbatim} +\end{vindent} +This generates the following output. + +\pagebreak +\begin{vindent} +\begin{verbatim} +value |-------------------------------------------------- count + 0 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1650 + 200 | 8 + 400 | 0 + 600 | 0 + ~ + 1000 | 0 + 1200 | 0 + 1400 | 1 + 1600 | 0 + 1800 | 0 +\end{verbatim} +\end{vindent} +This shows that 1650 network reads were of a size between 0 and 200 bytes, +8 reads were between 200 and 400 bytes, and 1 read was between +1200 and 1400 bytes. The tilde (\textasciitilde{}) character indicates +buckets 700, 800 and 900 were removed because they were empty. +Empty buckets at the upper end were also removed. + +\subsubsection{@hist\_log} +\index{hist\_log} +The statement \texttt{@hist\_log(v)} represents a base-2 logarithmic +histogram. Empty buckets are replaced with a tilde (\textasciitilde{}) +character in the same way as \texttt{@hist\_linear()} (see above). + +The following is an example. + +\begin{vindent} +\begin{verbatim} +global reads +probe netdev.receive { + reads <<< length +} +probe end { + print(@hist_log(reads)) +} +\end{verbatim} +\end{vindent} +This generates the following output. + +\begin{vindent} +\begin{verbatim} +value |-------------------------------------------------- count + 8 | 0 + 16 | 0 + 32 | 254 + 64 | 3 + 128 | 2 + 256 | 2 + 512 | 4 + 1024 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 16689 + 2048 | 0 + 4096 | 0 +\end{verbatim} +\end{vindent} + +\section{Predefined functions\label{sec:Predefined-Functions}} + +Unlike built-in functions, predefined functions are implemented in tapsets. + + +\subsection{Output functions} + +The following sections describe the functions you can use to output data. + + +\subsubsection{error} +\index{error} +General syntax: + +\begin{vindent} +\begin{verbatim} +error:unknown (msg:string) +\end{verbatim} +\end{vindent} +This function logs the given string to the error stream. It appends an implicit +end-of-line. It blocks any further execution of statements in this probe. +If the number of errors exceeds the MAXERRORS parameter, it triggers an \texttt{exit}. + + +\subsubsection{log} +\index{log} +General syntax: + +\begin{vindent} +\begin{verbatim} +log:unknown (msg:string) +log (const char *fmt, ) +\end{verbatim} +\end{vindent} +This function logs data. \texttt{log} sends the message immediately to staprun +and to the bulk transport (relayfs) if it is being used. If the last character +given is not a newline, then one is added. + +This function is not as efficient as printf and should only be used for urgent +messages. + +\subsubsection{print} +\index{print} +General syntax: + +\begin{vindent} +\begin{verbatim} +print:unknown () +\end{verbatim} +\end{vindent} +This function prints a single value of any type. + + +\subsubsection{printf} +\index{printf} +General syntax: + +\begin{vindent} +\begin{verbatim} +printf:unknown (fmt:string, ) +\end{verbatim} +\end{vindent} +The printf function takes a formatting string as an argument, and a number +of values of corresponding types, and prints them all. The format must be a +literal string constant. The printf formatting directives are similar to those +of C, except that they are fully checked for type by the translator. + +The formatting string can contain tags that are defined as follows: + +\begin{vindent} +\begin{verbatim} +%[flags][width][.precision][length]specifier +\end{verbatim} +\end{vindent} +Where \texttt{specifier} is required and defines the type and the interpretation +of the value of the corresponding argument. The following table shows the +details of the specifier parameter: + +\begin{table}[H] +\caption{printf specifier values} +\begin{tabular}{|>{\raggedright}p{1in}|>{\raggedright}p{3.5in}|>{\raggedright}p{1.25in}|} +\hline +\textbf{Specifier}& +\textbf{Output}& +\textbf{Example}\tabularnewline +\hline +\hline +d or i& +Signed decimal& +392\tabularnewline +\hline +o& +Unsigned octal& +610\tabularnewline +\hline +s& +String& +sample\tabularnewline +\hline +u& +Unsigned decimal& +7235\tabularnewline +\hline +x& +Unsigned hexadecimal (lowercase letters)& +7fa\tabularnewline +\hline +X& +Unsigned hexadecimal (uppercase letters)& +7FA\tabularnewline +\hline +p& +Pointer address& +0x0000000000bc614e\tabularnewline +\hline +n& +Writes a binary value that is the total length of the string written by printf. +The field width specifies the number of bytes to write. Valid specifications +are \%n, \%1n, \%2n and \%4n. The default is 2.& +See below\tabularnewline +\hline +b& +Writes a binary value as text. The field width specifies the number of bytes +to write. Valid specifications are \%b, \%1b, \%2b, \%4b and \%8b. The default +width is 4 (32-bits).& +See below\tabularnewline +\hline +\%& +A \% followed by another \% character will write \% to stdout.& +\%\tabularnewline +\hline +\end{tabular} +\end{table} +The tag can also contain \texttt{flags}, \texttt{width}, \texttt{.precision} +and \texttt{modifiers} sub-specifiers, which are optional and follow these +specifications: + +\begin{table}[H] +\caption{printf flag values} +\begin{tabular}{|>{\raggedright}p{1.5in}|>{\raggedright}p{4.5in}|} +\hline +\textbf{Flags}& +\textbf{Description}\tabularnewline +\hline +\hline +- (minus sign)& +Left-justify within the given field width. Right justification is the default +(see \texttt{width} sub-specifier).\tabularnewline +\hline ++ (plus sign)& +Precede the result with a plus or minus sign even for positive numbers. By +default, only negative numbers are preceded with a minus sign.\tabularnewline +\hline +(space)& +If no sign is going to be written, a blank space is inserted before the value.\tabularnewline +\hline +\#& +Used with \texttt{o}, \texttt{x} or \texttt{X} specifiers the value is preceded +with \texttt{0}, \texttt{0x} or \texttt{0X} respectively for non-zero values.\tabularnewline +\hline +0& +Left-pads the number with zeroes instead of spaces, where padding is specified +(see \texttt{width} sub-specifier).\tabularnewline +\hline +\end{tabular} +\end{table} + +\begin{table}[H] +\caption{printf width values} +\begin{tabular}{|>{\raggedright}p{1.5in}|>{\raggedright}p{4.5in}|} +\hline +\textbf{Width}& +\textbf{Description}\tabularnewline +\hline +\hline +(number)& +Minimum number of characters to be printed. If the value to be printed is +shorter than this number, the result is padded with blank spaces. The value +is not truncated even if the result is larger.\tabularnewline +\hline +\end{tabular} +\end{table} + +% +\begin{table}[H] + +\caption{printf precision values} + +\begin{tabular}{|>{\raggedright}p{1.5in}|>{\raggedright}p{4.5in}|} +\hline +\textbf{Precision}& +\textbf{Description}\tabularnewline +\hline +\hline +.number& +For integer specifiers (\texttt{d, i, o, u, x, X}): \texttt{precision} specifies +the minimum number of digits to be written. If the value to be written is +shorter than this number, the result is padded with leading zeros. The value +is not truncated even if the result is longer. A precision of 0 means that +no character is written for the value 0. For s: this is the maximum number +of characters to be printed. By default all characters are printed until +the ending null character is encountered. When no \texttt{precision} is specified, +the default is 1. If the period is specified without an explicit value for +\texttt{precision}, 0 is assumed.\tabularnewline +\hline +\end{tabular} +\end{table} + +\textbf{Binary Write Examples} + +The following is an example of using the binary write functions: + +\begin{vindent} +\begin{verbatim} +probe begin { + for (i = 97; i < 110; i++) + printf("%3d: %1b%1b%1b\n", i, i, i-32, i-64) + exit() +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} + 97: aA! + 98: bB" + 99: cC# +100: dD$ +101: eE% +102: fF& +103: gG' +104: hH( +105: iI) +106: jJ* +107: kK+ +108: lL, +109: mM- +\end{verbatim} +\end{vindent} +Another example: + +\begin{vindent} +\begin{verbatim} +stap -e 'probe begin{printf("%1n%b%b", 0xc0dedbad, \ +0x12345678);exit()}' | hexdump -C + +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +00000000 08 ad db de c0 78 56 34 12 |.....xV4.| +00000009 +\end{verbatim} +\end{vindent} +Another example: + +\begin{vindent} +\begin{verbatim} +probe begin{ + printf("%1b%1b%1blo %1b%1brld\n", 72,101,108,87,111) + exit() +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +Hello World +\end{verbatim} +\end{vindent} + +\subsubsection{printd} +\index{printd} +General syntax: + +\begin{vindent} +\begin{verbatim} +printd:unknown (delimiter:string, ) +\end{verbatim} +\end{vindent} +This function takes a string delimiter and two or more values of any type, then +prints the values with the delimiter interposed. The delimiter must be a +literal string constant. + +For example: +\begin{vindent} +\begin{verbatim} +printd("/", "one", "two", "three", 4, 5, 6) +\end{verbatim} +\end{vindent} +prints: +\begin{vindent} +\begin{verbatim} +one/two/three/4/5/6 +\end{verbatim} +\end{vindent} + +\subsubsection{printdln} +\index{printdln} +General syntax: + +\begin{vindent} +\begin{verbatim} +printdln:unknown () +\end{verbatim} +\end{vindent} +This function operates like \texttt{printd}, but also appends a newline. + +\subsubsection{println} +\index{println} +General syntax: + +\begin{vindent} +\begin{verbatim} +println:unknown () +\end{verbatim} +\end{vindent} +This function operates like \texttt{print}, but also appends a newline. + +\subsubsection{sprint} +\index{sprint} +General syntax: + +\begin{vindent} +\begin{verbatim} +sprint:unknown () +\end{verbatim} +\end{vindent} +This function operates like \texttt{print}, but returns the string rather +than printing it. + +\subsubsection{sprintf} +\index{sprintf} +General syntax: + +\begin{vindent} +\begin{verbatim} +sprintf:unknown (fmt:string, ) +\end{verbatim} +\end{vindent} +This function operates like \texttt{printf}, but returns the formatted string +rather than printing it. + + +\subsubsection{system} +\index{system} +General syntax: + +\begin{vindent} +\begin{verbatim} +system (cmd:string) +\end{verbatim} +\end{vindent} +The system function runs a command on the system. The specified command runs +in the background once the current probe completes. + + +\subsubsection{warn} +\index{warn} +General syntax: + +\begin{vindent} +\begin{verbatim} +warn:unknown (msg:string) +\end{verbatim} +\end{vindent} +This function sends a warning message immediately to staprun. It is also +sent over the bulk transport (relayfs) if it is being used. If the last character +is not a newline, then one is added. + +\subsection{Context at the probe point} + +The following functions provide ways to access the current task context +at a probe point. Note that these may not return correct values when +a probe is hit in interrupt context. + +\subsubsection{backtrace} +\index{backtrace} +General syntax: + +\begin{vindent} +\begin{verbatim} +backtrace:string () +\end{verbatim} +\end{vindent} +Returns a string of hex addresses that are a backtrace of the +stack. The output is truncated to MAXSTRINGLEN. + +\subsubsection{caller} +\index{caller} +General syntax: + +\begin{vindent} +\begin{verbatim} +caller:string() +\end{verbatim} +\end{vindent} +Returns the address and name of the calling function. It works +only for return probes. + +\subsubsection{caller\_addr} +\index{caller\_addr} +General syntax: + +\begin{vindent} +\begin{verbatim} +caller_addr:long () +\end{verbatim} +\end{vindent} +Returns the address of the calling function. It works only +for return probes. + + +\subsubsection{cpu} +\index{cpu} +General syntax: + +\begin{vindent} +\begin{verbatim} +cpu:long () +\end{verbatim} +\end{vindent} +Returns the current cpu number. + + +\subsubsection{egid} +\index{egid} +General syntax: + +\begin{vindent} +\begin{verbatim} +egid:long () +\end{verbatim} +\end{vindent} +Returns the effective group ID of the current process. + + +\subsubsection{euid} +\index{euid} +General syntax: + +\begin{vindent} +\begin{verbatim} +euid:long () +\end{verbatim} +\end{vindent} +Returns the effective user ID of the current process. + + +\subsubsection{execname} +\index{execname} +General syntax: + +\begin{vindent} +\begin{verbatim} +execname:string () +\end{verbatim} +\end{vindent} +Returns the name of the current process. + + +\subsubsection{gid} +\index{gid} +General syntax: + +\begin{vindent} +\begin{verbatim} +gid:long () +\end{verbatim} +\end{vindent} +Returns the group ID of the current process. + + +\subsubsection{is\_return} +\index{is\_return} +General syntax: + +\begin{vindent} +\begin{verbatim} +is_return:long () +\end{verbatim} +\end{vindent} +Returns 1 if the probe point is a return probe, else it returns +zero. + +\noun{Deprecated}. + + +\subsubsection{pexecname} +\index{pexecname} +General syntax: + +\begin{vindent} +\begin{verbatim} +pexecname:string () +\end{verbatim} +\end{vindent} +Returns the name of the parent process. + + +\subsubsection{pid} +\index{pid} +General syntax: + +\begin{vindent} +\begin{verbatim} +pid:long () +\end{verbatim} +\end{vindent} +Returns the process ID of the current process. + + +\subsubsection{ppid} +\index{ppid} +General syntax: + +\begin{vindent} +\begin{verbatim} +ppid:long () +\end{verbatim} +\end{vindent} +Returns the process ID of the parent process. + + +\subsubsection{tid} +\index{tid} +General syntax: + +\begin{vindent} +\begin{verbatim} +tid:long () +\end{verbatim} +\end{vindent} +Returns the ID of the current thread. + + +\subsubsection{uid} +\index{uid} +General syntax: + +\begin{vindent} +\begin{verbatim} +uid:long () +\end{verbatim} +\end{vindent} +Returns the user ID of the current task. + + +\subsubsection{print\_backtrace} +\index{print\_backtrace} +General syntax: + +\begin{vindent} +\begin{verbatim} +print_backtrace:unknown () +\end{verbatim} +\end{vindent} +This function is equivalent to \texttt{print\_stack(backtrace())}, except +that deeper stack nesting is supported. The function does not return a value. + + +\subsubsection{print\_regs} +\index{print\_regs} +General syntax: + +\begin{vindent} +\begin{verbatim} +print_regs:unknown () +\end{verbatim} +\end{vindent} +This function prints a register dump. + + +\subsubsection{print\_stack} +\index{print\_stack} +General syntax: + +\begin{vindent} +\begin{verbatim} +print_stack:unknown (stk:string) +\end{verbatim} +\end{vindent} +This function performs a symbolic lookup of the addresses in the given string, +which is assumed to be the result of a prior call to \texttt{backtrace()}. +It prints one line per address. Each printed line includes the address, the +name of the function containing the address, and an estimate of its position +within that function. The function does not return a value. + + +\subsubsection{stack\_size} +\index{stack\_size} +General syntax: + +\begin{vindent} +\begin{verbatim} +stack_size:long () +\end{verbatim} +\end{vindent} +Returns the size of the stack. + + +\subsubsection{stack\_unused} +\index{stack\_unused} +General syntax: + +\begin{vindent} +\begin{verbatim} +stack_unused:long () +\end{verbatim} +\end{vindent} +Returns how many bytes are currently unused in the stack. + + +\subsubsection{stack\_used} +\index{stack\_used} +General syntax: + +\begin{vindent} +\begin{verbatim} +stack_used:long () +\end{verbatim} +\end{vindent} +Returns how many bytes are currently used in the stack. + + +\subsubsection{stp\_pid} +\index{stp\_pid} +\begin{vindent} +\begin{verbatim} +stp_pid:long () +\end{verbatim} +\end{vindent} +Returns the process ID of the of the staprun process. + + +\subsubsection{target} +\index{target} +General syntax: + +\begin{vindent} +\begin{verbatim} +target:long () +\end{verbatim} +\end{vindent} +Returns the process ID of the target process. This is useful +in conjunction with the -x PID or -c CMD command-line options to stap. An +example of its use is to create scripts that filter on a specific process. + +\begin{verbatim} +-x <pid> +\end{verbatim} +target() returns the pid specified by -x + +\begin{verbatim} +-c <command> +\end{verbatim} +target() returns the pid for the executed command specified +by -c. + +\subsection{Task data} + +These functions return data about a task. They all require a task handle as +input, such as the value return by task\_current() or the variables +prev\_task and next\_task in the scheduler.ctxswitch probe alias. + +\subsubsection{task\_cpu} +\index{task\_cpu} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_cpu:long (task:long) +\end{verbatim} +\end{vindent} +Returns the scheduled cpu for the given task. + + +\subsubsection{task\_current} +\index{task\_current} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_current:long () +\end{verbatim} +\end{vindent} +Returns the address of the task\_struct representing +the current process. This address can be passed to the various task\_{*}() +functions to extract more task-specific data. + + +\subsubsection{task\_egid} +\index{task\_egid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_egid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the effective group ID of the given task. + + +\subsubsection{task\_execname} +\index{task\_execname} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_execname:string (task:long) +\end{verbatim} +\end{vindent} +Returns the name of the given task. + + +\subsubsection{task\_euid} +\index{task\_euid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_euid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the effective user ID of the given task. + + +\subsubsection{task\_gid} +\index{task\_gid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_gid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the group ID of the given task. + + +\subsubsection{task\_nice} +\index{task\_nice} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_nice:long (task:long) +\end{verbatim} +\end{vindent} +Returns the nice value of the given task. + + +\subsubsection{task\_parent} +\index{task\_parent} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_parent:long (task:long) +\end{verbatim} +\end{vindent} +Returns the address of the parent task\_struct of the given +task. This address can be passed to the various task\_{*}() functions to +extract more task-specific data. + + +\subsubsection{task\_pid} +\index{task\_pid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_pid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the process ID of the given task. + + +\subsubsection{task\_prio} +\index{task\_prio} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_prio:long (task:long) +\end{verbatim} +\end{vindent} +Returns the priority value of the given task. + + +\subsubsection{task\_state} +\index{task\_state} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_state:long (task:long) +\end{verbatim} +\end{vindent} +Returns the state of the given task. Possible states are: + +\begin{vindent} +\begin{verbatim} +TASK_RUNNING 0 +TASK_INTERRUPTIBLE 1 +TASK_UNINTERRUPTIBLE 2 +TASK_STOPPED 4 +TASK_TRACED 8 +EXIT_ZOMBIE 16 +EXIT_DEAD 32 +\end{verbatim} +\end{vindent} + +\subsubsection{task\_tid} +\index{task\_tid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_tid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the thread ID of the given task. + + +\subsubsection{task\_uid} +\index{task\_uid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_uid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the user ID of the given task. + + +\subsubsection{task\_open\_file\_handles} +\index{task\_open\_file\_handles} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_open_file_handles:long(task:long) +\end{verbatim} +\end{vindent} +Returns the number of open file handles for the given task. + + +\subsubsection{task\_max\_file\_handles} +\index{task\_max\_file\_handles} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_max_file_handles:long(task:long) +\end{verbatim} +\end{vindent} +Returns the maximum number of file handles for the given task. + + +\subsection{Accessing string data at a probe point} + +The following functions provide methods to access string data at a probe +point. + + +\subsubsection{kernel\_string} +\index{kernel\_string} +General syntax: + +\begin{vindent} +\begin{verbatim} +kernel_string:string (addr:long) +\end{verbatim} +\end{vindent} +Copies a string from kernel space at a given address. The validation of this +address is only partial. + + +\subsubsection{user\_string\label{sub:user_string}} +\index{user\_string} +General syntax: + +\begin{vindent} +\begin{verbatim} +user_string:string (addr:long) +\end{verbatim} +\end{vindent} +This function copies a string from user space at a given address. The validation +of this address is only partial. In rare cases when userspace data is not +accessible, this function returns the string \texttt{<unknown>.} + + +\subsubsection{user\_string2} +\index{user\_string2} +General syntax: + +\begin{vindent} +\begin{verbatim} +user_string2:string (addr:long, err_msg:string) +\end{verbatim} +\end{vindent} +This function is similar to \texttt{user\_string}, (Section~\ref{sub:user_string}) +but allows passing an error message as an argument to be returned if userspace +data is not available. + + +\subsubsection{user\_string\_warn} +\index{user\_string\_warn} +General syntax: + +\begin{vindent} +\begin{verbatim} +user_string_warn:string (addr:long) +\end{verbatim} +\end{vindent} +This function copies a string from userspace at given address. It prints +a verbose error message on failure. + + +\subsubsection{user\_string\_quoted} +\index{user\_string\_quoted} +General syntax: + +\begin{vindent} +\begin{verbatim} +user_string_quoted:string (addr:long) +\end{verbatim} +\end{vindent} +This function copies a string from userspace at given address. Any ASCII +characters that are not printable are replaced by the corresponding escape +sequence in the returned string. + + +\subsection{Initializing queue statistics} +\index{queue statistics} +The queue\_stats tapset provides functions that, when given notification +of queuing events like wait, run, or done, track averages such as queue length, +service and wait times, and utilization. Call the following three functions +from appropriate probes, in sequence. + + +\subsubsection{qs\_wait} +\index{qs\_wait} +General syntax: + +\begin{vindent} +\begin{verbatim} +qs_wait:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function records that a new request was enqueued for the given queue +name. + + +\subsubsection{qs\_run} +\index{qs\_run} +General syntax: + +\begin{vindent} +\begin{verbatim} +qs_run:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function records that a previously enqueued request was removed from +the given wait queue and is now being serviced. + + +\subsubsection{qs\_done} +\index{qs\_done} +General syntax: + +\begin{vindent} +\begin{verbatim} +qs_done:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function records that a request originally from the given queue has +completed being serviced. + + +\subsection{Using queue statistics} + +Functions with the qsq\_ prefix query the statistics averaged since the first +queue operation or when qsq\_start was called. Since statistics are often +fractional, a scale parameter multiplies the result to a more useful scale. +For some fractions, a scale of 100 returns percentage numbers. + + +\subsubsection{qsq\_blocked} +\index{qsq\_blocked} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_blocked:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the fraction of elapsed time during which one or more +requests were on the wait queue. + + +\subsubsection{qsq\_print} +\index{qsq\_print} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_print:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function prints a line containing the following statistics for the given +queue: + +\begin{itemize} +\item queue name +\item average rate of requests per second +\item average wait queue length +\item average time on the wait queue +\item average time to service a request +\item percentage of time the wait queue was used +\item percentage of time any request was being serviced +\end{itemize} + +\subsubsection{qsq\_service\_time} +\index{qsq\_service\_time} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_service_time:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average time in microseconds required to service +a request once it is removed from the wait queue. + + +\subsubsection{qsq\_start} +\index{qsq\_start} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_start:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function resets the statistics counters for the given queue, and restarts +tracking from the moment the function was called. This command is used to +create a queue. + + +\subsubsection{qsq\_throughput} +\index{qsq\_throughput} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_throughput:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average number of requests served per microsecond. + + +\subsubsection{qsq\_utilization} +\index{qsq\_utilization} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_utilization:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average time in microseconds that at least one +request was being serviced. + + +\subsubsection{qsq\_wait\_queue\_length} +\index{qsq wait\_queue\_length} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_wait_queue_length:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average length of the wait queue. + + +\subsubsection{qsq\_wait\_time} +\index{qsq\_wait\_time} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_wait_time:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average time in microseconds that it took for a +request to be serviced (qs\_wait() to qs\_done()). + + +\subsubsection{A queue example} + +What follows is an example from src/testsuite/systemtap.samples/queue\_demo.stp. +It uses the randomize feature of the timer probe to simulate queuing activity. + +\begin{vindent} +\begin{verbatim} +probe begin { + qsq_start ("block-read") + qsq_start ("block-write") +} + +probe timer.ms(3500), end { + qsq_print ("block-read") + qsq_start ("block-read") + qsq_print ("block-write") + qsq_start ("block-write") +} + +probe timer.ms(10000) { + exit () +} + +# synthesize queue work/service using three randomized "threads" for each queue. +global tc + +function qs_doit (thread, name) { + n = tc[thread] = (tc[thread]+1) % 3 # per-thread state counter + if (n==1) qs_wait (name) + else if (n==2) qs_run (name) + else if (n==0) qs_done (name) +} + +probe timer.ms(100).randomize(100) { qs_doit (0, "block-read") } +probe timer.ms(100).randomize(100) { qs_doit (1, "block-read") } +probe timer.ms(100).randomize(100) { qs_doit (2, "block-read") } +probe timer.ms(100).randomize(100) { qs_doit (3, "block-write") } +probe timer.ms(100).randomize(100) { qs_doit (4, "block-write") } +probe timer.ms(100).randomize(100) { qs_doit (5, "block-write") } +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +block-read: 9 ops/s, 1.090 qlen, 215749 await, 96382 svctm, 69% wait, 64% util +block-write: 9 ops/s, 0.992 qlen, 208485 await, 103150 svctm, 69% wait, 61% util +block-read: 9 ops/s, 0.968 qlen, 197411 await, 97762 svctm, 63% wait, 63% util +block-write: 8 ops/s, 0.930 qlen, 202414 await, 93870 svctm, 60% wait, 56% util +block-read: 8 ops/s, 0.774 qlen, 192957 await, 99995 svctm, 58% wait, 62% util +block-write: 9 ops/s, 0.861 qlen, 193857 await, 101573 svctm, 56% wait, 64% util +\end{verbatim} +\end{vindent} + +\subsection{Probe point identification} + +The following functions help you identify probe points. + + +\subsubsection{pp} +\index{pp} +General syntax: + +\begin{vindent} +\begin{verbatim} +pp:string () +\end{verbatim} +\end{vindent} +This function returns the probe point associated with a currently running +probe handler, including alias and wild-card expansion effects. + + +\subsubsection{probefunc} +\index{probefunc} +General syntax: + +\begin{vindent} +\begin{verbatim} +probefunc:string () +\end{verbatim} +\end{vindent} +This function returns the name of the function being probed. + + +\subsubsection{probemod} +\index{probefunc} +General syntax: + +\begin{vindent} +\begin{verbatim} +probemod:string () +\end{verbatim} +\end{vindent} +This function returns the name of the module containing the probe point. + + +\subsection{Formatting functions} +\index{formatting} +The following functions help you format output. + + +\subsubsection{ctime} +\index{ctime} +General syntax: + +\begin{vindent} +\begin{verbatim} +ctime:string(epochsecs:long) +\end{verbatim} +\end{vindent} +This function accepts an argument of seconds since the epoch as returned +by \texttt{gettimeofday\_s()}. It returns a date string in UTC of the form: + +\begin{vindent} +\begin{verbatim} +"Wed Jun 30 21:49:008 2006" +\end{verbatim} +\end{vindent} +This function does not adjust for timezones. The returned time is always +in GMT. Your script must manually adjust epochsecs before passing it to ctime() +if you want to print local time. + + +\subsubsection{errno\_str} +\index{errno\_str} +General syntax: + +\begin{vindent} +\begin{verbatim} +errno_str:string (err:long) +\end{verbatim} +\end{vindent} +This function returns the symbolic string associated with the given error +code, such as ENOENT for the number 2, or E\#3333 for an out-of-range value +such as 3333. + + +\subsubsection{returnstr} +\index{returnstr} +General syntax: + +\begin{vindent} +\begin{verbatim} +returnstr:string (returnp:long) +\end{verbatim} +\end{vindent} +This function is used by the syscall tapset, and returns a string. Set \texttt{}returnp +equal to 1 for decimal, or 2 for hex. + + +\subsubsection{thread\_indent} +\index{thread\_indent} +General syntax: + +\begin{vindent} +\begin{verbatim} +thread_indent:string (delta:long) +\end{verbatim} +\end{vindent} +This function returns a string with appropriate indentation for a thread. +Call it with a small positive or matching negative delta. If this is the +outermost, initial level of indentation, then the function resets the relative +timestamp base to zero. + +The following example uses thread\_indent() to trace the functions called +in the drivers/usb/core kernel source. It prints a relative timestamp and +the name and ID of the current process, followed by the appropriate indent +and the function name. Note that \char`\"{}swapper(0)\char`\"{} indicates +the kernel is running in interrupt context and there is no valid current +process. + +\begin{vindent} +\begin{verbatim} +probe kernel.function("*@drivers/usb/core/*") { + printf ("%s -> %s\n", thread_indent(1), probefunc()) +} +probe kernel.function("*@drivers/usb/core/*").return { + printf ("%s <- %s\n", thread_indent(-1), probefunc()) +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} + 0 swapper(0): -> usb_hcd_irq + 8 swapper(0): <- usb_hcd_irq + 0 swapper(0): -> usb_hcd_irq +10 swapper(0): -> usb_hcd_giveback_urb +16 swapper(0): -> urb_unlink +22 swapper(0): <- urb_unlink +29 swapper(0): -> usb_free_urb +35 swapper(0): <- usb_free_urb +39 swapper(0): <- usb_hcd_giveback_urb +45 swapper(0): <- usb_hcd_irq + 0 usb-storage(1338): -> usb_submit_urb + 6 usb-storage(1338): -> usb_hcd_submit_urb +12 usb-storage(1338): -> usb_get_urb +18 usb-storage(1338): <- usb_get_urb +25 usb-storage(1338): <- usb_hcd_submit_urb +29 usb-storage(1338): <- usb_submit_urb + 0 swapper(0): -> usb_hcd_irq + 7 swapper(0): <- usb_hcd_irq +\end{verbatim} +\end{vindent} + +\subsubsection{thread\_timestamp} +\index{thread\_timestamp} + +General syntax: + +\begin{vindent} +\begin{verbatim} +thread_timestamp:long () +\end{verbatim} +\end{vindent} +This function returns an absolute timestamp value for use by the indentation +function. The default function uses \texttt{gettimeofday\_us.} + + +\subsection{String functions} +\index{string} +The following are string functions you can use. + + +\subsubsection{isinstr} +\index{isinstr} +General syntax: + +\begin{vindent} +\begin{verbatim} +isinstr:long (s1:string, s2:string) +\end{verbatim} +\end{vindent} +This function returns 1 if string s1 contains string s2, otherwise zero. + + +\subsubsection{strlen} +\index{strlen} +General syntax: + +\begin{vindent} +\begin{verbatim} +strlen:long (str:string) +\end{verbatim} +\end{vindent} +This function returns the number of characters in str. + + +\subsubsection{strtol} + +General syntax: + +\begin{vindent} +\begin{verbatim} +strtol:long (str:string, base:long) +\end{verbatim} +\end{vindent} +This function converts the string representation of a number to an integer. +The base parameter indicates the number base to assume for the string (e.g. +16 for hex, 8 for octal, 2 for binary). + + +\subsubsection{substr} +\index{substr} +General syntax: + +\begin{vindent} +\begin{verbatim} +substr:string (str:string, start:long, stop:long) +\end{verbatim} +\end{vindent} +This function returns the substring of \texttt{str} starting from character +position \texttt{start} and ending at character position \texttt{stop}. + + +\subsubsection{text\_str} +\index{text\_str} +General syntax: + +\begin{vindent} +\begin{verbatim} +text_str:string (input:string) +\end{verbatim} +\end{vindent} +This function accepts a string argument. Any ASCII characters in the string +that are not printable are replaced by a corresponding escape sequence in +the returned string. + + +\subsubsection{text\_strn} +\index{text\_strn} +General syntax: + +\begin{vindent} +\begin{verbatim} +text_strn:string (input:string, len:long, quoted:long) +\end{verbatim} +\end{vindent} +This function accepts a string of length \texttt{len}. Any ASCII characters +that are not printable are replaced by a corresponding escape sequence in +the returned string. If \texttt{quoted} is not null, the function adds a +backslash character to the output. + + +\subsubsection{tokenize} + +General syntax: + +\begin{vindent} +\begin{verbatim} +tokenize:string (input:string, delim:string) +\end{verbatim} +\end{vindent} +This function returns the next token in the given input string, where +the tokens are delimited by one of the characters in the delim string. +If the input string is non-NULL, it returns the first token. If the input string +is NULL, it returns the next token in the string passed in the previous call +to tokenize. If no delimiter is found, the entire remaining input string +is returned. It returns NULL when no more tokens are available. + + +\subsection{Timestamps} +\index{timestamps} +The following functions provide methods to extract time data. + + +\subsubsection{get\_cycles} +\index{get\_cycles} +General syntax: + +\begin{vindent} +\begin{verbatim} +get_cycles:long () +\end{verbatim} +\end{vindent} +This function returns the processor cycle counter value if available, else +it returns zero. + + +\subsubsection{gettimeofday\_ms} +\index{gettimeofday\_ms} +General syntax: + +\begin{vindent} +\begin{verbatim} +gettimeofday_ms:long () +\end{verbatim} +\end{vindent} +This function returns the number of milliseconds since the UNIX epoch. + + +\subsubsection{gettimeofday\_ns} +\index{gettimeofday\_ns} +General syntax: + +\begin{vindent} +\begin{verbatim} +gettimeofday_ns:long () +\end{verbatim} +\end{vindent} +This function returns the number of nanoseconds since the UNIX epoch. + + +\subsubsection{gettimeofday\_s} +\index{gettimeofday\_ s} +General syntax: + +\begin{vindent} +\begin{verbatim} +gettimeofday_s:long () +\end{verbatim} +\end{vindent} +This function returns the number of seconds since the UNIX epoch. + + +\subsubsection{gettimeofday\_us} +\index{gettimeofday\_us} +General syntax: + +\begin{vindent} +\begin{verbatim} +gettimeofday_us:long () +\end{verbatim} +\end{vindent} +This function returns the number of microseconds since the UNIX epoch. + + +\subsection{Miscellaneous tapset functions} + +The following are miscellaneous functions. + + +\subsubsection{addr\_to\_node} +\index{addr\_to\_node} +General syntax: + +\begin{vindent} +\begin{verbatim} +addr_to_node:long (addr:long) +\end{verbatim} +\end{vindent} +This function accepts an address, and returns the node that the given address +belongs to in a NUMA system. + + +\subsubsection{exit} +\index{exit} +General syntax: + +\begin{vindent} +\begin{verbatim} +exit:unknown () +\end{verbatim} +\end{vindent} +This function enqueues a request to shut down the SystemTap session. It does +not unwind the current probe handler, nor block new probe handlers. The stap +daemon will respond to the request and initiate an ordered shutdown. + + +\subsubsection{system} +\index{system} +General syntax: + +\begin{vindent} +\begin{verbatim} +system (cmd:string) +\end{verbatim} +\end{vindent} +This function runs a command on the system. The command will run in the background +when the current probe completes. + + +\section{For Further Reference\label{sec:For-Further-Reference}} + +For more information, see: +\begin{itemize} +\item The SystemTap tutorial at \url{http://sourceware.org/systemtap/tutorial/} +\item The SystemTap wiki at \url{http://sourceware.org/systemtap/wiki} +\item The SystemTap documentation page at \url{http://sourceware.org/systemtap/documentation.html} +\item From an unpacked source tarball or CVS directory, the examples in in the +src/examples directory, the tapsets in the src/tapset directory, and the +test scripts in the src/testsuite directory. +\item The man pages for tapsets. For a list, run the command \texttt{{}``man -k +stapprobes}''. +\end {itemize} + +\setcounter{secnumdepth}{0} +\newpage{} +\addcontentsline{toc}{section}{Index} +\printindex{} +\end{document} diff --git a/doc/tutorial.tex b/doc/tutorial.tex new file mode 100644 index 00000000..d465bf0b --- /dev/null +++ b/doc/tutorial.tex @@ -0,0 +1,1210 @@ +% Copyright (C) 2005-2007 Red Hat Inc. +% This file is part of systemtap, and is free software. You can +% redistribute it and/or modify it under the terms of the GNU General +% Public License (GPL); either version 2, or (at your option) any +% later version. + +\documentclass{article} +\usepackage{html} +\usepackage{graphicx} +% \usepackage{moreverb} +\usepackage{fancyvrb} +\usepackage{listings} +\usepackage{fullpage} +\usepackage{fancybox} +\usepackage[compatible]{nomencl} +% \usepackage{geometry} +% \geometry{letterpaper,text={7in,8.5in}} +\usepackage{charter} + +\newenvironment{boxedminipage}%% Boxed minipage + {\begin{makeimage}\begin{center}\begin{Sbox}\begin{minipage}}% + {\end{minipage}\end{Sbox}\fbox{\TheSbox}\end{center}\end{makeimage}} + +\begin{htmlonly} +\renewcommand{\nomenclature}[2]{} +\end{htmlonly} + +% \usepackage{draftcopy} % ugly +\bibliographystyle{plain} +\makeglossary +\parindent0.0cm +\parskip0.2cm + +\begin{document} + +\begin{center} +\LARGE {\bf Systemtap tutorial} +\end{center} + +\hfill \begin{minipage}{2.5in} +% contributors please add your names to the list +Frank Ch. Eigler {\tt \small <fche@redhat.com>} \\ + +\hfill \today +\end{minipage} + +\tableofcontents + +\section{Introduction} + +Systemtap is a tool that allows developers and administrators to write +and reuse simple scripts to deeply examine the activities of a live +Linux system. Data may be extracted, filtered, and summarized quickly +and safely, to enable diagnoses of complex performance or functional +problems. + +\nomenclature{script}{A simple programming language understood by systemtap.} + +The essential idea behind a systemtap script is to name {\em events}, +and to give them {\em handlers}. Whenever a specified event occurs, +the Linux kernel runs the handler as if it were a quick subroutine, +then resumes. There are several kind of events, such as entering or +exiting a function, a timer expiring, or the entire systemtap session +starting or stopping. A handler is a series of script language +statements that specify the work to be done whenever the event occurs. +This work normally includes extracting data from the event context, +storing them into internal variables, or printing results. + +\nomenclature{event}{An identifiable instant in the operating system's +execution state, such as entry to a function, or expiry of a timer.} +\nomenclature{session}{A complete run of a systemtap script program.} +\nomenclature{handler}{A series of statements, written in script, which +is to be performed whenever an event occurs.} +\nomenclature{\tt .stp}{The standard file name extension for systemtap +scripts.} + +Systemtap works by translating the script to C, running the system C +compiler to create a kernel module from that. When the module is +loaded, it activates all the probed events by hooking into the kernel. +Then, as events occur on any processor, the compiled handlers run. +Eventually, the session stops, the hooks are disconnected, and the +module removed. This entire process is driven from a single +command-line program, \verb+stap+. + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat hello-world.stp +probe begin +{ + print ("hello world\n") + exit () +} + +# stap hello-world.stp +hello world +\end{verbatim} +\end{boxedminipage} +\label{fig:hello-world} +\caption{A systemtap smoke test.} +\end{figure} + +This paper assumes that you have installed systemtap and its +prerequisite kernel development tools and debugging data, so that you +can run the scripts such as the simple one in +Figure~\ref{fig:hello-world}. Log on as \verb+root+, or even better, +as a user authorized to \verb+sudo+, before running systemtap. + +\begin{figure}[h] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat strace-open.stp +probe syscall.open +{ + printf ("%s(%d) open (%s)\n", execname(), pid(), argstr) +} +probe timer.ms(4000) # after 4 seconds +{ + exit () +} + +# stap strace-open.stp +vmware-guestd(2206) open ("/etc/redhat-release", O_RDONLY) +hald(2360) open ("/dev/hdc", O_RDONLY|O_EXCL|O_NONBLOCK) +hald(2360) open ("/dev/hdc", O_RDONLY|O_EXCL|O_NONBLOCK) +hald(2360) open ("/dev/hdc", O_RDONLY|O_EXCL|O_NONBLOCK) +df(3433) open ("/etc/ld.so.cache", O_RDONLY) +df(3433) open ("/lib/tls/libc.so.6", O_RDONLY) +df(3433) open ("/etc/mtab", O_RDONLY) +hald(2360) open ("/dev/hdc", O_RDONLY|O_EXCL|O_NONBLOCK) +\end{verbatim} +\end{boxedminipage} +\label{fig:strace-open} +\caption{A taste of systemtap: a system-wide {\tt strace}, just for +the {\tt open} system call.} +\end{figure} +\nomenclature{strace}{A standard ptrace-based command line tool to trace system call activity of a process.} + +\section{Tracing} + +The simplest kind of probe is simply to {\em trace} an event. +\nomenclature{trace}{A compact textual record of an event occurrence.} +This is the effect of inserting strategically located \verb+print+ +statements into a program. This is often the first step of problem +solving: explore by seeing a history of what has happened. + +This style of instrumentation is the simplest. It just asks systemtap +to print something at each event. To express this in the script +language, you need to say where to probe and what to print there. + +\subsection{Where to probe} + +Systemtap supports a number of built-in events. The library of +scripts that comes with systemtap, each called a ``tapset'', may +define additional ones defined in terms of the built-in family. See +the \verb+stapprobes+ man page for details. \nomenclature{tapset}{A +reusable script forming part of the automatically searched tapset +library.} All these events are named using a unified syntax that +looks like dot-separated parameterized identifiers: + +\begin{tabular}{rl} +\verb+begin+ & The startup of the systemtap session. \\ +\verb+end+ & The end of the systemtap session. \\ +\verb+kernel.function("sys_open")+ & The entry to the function named +\verb+sys_open+ in the kernel. \\ +\verb+syscall.close.return+ & The return from the \verb+close+ system +call. \\ +\verb+module("ext3").statement(0xdeadbeef)+ & The addressed instruction +in the \verb+ext3+ filesystem driver. \\ +\verb+timer.ms(200)+ & A timer that fires every 200 milliseconds. \\ +\end{tabular} + +Let's say that you would like to trace all function entries and exits +in a source file, say \verb+net/socket.c+ in the kernel. The +\verb+kernel.function+ probe point lets you express that easily, since +systemtap examines the kernel's debugging information to relate object +code to source code. It works like a debugger: if you can name or +place it, you can probe it. Use +\verb+kernel.function("*@net/socket.c")+ for the function entries, and +\verb+kernel.function("*@net/socket.c").return+ for the exits. Note +the use of wildcards in the function name part, and the subsequent +\verb+@FILENAME+ part. You can also put wildcards into the file name, +and even add a colon (\verb+:+) and a line number, if you want to +restrict the search that precisely. Since systemtap will put a +separate probe in every place that matches a probe point, a few +wildcards can expand to hundreds or thousands of probes, so be careful +what you ask for. \nomenclature{debug information}{Data created by the +compiler when the kernel or application was built, sometimes packaged into +{\tt debuginfo} files, for use by a symbolic debugger.} +\nomenclature{wildcard}{Presence of \verb+*+ globbing patterns in probe points.} + +Once you identify the probe points, the skeleton of the systemtap +script appears. The \verb+probe+ keyword introduces a probe point, or +a comma-separated list of them. The following \verb+{+ and \verb+}+ +braces enclose the handler for all listed probe points. +\begin{verbatim} +probe kernel.function("*@net/socket.c") { } +probe kernel.function("*@net/socket.c").return { } +\end{verbatim} +You can run this script as is, though with empty handlers there will +be no output. Put the two lines into a new file. Run +\verb+stap -v FILE+. Terminate it any time with \verb+^C+. (The +\verb+-v+ option tells systemtap to print more verbose messages during +its processing. Try the \verb+-h+ option to see more options.) + +\subsection{What to print} + +Since you are interested in each function that was entered and exited, +a line should be printed for each, containing the function name. In +order to make that list easy to read, systemtap should indent the +lines so that functions called by other traced functions are nested +deeper. To tell each single process apart from any others that may be +running concurrently, systemtap should also print the process ID in +the line. + +Systemtap provides a variety of such contextual data, ready for +formatting. They usually appear as function calls within the handler, +like you already saw in Figure~\ref{fig:strace-open}. See the +\verb+stapfuncs+ man page for those functions and more defined in the +tapset library, but here's a sampling: + +\begin{tabular}{rl} +\verb+tid()+ & The id of the current thread. \\ +\verb+pid()+ & The process (task group) id of the current thread. \\ +\verb+uid()+ & The id of the current user. \\ +\verb+execname()+ & The name of the current process. \\ +\verb+cpu()+ & The current cpu number. \\ +\verb+gettimeofday_s()+ & Number of seconds since epoch. \\ +\verb+get_cycles()+ & Snapshot of hardware cycle counter. \\ +\verb+pp()+ & A string describing the probe point being currently handled. \\ +\verb+probefunc()+ & If known, the name of the function in which + this probe was placed. \\ +\end{tabular} + +The values returned may be strings or numbers. The \verb+print()+ +built-in function accepts either as its sole argument. Or, you can +use the C-style \verb+printf()+ built-in, whose formatting argument +may include \verb+%s+ for a string, \verb+%d+ for a number. +\verb+printf+ and other functions take comma-separated arguments. +Don't forget a \verb+"\n"+ at the end. + +A particularly handy function in the tapset library is +\verb+thread_indent+. Given an indentation delta parameter, it stores +internally an indentation counter for each thread (\verb+tid()+), and +returns a string with some generic trace data plus an appropriate +number of indentation spaces. That generic data includes a timestamp +(number of microseconds since the most recent initial indentation), a +process name and the thread id itself. It therefore gives an idea not +only about what functions were called, but who called them, and how +long they took. Figure~\ref{fig:socket-trace} shows the finished +script. It lacks a call to the \verb+exit()+ function, so you need to +interrupt it with \verb+^C+ when you want the tracing to stop. + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat socket-trace.stp +probe kernel.function("*@net/socket.c") { + printf ("%s -> %s\n", thread_indent(1), probefunc()) +} +probe kernel.function("*@net/socket.c").return { + printf ("%s <- %s\n", thread_indent(-1), probefunc()) +} + +# stap socket-trace.stp + 0 hald(2632): -> sock_poll + 28 hald(2632): <- sock_poll +[...] + 0 ftp(7223): -> sys_socketcall + 1159 ftp(7223): -> sys_socket + 2173 ftp(7223): -> __sock_create + 2286 ftp(7223): -> sock_alloc_inode + 2737 ftp(7223): <- sock_alloc_inode + 3349 ftp(7223): -> sock_alloc + 3389 ftp(7223): <- sock_alloc + 3417 ftp(7223): <- __sock_create + 4117 ftp(7223): -> sock_create + 4160 ftp(7223): <- sock_create + 4301 ftp(7223): -> sock_map_fd + 4644 ftp(7223): -> sock_map_file + 4699 ftp(7223): <- sock_map_file + 4715 ftp(7223): <- sock_map_fd + 4732 ftp(7223): <- sys_socket + 4775 ftp(7223): <- sys_socketcall +[...] +\end{verbatim} +\end{boxedminipage} +\caption{Tracing and timing functions in {\tt net/sockets.c}.} +\label{fig:socket-trace} +\end{figure} + +\subsection{Exercises} + +\begin{enumerate} +\item Use the \verb+-p2+ option to systemtap to list all the kernel +functions named with the word ``nit'' in them. The probe handlers +might as well be empty. + +\item Trace some system calls (use \verb+syscall.NAME+ and \verb+.return+ +probe points), with the same \verb+thread_indent+ probe handler as in +Figure~\ref{fig:socket-trace}. Interpret the results. + +\end{enumerate} + +\section{Analysis} + +Pages of generic tracing text may give you enough information for +exploring a system. With systemtap, it is possible to analyze that +data, to filter, aggregate, transform, and summarize it. Different +probes can work together to share data. Probe handlers can use a rich +set of control constructs to describe algorithms, with a syntax taken +roughly from \verb+awk+. With these tools, systemtap scripts can +focus on a specific question and provide a compact response: no +\verb+grep+ needed. +\nomenclature{awk}{A classic UNIX stream processing language.} + +\subsection{Basic constructs} + +Most systemtap scripts include conditionals, to limit tracing or other +logic to those processes or users or {\em whatever} of interest. The +syntax is simple: + +\begin{tabular}{rl} +\verb+if (+{\em EXPR}\verb+)+ {\em STATEMENT} [\verb+else+ {\em STATEMENT}\verb+]+ & if/else statement \\ +\verb+while (+{\em EXPR}\verb+)+ {\em STATEMENT} & while loop \\ +\verb+for (+{\em A}\verb+;+ {\em B}\verb+;+ {\em C}\verb+)+ {\em STATEMENT} & for loop \\ +\end{tabular} + +Scripts may use \verb+break+/\verb+continue+ as in C. +Probe handlers can return early using \verb+next+ as in \verb+awk+. +Blocks of statements are enclosed in \verb+{+ and \verb+}+. In +systemtap, the semicolon (\verb+;+) is accepted as a null statement +rather than as a statement terminator, so is only rarely\footnote{Use +them between consecutive expressions that place unary {\tt +},{\tt -} +or mixed pre/post {\tt ++},{\tt --} in an ambiguous manner.} +necessary. Shell-style (\verb+#+), C-style (\verb+/* */+), and +C++-style (\verb+//+) comments are all accepted. + +Expressions look like C or \verb+awk+, and support the usual +operators, precedences, and numeric literals. Strings are treated as +atomic values rather than arrays of characters. String concatenation +is done with the dot (\verb+"a" . "b"+). Some examples: + +\begin{tabular}{rl} +\verb+(uid() > 100)+ & probably an ordinary user \\ +\verb+(execname() == "sed")+ & current process is sed \\ +\verb+(cpu() == 0 && gettimeofday_s() > 1140498000)+ & after Feb. 21, 2006, on CPU 0 \\ +\verb+"hello" . " " . "world"+ & a string in three easy pieces \\ +\end{tabular} + +Variables may be used as well. Just pick a name, assign to it, and +use it in expressions. They are automatically initialized and +declared. The type of each identifier -- string vs. number -- is +automatically inferred by systemtap from the kinds of operators and +literals used on it. Any inconsistencies will be reported as errors. +Conversion between string and number types is done through explicit +function calls. + +\nomenclature{type}{A designation of each identifier such as a +variable, or function, or array value or index, as containing a string +or number.} \nomenclature{string}{A \verb+\0+-terminated character +string of up to a fixed limit in length.} \nomenclature{number}{A +64-bit signed integer.} \nomenclature{type inference}{The automatic +determination of the type of each variable, function parameter, array +value and index, based on their use.} + +\begin{tabular}{rl} +\verb+foo = gettimeofday_s()+ & foo is a number \\ +\verb+bar = "/usr/bin/" . execname()+ & bar is a string \\ +\verb|c++| & c is a number \\ +\verb+s = sprint(2345)+ & s becomes the string "2345" \\ +\end{tabular} + +By default, variables are local to the probe they are used in. That +is, they are initialized, used, and disposed of at each probe handler +invocation. To share variables between probes, declare them global +anywhere in the script. Because of possible concurrency (multiple +probe handlers running on different CPUs), each global variable used +by a probe is automatically read- or write-locked while the handler is +running. \nomenclature{global variable}{A scalar, array, or aggregate that was +named in a \verb+global+ declaration, sharing that object amongst all +probe handlers and functions executed during a systemtap session.} +\nomenclature{locking}{An automated facility used by systemtap to +protect global variables against concurrent modification and/or +access.} + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat timer-jiffies.stp +global count_jiffies, count_ms +probe timer.jiffies(100) { count_jiffies ++ } +probe timer.ms(100) { count_ms ++ } +probe timer.ms(12345) +{ + hz=(1000*count_jiffies) / count_ms + printf ("jiffies:ms ratio %d:%d => CONFIG_HZ=%d\n", + count_jiffies, count_ms, hz) + exit () +} + +# stap timer-jiffies.stp +jiffies:ms ratio 30:123 => CONFIG_HZ=243 +\end{verbatim} +\end{boxedminipage} +\caption{Experimentally measuring {\tt CONFIG\_HZ}.} +\label{fig:timer-jiffies} +\end{figure} + +\subsection{Target variables} + +A class of special ``target variables'' allow access to the probe +point context. \nomenclature{target variable}{A value that may be +extracted from the kernel context of the probe point, such as a +parameter or local variable within a probed function.} In a symbolic +debugger, when you're stopped at a breakpoint, you can print values +from the program's context. In systemtap scripts, for those probe +points that match with specific executable point (rather than an +asynchronous event like a timer), you can do the same. To know which +variables are likely to be available, you will need to be familiar +with the kernel source you are probing. In addition, you will need to +check that the compiler has not optimized those values into +unreachable nonexistence. + +Let's say that you are trying to trace filesystem reads/writes to a +particular device/inode. From your knowledge of the kernel, you know +that two functions of interest could be \verb+vfs_read+ and +\verb+vfs_write+. Each takes a \verb+struct file *+ argument, inside +which there is a \verb+struct dentry *+, a \verb+struct inode *+, and +so on. Systemtap allows limited dereferencing of such pointer chains. +Two functions, \verb+user_string+ and \verb+kernel_string+, can copy +\verb+char *+ target variables into systemtap strings. +Figure~\ref{fig:inode-watch} demonstrates one way to monitor a +particular file (identifed by device number and inode number). This +example also demonstrates pasting numeric command-line arguments +(\verb+$1+ etc.) into scripts. +%$ + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat inode-watch.stp +probe kernel.function ("vfs_write"), + kernel.function ("vfs_read") +{ + dev_nr = $file->f_dentry->d_inode->i_sb->s_dev + inode_nr = $file->f_dentry->d_inode->i_ino + + if (dev_nr == ($1 << 20 | $2) # major/minor device + && inode_nr == $3) + printf ("%s(%d) %s 0x%x/%u\n", + execname(), pid(), probefunc(), dev_nr, inode_nr) +} +# stat -c '%D %i' /etc/crontab +803 988136 +# stap inode-watch.stp 8 3 988136 +crond(2419) vfs_read 0x800003/988136 +crond(2419) vfs_read 0x800003/988136 +crond(2419) vfs_read 0x800003/988136 +\end{verbatim} +% $ +\end{boxedminipage} +\caption{Watching for reads/writes to a particular file.} +\label{fig:inode-watch} +\end{figure} + +\subsection{Functions} + +Functions are conveniently packaged reusable software: it would be a +shame to have to duplicate a complex condition expression or logging +directive in every placed it's used. So, systemtap lets you define +functions of your own. Like global variables, systemtap functions may +be defined anywhere in the script. They may take any number of string +or numeric arguments (by value), and may return a single string or +number. The parameter types are inferred as for ordinary variables, +and must be consistent throughout the program. Local and global +script variables are available, but target variables are {\em not}. +That's because there is no specific debugging-level context associated +with a function. +\nomenclature{function}{A clump of parametrized script statements that +may be repeatedly and recursively called from probe handlers and other +functions.} + +A function is defined with the keyword \verb+function+ followed by a +name. Then comes a comma-separated formal argument list (just a list +of variable names). The \verb+{ }+-enclosed body consists of any list +of statements, including expressions that call functions. Recursion +is possible, up to a nesting depth limit. Figure~\ref{fig:functions} +displays function syntax. + + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# Red Hat convention +function system_uid_p (u) { return u < 500 } + +# kernel device number assembly macro +function makedev (major,minor) { return major << 20 | minor } + +function trace_common () +{ + printf("%d %s(%d)", gettimeofday_s(), execname(), pid()) + # no return value necessary +} + +function fibonacci (i) +{ + if (i < 1) return 0 + else if (i < 2) return 1 + else return fibonacci(i-1) + fibonacci(i-2) +} +\end{verbatim} +\end{boxedminipage} +\caption{Some functions of dubious utility.} +\label{fig:functions} +\end{figure} + +\subsection{Arrays} + +Often, probes will want to share data that cannot be represented as a +simple scalar value. Much data is naturally tabular in nature, +indexed by some tuple of thread numbers, processor ids, names, time, +and so on. Systemtap offers associative arrays for this purpose. +These arrays are implemented as hash tables with a maximum size that +is fixed at startup. Because they are too large to be created +dynamically for inidividual probes handler runs, they must be declared +as global. \nomenclature{array}{A global +\verb+[+$k_1,k_2,\ldots,k_n\verb+]+\rightarrow value$ +associative lookup table, with a string, +number for each index; the value may be a string, number, or an aggregate.} + +\begin{tabular}{rl} +\verb|global a| & declare global scalar or array variable \\ +\verb|global b[400]| & declare array, reserving space for up to 400 tuples \\ +\end{tabular} + +The basic operations for arrays are setting and looking up elements. +These are expressed in \verb+awk+ syntax: the array name followed by +an opening \verb+[+ bracket, a comma-separated list of index +expressions, and a closing \verb+]+ bracket. Each index expression +may be string or numeric, as long as it is consistently typed +throughout the script. +\nomenclature{arity}{Number of indexes to an array, or number of parameters +to a function.} + +\begin{tabular}{rl} +\verb|foo [4,"hello"] ++ | & increment the named array slot \\ +\verb|processusage [uid(),execname()] ++| & update a statistic \\ +\verb|times [tid()] = get_cycles()| & set a timestamp reference point \\ +\verb|delta = get_cycles() - times [tid()]| & compute a timestamp delta \\ +\end{tabular} + +Array elements that have not been set {\em may} be fetched, and return +a dummy null value (zero or an empty string) as appropriate. However, +assigning a null value does not delete the element: an explicit +\verb|delete| statement is required. \nomenclature{null value}{A +default initialized value for globals and array elements: a zero or an +empty string, depending on type.} Systemtap provides syntactic sugar +for these operations, in the form of explicit membership testing and +deletion. + +\begin{tabular}{rl} +\verb|if ([4,"hello"] in foo) { }| & membership test \\ +\verb|delete times[tid()]| & deletion of a single element \\ +\verb|delete times| & deletion of all elements \\ +\end{tabular} + +One final and important operation is iteration over arrays. This uses +the keyword \verb+foreach+. Like \verb+awk+, this creates a loop that +{\em iterates over key tuples} of an array, not just {\em values}. In +addition, the iteration may be {\em sorted} by any single key or the +value by adding an extra \verb|+| or \verb|-| code. + +The \verb+break+ and \verb+continue+ statements work inside +\verb+foreach+ loops, too. Since arrays can be large but probe +handlers must not run for long, it is a good idea to exit iteration +early if possible. The \verb+limit+ option in the \verb+foreach+ +expression is one way. For simplicity, systemtap forbids any {\em +modification} of an array while it is being iterated using a +\verb+foreach+. + +\begin{tabular}{rl} +\verb|foreach ([a,b] in foo) { fuss_with(foo[a,b]) }| & simple loop in arbitrary sequence \\ +\verb|foreach ([a,b] in foo+ limit 5) { }| & loop in increasing sequence of value, stop after 5 \\ +\verb|foreach ([a-,b] in foo) { }| & loop in decreasing sequence of first key \\ +\end{tabular} + +\subsection{Aggregates} + +When we said above that values can only be strings or numbers, we lied +a little. There is a third type: statistics aggregates, or aggregates +for short. Instances of this type are used to collect statistics on +numerical values, where it is important to accumulate new data quickly +({\em without} exclusive locks) and in large volume (storing only +aggregated stream statistics). This type only makes sense for global +variables, and may be stored individually or as elements of an array. +\nomenclature{aggregate}{A special ``write-mostly'' data type used to +efficiently store aggregated statistical values of a potentially huge +data stream.} + +To add a value to a statistics aggregate, systemtap uses the special +operator \verb+<<<+. Think of it like C++'s \verb+<<+ output +streamer: the left hand side object accumulates the data sample given +on the right hand side. This operation is efficient (taking a shared +lock) because the aggregate values are kept separately on each +processor, and are only aggregated across processors on request. + +\begin{verbatim} +a <<< delta_timestamp +writes[execname()] <<< count +\end{verbatim} + +To read the aggregate value, special functions are available to +extract a selected statistical function. {\em The aggregate value +cannot be read by simply naming it as if it were an ordinary +variable.} These operations take an exclusive lock on the respective +globals, and should therefore be relatively rare. The simple ones +are: \verb+@min+, \verb+@max+, \verb+@count+, \verb+@avg+, and +\verb+@sum+, and evaluate to a single number. In addition, histograms +of the data stream may be extracted using the \verb+@hist_log+ and +\verb+@hist_linear+. These evaluate to a special sort of array that +may at present\footnote{We anticipate support for indexing and looping +using {\tt foreach} shortly.} only be printed. +\nomenclature{extractor}{A function-like expression in a script that +computes a single statistic for a given aggregate.} + +\begin{tabular}{rl} +\verb+@avg(a)+ & the average of all the values accumulated + into \verb+a+ \\ +\verb+print(@hist_linear(a,0,100,10))+ & print an ``ascii art'' linear + histogram of the same data stream, \\ + & bounds $0 \ldots 100$, bucket width is $10$ \\ +\verb|@count(writes["zsh"])| & the number of times ``zsh'' + ran the probe handler \\ +\verb+print(@hist_log(writes["zsh"]))+ & print an ``ascii art'' logarithmic + histogram of the same data stream \\ +\end{tabular} + +\subsection{Safety} +\label{sec:safety} + +The full expressivity of the scripting language raises good questions +of safety. Here is a set of Q\&A: + +\begin{description} +\item{\bf What about infinite loops? recursion?} A probe handler is +bounded in time. The C code generated by systemtap includes explicit +checks that limit the total number of statements executed to a small +number. A similar limit is imposed on the nesting depth of function +calls. When either limit is exceeded, that probe handler cleanly +aborts and signals an error. The systemtap session is normally +configured to abort as a whole at that time. + +\item{\bf What about running out of memory?} No dynamic memory +allocation whatsoever takes place during the execution of probe +handlers. Arrays, function contexts, and buffers are allocated during +initialization. These resources may run out during a session, and +generally result in errors. + +\item{\bf What about locking?} If multiple probes seek conflicting +locks on the same global variables, one or more of them will time out, +and be aborted. Such events are tallied as ``skipped'' probes, and a +count is displayed at session end. A configurable number of skipped +probes can trigger an abort of the session. + +\item{\bf What about null pointers? division by zero?} The C code +generated by systemtap translates potentially dangerous operations to +routines that check their arguments at run time. These signal errors +if they are invalid. Many arithmetic and string operations silently +overflow if the results exceed representation limits. + +\item{\bf What about bugs in the translator? compiler?} While bugs +in the translator, or the runtime layer certainly exist\footnote{See +\tt http://sources.redhat.com/bugzilla}, our test suite gives some +assurance. Plus, the entire generated C code may be inspected (try +the \verb+-p3+ option). Compiler bugs are unlikely to be of any +greater concern for systemtap than for the kernel as a whole. In +other words, if it was reliable enough to build the kernel, it will +build the systemtap modules properly too. + +\item{\bf Is that the whole truth?} In practice, there are several +weak points in systemtap and the underlying kprobes system at the time +of writing. Putting probes indiscriminately into unusually sensitive +parts of the kernel (low level context switching, interrupt +dispatching) has reportedly caused crashes in the past. We are +fixing these bugs as they are found, and +constructing a probe point ``blacklist'', but it is not complete. +\nomenclature{blacklist}{A list of probe point patterns encoded into +the translator or the kernel, where probing is prohibited for safety +reasons.} \nomenclature{kprobes}{A breakpoint dispatching system for +dynamic kernel probes, used by systemtap to implement some families of +probe points.} + +\end{description} + + +\subsection{Exercises} +\begin{enumerate} +\item Alter the last probe in \verb+timer-jiffies.stp+ to reset the +counters and continue reporting instead of exiting. + +\item Write a script that, every ten seconds, displays the top five +most frequent users of \verb+open+ system call during that interval. + +\item Write a script that experimentally measures the speed of the +\verb+get_cycles()+ counter on each processor. + +\item Use any suitable probe point to get an approximate profile of +process CPU usage: which processes/users use how much of each CPU. +\end{enumerate} + +\section{Tapsets} + +After writing enough analysis scripts for yourself, your may become +known as an expert to your colleagues, who will want to use your +scripts. Systemtap makes it possible to share in a controlled manner; +to build libraries of scripts that build on each other. In fact, all +of the functions (\verb+pid()+, etc.) used in the scripts above come +from tapset scripts like that. A ``tapset'' is just a script that +designed for reuse by installation into a special directory. + +\subsection{Automatic selection} + +Systemtap attempts to resolve references to global symbols (probes, +functions, variables) that are not defined within the script by a +systematic search through the tapset library for scripts that define +those symbols. Tapset scripts are installed under the default +directory named \verb+/usr/share/systemtap/tapset+. A user may give +additional directories with the \verb+-I DIR+ option. Systemtap +searches these directories for script (\verb+.stp+) files. + +The search process includes subdirectories that are specialized for a +particular kernel version and/or architecture, and ones that name only +larger kernel families. Naturally, the search is ordered from +specific to general, as shown in Figure~\ref{fig:tapset-search}. +\nomenclature{tapset search path}{A list of subdirectories searched by +systemtap for tapset scripts, allowing specialization by version +architecture.} + +\begin{figure}[h!] +\begin{boxedminipage}{6in} +\begin{verbatim} +# stap -p1 -vv -e 'probe begin { }' > /dev/null +Created temporary directory "/tmp/staplnEBh7" +Searched '/usr/share/systemtap/tapset/2.6.15/i686/*.stp', match count 0 +Searched '/usr/share/systemtap/tapset/2.6.15/*.stp', match count 0 +Searched '/usr/share/systemtap/tapset/2.6/i686/*.stp', match count 0 +Searched '/usr/share/systemtap/tapset/2.6/*.stp', match count 0 +Searched '/usr/share/systemtap/tapset/i686/*.stp', match count 1 +Searched '/usr/share/systemtap/tapset/*.stp', match count 12 +Pass 1: parsed user script and 13 library script(s) in 350usr/10sys/375real ms. +Running rm -rf /tmp/staplnEBh7 +\end{verbatim} +\end{boxedminipage} +\caption{Listing the tapset search path.} +\label{fig:tapset-search} +\end{figure} + +When a script file is found that {\em defines} one of the undefined +symbols, that {\em entire file} is added to the probing session being +analyzed. This search is repeated until no more references can become +satisfied. Systemtap signals an error if any are still unresolved. + +This mechanism enables several programming idioms. First, it allows +some global symbols to be defined only for applicable kernel +version/architecture pairs, and cause an error if their use is +attempted on an inapplicable host. Similarly, the same symbol can be +defined differently depending on kernels, in much the same way that +different kernel \verb+include/asm/ARCH/+ files contain macros that +provide a porting layer. + +Another use is to separate the default parameters of a tapset routine +from its implementation. For example, consider a tapset that defines +code for relating elapsed time intervals to process scheduling +activities. The data collection code can be generic with respect to +which time unit (jiffies, wall-clock seconds, cycle counts) it can +use. It should have a default, but should not require additional +run-time checks to let a user choose another. +Figure~\ref{fig:tapset-default} shows a way. + +\begin{figure}[h!] +\begin{boxedminipage}{6in} +\begin{verbatim} +# cat tapset/time-common.stp +global __time_vars +function timer_begin (name) { __time_vars[name] = __time_value () } +function timer_end (name) { return __time_value() - __time_vars[name] } + +# cat tapset/time-default.stp +function __time_value () { return gettimeofday_us () } + +# cat tapset-time-user.stp +probe begin +{ + timer_begin ("bench") + for (i=0; i<100; i++) ; + printf ("%d cycles\n", timer_end ("bench")) + exit () +} +function __time_value () { return get_ticks () } # override for greater precision + +\end{verbatim} +\end{boxedminipage} +\caption{Providing an overrideable default.} +\label{fig:tapset-default} +\end{figure} + +A tapset that exports only {\em data} may be as useful as ones that +exports functions or probe point aliases (see below). Such global +data can be computed and kept up-to-date using probes internal to the +tapset. Any outside reference to the global variable would +incidentally activate all the required probes. + +\subsection{Probe point aliases} + +\nomenclature{probe point alias}{A probe point that is defined in +terms of another probe point.} Probe point aliases allow creation of +new probe points from existing ones. This is useful if the new probe +points are named to provide a higher level of abstraction. For +example, the system-calls tapset defines probe point aliases of the +form \verb+syscall.open+ etc., in terms of lower level ones like +\verb+kernel.function("sys_open")+. Even if some future kernel +renames \verb+sys_open+, the aliased name can remain valid. + +A probe point alias definition looks like a normal probe. Both start +with the keyword \verb+probe+ and have a probe handler statement block +at the end. But where a normal probe just lists its probe points, an +alias creates a new name using the assignment (\verb+=+) operator. +Another probe that names the new probe point will create an actual +probe, with the handler of the alias {\em prepended}. + +This prepending behavior serves several purposes. It allows the alias +definition to ``preprocess'' the context of the probe before passing +control to the user-specified handler. This has several possible uses: +\begin{tabular}{rl} +\verb+if ($flag1 != $flag2) next+ & skip probe unless given condition is met \\ +\verb+name = "foo"+ & supply probe-describing values \\ +\verb+var = $var+ & extract target variable to plain local variable \\ %$ +\end{tabular} + +Figure~\ref{fig:probe-alias} demonstrates a probe point alias +definition as well as its use. It demonstrates how a single probe +point alias can expand to multiple probe points, even to other +aliases. It also includes probe point wildcarding. These functions +are designed to compose sensibly. + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat probe-alias.stp +probe syscallgroup.io = syscall.open, syscall.close, + syscall.read, syscall.write +{ groupname = "io" } + +probe syscallgroup.process = syscall.fork, syscall.execve +{ groupname = "process" } + +probe syscallgroup.* +{ groups [execname() . "/" . groupname] ++ } + +probe end +{ + foreach (eg+ in groups) + printf ("%s: %d\n", eg, groups[eg]) +} + +global groups + +# stap probe-alias.stp +05-wait_for_sys/io: 19 +10-udev.hotplug/io: 17 +20-hal.hotplug/io: 12 +X/io: 73 +apcsmart/io: 59 +[...] +make/io: 515 +make/process: 16 +[...] +xfce-mcs-manage/io: 3 +xfdesktop/io: 5 +[...] +xmms/io: 7070 +zsh/io: 78 +zsh/process: 5 +\end{verbatim} +\end{boxedminipage} +\caption{Classified system call activity.} +\label{fig:probe-alias} +\end{figure} + +\subsection{Embedded C} +\label{embedded-c} + +Sometimes, a tapset needs provide data values from the kernel that +cannot be extracted using ordinary target variables (\verb+$var+). %$ +This may be because the values are in complicated data structures, may +require lock awareness, or are defined by layers of macros. Systemtap +provides an ``escape hatch'' to go beyond what the language can safely +offer. In certain contexts, you may embed plain raw C in tapsets, +exchanging power for the safety guarantees listed in +section~\ref{sec:safety}. End-user scripts {\em may not} include +embedded C code, unless systemtap is run with the \verb+-g+ (``guru'' +mode) option. Tapset scripts get guru mode privileges automatically. +\nomenclature{embedded C}{Special syntax permitting tapsets to include +literal C code.} + +Embedded C can be the body of a script function. Instead enclosing +the function body statements in \verb+{+ and \verb+}+, use \verb+%{+ +and \verb+%}+. Any enclosed C code is literally transcribed into the +kernel module: it is up to you to make it safe and correct. In order +to take parameters and return a value, a pointer macro \verb+THIS+ is +available. Function parameters and a place for the return value are +available as fields of that pointer. The familiar data-gathering +functions \verb+pid()+, \verb+execname()+, and their neighbours are +all embedded C functions. Figure~\ref{fig:embedded-C} contains +another example. + +Since systemtap cannot examine the C code to infer these types, an +optional\footnote{This is only necessary if the types cannot be +inferred from other sources, such as the call sites.} annotation +syntax is available to assist the type inference process. Simply +suffix parameter names and/or the function name with \verb+:string+ or +\verb+:long+ to designate the string or numeric type. In addition, +the script may include a \verb+%{+ \verb+%}+ block at the outermost +level of the script, in order to transcribe declarative code like +\verb+#include <linux/foo.h>+. These enable the embedded C functions +to refer to general kernel types. + +There are a number of safety-related constraints that should be +observed by developers of embedded C code. +\begin{enumerate} +\item Do not dereference pointers that are not known or testable valid. +\item Do not call any kernel routine that may cause a sleep or fault. +\item Consider possible undesirable recursion, where your embedded C +function calls a routine that may be the subject of a probe. If that +probe handler calls your embedded C function, you may suffer infinite +regress. Similar problems may arise with respect to non-reentrant +locks. +\item If locking of a data structure is necessary, use a +\verb+trylock+ type call to attempt to take the lock. If that fails, +give up, do not block. +\end{enumerate} + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat embedded-C.stp +%{ +#include <linux/utsname.h> +%} + +function utsname:string (field:long) +%{ + if (down_read_trylock (& uts_sem)) + { + const char *f = + (THIS->field == 0 ? system_utsname.sysname : + THIS->field == 1 ? system_utsname.nodename : + THIS->field == 2 ? system_utsname.release : + THIS->field == 3 ? system_utsname.version : + THIS->field == 4 ? system_utsname.machine : + THIS->field == 5 ? system_utsname.domainname : ""); + strlcpy (THIS->__retvalue, f, MAXSTRINGLEN); + up_read (& uts_sem); + } +%} + +probe begin +{ + printf ("%s %s\n", utsname(0), utsname(2)) + exit () +} + +# stap -g embedded-C.stp +Linux 2.6.15 +\end{verbatim} +\end{boxedminipage} +\caption{Embedded C function.} +\label{fig:embedded-C} +\end{figure} + +\subsection{Naming conventions} + +Using the tapset search mechanism just described, potentially many +script files can become selected for inclusion in a single session. +This raises the problem of name collisions, where different tapsets +accidentally use the same names for functions/globals. This can +result in errors at translate or run time. + +To control this problem, systemtap tapset developers are advised to +follow naming conventions. Here is some of the guidance. +\nomenclature{naming convention}{Guidelines for naming variables and +functions to prevent unintentional duplication.} +\begin{enumerate} +\item Pick a unique name for your tapset, and substitute it for +{\em TAPSET} below. +\item Separate identifiers meant to be used by tapset users from +those that are internal implementation artifacts. +\item Document the first set in the appropriate \verb+man+ pages. +\item Prefix the names of external identifiers with {\em TAPSET}\_ if +there is any likelihood of collision with other tapsets or end-user +scripts. +\item Prefix any probe point aliases with an appropriate prefix. +\item Prefix the names of internal identifiers with \_\_{\em TAPSET}\_. +\end{enumerate} + +\subsection{Exercises} + +\begin{enumerate} +\item Write a tapset that implements deferred and ``cancelable'' +logging. Export a function that enqueues a text string (into some +private array), returning an id token. Include a timer-based probe +that periodically flushes the array to the standard log output. +Export another function that, if the entry was not already flushed, +allows a text string to be cancelled from the queue. + +\item Create a ``relative timestamp'' tapset with functions return all +the same values as the ones in the timestamp tapset, except that they +are made relative to the start time of the script. + +\item Create a tapset that exports a global array that contains a +mapping of recently seen process ID numbers to process names. +Intercept key system calls (\verb+execve+?) to update the list +incrementally. + +\item Send your tapset ideas to the mailing list! +\end{enumerate} + +\section{Further information} + +For further information about systemtap, several sources are available. + +There are \verb+man+ pages: + +\begin{tabular}{rl} +\verb+stap+ & systemtap program usage, language summary \\ +\verb+stapfuncs+ & functions provided by tapsets \\ +\verb+stapprobes+ & probes / probe aliases provided by tapsets \\ +\verb+stapex+ & some example scripts \\ +\end{tabular} + +Then, there is the source code itself. Since systemtap is {\em free +software}, you should have available the entire source code. The +source files in the \verb+tapset/+ directory are also packaged along +with the systemtap binary. Since systemtap reads these files rather +than their documentation, they are the most reliable way to see what's +inside all the tapsets. Use the \verb+-v+ (verbose) command line +option, several times if you like, to show inner workings. +\nomenclature{free software}{Software licensed under terms such as the +GNU GPL, which aims to enforce certain specified user freedoms such +as study, modification, and sharing.} + +Finally, there is the project web site +(\verb+http://sources.redhat.com/systemtap/+) with several articles, +an archived public mailing list for users and developers +(\verb+systemtap@sources.redhat.com+), and a live CVS source +repository. Come join us! + + +\appendix + +\section{Glossary} +\renewcommand{\nomname}{} +\printglossary +\begin{htmlonly} +{\em Sorry, not available in HTML.} +\end{htmlonly} + +\section{Errors} + +We explain some common systemtap error messages in this section. Most +error messages include line/character numbers with which one can +locate the precise location of error in the script code. There is +sometimes a subsequent or prior line that elaborates. + +{\large {\em error} {\tt at:} {\em filename}:{\em line}:{\em column}: {\em details}} + +\subsection{Parse errors} + +\begin{description} +\item{\bf parse error: expected {\em foo}, saw {\em bar} $\ldots$} \\ +The script contained a grammar error. A different type of construct +was expected in the given context. + +\item{\bf parse error: embedded code in unprivileged script} \\ The +script contained unsafe constructs such as embedded C (section +\ref{embedded-c}), but was run without the \verb+-g+ (guru mode) +option. Confirm that the constructs are used safely, then try +again with \verb+-g+. +\end{description} + +\subsection{Type errors} + +\begin{description} +\item{\bf semantic error: type mismatch for identifier '{\em foo}' +$\ldots$ string vs. long} \\ In this case, the identifier {\em foo} +was previously inferred as a numeric type (``long''), but at the given +point is being used as a string. Similar messages appear if an array +index or function parameter slot is used with conflicting types. + +\item{\bf semantic error: unresolved type for identifier '{\em foo}'} +\\ The identifier {\em foo} was used, for example in a \verb+print+, +but without any operations that could assign it a type. Similar +messages may appear if a symbol is misspelled by a typo. + +\item{\bf semantic error: Expecting symbol or array index expression} +\\ Something other than an assignable lvalue was on the left hand sign +of an assignment. +\end{description} + +\subsection{Symbol errors} + +\begin{description} +\item{\bf while searching for arity {\em N} function, semantic error: +unresolved function call} \\ The script calls a function with {\em N} +arguments that does not exist. The function may exist with different +arity. + +\item{\bf semantic error: array locals not supported: $\ldots$} \\ An +array operation is present for which no matching global declaration +was found. Similar messages appear if an array is used with +inconsistent arities. + +\item{\bf semantic error: variable '{\em foo}' modified during 'foreach'} \\ +The array {\em foo} is being modified (being assigned to or deleted from) +within an active \verb+foreach+ loop. This invalid operation is also +detected within a function called from within the loop. +\end{description} + +\subsection{Probing errors } + +\begin{description} +\item{\bf semantic error: probe point mismatch at position {\em N}, +while resolving probe point {\em foo}} \\ A probe point was named that +neither directly understood by systemtap, nor defined as an alias by a +tapset script. The divergence from the ``tree'' of probe point +namespace is at position {\em N} (starting with zero at left). + +\item{\bf semantic error: no match for probe point, while resolving +probe point {\em foo}} \\ A probe point cannot be resolved for any of +a variety of reasons. It may be a debuginfo-based probe point such as +\verb+kernel.function("foobar")+ where no \verb+foobar+ function was +found. This can occur if the script specifies a wildcard on function +names, or an invalid file name or source line number. + +\item{\bf semantic error: unresolved target-symbol expression} \\ A +target variable was referred to in a probe handler that was not +resolvable. Or, a target variable is not valid at all in a context +such as a script function. This variable may have been elided by an +optimizing compiler, or may not have a suitable type, or there might +just be an annoying bug somewhere. Try again with a slightly +different probe point (use \verb+statement()+ instead of +\verb+function()+) to search for a more cooperative neighbour in the +same area. + +\item{\bf semantic error: libdwfl failure $\ldots$} \\ There was a +problem processing the debugging information. It may simply be +missing, or may have some consistency / correctness problems. Later +compilers tend to produce better debugging information, so if you can +upgrade and recompile your kernel/application, it may help. + +\item{\bf semantic error: cannot find {\em foo} debuginfo} \\ Similarly, +suitable debugging information was not found. Check that your kernel +build/installation includes a matching version of debugging data. +\end{description} + +\subsection{Runtime errors} + +\begin{description} + +\item{\bf WARNING: Number of errors: {\em N}, skipped probes: {\em M}} \\ +Errors and/or skipped probes occurred during this run. +\nomenclature{skipped probe}{A probe handler that should have run but +couldn't, due to contention or temporary resource problems.} + +\item{\bf division by 0} \\ The script code performed an invalid +division. + +\item{\bf aggregate element not found} \\ An statistics extractor +function other than \verb+@count+ was invoked on an aggregate that has +not had any values accumulated yet. This is similar to a division by +zero. + +\item{\bf aggregation overflow} \\ An array containing aggregate +values contains too many distinct key tuples at this time. + +\item{\bf MAXNESTING exceeded} \\ Too many levels of function call nesting +were attempted. + +\item{\bf MAXACTION exceeded} \\ The probe handler attempted to execute +too many statements. + +\item{\bf kernel/user string copy fault at {\em 0xaddr}} \\ +The probe handler attempted to copy a string from kernel or user space +at an invalid address. + +\item{\bf pointer dereference fault} \\ +There was a fault encountered during a pointer dereference operation such +as a target variable evaluation. + +\end{description} + + +\section{Acknowledgments} + +The author thanks Martin Hunt, Will Cohen, and Jim Keniston for +improvement advice for this paper. + +\end{document} diff --git a/doc/tutorial/embedded-C.stp b/doc/tutorial/embedded-C.stp new file mode 100644 index 00000000..6834d728 --- /dev/null +++ b/doc/tutorial/embedded-C.stp @@ -0,0 +1,25 @@ +%{ +#include <linux/utsname.h> +%} + +function utsname:string (field:long) +%{ + if (down_read_trylock (& uts_sem)) + { + const char *f = + (THIS->field == 0 ? system_utsname.sysname : + THIS->field == 1 ? system_utsname.nodename : + THIS->field == 2 ? system_utsname.release : + THIS->field == 3 ? system_utsname.version : + THIS->field == 4 ? system_utsname.machine : + THIS->field == 5 ? system_utsname.domainname : ""); + strlcpy (THIS->__retvalue, f, MAXSTRINGLEN); + up_read (& uts_sem); + } +%} + +probe begin +{ + printf ("%s %s\n", utsname(0), utsname(2)) + exit () +} diff --git a/doc/tutorial/functions.stp b/doc/tutorial/functions.stp new file mode 100644 index 00000000..6a825722 --- /dev/null +++ b/doc/tutorial/functions.stp @@ -0,0 +1,18 @@ +# Red Hat convention +function system_uid_p (u) { return u < 500 } + +# kernel device number assembly macro +function makedev (major,minor) { return major << 20 | minor } + +function trace_common () +{ + printf("%d %s(%d)", gettimeofday_s(), execname(), pid()) + # no return value +} + +function fibonacci (i) +{ + if (i < 1) return 0 + else if (i < 2) return 1 + else return fibonacci(i-1) + fibonacci(i-2) +} diff --git a/doc/tutorial/hello-world.stp b/doc/tutorial/hello-world.stp new file mode 100644 index 00000000..6a9037a7 --- /dev/null +++ b/doc/tutorial/hello-world.stp @@ -0,0 +1,5 @@ +probe begin +{ + print ("hello world\n") + exit () +} diff --git a/doc/tutorial/inode-watch.stp b/doc/tutorial/inode-watch.stp new file mode 100644 index 00000000..caf04b9a --- /dev/null +++ b/doc/tutorial/inode-watch.stp @@ -0,0 +1,13 @@ +probe kernel.function ("vfs_write"), + kernel.function ("vfs_read") +{ + dev_nr = $file->f_dentry->d_inode->i_sb->s_dev + inode_nr = $file->f_dentry->d_inode->i_ino + + if (dev_nr == ($1 << 20 | $2) # major/minor device + && inode_nr == $3) + printf ("%s(%d) %s 0x%x/%u\n", + execname(), pid(), probefunc(), dev_nr, inode_nr) +} + +# dev_name = kernel_string ($file->f_dentry->d_inode->i_sb->s_id) diff --git a/doc/tutorial/probe-alias.stp b/doc/tutorial/probe-alias.stp new file mode 100644 index 00000000..aa5feb1b --- /dev/null +++ b/doc/tutorial/probe-alias.stp @@ -0,0 +1,17 @@ +probe syscallgroup.io = syscall.open, syscall.close, + syscall.read, syscall.write +{ groupname = "io" } + +probe syscallgroup.process = syscall.fork, syscall.execve +{ groupname = "process" } + +probe syscallgroup.* +{ groups [execname() . "/" . groupname] ++ } + +probe end +{ + foreach (eg+ in groups) + printf ("%s: %d\n", eg, groups[eg]) +} + +global groups diff --git a/doc/tutorial/socket-trace.stp b/doc/tutorial/socket-trace.stp new file mode 100644 index 00000000..53b69ecc --- /dev/null +++ b/doc/tutorial/socket-trace.stp @@ -0,0 +1,6 @@ +probe kernel.function("*@net/socket.c") { + printf ("%s -> %s\n", thread_indent(1), probefunc()) +} +probe kernel.function("*@net/socket.c").return { + printf ("%s <- %s\n", thread_indent(-1), probefunc()) +} diff --git a/doc/tutorial/strace-open.stp b/doc/tutorial/strace-open.stp new file mode 100644 index 00000000..fb87cec1 --- /dev/null +++ b/doc/tutorial/strace-open.stp @@ -0,0 +1,8 @@ +probe syscall.open +{ + printf ("%s(%d) open (%s)\n", execname(), pid(), argstr) +} +probe timer.ms(4000) # after 4 seconds +{ + exit () +} diff --git a/doc/tutorial/tapset-time-user.stp b/doc/tutorial/tapset-time-user.stp new file mode 100644 index 00000000..32069b03 --- /dev/null +++ b/doc/tutorial/tapset-time-user.stp @@ -0,0 +1,8 @@ +probe begin +{ + timer_begin ("bench") + for (i=0; i<100; i++) ; + printf ("%d cycles\n", timer_end ("bench")) + exit () +} +function __time_value () { return get_cycles () } # override diff --git a/doc/tutorial/tapset/time-common.stp b/doc/tutorial/tapset/time-common.stp new file mode 100644 index 00000000..cec5a4ea --- /dev/null +++ b/doc/tutorial/tapset/time-common.stp @@ -0,0 +1,4 @@ +global __time_vars +function timer_begin (name) { __time_vars[name] = __time_value () } +function timer_end (name) { return __time_value() - __time_vars[name] } + diff --git a/doc/tutorial/tapset/time-default.stp b/doc/tutorial/tapset/time-default.stp new file mode 100644 index 00000000..614ff506 --- /dev/null +++ b/doc/tutorial/tapset/time-default.stp @@ -0,0 +1,2 @@ +function __time_value () { return gettimeofday_us () } + diff --git a/doc/tutorial/timer-jiffies.stp b/doc/tutorial/timer-jiffies.stp new file mode 100644 index 00000000..d5e92e4a --- /dev/null +++ b/doc/tutorial/timer-jiffies.stp @@ -0,0 +1,10 @@ +global count_jiffies, count_ms +probe timer.jiffies(100) { count_jiffies ++ } +probe timer.ms(100) { count_ms ++ } +probe timer.ms(12345) +{ + hz=(1000*count_jiffies) / count_ms + printf ("jiffies:ms ratio %d:%d => CONFIG_HZ=%d\n", + count_jiffies, count_ms, hz) + exit () +} diff --git a/runtime/ChangeLog b/runtime/ChangeLog index 512fa061..497b9d5b 100644 --- a/runtime/ChangeLog +++ b/runtime/ChangeLog @@ -1,3 +1,19 @@ +2008-02-27 Martin Hunt <hunt@redhat.com> + + * sym.h (_stp_module): Add text_size, lock, and unwind data + pointer. + * sym.c (_stp_find_module_by_addr): New function. + (_stp_kallsyms_lookup): Call _stp_find_module_by_addr(). + (_stp_get_unwind_info): New. + + * runtime.h: Move debug macros to debug.h. Include it. + * debug.h: New file. + * map.c: Update debug calls. + * map-gen.c: Update debug calls. + * pmap-gen.c: Update debug calls. + + * mempool.c: New file. + 2008-02-27 Dave Brolley <brolley@redhat.com> PR5189 diff --git a/runtime/debug.h b/runtime/debug.h new file mode 100644 index 00000000..8f877ede --- /dev/null +++ b/runtime/debug.h @@ -0,0 +1,66 @@ +/* Systemtap Debug Macros + * Copyright (C) 2008 Red Hat Inc. + * + * This file is part of systemtap, and is free software. You can + * redistribute it and/or modify it under the terms of the GNU General + * Public License (GPL); either version 2, or (at your option) any + * later version. + */ + +#ifndef _STP_DEBUG_H_ +#define _STP_DEBUG_H_ + +/* These are always on. + * _dbug() writes to systemtap stderr. + * errk() writes to the system log. + */ +#define _dbug(args...) _stp_dbug(__FUNCTION__, __LINE__, args) + +#define errk(args...) do { \ + printk("Systemtap Error at %s:%d ",__FUNCTION__, __LINE__); \ + printk(args); \ + } while (0) + +#ifdef DEBUG_TRANSPORT +#undef DEBUG_TRANSPORT +#define DEBUG_TRANSPORT 1 +#else +#define DEBUG_TRANSPORT 0 +#endif + +#ifdef DEBUG_UNWIND +#undef DEBUG_UNWIND +#define DEBUG_UNWIND 2 +#else +#define DEBUG_UNWIND 0 +#endif + +#ifdef DEBUG_SYMBOLS +#undef DEBUG_SYMBOLS +#define DEBUG_SYMBOLS 4 +#else +#define DEBUG_SYMBOLS 0 +#endif + +#define DEBUG_TYPE (DEBUG_TRANSPORT|DEBUG_UNWIND|DEBUG_SYMBOLS) + +#if DEBUG_TYPE > 0 + +#define dbug(type, args...) do { \ + if ((type) & DEBUG_TYPE) \ + _stp_dbug(__FUNCTION__, __LINE__, args); \ + } while (0) + +#define kbug(type, args...) do { \ + if ((type) & DEBUG_TYPE) { \ + printk("%s:%d ",__FUNCTION__, __LINE__); \ + printk(args); \ + } \ + } while (0) + +#else +#define dbug(type, args...) ; +#define kbug(type, args...) ; +#endif /* DEBUG_TYPE > 0 */ + +#endif /* _STP_DEBUG_H_ */ diff --git a/runtime/map-gen.c b/runtime/map-gen.c index a17f7e34..ce6e8742 100644 --- a/runtime/map-gen.c +++ b/runtime/map-gen.c @@ -229,7 +229,6 @@ static key_data KEYSYM(map_get_key) (struct map_node *mn, int n, int *type) key_data ptr; struct KEYSYM(map_node) *m = (struct KEYSYM(map_node) *)mn; - dbug ("n = %d type=%lx\n", n, type); if (n > KEY_ARITY || n < 1) { if (type) *type = END; @@ -359,7 +358,6 @@ MAP KEYSYM(_stp_map_new) (unsigned max_entries, int htype, ...) start = va_arg(ap, int); stop = va_arg(ap, int); interval = va_arg(ap, int); - // dbug ("start=%d stop=%d interval=%d\n", start, stop, interval); va_end (ap); } @@ -404,7 +402,6 @@ int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add) hlist_for_each(e, head) { n = (struct KEYSYM(map_node) *)((long)e - sizeof(struct list_head)); - //dbug ("n=%lx key1=%ld n->key1=%ld\n", (long)n, key1, n->key1); if (KEY1_EQ_P(n->key1, key1) #if KEY_ARITY > 1 && KEY2_EQ_P(n->key2, key2) @@ -423,8 +420,6 @@ int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add) } } /* key not found */ - dbug("key not found\n"); - n = (struct KEYSYM(map_node)*)_new_map_create (map, head); if (n == NULL) return -1; diff --git a/runtime/map.c b/runtime/map.c index 70990876..513e27df 100644 --- a/runtime/map.c +++ b/runtime/map.c @@ -138,7 +138,6 @@ int64_t _stp_key_get_int64 (struct map_node *mn, int n) if (mn) { res = (*mn->map->get_key)(mn, n, &type).val; - dbug("type=%d\n", type); if (type != INT64) res = 0; } @@ -159,7 +158,6 @@ char *_stp_key_get_str (struct map_node *mn, int n) if (mn) { str = (*mn->map->get_key)(mn, n, &type).strp; - dbug("type=%d\n", type); if (type != STRING) str = "bad type"; } @@ -716,7 +714,6 @@ void _stp_map_printn (MAP map, int n, const char *fmt) struct map_node *ptr; int type, num; key_data kd; - dbug ("print map %lx fmt=%s\n", (long)map, fmt); if (n < 0) return; @@ -763,7 +760,6 @@ static struct map_node *_stp_new_agg(MAP agg, struct hlist_head *ahead, struct m { struct map_node *aptr; /* copy keys and aggregate */ - dbug("creating new entry in %lx\n", (long)agg); aptr = _new_map_create(agg, ahead); if (aptr == NULL) return NULL; @@ -952,12 +948,10 @@ static struct map_node *_new_map_create (MAP map, struct hlist_head *head) return NULL; } m = (struct map_node *)map->head.next; - dbug ("got %lx off head\n", (long)m); hlist_del_init(&m->hnode); } else { m = (struct map_node *)map->pool.next; map->num++; - dbug ("got %lx off pool\n", (long)m); } list_move_tail(&m->lnode, &map->head); diff --git a/runtime/mempool.c b/runtime/mempool.c new file mode 100644 index 00000000..0fbb4326 --- /dev/null +++ b/runtime/mempool.c @@ -0,0 +1,135 @@ +/* -*- linux-c -*- + * Preallocated memory pools + * Copyright (C) 2008 Red Hat Inc. + * + * This file is part of systemtap, and is free software. You can + * redistribute it and/or modify it under the terms of the GNU General + * Public License (GPL); either version 2, or (at your option) any + * later version. + */ + +#ifndef _STP_MEMPOOL_C_ +#define _STP_MEMPOOL_C_ + +/* An opaque struct identifying the memory pool. */ +typedef struct { + struct list_head free_list; + unsigned num; + unsigned size; + spinlock_t lock; +} _stp_mempool_t; + +/* for internal use only */ +struct _stp_mem_buffer { + struct list_head list; + _stp_mempool_t *pool; + void *buf; +}; + +/* Delete a memory pool */ +static void _stp_mempool_destroy(_stp_mempool_t *pool) +{ + struct list_head *p, *tmp; + if (pool) { + list_for_each_safe(p, tmp, &pool->free_list) { + list_del(p); + _stp_kfree(p); + } + _stp_kfree(pool); + } +} + +/* Create a new memory pool */ +static _stp_mempool_t *_stp_mempool_init(size_t size, size_t num) +{ + int i, alloc_size; + struct _stp_mem_buffer *m; + + _stp_mempool_t *pool = (_stp_mempool_t *)_stp_kmalloc(sizeof(_stp_mempool_t)); + if (unlikely(pool == NULL)) { + errk("Memory allocation failed.\n"); + return NULL; + } + + INIT_LIST_HEAD(&pool->free_list); + spin_lock_init(&pool->lock); + + alloc_size = size + sizeof(struct _stp_mem_buffer) - sizeof(void *); + + for (i = 0; i < num; i++) { + m = (struct _stp_mem_buffer *)_stp_kmalloc(alloc_size); + if (unlikely(m == NULL)) + goto err; + m->pool = pool; + list_add((struct list_head *)m, &pool->free_list); + } + pool->num = num; + pool->size = alloc_size; + return pool; + +err: + _stp_mempool_destroy(pool); + return NULL; +} + +/* Resize a memory pool */ +static int _stp_mempool_resize(_stp_mempool_t *pool, size_t num) +{ + int i; + unsigned long flags; + struct _stp_mem_buffer *m; + + if (unlikely(num == 0 || num == pool->num)) + return pool->num; + + if (num > pool->num) { + for (i = 0; i < num - pool->num; i++) { + m = (struct _stp_mem_buffer *)_stp_kmalloc(pool->size); + if (unlikely(m == NULL)) + goto done; + m->pool = pool; + pool->num++; + spin_lock_irqsave(&pool->lock, flags); + list_add((struct list_head *)m, &pool->free_list); + spin_unlock_irqrestore(&pool->lock, flags); + } + } else { + for (i = 0; i < pool->num - num; i++) { + spin_lock_irqsave(&pool->lock, flags); + m = (struct _stp_mem_buffer *)pool->free_list.next; + list_del(&m->list); + spin_unlock_irqrestore(&pool->lock, flags); + _stp_kfree(m); + } + pool->num = num; + } +done: + return num; +} + +/* allocate a buffer from a memory pool */ +static void *_stp_mempool_alloc(_stp_mempool_t *pool) +{ + unsigned long flags; + struct _stp_mem_buffer *ptr = NULL; + spin_lock_irqsave(&pool->lock, flags); + if (likely(!list_empty(&pool->free_list))) { + ptr = (struct _stp_mem_buffer *)pool->free_list.next; + list_del_init(&ptr->list); + spin_unlock_irqrestore(&pool->lock, flags); + return &ptr->buf; + } + spin_unlock_irqrestore(&pool->lock, flags); + return NULL; +} + +/* return a buffer to its memory pool */ +static void _stp_mempool_free(void *buf) +{ + unsigned long flags; + struct _stp_mem_buffer *m = container_of(buf, struct _stp_mem_buffer, buf); + spin_lock_irqsave(&m->pool->lock, flags); + list_add(&m->list, &m->pool->free_list); + spin_unlock_irqrestore(&m->pool->lock, flags); +} +#endif /* _STP_MEMPOOL_C_ */ diff --git a/runtime/pmap-gen.c b/runtime/pmap-gen.c index 0efffdb6..8666549b 100644 --- a/runtime/pmap-gen.c +++ b/runtime/pmap-gen.c @@ -437,7 +437,6 @@ PMAP KEYSYM(_stp_pmap_new) (unsigned max_entries, int htype, ...) start = va_arg(ap, int); stop = va_arg(ap, int); interval = va_arg(ap, int); - // dbug ("start=%d stop=%d interval=%d\n", start, stop, interval); va_end (ap); } @@ -515,8 +514,6 @@ int KEYSYM(__stp_pmap_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add) } /* key not found */ - dbug("key not found\n"); - n = (struct KEYSYM(pmap_node)*)_new_map_create (map, head); if (n == NULL) return -1; @@ -678,7 +675,6 @@ VALTYPE KEYSYM(_stp_pmap_get) (PMAP pmap, ALLKEYSD(key)) #endif ) { if (anode == NULL) { - // dbug("agg=%lx ahead=%lx\n", (long)agg, (long)ahead); anode = _stp_new_agg(agg, ahead, (struct map_node *)n); } else { if (clear_agg) { @@ -738,7 +734,6 @@ int KEYSYM(__stp_pmap_del) (MAP map, ALLKEYSD(key)) } /* key not found */ - dbug("key not found\n"); return 0; } diff --git a/runtime/runtime.h b/runtime/runtime.h index d951833d..318d3038 100644 --- a/runtime/runtime.h +++ b/runtime/runtime.h @@ -1,5 +1,5 @@ /* main header file - * Copyright (C) 2005-2007 Red Hat Inc. + * Copyright (C) 2005-2008 Red Hat Inc. * Copyright (C) 2005, 2006 Intel Corporation. * * This file is part of systemtap, and is free software. You can @@ -46,21 +46,7 @@ static void _stp_dbug (const char *func, int line, const char *fmt, ...); void _stp_error (const char *fmt, ...); -#ifdef DEBUG -/** Prints debug line. - * This function prints a debug message immediately to staprun. - * If the last character is not a newline, then one is added. - * @param args A variable number of args in a format like printf. - * @ingroup io - */ -#define dbug(args...) _stp_dbug(__FUNCTION__, __LINE__, args) -#define kbug(args...) {printk("%s:%d ",__FUNCTION__, __LINE__); printk(args); } -#else -#define dbug(args...) ; -#define kbug(args...) ; -#endif /* DEBUG */ -#define _dbug(args...) _stp_dbug(__FUNCTION__, __LINE__, args) -#define errk(args...) {printk("Systemtap Error at %s:%d ",__FUNCTION__, __LINE__); printk(args); } +#include "debug.h" /* atomic globals */ static atomic_t _stp_transport_failures = ATOMIC_INIT (0); diff --git a/runtime/staprun/symbols.c b/runtime/staprun/symbols.c index e33ee624..c7362d9e 100644 --- a/runtime/staprun/symbols.c +++ b/runtime/staprun/symbols.c @@ -19,9 +19,10 @@ static int send_data(int32_t type, void *data, int len) return write(control_channel, data, len); } + /* Get the sections for a module. Put them in the supplied buffer */ /* in the following order: */ -/* [struct _stp_msg_module][struct _stp_symbol sections ...][string data]*/ +/* [struct _stp_msg_module][struct _stp_symbol sections ...][string data][unwind data] */ /* Return the total length of all the data. */ #define SECDIR "/sys/module/%s/sections" @@ -31,8 +32,9 @@ static int get_sections(char *name, char *data_start, int datalen) char filename[STP_MODULE_NAME_LEN + 256]; char buf[32], strdata_start[32768]; char *strdata=strdata_start, *data=data_start; - int fd, len, res; + int fd, len, res, unwind_data_len=0; struct _stp_msg_module *mod = (struct _stp_msg_module *)data_start; + struct dirent *d; DIR *secdir; void *sec; @@ -63,6 +65,9 @@ static int get_sections(char *name, char *data_start, int datalen) return -1; } + /* FIXME: optionally fill in unwind data here */ + mod->unwind_len = unwind_data_len; + while ((d = readdir(secdir))) { char *secname = d->d_name; @@ -138,6 +143,14 @@ static int get_sections(char *name, char *data_start, int datalen) while (len--) *data++ = *strdata++; +#if 0 + if (unwind_data_len) { + if ((unwind_data_len + data - data_start) > datalen) + goto err0; + memcpy(data, unwind_data, unwind_data_len); + data += unwind_data_len; + } +#endif return data - data_start; err1: @@ -211,7 +224,7 @@ int do_kernel_symbols(void) int ret, num_syms, i = 0, struct_symbol_size; int max_syms= MAX_SYMBOLS, data_basesize = MAX_SYMBOLS*32; - if (kernel_ptr_size == 8) + if (kernel_ptr_size == 8) struct_symbol_size = sizeof(struct _stp_symbol64); else struct_symbol_size = sizeof(struct _stp_symbol32); @@ -285,10 +298,12 @@ int do_kernel_symbols(void) if (num_syms <= 0) goto err; + /* send header */ struct _stp_msg_symbol_hdr smsh; smsh.num_syms = num_syms; smsh.sym_size = (uint32_t)(dataptr - data_base); + smsh.unwind_size = (uint32_t)0; if (send_request(STP_SYMBOLS, &smsh, sizeof(smsh)) <= 0) goto err; diff --git a/runtime/sym.c b/runtime/sym.c index 56c93064..3c2f859a 100644 --- a/runtime/sym.c +++ b/runtime/sym.c @@ -1,6 +1,6 @@ /* -*- linux-c -*- * Symbolic Lookup Functions - * Copyright (C) 2005, 2006, 2007 Red Hat Inc. + * Copyright (C) 2005-2008 Red Hat Inc. * Copyright (C) 2006 Intel Corporation. * * This file is part of systemtap, and is free software. You can @@ -9,8 +9,8 @@ * later version. */ -#ifndef _SYM_C_ -#define _SYM_C_ +#ifndef _STP_SYM_C_ +#define _STP_SYM_C_ #include "string.c" @@ -20,11 +20,12 @@ * @{ */ -unsigned long _stp_module_relocate (const char *module, const char *section, unsigned long offset) { +unsigned long _stp_module_relocate(const char *module, const char *section, unsigned long offset) +{ static struct _stp_module *last = NULL; static struct _stp_symbol *last_sec; unsigned long flags; - int i,j; + int i, j; /* if module is -1, we invalidate last. _stp_del_module calls this when modules are deleted. */ if ((long)module == -1) { @@ -32,53 +33,52 @@ unsigned long _stp_module_relocate (const char *module, const char *section, uns return 0; } - dbug("%s, %s, %lx\n", module, section, offset); + dbug(DEBUG_SYMBOLS, "%s, %s, %lx\n", module, section, offset); - STP_LOCK_MODULES; - if (! module - || !strcmp (section, "") /* absolute, unrelocated address */ - || _stp_num_modules == 0) { - STP_UNLOCK_MODULES; - return offset; + STP_RLOCK_MODULES; + if (!module || !strcmp(section, "") /* absolute, unrelocated address */ + ||_stp_num_modules == 0) { + STP_RUNLOCK_MODULES; + return offset; } /* Most likely our relocation is in the same section of the same module as the last. */ if (last) { - if (!strcmp (module, last->name) && !strcmp (section, last_sec->symbol)) { + if (!strcmp(module, last->name) && !strcmp(section, last_sec->symbol)) { offset += last_sec->addr; - STP_UNLOCK_MODULES; - dbug("offset = %lx\n", offset); + STP_RUNLOCK_MODULES; + dbug(DEBUG_SYMBOLS, "offset = %lx\n", offset); return offset; } } /* not cached. need to scan all modules */ - if (! strcmp (module, "kernel")) { - STP_UNLOCK_MODULES; + if (!strcmp(module, "kernel")) { + STP_RUNLOCK_MODULES; /* See also transport/symbols.c (_stp_do_symbols). */ - if (strcmp (section, "_stext")) + if (strcmp(section, "_stext")) return 0; else return offset + _stp_modules[0]->text; } else { /* relocatable module */ - for (i = 1; i < _stp_num_modules; i++) { /* skip over [0]=kernel */ + for (i = 1; i < _stp_num_modules; i++) { /* skip over [0]=kernel */ last = _stp_modules[i]; if (strcmp(module, last->name)) continue; for (j = 0; j < (int)last->num_sections; j++) { last_sec = &last->sections[j]; - if (!strcmp (section, last_sec->symbol)) { + if (!strcmp(section, last_sec->symbol)) { offset += last_sec->addr; - STP_UNLOCK_MODULES; - dbug("offset = %lx\n", offset); + STP_RUNLOCK_MODULES; + dbug(DEBUG_SYMBOLS, "offset = %lx\n", offset); return offset; } } } } - STP_UNLOCK_MODULES; + STP_RUNLOCK_MODULES; last = NULL; return 0; } @@ -97,24 +97,15 @@ static unsigned long _stp_kallsyms_lookup_name(const char *name) return 0; } -static const char * _stp_kallsyms_lookup ( - unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, - char *namebuf) +static struct _stp_module *_stp_find_module_by_addr(unsigned long addr) { - struct _stp_module *m; - struct _stp_symbol *s; - unsigned long flags; - unsigned end, begin = 0; + unsigned begin = 0; + unsigned end = _stp_num_modules; - if (STP_TRYLOCK_MODULES) + if (unlikely(addr < _stp_modules_by_addr[0]->text)) return NULL; - end = _stp_num_modules; - - if (_stp_num_modules >= 2 && addr > _stp_modules_by_addr[1]->text) { + if (_stp_num_modules > 1 && addr > _stp_modules_by_addr[0]->data) { /* binary search on index [begin,end) */ do { unsigned mid = (begin + end) / 2; @@ -125,18 +116,51 @@ static const char * _stp_kallsyms_lookup ( } while (begin + 1 < end); /* result index in $begin, guaranteed between [0,_stp_num_modules) */ } - m = _stp_modules_by_addr[begin]; - begin = 0; - end = m->num_symbols; + /* check if addr is past the last module */ + if (unlikely(begin == _stp_num_modules - 1 + && (addr > _stp_modules_by_addr[begin]->text + _stp_modules_by_addr[begin]->text_size))) + return NULL; + + return _stp_modules_by_addr[begin]; +} - /* m->data is the lowest address of a data section. It should be */ - /* after the text section. */ - /* If our address is in the data section, then return now. */ - if (m->data > m->text && addr >= m->data) { - STP_UNLOCK_MODULES; +static struct _stp_module *_stp_get_unwind_info(unsigned long addr) +{ + struct _stp_module *m; + struct _stp_symbol *s; + unsigned long flags; + + STP_RLOCK_MODULES; + m = _stp_find_module_by_addr(addr); + if (unlikely(m == NULL)) { + STP_RUNLOCK_MODULES; return NULL; } - + /* Lock the module struct so it doesn't go away while being used. */ + /* Probably could never happen, but lock it to be sure for now. */ + read_lock(&m->lock); + + STP_RUNLOCK_MODULES; + return m; +} + +static const char *_stp_kallsyms_lookup(unsigned long addr, + unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) +{ + struct _stp_module *m; + struct _stp_symbol *s; + unsigned long flags; + unsigned end, begin = 0; + + STP_RLOCK_MODULES; + m = _stp_find_module_by_addr(addr); + if (unlikely(m == NULL)) { + STP_RUNLOCK_MODULES; + return NULL; + } + + end = m->num_symbols; + /* binary search for symbols within the module */ do { unsigned mid = (begin + end) / 2; @@ -148,31 +172,29 @@ static const char * _stp_kallsyms_lookup ( /* result index in $begin */ s = &m->symbols[begin]; - if (addr < s->addr) { - STP_UNLOCK_MODULES; - return NULL; - } else { - if (offset) *offset = addr - s->addr; - if (modname) *modname = m->name; + if (likely(addr >= s->addr)) { + if (offset) + *offset = addr - s->addr; + if (modname) + *modname = m->name; if (symbolsize) { if ((begin + 1) < m->num_symbols) - *symbolsize = m->symbols[begin+1].addr - s->addr; + *symbolsize = m->symbols[begin + 1].addr - s->addr; else *symbolsize = 0; // NB: This is only a heuristic. Sometimes there are large // gaps between text areas of modules. } if (namebuf) { - strlcpy (namebuf, s->symbol, KSYM_NAME_LEN+1); - STP_UNLOCK_MODULES; + strlcpy(namebuf, s->symbol, KSYM_NAME_LEN + 1); + STP_RUNLOCK_MODULES; return namebuf; - } - else { - STP_UNLOCK_MODULES; + } else { + STP_RUNLOCK_MODULES; return s->symbol; } } - STP_UNLOCK_MODULES; + STP_RUNLOCK_MODULES; return NULL; } @@ -182,31 +204,31 @@ static const char * _stp_kallsyms_lookup ( * a probe because it is too time-consuming. Use at module exit time. */ -void _stp_symbol_print (unsigned long address) -{ +void _stp_symbol_print(unsigned long address) +{ char *modname; - const char *name; - unsigned long offset, size; + const char *name; + unsigned long offset, size; - name = _stp_kallsyms_lookup(address, &size, &offset, &modname, NULL); + name = _stp_kallsyms_lookup(address, &size, &offset, &modname, NULL); - _stp_printf ("%p", (int64_t)address); + _stp_printf("%p", (int64_t) address); - if (name) { + if (name) { if (modname && *modname) - _stp_printf (" : %s+%#lx/%#lx [%s]", name, offset, size, modname); + _stp_printf(" : %s+%#lx/%#lx [%s]", name, offset, size, modname); else - _stp_printf (" : %s+%#lx/%#lx", name, offset, size); + _stp_printf(" : %s+%#lx/%#lx", name, offset, size); } } /* Like _stp_symbol_print, except only print if the address is a valid function address */ -void _stp_func_print (unsigned long address, int verbose, int exact) -{ +void _stp_func_print(unsigned long address, int verbose, int exact) +{ char *modname; - const char *name; - unsigned long offset, size; + const char *name; + unsigned long offset, size; char *exstr; if (exact) @@ -214,33 +236,32 @@ void _stp_func_print (unsigned long address, int verbose, int exact) else exstr = " (inexact)"; - name = _stp_kallsyms_lookup(address, &size, &offset, &modname, NULL); + name = _stp_kallsyms_lookup(address, &size, &offset, &modname, NULL); if (name) { if (verbose) { if (modname && *modname) - _stp_printf (" %p : %s+%#lx/%#lx [%s]%s\n", - (int64_t)address, name, offset, size, modname, exstr); + _stp_printf(" %p : %s+%#lx/%#lx [%s]%s\n", + (int64_t) address, name, offset, size, modname, exstr); else - _stp_printf (" %p : %s+%#lx/%#lx%s\n", - (int64_t)address, name, offset, size, exstr); - } else - _stp_printf ("%p ", (int64_t)address); + _stp_printf(" %p : %s+%#lx/%#lx%s\n", (int64_t) address, name, offset, size, exstr); + } else + _stp_printf("%p ", (int64_t) address); } } -void _stp_symbol_snprint (char *str, size_t len, unsigned long address) -{ - char *modname; - const char *name; - unsigned long offset, size; - - name = _stp_kallsyms_lookup(address, &size, &offset, &modname, NULL); - if (name) - strlcpy(str, name, len); - else - _stp_snprintf(str, len, "%p", (int64_t)address); +void _stp_symbol_snprint(char *str, size_t len, unsigned long address) +{ + char *modname; + const char *name; + unsigned long offset, size; + + name = _stp_kallsyms_lookup(address, &size, &offset, &modname, NULL); + if (name) + strlcpy(str, name, len); + else + _stp_snprintf(str, len, "%p", (int64_t) address); } /** @} */ -#endif /* _SYM_C_ */ +#endif /* _STP_SYM_C_ */ diff --git a/runtime/sym.h b/runtime/sym.h index 6a55a22e..b124882a 100644 --- a/runtime/sym.h +++ b/runtime/sym.h @@ -1,5 +1,5 @@ -/* - * Copyright (C) 2005, 2006 Red Hat Inc. +/* -*- linux-c -*- + * Copyright (C) 2005-2008 Red Hat Inc. * * This file is part of systemtap, and is free software. You can * redistribute it and/or modify it under the terms of the GNU General @@ -13,39 +13,59 @@ #define STP_MODULE_NAME_LEN 64 struct _stp_symbol { - unsigned long addr; - const char *symbol; + unsigned long addr; + const char *symbol; +}; +struct stap_symbol { + unsigned long addr; + const char *symbol; + const char *module; }; -struct _stp_module { - /* the module name, or "" for kernel */ - char name[STP_MODULE_NAME_LEN]; - - /* A pointer to the struct module. Note that we cannot */ - /* trust this because as of 2.6.19, there are not yet */ - /* any notifier hooks that will tell us when a module */ - /* is unloading. */ - unsigned long module; - - /* the start of the module's text and data sections */ - unsigned long text; - unsigned long data; - - /* how many symbols this module has that we are interested in */ - uint32_t num_symbols; - - /* how many sections this module has */ - uint32_t num_sections; - struct _stp_symbol *sections; - - /* how the symbol_data below was allocated */ - int32_t allocated; /* 0 = kmalloc, 1 = vmalloc */ - - /* an array of num_symbols _stp_symbol structs */ - struct _stp_symbol *symbols; /* ordered by address */ +DEFINE_RWLOCK(_stp_module_lock); +#define STP_RLOCK_MODULES read_lock_irqsave(&_stp_module_lock, flags) +#define STP_WLOCK_MODULES write_lock_irqsave(&_stp_module_lock, flags) +#define STP_RUNLOCK_MODULES read_unlock_irqrestore(&_stp_module_lock, flags) +#define STP_WUNLOCK_MODULES write_unlock_irqrestore(&_stp_module_lock, flags) - /* where we stash our copy of the strtab */ - void *symbol_data; /* private */ +struct _stp_module { + /* the module name, or "" for kernel */ + char name[STP_MODULE_NAME_LEN]; + + /* A pointer to the struct module. Note that we cannot */ + /* trust this because as of 2.6.19, there are not yet */ + /* any notifier hooks that will tell us when a module */ + /* is unloading. */ + unsigned long module; + + /* the start of the module's text and data sections */ + unsigned long text; + unsigned long data; + + uint32_t text_size; + + /* how many symbols this module has that we are interested in */ + uint32_t num_symbols; + + /* how many sections this module has */ + uint32_t num_sections; + + /* how the symbol_data below was allocated */ + int32_t allocated; /* 0 = kmalloc, 1 = vmalloc */ + + struct _stp_symbol *sections; + + /* an array of num_symbols _stp_symbol structs */ + struct _stp_symbol *symbols; /* ordered by address */ + + /* where we stash our copy of the strtab */ + void *symbol_data; + + /* the stack unwind data for this module */ + void *unwind_data; + uint32_t unwind_data_len; + rwlock_t lock; /* lock while unwinding is happening */ + }; #ifndef STP_MAX_MODULES @@ -62,4 +82,5 @@ struct _stp_module *_stp_modules_by_addr[STP_MAX_MODULES]; int _stp_num_modules = 0; unsigned long _stp_module_relocate (const char *module, const char *section, unsigned long offset); +static struct _stp_module *_stp_get_unwind_info (unsigned long addr); #endif /* _STAP_SYMBOLS_H_ */ diff --git a/runtime/transport/ChangeLog b/runtime/transport/ChangeLog index 764e3579..c3837f86 100644 --- a/runtime/transport/ChangeLog +++ b/runtime/transport/ChangeLog @@ -1,3 +1,16 @@ +2008-02-27 Martin Hunt <hunt@redhat.com> + + * symbols.c: Use rwlocks. Use new dbug macros. Handle + unwind info if present. + + * transport.c: Include mempool.c. Update dbug and kbug calls + to new macros. + * transport_msgs.h (_stp_command_name): Add + struct containing message names for debugging. + + * control.c, procfs.c: Use new dbug macros. Use + new mempool functions. + 2008-01-28 Martin Hunt <hunt@redhat.com> * control.c, procfs.c, symbols.c: Use DEFINE_SPINLOCK diff --git a/runtime/transport/control.c b/runtime/transport/control.c index 0bf99fc8..6a5b272d 100644 --- a/runtime/transport/control.c +++ b/runtime/transport/control.c @@ -12,15 +12,13 @@ #define STP_DEFAULT_BUFFERS 50 static int _stp_current_buffers = STP_DEFAULT_BUFFERS; +static _stp_mempool_t *_stp_pool_q; static struct list_head _stp_ctl_ready_q; static struct list_head _stp_sym_ready_q; -static struct list_head _stp_pool_q; -DEFINE_SPINLOCK(_stp_pool_lock); DEFINE_SPINLOCK(_stp_ctl_ready_lock); DEFINE_SPINLOCK(_stp_sym_ready_lock); -static ssize_t _stp_sym_write_cmd (struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t _stp_sym_write_cmd(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { static int saved_type = 0; int type; @@ -28,7 +26,7 @@ static ssize_t _stp_sym_write_cmd (struct file *file, const char __user *buf, if (count < sizeof(int32_t)) return 0; - /* Allow sending of packet type followed by data in the next packet.*/ + /* Allow sending of packet type followed by data in the next packet. */ if (count == sizeof(int32_t)) { if (get_user(saved_type, (int __user *)buf)) return -EFAULT; @@ -42,11 +40,14 @@ static ssize_t _stp_sym_write_cmd (struct file *file, const char __user *buf, count -= sizeof(int); buf += sizeof(int); } - - kbug ("count:%d type:%d\n", (int)count, type); + +#if DEBUG_TRANSPORT > 0 + if (type < STP_MAX_CMD) + _dbug("Got %s. len=%d\n", _stp_command_name[type], (int)count); +#endif switch (type) { - case STP_SYMBOLS: + case STP_SYMBOLS: count = _stp_do_symbols(buf, count); break; case STP_MODULE: @@ -54,21 +55,20 @@ static ssize_t _stp_sym_write_cmd (struct file *file, const char __user *buf, count = _stp_do_module(buf, count); else { /* count == 1 indicates end of initial modules list */ - _stp_ctl_send(STP_TRANSPORT, NULL, 0); + _stp_ctl_send(STP_TRANSPORT, NULL, 0); } break; case STP_EXIT: _stp_exit_flag = 1; break; default: - errk ("invalid symbol command type %d\n", type); + errk("invalid symbol command type %d\n", type); return -EINVAL; } return count; } -static ssize_t _stp_ctl_write_cmd (struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t _stp_ctl_write_cmd(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { int type; static int started = 0; @@ -79,7 +79,10 @@ static ssize_t _stp_ctl_write_cmd (struct file *file, const char __user *buf, if (get_user(type, (int __user *)buf)) return -EFAULT; - kbug ("count:%d type:%d\n", (int)count, type); +#if DEBUG_TRANSPORT > 0 + if (type < STP_MAX_CMD) + _dbug("Got %s. len=%d\n", _stp_command_name[type], (int)count); +#endif count -= sizeof(int); buf += sizeof(int); @@ -90,9 +93,9 @@ static ssize_t _stp_ctl_write_cmd (struct file *file, const char __user *buf, struct _stp_msg_start st; if (count < sizeof(st)) return 0; - if (copy_from_user (&st, buf, sizeof(st))) + if (copy_from_user(&st, buf, sizeof(st))) return -EFAULT; - _stp_handle_start (&st); + _stp_handle_start(&st); started = 1; } break; @@ -107,11 +110,11 @@ static ssize_t _stp_ctl_write_cmd (struct file *file, const char __user *buf, #endif case STP_READY: /* request symbolic information */ - _stp_ask_for_symbols(); + _stp_ask_for_symbols(); break; - + default: - errk ("invalid command type %d\n", type); + errk("invalid command type %d\n", type); return -EINVAL; } @@ -130,55 +133,55 @@ struct _stp_buffer { static DECLARE_WAIT_QUEUE_HEAD(_stp_ctl_wq); static DECLARE_WAIT_QUEUE_HEAD(_stp_sym_wq); -#ifdef DEBUG -static void _stp_ctl_write_dbug (int type, void *data, int len) +#if DEBUG_TRANSPORT > 0 +static void _stp_ctl_write_dbug(int type, void *data, int len) { char buf[64]; switch (type) { case STP_START: - printk("_stp_ctl_write: sending STP_START\n"); + _dbug("sending STP_START\n"); break; case STP_EXIT: - printk("_stp_ctl_write: sending STP_EXIT\n"); + _dbug("sending STP_EXIT\n"); break; case STP_OOB_DATA: - snprintf(buf, sizeof(buf), "%s", (char *)data); - printk("_stp_ctl_write: sending %d bytes of STP_OOB_DATA: %s\n", len, buf); + snprintf(buf, sizeof(buf), "%s", (char *)data); + _dbug("sending %d bytes of STP_OOB_DATA: %s\n", len, buf); break; case STP_SYSTEM: - snprintf(buf, sizeof(buf), "%s", (char *)data); - printk("_stp_ctl_write: sending STP_SYSTEM: %s\n", buf); + snprintf(buf, sizeof(buf), "%s", (char *)data); + _dbug("sending STP_SYSTEM: %s\n", buf); break; case STP_TRANSPORT: - printk("_stp_ctl_write: sending STP_TRANSPORT\n"); + _dbug("sending STP_TRANSPORT\n"); break; default: - printk("_stp_ctl_write: ERROR: unknown message type: %d\n", type); + _dbug("ERROR: unknown message type: %d\n", type); break; } } -static void _stp_sym_write_dbug (int type, void *data, int len) +static void _stp_sym_write_dbug(int type, void *data, int len) { switch (type) { case STP_SYMBOLS: - printk("_stp_sym_write: sending STP_SYMBOLS\n"); + _dbug("sending STP_SYMBOLS\n"); break; case STP_MODULE: - printk("_stp_sym_write: sending STP_MODULE\n"); + _dbug("sending STP_MODULE\n"); break; default: - printk("_stp_sym_write: ERROR: unknown message type: %d\n", type); + _dbug("ERROR: unknown message type: %d\n", type); break; } } #endif -static int _stp_ctl_write (int type, void *data, unsigned len) +static int _stp_ctl_write(int type, void *data, unsigned len) { struct _stp_buffer *bptr; unsigned long flags; - unsigned numtrylock; -#ifdef DEBUG + +#if DEBUG_TRANSPORT > 0 _stp_ctl_write_dbug(type, data, len); #endif @@ -186,47 +189,29 @@ static int _stp_ctl_write (int type, void *data, unsigned len) if (unlikely(len > STP_CTL_BUFFER_SIZE)) return 0; - numtrylock = 0; - while (!spin_trylock_irqsave (&_stp_pool_lock, flags) && (++numtrylock < MAXTRYLOCK)) - ndelay (TRYLOCKDELAY); - if (unlikely (numtrylock >= MAXTRYLOCK)) - return 0; - - if (unlikely(list_empty(&_stp_pool_q))) { - spin_unlock_irqrestore(&_stp_pool_lock, flags); - dbug("_stp_pool_q empty\n"); + /* get a buffer from the free pool */ + bptr = _stp_mempool_alloc(_stp_pool_q); + if (unlikely(bptr == NULL)) return -1; - } - - /* get the next buffer from the pool */ - bptr = (struct _stp_buffer *)_stp_pool_q.next; - list_del_init(&bptr->list); - spin_unlock_irqrestore(&_stp_pool_lock, flags); bptr->type = type; memcpy(bptr->buf, data, len); bptr->len = len; - - /* put it on the pool of ready buffers */ - numtrylock = 0; - while (!spin_trylock_irqsave (&_stp_ctl_ready_lock, flags) && (++numtrylock < MAXTRYLOCK)) - ndelay (TRYLOCKDELAY); - - if (unlikely (numtrylock >= MAXTRYLOCK)) - return 0; + /* put it on the pool of ready buffers */ + spin_lock_irqsave(&_stp_ctl_ready_lock, flags); list_add_tail(&bptr->list, &_stp_ctl_ready_q); spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags); return len; } -static int _stp_sym_write (int type, void *data, unsigned len) +static int _stp_sym_write(int type, void *data, unsigned len) { struct _stp_buffer *bptr; unsigned long flags; -#ifdef DEBUG +#if DEBUG_TRANSPORT > 0 _stp_sym_write_dbug(type, data, len); #endif @@ -234,24 +219,17 @@ static int _stp_sym_write (int type, void *data, unsigned len) if (unlikely(len > STP_CTL_BUFFER_SIZE)) return 0; - spin_lock_irqsave (&_stp_pool_lock, flags); - if (unlikely(list_empty(&_stp_pool_q))) { - spin_unlock_irqrestore(&_stp_pool_lock, flags); - dbug("_stp_pool_q empty\n"); + /* get a buffer from the free pool */ + bptr = _stp_mempool_alloc(_stp_pool_q); + if (unlikely(bptr == NULL)) return -1; - } - - /* get the next buffer from the pool */ - bptr = (struct _stp_buffer *)_stp_pool_q.next; - list_del_init(&bptr->list); - spin_unlock_irqrestore(&_stp_pool_lock, flags); bptr->type = type; memcpy(bptr->buf, data, len); bptr->len = len; - + /* put it on the pool of ready buffers */ - spin_lock_irqsave (&_stp_sym_ready_lock, flags); + spin_lock_irqsave(&_stp_sym_ready_lock, flags); list_add_tail(&bptr->list, &_stp_sym_ready_q); spin_unlock_irqrestore(&_stp_sym_ready_lock, flags); @@ -262,25 +240,24 @@ static int _stp_sym_write (int type, void *data, unsigned len) } /* send commands with timeout and retry */ -static int _stp_ctl_send (int type, void *data, int len) +static int _stp_ctl_send(int type, void *data, int len) { int err, trylimit = 50; - kbug("ctl_send: type=%d len=%d\n", type, len); + kbug(DEBUG_TRANSPORT, "ctl_send: type=%d len=%d\n", type, len); if (unlikely(type == STP_SYMBOLS || type == STP_MODULE)) { while ((err = _stp_sym_write(type, data, len)) < 0 && trylimit--) - msleep (5); + msleep(5); } else { while ((err = _stp_ctl_write(type, data, len)) < 0 && trylimit--) - msleep (5); + msleep(5); if (err > 0) wake_up_interruptible(&_stp_ctl_wq); } - kbug("returning %d\n", err); + kbug(DEBUG_TRANSPORT, "returning %d\n", err); return err; } -static ssize_t -_stp_sym_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *ppos) +static ssize_t _stp_sym_read_cmd(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct _stp_buffer *bptr; int len; @@ -296,7 +273,7 @@ _stp_sym_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *pp return -ERESTARTSYS; spin_lock_irqsave(&_stp_sym_ready_lock, flags); } - + /* get the next buffer off the ready list */ bptr = (struct _stp_buffer *)_stp_sym_ready_q.next; list_del_init(&bptr->list); @@ -314,15 +291,12 @@ _stp_sym_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *pp } /* put it on the pool of free buffers */ - spin_lock_irqsave(&_stp_pool_lock, flags); - list_add_tail(&bptr->list, &_stp_pool_q); - spin_unlock_irqrestore(&_stp_pool_lock, flags); + _stp_mempool_free(bptr); return len; } -static ssize_t -_stp_ctl_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *ppos) +static ssize_t _stp_ctl_read_cmd(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct _stp_buffer *bptr; int len; @@ -338,7 +312,7 @@ _stp_ctl_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *pp return -ERESTARTSYS; spin_lock_irqsave(&_stp_ctl_ready_lock, flags); } - + /* get the next buffer off the ready list */ bptr = (struct _stp_buffer *)_stp_ctl_ready_q.next; list_del_init(&bptr->list); @@ -356,15 +330,13 @@ _stp_ctl_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *pp } /* put it on the pool of free buffers */ - spin_lock_irqsave(&_stp_pool_lock, flags); - list_add_tail(&bptr->list, &_stp_pool_q); - spin_unlock_irqrestore(&_stp_pool_lock, flags); + _stp_mempool_free(bptr); return len; } static int _stp_sym_opens = 0; -static int _stp_sym_open_cmd (struct inode *inode, struct file *file) +static int _stp_sym_open_cmd(struct inode *inode, struct file *file) { /* only allow one reader */ if (_stp_sym_opens) @@ -374,14 +346,14 @@ static int _stp_sym_open_cmd (struct inode *inode, struct file *file) return 0; } -static int _stp_sym_close_cmd (struct inode *inode, struct file *file) +static int _stp_sym_close_cmd(struct inode *inode, struct file *file) { if (_stp_sym_opens) _stp_sym_opens--; return 0; } -static int _stp_ctl_open_cmd (struct inode *inode, struct file *file) +static int _stp_ctl_open_cmd(struct inode *inode, struct file *file) { if (_stp_attached) return -1; @@ -390,7 +362,7 @@ static int _stp_ctl_open_cmd (struct inode *inode, struct file *file) return 0; } -static int _stp_ctl_close_cmd (struct inode *inode, struct file *file) +static int _stp_ctl_close_cmd(struct inode *inode, struct file *file) { if (_stp_attached) _stp_detach(); @@ -416,12 +388,12 @@ static struct file_operations _stp_sym_fops_cmd = { static struct dentry *_stp_cmd_file = NULL; static struct dentry *_stp_sym_file = NULL; -static int _stp_register_ctl_channel (void) +static int _stp_register_ctl_channel(void) { int i; struct list_head *p, *tmp; char buf[32]; - + if (_stp_utt == NULL) { errk("_expected _stp_utt to be set.\n"); return -1; @@ -429,21 +401,16 @@ static int _stp_register_ctl_channel (void) INIT_LIST_HEAD(&_stp_ctl_ready_q); INIT_LIST_HEAD(&_stp_sym_ready_q); - INIT_LIST_HEAD(&_stp_pool_q); /* allocate buffers */ - for (i = 0; i < STP_DEFAULT_BUFFERS; i++) { - p = (struct list_head *)_stp_kmalloc(sizeof(struct _stp_buffer)); - // printk("allocated buffer at %lx\n", (long)p); - if (!p) - goto err0; - _stp_allocated_net_memory += sizeof(struct _stp_buffer); - list_add (p, &_stp_pool_q); - } + _stp_pool_q = _stp_mempool_init(sizeof(struct _stp_buffer), STP_DEFAULT_BUFFERS); + if (unlikely(_stp_pool_q == NULL)) + goto err0; + _stp_allocated_net_memory += sizeof(struct _stp_buffer) * STP_DEFAULT_BUFFERS; /* create [debugfs]/systemtap/module_name/.cmd */ _stp_cmd_file = debugfs_create_file(".cmd", 0600, _stp_utt->dir, NULL, &_stp_ctl_fops_cmd); - if (_stp_cmd_file == NULL) + if (_stp_cmd_file == NULL) goto err0; _stp_cmd_file->d_inode->i_uid = _stp_uid; _stp_cmd_file->d_inode->i_gid = _stp_gid; @@ -455,35 +422,29 @@ static int _stp_register_ctl_channel (void) return 0; err0: - if (_stp_cmd_file) debugfs_remove(_stp_cmd_file); - - list_for_each_safe(p, tmp, &_stp_pool_q) { - list_del(p); - _stp_kfree(p); - } - errk ("Error creating systemtap debugfs entries.\n"); + if (_stp_cmd_file) + debugfs_remove(_stp_cmd_file); + _stp_mempool_destroy(_stp_pool_q); + errk("Error creating systemtap debugfs entries.\n"); return -1; } - -static void _stp_unregister_ctl_channel (void) +static void _stp_unregister_ctl_channel(void) { struct list_head *p, *tmp; - if (_stp_sym_file) debugfs_remove(_stp_sym_file); - if (_stp_cmd_file) debugfs_remove(_stp_cmd_file); + if (_stp_sym_file) + debugfs_remove(_stp_sym_file); + if (_stp_cmd_file) + debugfs_remove(_stp_cmd_file); - /* free memory pools */ - list_for_each_safe(p, tmp, &_stp_pool_q) { - list_del(p); - _stp_kfree(p); - } + /* Return memory to pool and free it. */ list_for_each_safe(p, tmp, &_stp_sym_ready_q) { list_del(p); - _stp_kfree(p); + _stp_mempool_free(p); } list_for_each_safe(p, tmp, &_stp_ctl_ready_q) { list_del(p); - _stp_kfree(p); + _stp_mempool_free(p); } + _stp_mempool_destroy(_stp_pool_q); } - diff --git a/runtime/transport/procfs.c b/runtime/transport/procfs.c index 33f6db33..2afea1c9 100644 --- a/runtime/transport/procfs.c +++ b/runtime/transport/procfs.c @@ -12,18 +12,16 @@ #define STP_DEFAULT_BUFFERS 256 static int _stp_current_buffers = STP_DEFAULT_BUFFERS; +static _stp_mempool_t *_stp_pool_q; static struct list_head _stp_ctl_ready_q; static struct list_head _stp_sym_ready_q; -static struct list_head _stp_pool_q; -DEFINE_SPINLOCK(_stp_pool_lock); DEFINE_SPINLOCK(_stp_ctl_ready_lock); DEFINE_SPINLOCK(_stp_sym_ready_lock); #ifdef STP_BULKMODE extern int _stp_relay_flushing; /* handle the per-cpu subbuf info read for relayfs */ -static ssize_t -_stp_proc_read (struct file *file, char __user *buf, size_t count, loff_t *ppos) +static ssize_t _stp_proc_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { int num; struct _stp_buf_info out; @@ -46,8 +44,7 @@ _stp_proc_read (struct file *file, char __user *buf, size_t count, loff_t *ppos) } /* handle the per-cpu subbuf info write for relayfs */ -static ssize_t _stp_proc_write (struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t _stp_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct _stp_consumed_info info; int cpu = *(int *)(PDE(file->f_dentry->d_inode)->data); @@ -65,8 +62,7 @@ static struct file_operations _stp_proc_fops = { }; #endif /* STP_BULKMODE */ -static ssize_t _stp_sym_write_cmd (struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t _stp_sym_write_cmd(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { static int saved_type = 0; int type; @@ -74,7 +70,7 @@ static ssize_t _stp_sym_write_cmd (struct file *file, const char __user *buf, if (count < sizeof(int32_t)) return 0; - /* Allow sending of packet type followed by data in the next packet.*/ + /* Allow sending of packet type followed by data in the next packet. */ if (count == sizeof(int32_t)) { if (get_user(saved_type, (int __user *)buf)) return -EFAULT; @@ -88,11 +84,14 @@ static ssize_t _stp_sym_write_cmd (struct file *file, const char __user *buf, count -= sizeof(int); buf += sizeof(int); } - - // kbug ("count:%d type:%d\n", (int)count, type); + +#if DEBUG_TRANSPORT > 0 + if (type < STP_MAX_CMD) + _dbug("Got %s. len=%d\n", _stp_command_name[type], (int)count); +#endif switch (type) { - case STP_SYMBOLS: + case STP_SYMBOLS: count = _stp_do_symbols(buf, count); break; case STP_MODULE: @@ -100,19 +99,19 @@ static ssize_t _stp_sym_write_cmd (struct file *file, const char __user *buf, count = _stp_do_module(buf, count); else { /* count == 1 indicates end of initial modules list */ - _stp_ctl_send(STP_TRANSPORT, NULL, 0); + _stp_ctl_send(STP_TRANSPORT, NULL, 0); } break; default: - errk ("invalid symbol command type %d\n", type); + errk("invalid symbol command type %d\n", type); return -EINVAL; } return count; } -static ssize_t _stp_ctl_write_cmd (struct file *file, const char __user *buf, - size_t count, loff_t *ppos) + +static ssize_t _stp_ctl_write_cmd(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { int type; static int started = 0; @@ -123,7 +122,10 @@ static ssize_t _stp_ctl_write_cmd (struct file *file, const char __user *buf, if (get_user(type, (int __user *)buf)) return -EFAULT; - // kbug ("count:%d type:%d\n", count, type); +#if DEBUG_TRANSPORT > 0 + if (type < STP_MAX_CMD) + _dbug("Got %s. len=%d\n", _stp_command_name[type], (int)count); +#endif count -= sizeof(int); buf += sizeof(int); @@ -134,9 +136,9 @@ static ssize_t _stp_ctl_write_cmd (struct file *file, const char __user *buf, struct _stp_msg_start st; if (count < sizeof(st)) return 0; - if (copy_from_user (&st, buf, sizeof(st))) + if (copy_from_user(&st, buf, sizeof(st))) return -EFAULT; - _stp_handle_start (&st); + _stp_handle_start(&st); started = 1; } break; @@ -145,10 +147,10 @@ static ssize_t _stp_ctl_write_cmd (struct file *file, const char __user *buf, break; case STP_READY: /* request symbolic information */ - _stp_ask_for_symbols(); + _stp_ask_for_symbols(); break; default: - errk ("invalid command type %d\n", type); + errk("invalid command type %d\n", type); return -EINVAL; } @@ -165,76 +167,66 @@ struct _stp_buffer { static DECLARE_WAIT_QUEUE_HEAD(_stp_ctl_wq); static DECLARE_WAIT_QUEUE_HEAD(_stp_sym_wq); -#ifdef DEBUG -static void _stp_ctl_write_dbug (int type, void *data, int len) +#if DEBUG_TRANSPORT > 0 +static void _stp_ctl_write_dbug(int type, void *data, int len) { char buf[64]; switch (type) { - case STP_REALTIME_DATA: - break; case STP_START: - printk("_stp_ctl_write: sending STP_START\n"); + _dbug("sending STP_START\n"); break; case STP_EXIT: - printk("_stp_ctl_write: sending STP_EXIT\n"); + _dbug("sending STP_EXIT\n"); break; case STP_OOB_DATA: - snprintf(buf, sizeof(buf), "%s", (char *)data); - printk("_stp_ctl_write: sending %d bytes of STP_OOB_DATA: %s\n", len, buf); + snprintf(buf, sizeof(buf), "%s", (char *)data); + _dbug("sending %d bytes of STP_OOB_DATA: %s\n", len, buf); break; case STP_SYSTEM: - snprintf(buf, sizeof(buf), "%s", (char *)data); - printk("_stp_ctl_write: sending STP_SYSTEM: %s\n", buf); + snprintf(buf, sizeof(buf), "%s", (char *)data); + _dbug("sending STP_SYSTEM: %s\n", buf); break; case STP_TRANSPORT: - printk("_stp_ctl_write: sending STP_TRANSPORT\n"); + _dbug("sending STP_TRANSPORT\n"); break; default: - printk("_stp_ctl_write: ERROR: unknown message type: %d\n", type); + _dbug("ERROR: unknown message type: %d\n", type); break; } } -static void _stp_sym_write_dbug (int type, void *data, int len) +static void _stp_sym_write_dbug(int type, void *data, int len) { switch (type) { case STP_SYMBOLS: - printk("_stp_sym_write: sending STP_SYMBOLS\n"); + _dbug("sending STP_SYMBOLS\n"); break; case STP_MODULE: - printk("_stp_sym_write: sending STP_MODULE\n"); + _dbug("sending STP_MODULE\n"); break; default: - printk("_stp_sym_write: ERROR: unknown message type: %d\n", type); + _dbug("ERROR: unknown message type: %d\n", type); break; } } #endif -static int _stp_ctl_write (int type, void *data, int len) +static int _stp_ctl_write(int type, void *data, int len) { struct _stp_buffer *bptr; unsigned long flags; - unsigned numtrylock; -#ifdef DEBUG +#if DEBUG_TRANSPORT > 0 _stp_ctl_write_dbug(type, data, len); #endif #define WRITE_AGG #ifdef WRITE_AGG - numtrylock = 0; - while (!spin_trylock_irqsave (&_stp_ctl_ready_lock, flags) && (++numtrylock < MAXTRYLOCK)) - ndelay (TRYLOCKDELAY); - if (unlikely (numtrylock >= MAXTRYLOCK)) - return 0; - + spin_lock_irqsave(&_stp_ctl_ready_lock, flags); if (!list_empty(&_stp_ctl_ready_q)) { bptr = (struct _stp_buffer *)_stp_ctl_ready_q.prev; - if (bptr->len + len <= STP_BUFFER_SIZE - && type == STP_REALTIME_DATA - && bptr->type == STP_REALTIME_DATA) { - memcpy (bptr->buf + bptr->len, data, len); + if (bptr->len + len <= STP_BUFFER_SIZE && type == STP_REALTIME_DATA && bptr->type == STP_REALTIME_DATA) { + memcpy(bptr->buf + bptr->len, data, len); bptr->len += len; spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags); return len; @@ -247,47 +239,29 @@ static int _stp_ctl_write (int type, void *data, int len) if (unlikely(len > STP_BUFFER_SIZE)) return 0; - numtrylock = 0; - while (!spin_trylock_irqsave (&_stp_pool_lock, flags) && (++numtrylock < MAXTRYLOCK)) - ndelay (TRYLOCKDELAY); - if (unlikely (numtrylock >= MAXTRYLOCK)) - return 0; - - if (unlikely(list_empty(&_stp_pool_q))) { - spin_unlock_irqrestore(&_stp_pool_lock, flags); - dbug("_stp_pool_q empty\n"); + /* get a buffer from the free pool */ + bptr = _stp_mempool_alloc(_stp_pool_q); + if (unlikely(bptr == NULL)) return -1; - } - - /* get the next buffer from the pool */ - bptr = (struct _stp_buffer *)_stp_pool_q.next; - list_del_init(&bptr->list); - spin_unlock_irqrestore(&_stp_pool_lock, flags); bptr->type = type; - memcpy (bptr->buf, data, len); + memcpy(bptr->buf, data, len); bptr->len = len; - - /* put it on the pool of ready buffers */ - numtrylock = 0; - while (!spin_trylock_irqsave (&_stp_ctl_ready_lock, flags) && (++numtrylock < MAXTRYLOCK)) - ndelay (TRYLOCKDELAY); - - if (unlikely (numtrylock >= MAXTRYLOCK)) - return 0; + /* put it on the pool of ready buffers */ + spin_lock_irqsave(&_stp_ctl_ready_lock, flags); list_add_tail(&bptr->list, &_stp_ctl_ready_q); spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags); return len; } -static int _stp_sym_write (int type, void *data, unsigned len) +static int _stp_sym_write(int type, void *data, unsigned len) { struct _stp_buffer *bptr; unsigned long flags; -#ifdef DEBUG +#if DEBUG_TRANSPORT > 0 _stp_sym_write_dbug(type, data, len); #endif @@ -295,24 +269,17 @@ static int _stp_sym_write (int type, void *data, unsigned len) if (unlikely(len > STP_BUFFER_SIZE)) return 0; - spin_lock_irqsave (&_stp_pool_lock, flags); - if (unlikely(list_empty(&_stp_pool_q))) { - spin_unlock_irqrestore(&_stp_pool_lock, flags); - dbug("_stp_pool_q empty\n"); + /* get a buffer from the free pool */ + bptr = _stp_mempool_alloc(_stp_pool_q); + if (unlikely(bptr == NULL)) return -1; - } - - /* get the next buffer from the pool */ - bptr = (struct _stp_buffer *)_stp_pool_q.next; - list_del_init(&bptr->list); - spin_unlock_irqrestore(&_stp_pool_lock, flags); bptr->type = type; memcpy(bptr->buf, data, len); bptr->len = len; /* put it on the pool of ready buffers */ - spin_lock_irqsave (&_stp_sym_ready_lock, flags); + spin_lock_irqsave(&_stp_sym_ready_lock, flags); list_add_tail(&bptr->list, &_stp_sym_ready_q); spin_unlock_irqrestore(&_stp_sym_ready_lock, flags); @@ -323,23 +290,24 @@ static int _stp_sym_write (int type, void *data, unsigned len) } /* send commands with timeout and retry */ -static int _stp_ctl_send (int type, void *data, int len) +static int _stp_ctl_send(int type, void *data, int len) { int err, trylimit = 50; + kbug(DEBUG_TRANSPORT, "ctl_send: type=%d len=%d\n", type, len); if (unlikely(type == STP_SYMBOLS || type == STP_MODULE)) { while ((err = _stp_sym_write(type, data, len)) < 0 && trylimit--) - msleep (5); + msleep(5); } else { while ((err = _stp_ctl_write(type, data, len)) < 0 && trylimit--) - msleep (5); + msleep(5); if (err > 0) wake_up_interruptible(&_stp_ctl_wq); } + kbug(DEBUG_TRANSPORT, "returning %d\n", err); return err; } -static ssize_t -_stp_sym_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *ppos) +static ssize_t _stp_sym_read_cmd(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct _stp_buffer *bptr; int len; @@ -355,7 +323,7 @@ _stp_sym_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *pp return -ERESTARTSYS; spin_lock_irqsave(&_stp_sym_ready_lock, flags); } - + /* get the next buffer off the ready list */ bptr = (struct _stp_buffer *)_stp_sym_ready_q.next; list_del_init(&bptr->list); @@ -373,15 +341,12 @@ _stp_sym_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *pp } /* put it on the pool of free buffers */ - spin_lock_irqsave(&_stp_pool_lock, flags); - list_add_tail(&bptr->list, &_stp_pool_q); - spin_unlock_irqrestore(&_stp_pool_lock, flags); + _stp_mempool_free(bptr); return len; } -static ssize_t -_stp_ctl_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *ppos) +static ssize_t _stp_ctl_read_cmd(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct _stp_buffer *bptr; int len; @@ -397,7 +362,7 @@ _stp_ctl_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *pp return -ERESTARTSYS; spin_lock_irqsave(&_stp_ctl_ready_lock, flags); } - + /* get the next buffer off the ready list */ bptr = (struct _stp_buffer *)_stp_ctl_ready_q.next; list_del_init(&bptr->list); @@ -415,15 +380,13 @@ _stp_ctl_read_cmd (struct file *file, char __user *buf, size_t count, loff_t *pp } /* put it on the pool of free buffers */ - spin_lock_irqsave(&_stp_pool_lock, flags); - list_add_tail(&bptr->list, &_stp_pool_q); - spin_unlock_irqrestore(&_stp_pool_lock, flags); + _stp_mempool_free(bptr); return len; } static int _stp_sym_opens = 0; -static int _stp_sym_open_cmd (struct inode *inode, struct file *file) +static int _stp_sym_open_cmd(struct inode *inode, struct file *file) { /* only allow one reader */ if (_stp_sym_opens) @@ -433,14 +396,14 @@ static int _stp_sym_open_cmd (struct inode *inode, struct file *file) return 0; } -static int _stp_sym_close_cmd (struct inode *inode, struct file *file) +static int _stp_sym_close_cmd(struct inode *inode, struct file *file) { if (_stp_sym_opens) _stp_sym_opens--; return 0; } -static int _stp_ctl_open_cmd (struct inode *inode, struct file *file) +static int _stp_ctl_open_cmd(struct inode *inode, struct file *file) { if (_stp_attached) return -1; @@ -449,7 +412,7 @@ static int _stp_ctl_open_cmd (struct inode *inode, struct file *file) return 0; } -static int _stp_ctl_close_cmd (struct inode *inode, struct file *file) +static int _stp_ctl_close_cmd(struct inode *inode, struct file *file) { if (_stp_attached) _stp_detach(); @@ -482,46 +445,14 @@ static int my_proc_match(int len, const char *name, struct proc_dir_entry *de) /* set the number of buffers to use to 'num' */ static int _stp_set_buffers(int num) { - int i; - struct list_head *p; - unsigned long flags; - - //printk("stp_set_buffers %d\n", num); - - if (num == 0 || num == _stp_current_buffers) - return _stp_current_buffers; - - if (num > _stp_current_buffers) { - for (i = 0; i < num - _stp_current_buffers; i++) { - p = (struct list_head *)_stp_kmalloc(sizeof(struct _stp_buffer)); - if (!p) { - _stp_current_buffers += i; - goto err; - } - _stp_allocated_net_memory += sizeof(struct _stp_buffer); - spin_lock_irqsave(&_stp_pool_lock, flags); - list_add (p, &_stp_pool_q); - spin_unlock_irqrestore(&_stp_pool_lock, flags); - } - } else { - for (i = 0; i < _stp_current_buffers - num; i++) { - spin_lock_irqsave(&_stp_pool_lock, flags); - p = _stp_pool_q.next; - list_del(p); - spin_unlock_irqrestore(&_stp_pool_lock, flags); - _stp_kfree(p); - } - } - _stp_current_buffers = num; -err: - return _stp_current_buffers; + kbug(DEBUG_TRANSPORT, "stp_set_buffers %d\n", num); + return _stp_mempool_resize(_stp_pool_q, num); } -static int _stp_ctl_read_bufsize (char *page, char **start, off_t off, - int count, int *eof, void *data) +static int _stp_ctl_read_bufsize(char *page, char **start, off_t off, int count, int *eof, void *data) { int len = sprintf(page, "%d,%d\n", _stp_nsubbufs, _stp_subbuf_size); - if (len <= off+count) + if (len <= off + count) *eof = 1; *start = page + off; len -= off; @@ -532,7 +463,7 @@ static int _stp_ctl_read_bufsize (char *page, char **start, off_t off, return len; } -static int _stp_register_ctl_channel (void) +static int _stp_register_ctl_channel(void) { int i; const char *dirname = "systemtap"; @@ -546,17 +477,12 @@ static int _stp_register_ctl_channel (void) INIT_LIST_HEAD(&_stp_ctl_ready_q); INIT_LIST_HEAD(&_stp_sym_ready_q); - INIT_LIST_HEAD(&_stp_pool_q); /* allocate buffers */ - for (i = 0; i < STP_DEFAULT_BUFFERS; i++) { - p = (struct list_head *)_stp_kmalloc(sizeof(struct _stp_buffer)); - // printk("allocated buffer at %lx\n", (long)p); - if (!p) - goto err0; - _stp_allocated_net_memory += sizeof(struct _stp_buffer); - list_add (p, &_stp_pool_q); - } + _stp_pool_q = _stp_mempool_init(sizeof(struct _stp_buffer), STP_DEFAULT_BUFFERS); + if (unlikely(_stp_pool_q == NULL)) + goto err0; + _stp_allocated_net_memory += sizeof(struct _stp_buffer) * STP_DEFAULT_BUFFERS; if (!_stp_mkdir_proc_module()) goto err0; @@ -565,15 +491,15 @@ static int _stp_register_ctl_channel (void) /* now for each cpu "n", create /proc/systemtap/module_name/n */ for_each_cpu(i) { sprintf(buf, "%d", i); - de = create_proc_entry (buf, 0600, _stp_proc_root); - if (de == NULL) + de = create_proc_entry(buf, 0600, _stp_proc_root); + if (de == NULL) goto err1; de->uid = _stp_uid; de->gid = _stp_gid; de->proc_fops = &_stp_proc_fops; de->data = _stp_kmalloc(sizeof(int)); if (de->data == NULL) { - remove_proc_entry (buf, _stp_proc_root); + remove_proc_entry(buf, _stp_proc_root); goto err1; } *(int *)de->data = i; @@ -582,48 +508,44 @@ static int _stp_register_ctl_channel (void) #endif /* STP_BULKMODE */ /* create /proc/systemtap/module_name/.cmd */ - de = create_proc_entry (".cmd", 0600, _stp_proc_root); - if (de == NULL) + de = create_proc_entry(".cmd", 0600, _stp_proc_root); + if (de == NULL) goto err1; de->uid = _stp_uid; de->gid = _stp_gid; de->proc_fops = &_stp_proc_fops_cmd; /* create /proc/systemtap/module_name/.symbols */ - de = create_proc_entry (".symbols", 0600, _stp_proc_root); - if (de == NULL) + de = create_proc_entry(".symbols", 0600, _stp_proc_root); + if (de == NULL) goto err2; de->proc_fops = &_stp_sym_fops_cmd; return 0; err2: - remove_proc_entry (".cmd", _stp_proc_root); + remove_proc_entry(".cmd", _stp_proc_root); err1: #ifdef STP_BULKMODE for (de = _stp_proc_root->subdir; de; de = de->next) - _stp_kfree (de->data); + _stp_kfree(de->data); for_each_cpu(j) { if (j == i) break; sprintf(buf, "%d", j); - remove_proc_entry (buf, _stp_proc_root); - + remove_proc_entry(buf, _stp_proc_root); + } - if (bs) remove_proc_entry ("bufsize", _stp_proc_root); + if (bs) + remove_proc_entry("bufsize", _stp_proc_root); #endif /* STP_BULKMODE */ _stp_rmdir_proc_module(); err0: - list_for_each_safe(p, tmp, &_stp_pool_q) { - list_del(p); - _stp_kfree(p); - } - - errk ("Error creating systemtap /proc entries.\n"); + _stp_mempool_destroy(_stp_pool_q); + errk("Error creating systemtap /proc entries.\n"); return -1; } - -static void _stp_unregister_ctl_channel (void) +static void _stp_unregister_ctl_channel(void) { struct list_head *p, *tmp; char buf[32]; @@ -632,31 +554,27 @@ static void _stp_unregister_ctl_channel (void) struct proc_dir_entry *de; kbug("unregistering procfs\n"); for (de = _stp_proc_root->subdir; de; de = de->next) - _stp_kfree (de->data); + _stp_kfree(de->data); for_each_cpu(i) { sprintf(buf, "%d", i); - remove_proc_entry (buf, _stp_proc_root); + remove_proc_entry(buf, _stp_proc_root); } - remove_proc_entry ("bufsize", _stp_proc_root); + remove_proc_entry("bufsize", _stp_proc_root); #endif /* STP_BULKMODE */ - remove_proc_entry (".symbols", _stp_proc_root); - remove_proc_entry (".cmd", _stp_proc_root); + remove_proc_entry(".symbols", _stp_proc_root); + remove_proc_entry(".cmd", _stp_proc_root); _stp_rmdir_proc_module(); - /* free memory pools */ - list_for_each_safe(p, tmp, &_stp_pool_q) { - list_del(p); - _stp_kfree(p); - } + /* Return memory to pool and free it. */ list_for_each_safe(p, tmp, &_stp_sym_ready_q) { list_del(p); - _stp_kfree(p); + _stp_mempool_free(p); } list_for_each_safe(p, tmp, &_stp_ctl_ready_q) { list_del(p); - _stp_kfree(p); + _stp_mempool_free(p); } + _stp_mempool_destroy(_stp_pool_q); } - diff --git a/runtime/transport/symbols.c b/runtime/transport/symbols.c index e740dde8..8c453a55 100644 --- a/runtime/transport/symbols.c +++ b/runtime/transport/symbols.c @@ -16,16 +16,6 @@ #define _SYMBOLS_C_ #include "../sym.h" -DEFINE_SPINLOCK(_stp_module_lock); -#define STP_TRYLOCK_MODULES ({ \ - int numtrylock = 0; \ - while (!spin_trylock_irqsave (&_stp_module_lock, flags) && (++numtrylock < MAXTRYLOCK)) \ - ndelay (TRYLOCKDELAY); \ - (numtrylock >= MAXTRYLOCK); \ - }) -#define STP_LOCK_MODULES spin_lock_irqsave(&_stp_module_lock, flags) -#define STP_UNLOCK_MODULES spin_unlock_irqrestore(&_stp_module_lock, flags) - static char *_stp_symbol_data = NULL; static int _stp_symbol_state = 0; static char *_stp_module_data = NULL; @@ -63,7 +53,7 @@ static unsigned _stp_get_sym_sizes(struct module *m, unsigned *dsize) } /* allocate space for a module and symbols */ -static struct _stp_module * _stp_alloc_module(unsigned num, unsigned datasize) +static struct _stp_module * _stp_alloc_module(unsigned num, unsigned datasize, unsigned unwindsize) { struct _stp_module *mod = (struct _stp_module *)_stp_kzalloc(sizeof(struct _stp_module)); if (mod == NULL) @@ -85,6 +75,14 @@ static struct _stp_module * _stp_alloc_module(unsigned num, unsigned datasize) mod->allocated |= 2; } + mod->unwind_data = _stp_kmalloc(unwindsize); + if (mod->unwind_data == NULL) { + mod->unwind_data = _stp_vmalloc(unwindsize); + if (mod->unwind_data == NULL) + goto bad; + mod->allocated |= 4; + } + mod->num_symbols = num; return mod; @@ -97,19 +95,40 @@ bad: _stp_kfree(mod->symbols); mod->symbols = NULL; } + if (mod->symbol_data) { + if (mod->allocated & 2) + _stp_vfree(mod->symbol_data); + else + _stp_kfree(mod->symbol_data); + mod->symbol_data = NULL; + } + _stp_kfree(mod); + if (mod->symbols) { + if (mod->allocated & 1) + _stp_vfree(mod->symbols); + else + _stp_kfree(mod->symbols); + mod->symbols = NULL; + } _stp_kfree(mod); } return NULL; } -static struct _stp_module * _stp_alloc_module_from_module (struct module *m) +static struct _stp_module * _stp_alloc_module_from_module (struct module *m, uint32_t unwind_len) { unsigned datasize, num = _stp_get_sym_sizes(m, &datasize); - return _stp_alloc_module(num, datasize); + return _stp_alloc_module(num, datasize, unwind_len); } static void _stp_free_module(struct _stp_module *mod) { + /* The module write lock is held. Any prior readers of this */ + /* module's data will have read locks and need to finish before */ + /* the memory is freed. */ + write_lock(&mod->lock); + write_unlock(&mod->lock); /* there will be no more readers */ + /* free symbol memory */ if (mod->symbols) { if (mod->allocated & 1) @@ -126,21 +145,30 @@ static void _stp_free_module(struct _stp_module *mod) mod->symbol_data = NULL; } + if (mod->unwind_data) { + if (mod->allocated & 4) + _stp_vfree(mod->unwind_data); + else + _stp_kfree(mod->unwind_data); + mod->unwind_data = NULL; + + } if (mod->sections) { _stp_kfree(mod->sections); mod->sections = NULL; } + /* free module memory */ _stp_kfree(mod); } /* Delete a module and free its memory. */ -/* The lock should already be held before calling this. */ +/* The module lock should already be held before calling this. */ static void _stp_del_module(struct _stp_module *mod) { int i, num; - // kbug("deleting %s\n", mod->name); + // kbug(DEBUG_SYMBOLS, "deleting %s\n", mod->name); /* signal relocation code to clear its cache */ _stp_module_relocate((char *)-1, NULL, 0); @@ -185,7 +213,7 @@ static unsigned long _stp_kallsyms_lookup_name(const char *name); static int _stp_do_symbols(const char __user *buf, int count) { struct _stp_symbol *s; - unsigned datasize, num; + unsigned datasize, num, unwindsize; int i; switch (_stp_symbol_state) { @@ -198,23 +226,26 @@ static int _stp_do_symbols(const char __user *buf, int count) return -EFAULT; if (get_user(datasize, (unsigned __user *)(buf+4))) return -EFAULT; - //kbug("num=%d datasize=%d\n", num, datasize); + if (get_user(unwindsize, (unsigned __user *)(buf+8))) + return -EFAULT; + dbug(DEBUG_UNWIND, "num=%d datasize=%d unwindsize=%d\n", num, datasize, unwindsize); - _stp_modules[0] = _stp_alloc_module(num, datasize); + _stp_modules[0] = _stp_alloc_module(num, datasize, unwindsize); if (_stp_modules[0] == NULL) { errk("cannot allocate memory\n"); return -EFAULT; } + rwlock_init(&_stp_modules[0]->lock); _stp_symbol_state = 1; break; case 1: - //kbug("got stap_symbols, count=%d\n", count); + dbug(DEBUG_SYMBOLS, "got stap_symbols, count=%d\n", count); if (copy_from_user ((char *)_stp_modules[0]->symbols, buf, count)) return -EFAULT; _stp_symbol_state = 2; break; case 2: - //kbug("got symbol data, count=%d buf=%p\n", count, buf); + dbug(DEBUG_SYMBOLS, "got symbol data, count=%d buf=%p\n", count, buf); if (copy_from_user (_stp_modules[0]->symbol_data, buf, count)) return -EFAULT; _stp_num_modules = 1; @@ -227,8 +258,19 @@ static int _stp_do_symbols(const char __user *buf, int count) /* NB: this mapping is used by kernel/_stext pseudo-relocations. */ _stp_modules[0]->text = _stp_kallsyms_lookup_name("_stext"); _stp_modules[0]->data = _stp_kallsyms_lookup_name("_etext"); + _stp_modules[0]->text_size = _stp_modules[0]->data - _stp_modules[0]->text; _stp_modules_by_addr[0] = _stp_modules[0]; - //kbug("done with symbol data\n"); + dbug(DEBUG_SYMBOLS, "Got kernel symbols. text=%p len=%u\n", + (int64_t)_stp_modules[0]->text, _stp_modules[0]->text_size); + break; + case 3: + dbug(DEBUG_UNWIND, "got unwind data, count=%d\n", count); + _stp_symbol_state = 4; + if (copy_from_user (_stp_modules[0]->unwind_data, buf, count)) { + _dbug("cfu failed\n"); + return -EFAULT; + } + _stp_modules[0]->unwind_data_len = count; break; default: errk("unexpected symbol data of size %d.\n", count); @@ -266,10 +308,8 @@ static void u32_swap(void *a, void *b, int size) static void generic_swap(void *a, void *b, int size) { - char t; - do { - t = *(char *)a; + char t = *(char *)a; *(char *)a++ = *(char *)b; *(char *)b++ = t; } while (--size > 0); @@ -328,7 +368,7 @@ void _stp_sort(void *base, size_t num, size_t size, } /* Create a new _stp_module and load the symbols */ -static struct _stp_module *_stp_load_module_symbols (struct _stp_module *imod) +static struct _stp_module *_stp_load_module_symbols (struct _stp_module *imod, uint32_t unwind_len) { unsigned i, num=0; struct module *m = (struct module *)imod->module; @@ -336,12 +376,12 @@ static struct _stp_module *_stp_load_module_symbols (struct _stp_module *imod) char *dataptr; if (m == NULL) { - kbug("imod->module is NULL\n"); + kbug(DEBUG_SYMBOLS, "imod->module is NULL\n"); return NULL; } if (try_module_get(m)) { - mod = _stp_alloc_module_from_module(m); + mod = _stp_alloc_module_from_module(m, unwind_len); if (mod == NULL) { module_put(m); errk("failed to allocate memory for module.\n"); @@ -354,6 +394,8 @@ static struct _stp_module *_stp_load_module_symbols (struct _stp_module *imod) mod->data = imod->data; mod->num_sections = imod->num_sections; mod->sections = imod->sections; + mod->text_size = m->core_text_size; + rwlock_init(&mod->lock); /* now copy all the symbols we are interested in */ dataptr = mod->symbol_data; @@ -375,24 +417,32 @@ static struct _stp_module *_stp_load_module_symbols (struct _stp_module *imod) return mod; } -/* Do we already have this module? */ -static int _stp_module_exists(struct _stp_module *mod) +/* Remove any old module info from our database */ +static void _stp_module_exists_delete (struct _stp_module *mod) { - int i, res; - unsigned long flags; - // kbug("exists? %s\n", mod->name); - STP_LOCK_MODULES; - for (i = 1; i < _stp_num_modules; i++) { - res = strcmp(_stp_modules[i]->name, mod->name); - if (res > 0) + int i, num; + + /* remove any old modules with the same name */ + for (num = 1; num < _stp_num_modules; num++) { + if (strcmp(_stp_modules[num]->name, mod->name) == 0) { + dbug(DEBUG_SYMBOLS, "found existing module with name %s. Deleting.\n", mod->name); + _stp_del_module(_stp_modules[num]); break; - if (res == 0 && _stp_modules[i]->module == mod->module) { - STP_UNLOCK_MODULES; - return 1; } } - STP_UNLOCK_MODULES; - return 0; + + /* remove modules with overlapping addresses */ + for (num = 1; num < _stp_num_modules; num++) { + if (mod->text + mod->text_size < _stp_modules_by_addr[num]->text) + continue; + if (mod->text < _stp_modules_by_addr[num]->text + + _stp_modules_by_addr[num]->text_size) { + dbug(DEBUG_SYMBOLS, "New module %s overlaps with old module %s. Deleting old.\n", + mod->name, _stp_modules_by_addr[num]->name); + _stp_del_module(_stp_modules_by_addr[num]); + } + } + } static int _stp_ins_module(struct _stp_module *mod) @@ -400,9 +450,11 @@ static int _stp_ins_module(struct _stp_module *mod) int i, num, res, ret = 0; unsigned long flags; - // kbug("insert %s\n", mod->name); + // kbug(DEBUG_SYMBOLS, "insert %s\n", mod->name); - STP_LOCK_MODULES; + STP_WLOCK_MODULES; + + _stp_module_exists_delete(mod); /* check for overflow */ if (_stp_num_modules == STP_MAX_MODULES) { @@ -412,32 +464,25 @@ static int _stp_ins_module(struct _stp_module *mod) } /* insert alphabetically in _stp_modules[] */ - for (num = 1; num < _stp_num_modules; num++) { - res = strcmp(_stp_modules[num]->name, mod->name); - if (res < 0) - continue; - if (res > 0) + for (num = 1; num < _stp_num_modules; num++) + if (strcmp(_stp_modules[num]->name, mod->name) > 0) break; - _stp_del_module(_stp_modules[num]); - break; - } for (i = _stp_num_modules; i > num; i--) _stp_modules[i] = _stp_modules[i-1]; _stp_modules[num] = mod; /* insert by text address in _stp_modules_by_addr[] */ - for (num = 1; num < _stp_num_modules; num++) { - if (_stp_modules_by_addr[num]->text > mod->text) + for (num = 1; num < _stp_num_modules; num++) + if (mod->text < _stp_modules_by_addr[num]->text) break; - } for (i = _stp_num_modules; i > num; i--) _stp_modules_by_addr[i] = _stp_modules_by_addr[i-1]; _stp_modules_by_addr[num] = mod; - + _stp_num_modules++; done: - STP_UNLOCK_MODULES; + STP_WUNLOCK_MODULES; return ret; } @@ -456,13 +501,13 @@ static int _stp_do_module(const char __user *buf, int count) if (copy_from_user ((char *)&tmpmod, buf, sizeof(tmpmod))) return -EFAULT; - section_len = count - sizeof(tmpmod); + section_len = count - sizeof(tmpmod) - tmpmod.unwind_len; if (section_len <= 0) { errk("section_len = %d\n", section_len); return -EFAULT; } - kbug("Got module %s, count=%d section_len=%d\n", - tmpmod.name, count, section_len); + dbug(DEBUG_SYMBOLS, "Got module %s, count=%d section_len=%d unwind_len=%d\n", + tmpmod.name, count, section_len, tmpmod.unwind_len); strcpy(mod.name, tmpmod.name); mod.module = tmpmod.module; @@ -470,9 +515,6 @@ static int _stp_do_module(const char __user *buf, int count) mod.data = tmpmod.data; mod.num_sections = tmpmod.num_sections; - if (_stp_module_exists(&mod)) - return count; - /* copy in section data */ mod.sections = _stp_kmalloc(section_len); if (mod.sections == NULL) { @@ -489,18 +531,27 @@ static int _stp_do_module(const char __user *buf, int count) + (long)((long)mod.sections + mod.num_sections * sizeof(struct _stp_symbol))); } - #ifdef DEBUG_SYMBOLS + #if 0 for (i = 0; i < mod.num_sections; i++) - printk("section %d (stored at %p): %s %lx\n", i, &mod.sections[i], mod.sections[i].symbol, mod.sections[i].addr); + _dbug("section %d (stored at %p): %s %lx\n", i, &mod.sections[i], mod.sections[i].symbol, mod.sections[i].addr); #endif /* load symbols from tmpmod.module to mod */ - m = _stp_load_module_symbols(&mod); + m = _stp_load_module_symbols(&mod, tmpmod.unwind_len); if (m == NULL) { _stp_kfree(mod.sections); return 0; } + dbug(DEBUG_SYMBOLS, "module %s loaded. Text=%p text_size=%u\n", m->name, (int64_t)m->text, m->text_size); + /* finally copy unwind info */ + if (copy_from_user (m->unwind_data, buf+sizeof(tmpmod)+section_len, tmpmod.unwind_len)) { + _stp_free_module(m); + _stp_kfree(mod.sections); + return -EFAULT; + } + m->unwind_data_len = tmpmod.unwind_len; + if (_stp_ins_module(m) < 0) { _stp_free_module(m); return -ENOMEM; @@ -513,20 +564,18 @@ static int _stp_ctl_send (int type, void *data, int len); static int _stp_module_load_notify(struct notifier_block * self, unsigned long val, void * data) { -#ifdef CONFIG_MODULES struct module *mod = (struct module *)data; struct _stp_module rmod; switch (val) { case MODULE_STATE_COMING: - dbug("module %s loaded\n", mod->name); + dbug(DEBUG_SYMBOLS, "module %s load notify\n", mod->name); strlcpy(rmod.name, mod->name, STP_MODULE_NAME_LEN); _stp_ctl_send(STP_MODULE, &rmod, sizeof(struct _stp_module)); break; default: errk("module loaded? val=%ld\n", val); } -#endif return 0; } diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c index 6b90ee64..8335e44b 100644 --- a/runtime/transport/transport.c +++ b/runtime/transport/transport.c @@ -18,6 +18,7 @@ #include <linux/namei.h> #include "transport.h" #include "time.c" +#include "../mempool.c" #include "symbols.c" #include "../procfs.c" @@ -76,7 +77,7 @@ static void _stp_ask_for_symbols(void) if (sent_symbols == 0) { /* ask for symbols and modules */ - kbug("AFS\n"); + kbug(DEBUG_SYMBOLS|DEBUG_TRANSPORT, "AFS\n"); req.endian = 0x1234; req.ptr_size = sizeof(char *); @@ -94,7 +95,7 @@ static void _stp_ask_for_symbols(void) void _stp_handle_start (struct _stp_msg_start *st) { - kbug ("stp_handle_start\n"); + kbug (DEBUG_TRANSPORT, "stp_handle_start\n"); if (register_module_notifier(&_stp_module_load_nb)) errk("failed to load module notifier\n"); @@ -116,7 +117,7 @@ void _stp_handle_start (struct _stp_msg_start *st) /* when someone does /sbin/rmmod on a loaded systemtap module. */ static void _stp_cleanup_and_exit (int dont_rmmod) { - kbug("cleanup_and_exit (%d)\n", dont_rmmod); + kbug(DEBUG_TRANSPORT, "cleanup_and_exit (%d)\n", dont_rmmod); if (!_stp_exit_called) { int failures; @@ -127,23 +128,23 @@ static void _stp_cleanup_and_exit (int dont_rmmod) _stp_exit_called = 1; if (_stp_probes_started) { - kbug("calling probe_exit\n"); + kbug(DEBUG_TRANSPORT, "calling probe_exit\n"); /* tell the stap-generated code to unload its probes, etc */ probe_exit(); - kbug("done with probe_exit\n"); + kbug(DEBUG_TRANSPORT, "done with probe_exit\n"); } failures = atomic_read(&_stp_transport_failures); if (failures) _stp_warn ("There were %d transport failures.\n", failures); - kbug("************** calling startstop 0 *************\n"); + kbug(DEBUG_TRANSPORT, "************** calling startstop 0 *************\n"); if (_stp_utt) utt_trace_startstop(_stp_utt, 0, &utt_seq); - kbug("ctl_send STP_EXIT\n"); + kbug(DEBUG_TRANSPORT, "ctl_send STP_EXIT\n"); /* tell staprun to exit (if it is still there) */ _stp_ctl_send(STP_EXIT, &dont_rmmod, sizeof(int)); - kbug("done with ctl_send STP_EXIT\n"); + kbug(DEBUG_TRANSPORT, "done with ctl_send STP_EXIT\n"); } } @@ -152,7 +153,7 @@ static void _stp_cleanup_and_exit (int dont_rmmod) */ static void _stp_detach(void) { - kbug("detach\n"); + kbug(DEBUG_TRANSPORT, "detach\n"); _stp_attached = 0; _stp_pid = 0; @@ -168,7 +169,7 @@ static void _stp_detach(void) */ static void _stp_attach(void) { - kbug("attach\n"); + kbug(DEBUG_TRANSPORT, "attach\n"); _stp_attached = 1; _stp_pid = current->pid; utt_set_overwrite(0); @@ -210,7 +211,7 @@ static void _stp_work_queue (void *data) */ void _stp_transport_close() { - kbug("%d: ************** transport_close *************\n", current->pid); + kbug(DEBUG_TRANSPORT, "%d: ************** transport_close *************\n", current->pid); _stp_cleanup_and_exit(1); destroy_workqueue(_stp_wq); _stp_unregister_ctl_channel(); @@ -219,7 +220,7 @@ void _stp_transport_close() _stp_kill_time(); _stp_print_cleanup(); /* free print buffers */ _stp_mem_debug_done(); - kbug("---- CLOSED ----\n"); + kbug(DEBUG_TRANSPORT, "---- CLOSED ----\n"); } @@ -248,7 +249,7 @@ int _stp_transport_init(void) { int ret; - kbug("transport_init\n"); + kbug(DEBUG_TRANSPORT, "transport_init\n"); _stp_init_pid = current->pid; _stp_uid = current->uid; _stp_gid = current->gid; @@ -263,7 +264,7 @@ int _stp_transport_init(void) unsigned size = _stp_bufsize * 1024 * 1024; _stp_subbuf_size = ((size >> 2) + 1) * 65536; _stp_nsubbufs = size / _stp_subbuf_size; - kbug("Using %d subbufs of size %d\n", _stp_nsubbufs, _stp_subbuf_size); + kbug(DEBUG_TRANSPORT, "Using %d subbufs of size %d\n", _stp_nsubbufs, _stp_subbuf_size); } /* initialize timer code */ @@ -388,12 +389,12 @@ static struct dentry *_stp_get_root_dir(const char *name) { _stp_lock_inode(sb->s_root->d_inode); root = lookup_one_len(name, sb->s_root, strlen(name)); _stp_unlock_inode(sb->s_root->d_inode); - kbug("root=%p\n", root); + kbug(DEBUG_TRANSPORT, "root=%p\n", root); if (!IS_ERR(root)) dput(root); else { root = NULL; - kbug("Could not create or find transport directory.\n"); + kbug(DEBUG_TRANSPORT, "Could not create or find transport directory.\n"); } } _stp_unlock_debugfs(); diff --git a/runtime/transport/transport_msgs.h b/runtime/transport/transport_msgs.h index b2187cd5..55de2d4a 100644 --- a/runtime/transport/transport_msgs.h +++ b/runtime/transport/transport_msgs.h @@ -36,7 +36,30 @@ enum STP_SUBBUFS_CONSUMED, STP_REALTIME_DATA, #endif + + STP_MAX_CMD +}; + +#ifdef DEBUG_TRANSPORT +static const char *_stp_command_name[] = { + "STP_START", + "STP_EXIT", + "STP_OOB_DATA", + "STP_SYSTEM", + "STP_SYMBOLS", + "STP_MODULE", + "STP_TRANSPORT", + "STP_CONNECT", + "STP_DISCONNECT", + "STP_BULK", + "STP_READY", +#ifdef STP_OLD_TRANSPORT + "STP_BUF_INFO", + "STP_SUBBUFS_CONSUMED", + "STP_REALTIME_DATA", +#endif }; +#endif /* DEBUG_TRANSPORT */ /* control channel messages */ |