From 825fd305f03b726665edca34963978ce27448182 Mon Sep 17 00:00:00 2001 From: José Alburquerque Date: Sun, 11 Apr 2010 15:45:09 -0400 Subject: Update doc extraction tool to handle GObjectIntrospection annotations. * codegen/docextract.py (FunctionDoc): Renamed class to GtkDoc. (GtkDoc::annotations): Added a list field to store annotations which are 2-tuples of (name, value). (GtkDoc::ret): Modified field to store the return description along with a list of annotations as described above. (GtkDoc::params): Now holds a list of 3-tupples: name, description and annotations (as described above). (GtkDoc::block_type): Add a field to tell if the comment block is a function block, signal block or property block. (GtkDoc::set_type): (GtkDoc::get_type): Add methods for setting/getting the block type. (GtkDoc::add_param): Modified to also accept a list of annotations to be added with the parameter. (GtkDoc::add_annotation): (GtkDoc::get_annotations): Added methods to add/get annotations for the comment block. (GtkDoc::append_description): Renamed to append_to_description(). (GtkDoc::get_param_description): Removed unused method. (GtkDoc::get_description): Added method to get block description. (GtkDoc::add_return): Added method to add a return accepting the first line of the description and its annotations. (GtkDoc::append_return): Renamed to append_to_return(). (Regular expressions): - Made the names of the variables un-abbreviated. - Added 'since', 'deprecated' and 'rename to' regular expressions. - Modified the return matching regular expression so that it doesn't match descriptions that begin with 'Returns ...'. This improves the docs of many function. - Added signal and property comment block identifier matching regular expressions in case those are useful. - Modified existing identifier matching regular expressions (function, signal, and property regular expressions) to properly parse annotations. Also added a regular expression for extracting annotations from the parameter and return descriptions. - Refined the function name matching regular expression to only accept identifiers that begin with a lowercase letter. This eliminates 'SECTION:' matches. - Finally, grouped commonly related expressions like return_pattern, since_pattern, etc. into groups (in lists) so that matching those sections can be done using loops. (Parsing algorithm): Modified the algorithm to use a functional approach to parsing. Extra methods like skip_to_comment() and processs_params() have been added and used in the parse_file() function to now process the comment blocks. (parse_dir): Added file processing output to stderr. * codegen/docextract_to_xml.py (usage): Added function to print out the usage. (print_annotations): Added function to print the given list of annotations. (options): Added --with-signals (-i), with-properties (-p) and --with-annotation (-a) to the existing --source-dir (-s) option. (algorithm): Now prints annotations, if specified. Also, prints signals and properties correctly (using names like Class::signal-one for signals and Classs:property) with xml such as .... The return xml is slightly modified with annotations but this would only be exhibited if annotation xml is requested. --- codegen/docextract.py | 439 ++++++++++++++++++++++++++++++++++--------- codegen/docextract_to_xml.py | 87 ++++++--- 2 files changed, 414 insertions(+), 112 deletions(-) (limited to 'codegen') diff --git a/codegen/docextract.py b/codegen/docextract.py index e6c6505..aef0393 100644 --- a/codegen/docextract.py +++ b/codegen/docextract.py @@ -6,122 +6,380 @@ import sys, os, string, re __all__ = ['extract'] -class FunctionDoc: +class GtkDoc: def __init__(self): self.name = None + self.block_type = '' # The block type ('function', 'signal', 'property') self.params = [] + self.annotations = [] self.description = '' - self.ret = '' + self.ret = ('', []) # (return, annotations) def set_name(self, name): self.name = name - def add_param(self, name, description): + def set_type(self, block_type): + self.block_type = block_type + def get_type(self): + return self.block_type + def add_param(self, name, description, annotations=[]): if name == '...': name = 'Varargs' - self.params.append((name, description)) + self.params.append((name, description, annotations)) def append_to_last_param(self, extra): - self.params[-1] = (self.params[-1][0], self.params[-1][1] + extra) + self.params[-1] = (self.params[-1][0], self.params[-1][1] + extra, + self.params[-1][2]) def append_to_named_param(self, name, extra): for i in range(len(self.params)): if self.params[i][0] == name: - self.params[i] = (name, self.params[i][1] + extra) + self.params[i] = (name, self.params[i][1] + extra, + self.params[i][2]) return # fall through to adding extra parameter ... self.add_param(name, extra) - def append_description(self, extra): + def add_annotation(self, annotation): + self.annotations.append(annotation) + def get_annotations(self): + return self.annotations + def append_to_description(self, extra): self.description = self.description + extra - def append_return(self, extra): - self.ret = self.ret + extra + def get_description(self): + return self.description + def add_return(self, first_line, annotations=[]): + self.ret = (first_line, annotations) + def append_to_return(self, extra): + self.ret = (self.ret[0] + extra, self.ret[1]) - def get_param_description(self, name): - for param, description in self.params: - if param == name: - return description - else: - return '' +comment_start_pattern = re.compile(r'^\s*/\*\*\s') +comment_end_pattern = re.compile(r'^\s*\*+/') +comment_line_lead_pattern = re.compile(r'^\s*\*\s*') +comment_empty_line_pattern = re.compile(r'^\s*\**\s*$') +function_name_pattern = re.compile(r'^([a-z]\w*)\s*:?(\s*\(.*\)\s*){0,2}\s*$') +signal_name_pattern = re.compile(r'^([A-Z]\w+::[a-z0-9-]+)\s*:?(\s*\(.*\)\s*){0,2}\s*$') +property_name_pattern = re.compile(r'^([A-Z]\w+:[a-z0-9-]+)\s*:?(\s*\(.*\)\s*){0,2}\s*$') +return_pattern = re.compile(r'^@?(returns:|return\s+value:)(.*\n?)$', re.IGNORECASE) +deprecated_pattern = re.compile(r'^(deprecated\s*:\s*.*\n?)$', re.IGNORECASE) +rename_to_pattern = re.compile(r'^(rename\s+to)\s*:\s*(.*\n?)$', re.IGNORECASE) +param_pattern = re.compile(r'^@(\S+)\s*:(.*\n?)$') +# Used to extract the annotations in the parameter and return descriptions +# extracted using above [param|return]_pattern patterns. +annotations_pattern = re.compile(r'^(?:(\s*\(.*\)\s*)*:)') +# Used to construct the annotation lists. +annotation_lead_pattern = re.compile(r'^\s*\(\s*(.*?)\s*\)\s*') + +# These patterns determine the identifier of the current comment block. They +# are grouped in a list for easy determination of block identifiers (in +# skip_to_identifier). The function_name_pattern should be tested for last +# because it always matches signal and property identifiers. +identifier_patterns = [ signal_name_pattern, property_name_pattern, function_name_pattern ] -comment_start_pat = re.compile(r'^\s*/\*\*\s') -comment_end_pat = re.compile(r'^\s*\*+/') -comment_line_lead = re.compile(r'^\s*\*\s*') -funcname_pat = re.compile(r'^(\w+)\s*:?') -return_pat = re.compile(r'^(returns:|return\s+value:|returns\s*)(.*\n?)$', - re.IGNORECASE) -param_pat = re.compile(r'^@(\S+)\s*:(.*\n?)$') +# This pattern is to match return sections that forget to have a colon (':') +# after the initial 'Return' phrase. It is not included by default in the list +# of final sections below because a lot of function descriptions begin with +# 'Returns ...' and the process_description() function would stop right at that +# first line, thinking it is a return section. +no_colon_return_pattern = re.compile(r'^@?(returns|return\s+value)\s*(.*\n?)$', re.IGNORECASE) +since_pattern = re.compile(r'^(since\s*:\s*.*\n?)$', re.IGNORECASE) + +# These patterns normally will be encountered after the description. Knowing +# the order of their appearance is difficult so this list is used to test when +# one begins and the other ends when processing the rest of the sections after +# the description. +final_section_patterns = [ return_pattern, since_pattern, deprecated_pattern, rename_to_pattern ] def parse_file(fp, doc_dict): line = fp.readline() - in_comment_block = 0 while line: - if not in_comment_block: - if comment_start_pat.match(line): - in_comment_block = 1 - cur_doc = FunctionDoc() - in_description = 0 - in_return = 0 - line = fp.readline() - continue + cur_doc = GtkDoc() + line = skip_to_comment_block(fp, line) + line = skip_to_identifier(fp, line, cur_doc) + # See if the identifier is found (stored in the current GtkDoc by + # skip_to_identifier). If so, continue reading the rest of the comment + # block. + if cur_doc.name: + line = process_params(fp, line, cur_doc) + line = process_description(fp, line, cur_doc) + line = process_final_sections(fp, line, cur_doc) + # Add the current doc block to the dictionary of doc blocks. + doc_dict[cur_doc.name] = cur_doc - # we are inside a comment block ... - if comment_end_pat.match(line): - if not cur_doc.name: - sys.stderr.write("no function name found in doc comment\n") - else: - doc_dict[cur_doc.name] = cur_doc - in_comment_block = 0 - line = fp.readline() - continue +# Given a list of annotations as string of the form +# '(annotation1) (annotation2) ...' return a list of annotations of the form +# [ (name1, value1), (name2, value2) ... ]. Not all annotations have values so +# the values in the list of tuples could be empty (''). +def get_annotation_list(annotations): + annotation_list = [] + while annotations: + match = annotation_lead_pattern.match(annotations) + if match: + annotation_contents = match.group(1) + name, split, value = annotation_contents.strip().partition(' ') + annotation_list.append((name, value)) + # Remove first occurrence to continue processing. + annotations = annotation_lead_pattern.sub('', annotations) + else: + break + return annotation_list - # inside a comment block, and not the end of the block ... - line = comment_line_lead.sub('', line) - if not line: line = '\n' +# Given a currently read line, test that line and continue reading until the +# beginning of a comment block is found or eof is reached. Return the last +# read line. +def skip_to_comment_block(fp, line): + while line: + if comment_start_pattern.match(line): + break + line = fp.readline() + return line + +# Given the current line in a comment block, continue skipping lines until a +# non-blank line in the comment block is found or until the end of the block +# (or eof) is reached. Returns the line where reading stopped. +def skip_to_nonblank(fp, line): + while line: + if not comment_empty_line_pattern.match(line): + break + line = fp.readline() + # Stop processing if eof or end of comment block is reached. + if not line or comment_end_pattern.match(line): + break + return line + +# Given the first line of a comment block (the '/**'), see if the next +# non-blank line is the identifier of the comment block. Stop processing if +# the end of the block or eof is reached. Store the identifier (if there is +# one) and its type ('function', 'signal' or 'property') in the given GtkDoc. +# Return the line where the identifier is found or the line that stops the +# processing (if eof or the end of the comment block is found first). +def skip_to_identifier(fp, line, cur_doc): + # Skip the initial comment block line ('/**') if not eof. + if line: line = fp.readline() - if not cur_doc.name: - match = funcname_pat.match(line) + # Now skip empty lines. + line = skip_to_nonblank(fp, line) + + # See if the first non-blank line is the identifier. + if line and not comment_end_pattern.match(line): + # Remove the initial ' * ' in comment block line and see if there is an + # identifier. + line = comment_line_lead_pattern.sub('', line) + for pattern in identifier_patterns: + match = pattern.match(line) if match: + # Set the GtkDoc name. cur_doc.set_name(match.group(1)) - elif in_return: - match = return_pat.match(line) - if match: - # assume the last return statement was really part of the - # description - return_start = match.group(1) - cur_doc.ret = match.group(2) - cur_doc.description = cur_doc.description + return_start + \ - cur_doc.ret - else: - cur_doc.append_return(line) - elif in_description: - if line[:12] == 'Description:': - line = line[12:] - match = return_pat.match(line) - if match: - in_return = 1 - return_start = match.group(1) - cur_doc.append_return(match.group(2)) - else: - cur_doc.append_description(line) - elif line == '\n': - # end of parameters - in_description = 1 + # Get annotations and add them to the GtkDoc. + annotations = get_annotation_list(match.group(2)) + for annotation in annotations: + cur_doc.add_annotation(annotation) + # Set the GtkDoc type. + if pattern == signal_name_pattern: + cur_doc.set_type('signal') + elif pattern == property_name_pattern: + cur_doc.set_type('property') + elif pattern == function_name_pattern: + cur_doc.set_type('function') + return line + return line + +# Given a currently read line (presumably the identifier line), read the next +# lines, testing to see if the lines are part of parameter descriptions. If +# so, store the parameter descriptions in the given doc block. Stop on eof and +# return the last line that stops the processing. +def process_params(fp, line, cur_doc): + # Skip the identifier line if not eof. Also skip any blank lines in the + # comment block. Return if eof or the end of the comment block are + # encountered. + if line: line = fp.readline() + line = skip_to_nonblank(fp, line) + if not line or comment_end_pattern.match(line): + return line + + # Remove initial ' * ' in first non-empty comment block line. + line = comment_line_lead_pattern.sub('', line) + + # Now process possible parameters as long as no eof or the end of the + # param section is not reached (which could be triggered by anything that + # doesn't match a '@param:..." line, even the end of the comment block). + match = param_pattern.match(line) + while line and match: + description = match.group(2) + + # First extract the annotations from the description and save them. + annotations = [] + annotation_match = annotations_pattern.match(description) + if annotation_match: + annotations = get_annotation_list(annotation_match.group(1)) + # Remove the annotations from the description + description = annotations_pattern.sub('', description) + + # Default to appending lines to current parameter. + append_func = cur_doc.append_to_last_param + + # See if the return has been included as part of the parameter + # section and make sure that lines are added to the GtkDoc return if + # so. + if match.group(1).lower() == "returns": + cur_doc.add_return(description, annotations) + append_func = cur_doc.append_to_return + # If not, just add it as a regular parameter. else: - match = param_pat.match(line) + cur_doc.add_param(match.group(1), description, annotations) + + # Now read lines and append them until next parameter, beginning of + # description (an empty line), the end of the comment block or eof. + line = fp.readline() + while line: + # Stop processing if end of comment block or a blank comment line + # is encountered. + if comment_empty_line_pattern.match(line) or \ + comment_end_pattern.match(line): + break + + # Remove initial ' * ' in comment block line. + line = comment_line_lead_pattern.sub('', line) + + # Break from current param processing if a new one is + # encountered. + if param_pattern.match(line): break; + + # Otherwise, just append the current line and get the next line. + append_func(line) + line = fp.readline() + + # Re-evaluate match for while condition + match = param_pattern.match(line) + + # End by returning the current line. + return line + +# Having processed parameters, read the following lines into the description of +# the current doc block until the end of the comment block, the end of file or +# a return section is encountered. +def process_description(fp, line, cur_doc): + # First skip empty lines returning on eof or end of comment block. + line = skip_to_nonblank(fp, line) + if not line or comment_end_pattern.match(line): + return line + + # Remove initial ' * ' in non-empty comment block line. + line = comment_line_lead_pattern.sub('', line) + + # Also remove possible 'Description:' prefix. + if line[:12] == 'Description:': line = line[12:] + + # Used to tell if the previous line was blank and a return section + # uncommonly marked with 'Returns ...' instead of 'Returns: ...' has + # started (assume it is non-empty to begin with). + prev_line = 'non-empty' + + # Now read lines until a new section (like a return or a since section) is + # encountered. + while line: + # See if the description section has ended (if the line begins with + # 'Returns ...' and the previous line was empty -- this loop replaces + # empty lines with a newline). + if no_colon_return_pattern.match(line) and prev_line == '\n': + return line + # Or if one of the patterns of the final sections match + for pattern in final_section_patterns: + if pattern.match(line): + return line + + # If not, append lines to description in the doc comment block. + cur_doc.append_to_description(line) + + prev_line = line + line = fp.readline() + + # Stop processing on eof or at the end of comment block. + if not line or comment_end_pattern.match(line): + return line + + # Remove initial ' * ' in line so that the text can be appended to the + # description of the comment block and make sure that if the line is + # empty it be interpreted as a newline. + line = comment_line_lead_pattern.sub('', line) + if not line: line = '\n' + +# Given the line that ended the description (the first line of one of the final +# sections) process the final sections ('Returns:', 'Since:', etc.) until the +# end of the comment block or eof. Return the line that ends the processing. +def process_final_sections(fp, line, cur_doc): + while line and not comment_end_pattern.match(line): + # Remove leading ' * ' from current non-empty comment line. + line = comment_line_lead_pattern.sub('', line) + # Temporarily append the no colon return pattern to the final section + # patterns now that the description has been processed. It will be + # removed after the for loop below executes so that future descriptions + # that begin with 'Returns ...' are not interpreted as a return + # section. + final_section_patterns.append(no_colon_return_pattern) + for pattern in final_section_patterns: + match = pattern.match(line) if match: - param = match.group(1) - desc = match.group(2) - if param == 'returns': - cur_doc.ret = desc + if pattern == return_pattern or \ + pattern == no_colon_return_pattern: + # Dealing with a 'Returns:' so first extract the + # annotations from the description and save them. + description = match.group(2) + annotations = [] + annotation_match = \ + annotations_pattern.match(description) + if annotation_match: + annotations = \ + get_annotation_list(annotation_match.group(1)) + # Remove the annotations from the description + description = annotations_pattern.sub('', description) + + # Now add the return. + cur_doc.add_return(description, annotations) + # In case more lines need to be appended. + append_func = cur_doc.append_to_return + elif pattern == rename_to_pattern: + # Dealing with a 'Rename to:' section (GObjectIntrospection + # annotation) so no further lines will be appended but this + # single one (and only to the annotations). + append_func = None + cur_doc.add_annotation((match.group(1), + match.group(2))) else: - cur_doc.add_param(param, desc) - else: - # must be continuation - try: - if param == 'returns': - cur_doc.append_return(line) - else: - cur_doc.append_to_last_param(line) - except: - sys.stderr.write('something weird while reading param\n') + # For all others ('Since:' and 'Deprecated:') just append + # the line to the description for now. + cur_doc.append_to_description(line) + # In case more lines need to be appended. + append_func = cur_doc.append_to_description + + # Remove the no colon return pattern (which was temporarily added in + # the just executed loop) from the list of final section patterns. + final_section_patterns.pop() + line = fp.readline() + # Now continue appending lines to current section until a new one is + # found or an eof or the end of the comment block is encountered. + finished = False + while not finished and line and \ + not comment_end_pattern.match(line): + # Remove leading ' * ' from line and make sure that if it is empty, + # it be interpreted as a newline. + line = comment_line_lead_pattern.sub('', line) + if not line: line = '\n' + + for pattern in final_section_patterns: + if pattern.match(line): + finished = True + break + + # Break out of loop if a new section is found (determined in above + # inner loop). + if finished: break + + # Now it's safe to append line. + if append_func: append_func(line) + + # Get the next line to continue processing. + line = fp.readline() + + return line + def parse_dir(dir, doc_dict): for file in os.listdir(dir): if file in ('.', '..'): continue @@ -129,6 +387,7 @@ def parse_dir(dir, doc_dict): if os.path.isdir(path): parse_dir(path, doc_dict) if len(file) > 2 and file[-2:] == '.c': + sys.stderr.write("Processing " + path + '\n') parse_file(open(path, 'r'), doc_dict) def extract(dirs, doc_dict=None): @@ -137,13 +396,13 @@ def extract(dirs, doc_dict=None): parse_dir(dir, doc_dict) return doc_dict -tmpl_section_pat = re.compile(r'^$') +tmpl_section_pattern = re.compile(r'^$') def parse_tmpl(fp, doc_dict): cur_doc = None line = fp.readline() while line: - match = tmpl_section_pat.match(line) + match = tmpl_section_pattern.match(line) if match: cur_doc = None # new input shouldn't affect the old doc dict sect_type = match.group(1) @@ -152,7 +411,7 @@ def parse_tmpl(fp, doc_dict): if sect_type == 'FUNCTION': cur_doc = doc_dict.get(sect_name) if not cur_doc: - cur_doc = FunctionDoc() + cur_doc = GtkDoc() cur_doc.set_name(sect_name) doc_dict[sect_name] = cur_doc elif line == '\n': @@ -160,15 +419,15 @@ def parse_tmpl(fp, doc_dict): elif cur_doc: if line[:10] == '@Returns: ': if string.strip(line[10:]): - cur_doc.append_return(line[10:]) + cur_doc.append_to_return(line[10:]) elif line[0] == '@': pos = string.find(line, ':') if pos >= 0: cur_doc.append_to_named_param(line[1:pos], line[pos+1:]) else: - cur_doc.append_description(line) + cur_doc.append_to_description(line) else: - cur_doc.append_description(line) + cur_doc.append_to_description(line) line = fp.readline() diff --git a/codegen/docextract_to_xml.py b/codegen/docextract_to_xml.py index f8d3bae..638ac94 100755 --- a/codegen/docextract_to_xml.py +++ b/codegen/docextract_to_xml.py @@ -13,6 +13,13 @@ import sys import docextract +def usage(): + sys.stderr.write('usage: docextract_to_xml.py ' + + '[-s /src/dir | --source-dir=/src/dir] ' + + '[-a | --with-annotations] [-p | --with-properties] ' + + '[-i | --with-signals ]\n') + sys.exit(1) + def escape_text(unescaped_text): # Escape every "&" not part of an entity reference escaped_text = re.sub(r'&(?![A-Za-z]+;)', '&', unescaped_text) @@ -30,23 +37,34 @@ def escape_text(unescaped_text): return escaped_text +def print_annotations(annotations): + for annotation in annotations: + print "" + \ + escape_text(annotation[1]) + "" + if __name__ == '__main__': try: - opts, args = getopt.getopt(sys.argv[1:], "d:s:o:", - ["source-dir="]) + opts, args = getopt.getopt(sys.argv[1:], "d:s:o:api", + ["source-dir=", "with-annotations", + "with-properties", "with-signals"]) except getopt.error, e: - sys.stderr.write('docgen.py: %s\n' % e) - sys.stderr.write( - 'usage: docgen.py [-s /src/dir]\n') - sys.exit(1) + sys.stderr.write('docextract_to_xml.py: %s\n' % e) + usage() source_dirs = [] + with_annotations = False + with_signals = False + with_properties = False for opt, arg in opts: if opt in ('-s', '--source-dir'): source_dirs.append(arg) + if opt in ('-a', '--with-annotations'): + with_annotations = True + if opt in ('-p', '--with-properties'): + with_properties = True + if opt in ('-i', '--with-signals'): + with_signals = True if len(args) != 0: - sys.stderr.write( - 'usage: docgen.py [-s /src/dir]\n') - sys.exit(1) + usage() docs = docextract.extract(source_dirs); docextract.extract_tmpl(source_dirs, docs); #Try the tmpl sgml files too. @@ -58,25 +76,50 @@ if __name__ == '__main__': print "" for name, value in docs.items(): - print "" + # Get the type of comment block ('function', 'signal' or + # 'property') (the value is a GtkDoc). + block_type = value.get_type() + + # Skip signals if the option was not specified. + if block_type == 'signal' and not with_signals: + continue + # Likewise for properties. + elif block_type == 'property' and not with_properties: + continue + + print "<" + block_type + " name=\"" + escape_text(name) + "\">" print "" - #The value is a docextract.FunctionDoc - print escape_text(value.description) + print escape_text(value.get_description()) print "" - # Loop through the parameters: - print "" - for name, description in value.params: - print "" - print "" + escape_text(description) + "" - print "" + # Loop through the parameters if not dealing with a property: + if block_type != 'property': + print "" + for name, description, annotations in value.params: + print "" + print "" + escape_text(description) + "" + + if with_annotations: + print_annotations(annotations) + + print "" + + print "" - print "" + # Show the return-type (also if not dealing with a property): + if with_annotations: + print "" + print "" + escape_text(value.ret[0]) + \ + "" + print_annotations(value.ret[1]) + print "" + else: + print "" + escape_text(value.ret[0]) + "" - # Show the return-type: - print "" + escape_text(value.ret) + "" + if with_annotations: + print_annotations(value.get_annotations()) - print "\n" + print "\n" print "" -- cgit