From 5d0936792dfe1e1016ba1f5650d04fc6a37fa5a4 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 28 Apr 2011 17:20:58 +0800 Subject: test --- wikixml.py | 463 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100755 wikixml.py diff --git a/wikixml.py b/wikixml.py new file mode 100755 index 0000000..a5326ce --- /dev/null +++ b/wikixml.py @@ -0,0 +1,463 @@ +#!/usr/bin/python + +import os +import sys +import optparse +import re +import time, datetime + +try: + from simplemediawiki import MediaWiki +except ImportError: + print "Unable to import simplemediawiki. Is python-simpemediawiki installed?" + sys.exit(1) + +time_fmt = "%Y-%m-%dT%H:%M:%SZ" + +def parse_args(): + '''Set up the option parser''' + parser = optparse.OptionParser(usage="%prog [options] [options]") + parser.add_option('-v', '--verbose', action='store_true', default=False, + help='Enable more verbose output') + parser.add_option('-d', '--debug', action='store_true', default=False, + help='Enable debugging output') + parser.add_option('--url', action='store', default='https://fedoraproject.org/w/api.php', + help='API URL') + + # general + def default_start(): + return time.strftime(time_fmt, time.localtime()) + def default_end(): + # default to the last 30 days + today = datetime.datetime.today() + one_month = datetime.timedelta(days=30) + return (today - one_month).strftime(time_fmt) + + optgrp = optparse.OptionGroup(parser, "General options") + optgrp.add_option('--start', action='store', default=default_start(), + help='Limit results using start date') + optgrp.add_option('--end', action='store', default=default_end(), + help='Limit results using end date') + optgrp.add_option('-n', '--namespace', dest="namespaces", + action='append', default=[], + help='Limit results to a specific namespace (accepts multiple values)') + optgrp.add_option('-l', '--limit', action='store', default=5, type="int", + help='Limit recursion depth (%default)') + parser.add_option_group(optgrp) + + # list_categorymembers + optgrp = optparse.OptionGroup(parser, "Options for 'categorymembers' command:") + optgrp.add_option('-c', '--category', dest="categories", + default=[], action="append", + help='Wiki category name to query (accepts multiple values)') + parser.add_option_group(optgrp) + + # list_usercontribs + optgrp = optparse.OptionGroup(parser, "Options for 'usercontribs' and 'usercontribstable'") + optgrp.add_option('-u', '--user', dest="users", + default=[], action="append", + help='FAS username to query (accepts multiple values)') + parser.add_option_group(optgrp) + + optgrp = optparse.OptionGroup(parser, "Options for 'parse' and 'get'") + optgrp.add_option('-t', '--title', + default='', action='store', + help='Page title to parse') + parser.add_option_group(optgrp) + + optgrp = optparse.OptionGroup(parser, "Options for 'recentchanges'") + parser.add_option_group(optgrp) + + (opts, args) = parser.parse_args() + + if len(args) == 0: + parser.error("No action specified") + else: + action = args[0] + + # Validate inputs + if action == 'categorymembers': + if len(opts.categories) == 0: + parser.error("Must specify at least one category (-c|--category)") + elif action in ['usercontribs', 'usercontribstable']: + if len(opts.users) == 0: + parser.error("Must specify at least one category (-u|--user)") + else: + # expand space or comma-delimited values + idx = 0 + split_re="[ ,]" + while idx < len(opts.users): + u = opts.users[idx] + if re.search(split_re, u): + opts.users[idx] = re.split('[ ,]', u)[0] + opts.users.extend(re.split('[ ,]', u)[1:]) + idx += 1 + opts.users.sort() # sort list + elif action in ['parse', 'get']: + if opts.title == '': + parser.error("Must specify a page (-t|--title)") + + return (opts, action) + +def list_namespaces(wiki): + query = dict(action='query', + meta='siteinfo', + siprop='namespaces') + + if opts.debug: print query + response = wiki.call(query) + if opts.debug: print response + + namespaces = dict() + for id,entry in response.get('query',{}).get('namespaces',{}).items(): + if entry.has_key('canonical'): + namespaces[entry['canonical']] = id + + return namespaces + +def recentchanges(wiki, date_start='', date_end='', namespaces=""): + # https://fedoraproject.org/w/api.php?action=query&list=recentchanges&rcprop="title|timestamp|ids|user + # Build query arguments and call wiki + query = dict(action='query', + list='recentchanges', + rcprop="title|timestamp|ids|user", + rclimit=50, + ) + # FIXME - validate date input (expected format "%Y-%m-%dT%H:%M:%S") + if date_start != '': + query['rcstart'] = date_start + if date_end != '': + query['rcend'] = date_end + if namespaces != '': + query['rcnamespace'] = namespaces + + if opts.debug: print query + response = wiki.call(query) + if opts.debug: print response + + changes = list() + # If necesary, repeatedly call the server to get more data + while response.has_key('query-continue'): + changes.extend( [entry.get('title') for entry in response.get('query',{}).get('recentchanges',[]) if entry.has_key('title')] ) + query['rccontinue'] = True + query['rcstart'] = response['query-continue']['recentchanges']['rcstart'] + if opts.debug: print query + response = wiki.call(query) + if opts.debug: print response + + # Extract any remaining data from the response + changes.extend( [entry.get('title') for entry in response.get('query',{}).get('recentchanges',[]) if entry.has_key('title')] ) + + # Sort results + changes.sort() + return changes + +def list_usercontribs(wiki, user, date_start='', date_end='', namespaces=""): + '''Return a dictionary of page names and commits''' + # "https://fedoraproject.org/w/api.php?action=query&list=usercontribs&uclimit=100&ucuser=jlaska&ucnamespacestart=2010-11-11T00:00:00Z&ucend=2010-11-01T23:59:59Z" + # Build query arguments and call wiki + query = dict(action='query', + list='usercontribs', + uclimit=50, + ucuser=user) + # FIXME - validate date input (expected format "%Y-%m-%dT%H:%M:%S") + if date_start != '': + query['ucstart'] = date_start + if date_end != '': + query['ucend'] = date_end + if namespaces != '': + query['ucnamespace'] = namespaces + + if opts.debug: print query + response = wiki.call(query) + if opts.debug: print response + + contribs = list() + # If necesary, repeatedly call the server to get more data + while response.has_key('query-continue'): + contribs.extend( [entry.get('title') for entry in response.get('query',{}).get('usercontribs',[]) if entry.has_key('title')] ) + query['uccontinue'] = True + query['ucstart'] = response['query-continue']['usercontribs']['ucstart'] + if opts.debug: print query + response = wiki.call(query) + if opts.debug: print response + + # Extract any remaining data from the response + contribs.extend( [entry.get('title') for entry in response.get('query',{}).get('usercontribs',[]) if entry.has_key('title')] ) + + # Sort results + contribs.sort() + return contribs + +def parse(wiki, page): + '''Parse a page and return content''' + + # Build query arguments and call wiki + query = dict(action='query', + prop='revisions', + titles=page, + rvprop='content') + if opts.debug: print query + response = wiki.call(query) + #print response + #return response.get('query',{}).get('pages',{}).get('*','') + # XXX - only returns the first rev ... do we care? + for page in response.get('query',{}).get('pages',{}).values(): + revs = page.get('revisions',[]) + for t in revs: + return t.get('*','') + return '' + + +def getraw(wiki, titles): + '''Return raw mediawiki content of a specified page''' + + # Build query arguments and call wiki + query = dict(action='query', + prop='revisions', + titles=titles, + rvlimit=1, + rvexpandtemplates=1, + rvprop='content') + if opts.debug: print query + response = wiki.call(query) + rev = response.get('query',{}).get('pages',{}) + # XXX - only returns the first rev ... do we care? + for page in response.get('query',{}).get('pages',{}).values(): + print "page='%s'"%page + revs = page.get('revisions',[]) + print "revs='%s'"%revs + for rev in revs: + print "rev='%s'"%rev + return rev.get('*','') + return '' + +def list_categorymembers(wiki, cat_page, limit=5): + '''Return a list of pages belonging to category page''' + # Add 'Category:' prefix if not given + if not cat_page.startswith("Category:"): + cat_page = "Category:%s" % cat_page + + # Build query arguments and call wiki + query = dict(action='query', + list='categorymembers', + cmtitle=cat_page) + if opts.debug: print query + response = wiki.call(query) + + members = [entry.get('title') for entry in response.get('query',{}).get('categorymembers',{}) if entry.has_key('title')] + + # Determine whether we need to recurse + idx = 0 + while True: + if idx >= len(members) or limit <= 0: + break + # Recurse? + if members[idx].startswith('Category:') and limit > 0: + members.extend(list_categorymembers(wiki, members[idx], limit-1)) + members.remove(members[idx]) # remove Category from list + else: + idx += 1 + + return members + +def extract(s, titles): + w1 = s.find('|description=') + w2 = s.find('|setup=') + w3 = s.find('|actions=') + w4 = s.find('|results=') + w5 = s.find('}}') + start = [w5+1, w5+1] + tag = [] + w6 = [] + w7 = [] + i = 0 + while True: + #print s.find('[[', start) + w6.append(s.find('[[', start[0])) + w7.append(s.find(']]', start[1])) + if w6[i] == -1: + break + tag.append(s[(w6[i]+len('[[Category:')):w7[i]]) + #print "w6='%s', w7='%s', table tag = '%s'"%(w6[i], w7[i], table['tag']) + #table['tag'].append(s[(w6[i]+len('[[')):w7[i]]) + start[0] = w6[i] + 1 + start[1] = w7[i] + 1 + i += 1 + table = {} + table['title'] = titles + if w2 == -1: + table['description'] = s[(w1+len('|description=')):w3] + table['setup'] = '' + else: + table['description'] = s[(w1+len('|description=')):w2] + table['setup'] = s[(w2+len('|setup=')):w3] + table['actions'] = s[(w3+len('|actions=')):w4] + table['results'] = s[(w4+len('|results=')):w5] + table['tag'] = tag + return table + + +if __name__ == "__main__": + (opts,action) = parse_args() + + # Create mediawiki handle + wiki = MediaWiki(opts.url) + + if action == 'categorymembers': + for cat_page in opts.categories: + pages = list_categorymembers(wiki, cat_page, opts.limit) + if pages: + print "\n".join(pages) + else: + print "No data found for '%s'" % cat_page + + elif action in ['usercontribs', 'usercontribstable']: + # Gather namespace arguments + ns_ids = '' + if len(opts.namespaces) > 0: + ns_ids = list() + namespaces = list_namespaces(wiki) + for ns in opts.namespaces: + if namespaces.has_key(ns): + ns_ids.append(namespaces[ns]) + ns_ids = "|".join(ns_ids) + + # Gather data + user_edits = dict() + for user in opts.users: + user_edits[user] = list_usercontribs(wiki, user, date_start=opts.start, date_end=opts.end, namespaces=ns_ids) # '2010-12-31T00:00:00Z', '2010-10-01T00:00:00Z') + + # Display list of user contributions + if action == 'usercontribs': + user_pages = dict() + for user in opts.users: + # Create unique list of pages each user edited + if not user_pages.has_key(user): + user_pages[user] = dict() + for page in user_edits[user]: + if not user_pages[user].has_key(page): + user_pages[user][page] = 0 + user_pages[user][page] += 1 + + print "= Wiki contributions for %s between %s and %s =" % (user, opts.start[:10], opts.end[:10]) + for page,edits in user_pages[user].items(): + print " * %s (%s edits)" % (page,edits) + print "" + + # Display table of contributions where x/y axis == user / namespace + elif action == 'usercontribstable': + # Count pages/edits for each namespace + namespaces = dict() + for user,pages in user_edits.items(): + for p in pages: + if p.count(':') > 0: + ns = p.split(':',1)[0] + ':' + else: + ns = "Main:" + + # Initialize per-ns dict + if not namespaces.has_key(ns): + namespaces[ns] = dict() + + # Initialize per-user dict + if not namespaces[ns].has_key(user): + namespaces[ns][user] = dict(pages={}, edits=0) + + # Increment count of edits + namespaces[ns][user]['edits'] += 1 + namespaces[ns][user]['pages'][p] = 0 + + # Total the number of unique pages + for ns,users in namespaces.items(): + for user in users: + namespaces[ns][user]['pages'] = len(namespaces[ns][user]['pages'].keys()) + + # Print banner + print "= Wiki contributions between %s and %s =" % (opts.start[:10], opts.end[:10]) + # Banner row#1 + idx = 0 + for u in opts.users: + # if halfway through list ... + if idx >= len(opts.users)/2: + sys.stdout.write("Number of pages (number of edits)") + break + else: + sys.stdout.write("%10s " % (" "*10)) + idx += 1 + sys.stdout.write("\n") + # Banner row#2 + sys.stdout.write("%-20s " % "Namespace") + garbage = [sys.stdout.write("%10s " % u) for u in opts.users] + sys.stdout.write("%10s " % "Total") + sys.stdout.write("\n") + # Banner row#3 + sys.stdout.write("%-20s-" % ("-"*20)) + garbage = [sys.stdout.write("%10s-" % ("-"*10)) for u in opts.users] + sys.stdout.write("%10s-" % ("-"*10)) + sys.stdout.write("\n") + + # Display data + for ns in namespaces.keys(): + sys.stdout.write("%-20s " % ns) + ttl_pages = 0 + ttl_edits = 0 + for u in opts.users: + num_pages = namespaces[ns].get(u, {}).get('pages', 0) + num_edits = namespaces[ns].get(u, {}).get('edits', 0) + ttl_pages += num_pages + ttl_edits += num_edits + sys.stdout.write("%10s " % ("%s (%s)" % (num_pages, num_edits))) + #garbage = [sys.stdout.write("%10s " % namespaces[ns].get(u,0)) for u in opts.users] + sys.stdout.write("%10s " % ("%s (%s)" % (ttl_pages, ttl_edits))) + sys.stdout.write("\n") + + sys.stdout.write("%-20s-" % ("-"*20)) + garbage = [sys.stdout.write("%10s-" % ("-"*10)) for u in opts.users] + sys.stdout.write("%10s-" % ("-"*10)) + sys.stdout.write("\n") + + # Display bottom total + ttl_pages = 0 + ttl_edits = 0 + sys.stdout.write("%-20s " % "Total") + for u in opts.users: + num_pages = reduce(lambda x,y: x+y, [namespaces[ns].get(u, {}).get('pages', 0) for ns in namespaces.keys()]) + num_edits = reduce(lambda x,y: x+y, [namespaces[ns].get(u, {}).get('edits', 0) for ns in namespaces.keys()]) + ttl_pages += num_pages + ttl_edits += num_edits + sys.stdout.write("%10s " % ("%s (%s)" % (num_pages, num_edits))) + sys.stdout.write("%10s " % ("%s (%s)" % (ttl_pages, ttl_edits))) + sys.stdout.write("\n") + + elif action == 'parse': + s = parse(wiki, opts.title) + table = extract(s, opts.title) + print "====================================" + #print table + for key in table.keys(): + print key, '\t', table[key] + + elif action == 'get': + print getraw(wiki, opts.title) + + elif action == 'recentchanges': + # FIXME + # ianweller explained that mediawiki only retains the 500 most recent + # changes for inspection + print "Experimental!!" + # Gather namespace arguments + ns_ids = '' + if len(opts.namespaces) > 0: + ns_ids = list() + namespaces = list_namespaces(wiki) + for ns in opts.namespaces: + if namespaces.has_key(ns): + ns_ids.append(namespaces[ns]) + ns_ids = "|".join(ns_ids) + + print recentchanges(wiki, opts.start, opts.end, ns_ids) + + else: + print "Unknown action requested '%s'" % action + sys.exit(1) -- cgit