summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Laska <jlaska@redhat.com>2011-04-19 08:34:32 -0400
committerJames Laska <jlaska@redhat.com>2011-04-19 08:34:32 -0400
commit5623a7202766d27dbd99d9af5c82715b37c8d92d (patch)
treefd0251d0c7c6f8504e9990595d092b5cf4421b6b
parent3d26b2a6f8d8b7fdc10d65a6b6923d99e482b7af (diff)
downloadscripts-5623a7202766d27dbd99d9af5c82715b37c8d92d.tar.gz
scripts-5623a7202766d27dbd99d9af5c82715b37c8d92d.tar.xz
scripts-5623a7202766d27dbd99d9af5c82715b37c8d92d.zip
New mediawiki metrics script
-rwxr-xr-xget-mediawiki-data408
1 files changed, 408 insertions, 0 deletions
diff --git a/get-mediawiki-data b/get-mediawiki-data
new file mode 100755
index 0000000..54a50cb
--- /dev/null
+++ b/get-mediawiki-data
@@ -0,0 +1,408 @@
+#!/usr/bin/python
+
+import os
+import sys
+import optparse
+import re
+import time, datetime
+
+try:
+ from simplemediawiki import MediaWiki
+except ImportError:
+ print "Unable to import simplemediawiki. Is python-simpemediawiki installed?"
+ sys.exit(1)
+
+time_fmt = "%Y-%m-%dT%H:%M:%SZ"
+
+def parse_args():
+ '''Set up the option parser'''
+ parser = optparse.OptionParser(usage="%prog [options] <action> [options]")
+ parser.add_option('-v', '--verbose', action='store_true', default=False,
+ help='Enable more verbose output')
+ parser.add_option('-d', '--debug', action='store_true', default=False,
+ help='Enable debugging output')
+ parser.add_option('--url', action='store', default='https://fedoraproject.org/w/api.php',
+ help='API URL')
+
+ # general
+ def default_start():
+ return time.strftime(time_fmt, time.localtime())
+ def default_end():
+ # default to the last 30 days
+ today = datetime.datetime.today()
+ one_month = datetime.timedelta(days=30)
+ return (today - one_month).strftime(time_fmt)
+
+ optgrp = optparse.OptionGroup(parser, "General options")
+ optgrp.add_option('--start', action='store', default=default_start(),
+ help='Limit results using start date')
+ optgrp.add_option('--end', action='store', default=default_end(),
+ help='Limit results using end date')
+ optgrp.add_option('-n', '--namespace', dest="namespaces",
+ action='append', default=[],
+ help='Limit results to a specific namespace (accepts multiple values)')
+ optgrp.add_option('-l', '--limit', action='store', default=5, type="int",
+ help='Limit recursion depth (%default)')
+ parser.add_option_group(optgrp)
+
+ # list_categorymembers
+ optgrp = optparse.OptionGroup(parser, "Options for 'categorymembers' command:")
+ optgrp.add_option('-c', '--category', dest="categories",
+ default=[], action="append",
+ help='Wiki category name to query (accepts multiple values)')
+ parser.add_option_group(optgrp)
+
+ # list_usercontribs
+ optgrp = optparse.OptionGroup(parser, "Options for 'usercontribs' and 'usercontribstable'")
+ optgrp.add_option('-u', '--user', dest="users",
+ default=[], action="append",
+ help='FAS username to query (accepts multiple values)')
+ parser.add_option_group(optgrp)
+
+ optgrp = optparse.OptionGroup(parser, "Options for 'parse' and 'get'")
+ optgrp.add_option('-t', '--title',
+ default='', action='store',
+ help='Page title to parse')
+ parser.add_option_group(optgrp)
+
+ optgrp = optparse.OptionGroup(parser, "Options for 'recentchanges'")
+ parser.add_option_group(optgrp)
+
+ (opts, args) = parser.parse_args()
+
+ if len(args) == 0:
+ parser.error("No action specified")
+ else:
+ action = args[0]
+
+ # Validate inputs
+ if action == 'categorymembers':
+ if len(opts.categories) == 0:
+ parser.error("Must specify at least one category (-c|--category)")
+ elif action in ['usercontribs', 'usercontribstable']:
+ if len(opts.users) == 0:
+ parser.error("Must specify at least one category (-u|--user)")
+ else:
+ # expand space or comma-delimited values
+ idx = 0
+ split_re="[ ,]"
+ while idx < len(opts.users):
+ u = opts.users[idx]
+ if re.search(split_re, u):
+ opts.users[idx] = re.split('[ ,]', u)[0]
+ opts.users.extend(re.split('[ ,]', u)[1:])
+ idx += 1
+ opts.users.sort() # sort list
+ elif action in ['parse', 'get']:
+ if opts.title == '':
+ parser.error("Must specify a page (-t|--title)")
+
+ return (opts, action)
+
+def list_namespaces(wiki):
+ query = dict(action='query',
+ meta='siteinfo',
+ siprop='namespaces')
+
+ if opts.debug: print query
+ response = wiki.call(query)
+ if opts.debug: print response
+
+ namespaces = dict()
+ for id,entry in response.get('query',{}).get('namespaces',{}).items():
+ if entry.has_key('canonical'):
+ namespaces[entry['canonical']] = id
+
+ return namespaces
+
+def recentchanges(wiki, date_start='', date_end='', namespaces=""):
+ # https://fedoraproject.org/w/api.php?action=query&list=recentchanges&rcprop="title|timestamp|ids|user
+ # Build query arguments and call wiki
+ query = dict(action='query',
+ list='recentchanges',
+ rcprop="title|timestamp|ids|user",
+ rclimit=50,
+ )
+ # FIXME - validate date input (expected format "%Y-%m-%dT%H:%M:%S")
+ if date_start != '':
+ query['rcstart'] = date_start
+ if date_end != '':
+ query['rcend'] = date_end
+ if namespaces != '':
+ query['rcnamespace'] = namespaces
+
+ if opts.debug: print query
+ response = wiki.call(query)
+ if opts.debug: print response
+
+ changes = list()
+ # If necesary, repeatedly call the server to get more data
+ while response.has_key('query-continue'):
+ changes.extend( [entry.get('title') for entry in response.get('query',{}).get('recentchanges',[]) if entry.has_key('title')] )
+ query['rccontinue'] = True
+ query['rcstart'] = response['query-continue']['recentchanges']['rcstart']
+ if opts.debug: print query
+ response = wiki.call(query)
+ if opts.debug: print response
+
+ # Extract any remaining data from the response
+ changes.extend( [entry.get('title') for entry in response.get('query',{}).get('recentchanges',[]) if entry.has_key('title')] )
+
+ # Sort results
+ changes.sort()
+ return changes
+
+def list_usercontribs(wiki, user, date_start='', date_end='', namespaces=""):
+ '''Return a dictionary of page names and commits'''
+ # "https://fedoraproject.org/w/api.php?action=query&list=usercontribs&uclimit=100&ucuser=jlaska&ucnamespacestart=2010-11-11T00:00:00Z&ucend=2010-11-01T23:59:59Z"
+ # Build query arguments and call wiki
+ query = dict(action='query',
+ list='usercontribs',
+ uclimit=50,
+ ucuser=user)
+ # FIXME - validate date input (expected format "%Y-%m-%dT%H:%M:%S")
+ if date_start != '':
+ query['ucstart'] = date_start
+ if date_end != '':
+ query['ucend'] = date_end
+ if namespaces != '':
+ query['ucnamespace'] = namespaces
+
+ if opts.debug: print query
+ response = wiki.call(query)
+ if opts.debug: print response
+
+ contribs = list()
+ # If necesary, repeatedly call the server to get more data
+ while response.has_key('query-continue'):
+ contribs.extend( [entry.get('title') for entry in response.get('query',{}).get('usercontribs',[]) if entry.has_key('title')] )
+ query['uccontinue'] = True
+ query['ucstart'] = response['query-continue']['usercontribs']['ucstart']
+ if opts.debug: print query
+ response = wiki.call(query)
+ if opts.debug: print response
+
+ # Extract any remaining data from the response
+ contribs.extend( [entry.get('title') for entry in response.get('query',{}).get('usercontribs',[]) if entry.has_key('title')] )
+
+ # Sort results
+ contribs.sort()
+ return contribs
+
+def parse(wiki, page):
+ '''Parse a page and return content'''
+
+ # Build query arguments and call wiki
+ query = dict(action='parse',
+ page=page)
+ if opts.debug: print query
+ response = wiki.call(query)
+ return response.get('parse',{}).get('text',{}).get('*','')
+
+def getraw(wiki, titles):
+ '''Return raw mediawiki content of a specified page'''
+
+ # Build query arguments and call wiki
+ query = dict(action='query',
+ prop='revisions',
+ titles=titles,
+ rvlimit=1,
+ rvexpandtemplates=1,
+ rvprop='content')
+ if opts.debug: print query
+ response = wiki.call(query)
+ rev = response.get('query',{}).get('pages',{})
+ # XXX - only returns the first rev ... do we care?
+ for page in response.get('query',{}).get('pages',{}).values():
+ revs = page.get('revisions',[])
+ for rev in revs:
+ return rev.get('*','')
+ return ''
+
+def list_categorymembers(wiki, cat_page, limit=5):
+ '''Return a list of pages belonging to category page'''
+ # Add 'Category:' prefix if not given
+ if not cat_page.startswith("Category:"):
+ cat_page = "Category:%s" % cat_page
+
+ # Build query arguments and call wiki
+ query = dict(action='query',
+ list='categorymembers',
+ cmtitle=cat_page)
+ if opts.debug: print query
+ response = wiki.call(query)
+
+ members = [entry.get('title') for entry in response.get('query',{}).get('categorymembers',{}) if entry.has_key('title')]
+
+ # Determine whether we need to recurse
+ idx = 0
+ while True:
+ if idx >= len(members) or limit <= 0:
+ break
+ # Recurse?
+ if members[idx].startswith('Category:') and limit > 0:
+ members.extend(list_categorymembers(wiki, members[idx], limit-1))
+ members.remove(members[idx]) # remove Category from list
+ else:
+ idx += 1
+
+ return members
+
+if __name__ == "__main__":
+ (opts,action) = parse_args()
+
+ # Create mediawiki handle
+ wiki = MediaWiki(opts.url)
+
+ if action == 'categorymembers':
+ for cat_page in opts.categories:
+ pages = list_categorymembers(wiki, cat_page, opts.limit)
+ if pages:
+ print "\n".join(pages)
+ else:
+ print "No data found for '%s'" % cat_page
+
+ elif action in ['usercontribs', 'usercontribstable']:
+ # Gather namespace arguments
+ ns_ids = ''
+ if len(opts.namespaces) > 0:
+ ns_ids = list()
+ namespaces = list_namespaces(wiki)
+ for ns in opts.namespaces:
+ if namespaces.has_key(ns):
+ ns_ids.append(namespaces[ns])
+ ns_ids = "|".join(ns_ids)
+
+ # Gather data
+ user_edits = dict()
+ for user in opts.users:
+ user_edits[user] = list_usercontribs(wiki, user, date_start=opts.start, date_end=opts.end, namespaces=ns_ids) # '2010-12-31T00:00:00Z', '2010-10-01T00:00:00Z')
+
+ # Display list of user contributions
+ if action == 'usercontribs':
+ user_pages = dict()
+ for user in opts.users:
+ # Create unique list of pages each user edited
+ if not user_pages.has_key(user):
+ user_pages[user] = dict()
+ for page in user_edits[user]:
+ if not user_pages[user].has_key(page):
+ user_pages[user][page] = 0
+ user_pages[user][page] += 1
+
+ print "= Wiki contributions for %s between %s and %s =" % (user, opts.start[:10], opts.end[:10])
+ for page,edits in user_pages[user].items():
+ print " * %s (%s edits)" % (page,edits)
+ print ""
+
+ # Display table of contributions where x/y axis == user / namespace
+ elif action == 'usercontribstable':
+ # Count pages/edits for each namespace
+ namespaces = dict()
+ for user,pages in user_edits.items():
+ for p in pages:
+ if p.count(':') > 0:
+ ns = p.split(':',1)[0] + ':'
+ else:
+ ns = "Main:"
+
+ # Initialize per-ns dict
+ if not namespaces.has_key(ns):
+ namespaces[ns] = dict()
+
+ # Initialize per-user dict
+ if not namespaces[ns].has_key(user):
+ namespaces[ns][user] = dict(pages={}, edits=0)
+
+ # Increment count of edits
+ namespaces[ns][user]['edits'] += 1
+ namespaces[ns][user]['pages'][p] = 0
+
+ # Total the number of unique pages
+ for ns,users in namespaces.items():
+ for user in users:
+ namespaces[ns][user]['pages'] = len(namespaces[ns][user]['pages'].keys())
+
+ # Print banner
+ print "= Wiki contributions between %s and %s =" % (opts.start[:10], opts.end[:10])
+ # Banner row#1
+ idx = 0
+ for u in opts.users:
+ # if halfway through list ...
+ if idx >= len(opts.users)/2:
+ sys.stdout.write("Number of pages (number of edits)")
+ break
+ else:
+ sys.stdout.write("%10s " % (" "*10))
+ idx += 1
+ sys.stdout.write("\n")
+ # Banner row#2
+ sys.stdout.write("%-20s " % "Namespace")
+ garbage = [sys.stdout.write("%10s " % u) for u in opts.users]
+ sys.stdout.write("%10s " % "Total")
+ sys.stdout.write("\n")
+ # Banner row#3
+ sys.stdout.write("%-20s-" % ("-"*20))
+ garbage = [sys.stdout.write("%10s-" % ("-"*10)) for u in opts.users]
+ sys.stdout.write("%10s-" % ("-"*10))
+ sys.stdout.write("\n")
+
+ # Display data
+ for ns in namespaces.keys():
+ sys.stdout.write("%-20s " % ns)
+ ttl_pages = 0
+ ttl_edits = 0
+ for u in opts.users:
+ num_pages = namespaces[ns].get(u, {}).get('pages', 0)
+ num_edits = namespaces[ns].get(u, {}).get('edits', 0)
+ ttl_pages += num_pages
+ ttl_edits += num_edits
+ sys.stdout.write("%10s " % ("%s (%s)" % (num_pages, num_edits)))
+ #garbage = [sys.stdout.write("%10s " % namespaces[ns].get(u,0)) for u in opts.users]
+ sys.stdout.write("%10s " % ("%s (%s)" % (ttl_pages, ttl_edits)))
+ sys.stdout.write("\n")
+
+ sys.stdout.write("%-20s-" % ("-"*20))
+ garbage = [sys.stdout.write("%10s-" % ("-"*10)) for u in opts.users]
+ sys.stdout.write("%10s-" % ("-"*10))
+ sys.stdout.write("\n")
+
+ # Display bottom total
+ ttl_pages = 0
+ ttl_edits = 0
+ sys.stdout.write("%-20s " % "Total")
+ for u in opts.users:
+ num_pages = reduce(lambda x,y: x+y, [namespaces[ns].get(u, {}).get('pages', 0) for ns in namespaces.keys()])
+ num_edits = reduce(lambda x,y: x+y, [namespaces[ns].get(u, {}).get('edits', 0) for ns in namespaces.keys()])
+ ttl_pages += num_pages
+ ttl_edits += num_edits
+ sys.stdout.write("%10s " % ("%s (%s)" % (num_pages, num_edits)))
+ sys.stdout.write("%10s " % ("%s (%s)" % (ttl_pages, ttl_edits)))
+ sys.stdout.write("\n")
+
+ elif action == 'parse':
+ print parse(wiki, opts.title)
+
+ elif action == 'get':
+ print getraw(wiki, opts.title)
+
+ elif action == 'recentchanges':
+ # FIXME
+ # ianweller explained that mediawiki only retains the 500 most recent
+ # changes for inspection
+ print "Experimental!!"
+ # Gather namespace arguments
+ ns_ids = ''
+ if len(opts.namespaces) > 0:
+ ns_ids = list()
+ namespaces = list_namespaces(wiki)
+ for ns in opts.namespaces:
+ if namespaces.has_key(ns):
+ ns_ids.append(namespaces[ns])
+ ns_ids = "|".join(ns_ids)
+
+ print recentchanges(wiki, opts.start, opts.end, ns_ids)
+
+ else:
+ print "Unknown action requested '%s'" % action
+ sys.exit(1)