#!/usr/bin/python import os import sys import optparse import re import time, datetime try: from simplemediawiki import MediaWiki except ImportError: print "Unable to import simplemediawiki. Is python-simpemediawiki installed?" sys.exit(1) time_fmt = "%Y-%m-%dT%H:%M:%SZ" def parse_args(): '''Set up the option parser''' parser = optparse.OptionParser(usage="%prog [options] [options]") parser.add_option('-v', '--verbose', action='store_true', default=False, help='Enable more verbose output') parser.add_option('-d', '--debug', action='store_true', default=False, help='Enable debugging output') parser.add_option('--url', action='store', default='https://fedoraproject.org/w/api.php', help='API URL') # general def default_start(): return time.strftime(time_fmt, time.localtime()) def default_end(): # default to the last 30 days today = datetime.datetime.today() one_month = datetime.timedelta(days=30) return (today - one_month).strftime(time_fmt) optgrp = optparse.OptionGroup(parser, "General options") optgrp.add_option('--start', action='store', default=default_start(), help='Limit results using start date') optgrp.add_option('--end', action='store', default=default_end(), help='Limit results using end date') optgrp.add_option('-n', '--namespace', dest="namespaces", action='append', default=[], help='Limit results to a specific namespace (accepts multiple values)') optgrp.add_option('-l', '--limit', action='store', default=5, type="int", help='Limit recursion depth (%default)') parser.add_option_group(optgrp) # list_categorymembers optgrp = optparse.OptionGroup(parser, "Options for 'categorymembers' command:") optgrp.add_option('-c', '--category', dest="categories", default=[], action="append", help='Wiki category name to query (accepts multiple values)') parser.add_option_group(optgrp) # list_usercontribs optgrp = optparse.OptionGroup(parser, "Options for 'usercontribs' and 'usercontribstable'") optgrp.add_option('-u', '--user', dest="users", default=[], action="append", help='FAS username to query (accepts multiple values)') parser.add_option_group(optgrp) optgrp = optparse.OptionGroup(parser, "Options for 'parse' and 'get'") optgrp.add_option('-t', '--title', default='', action='store', help='Page title to parse') parser.add_option_group(optgrp) optgrp = optparse.OptionGroup(parser, "Options for 'recentchanges'") parser.add_option_group(optgrp) (opts, args) = parser.parse_args() if len(args) == 0: parser.error("No action specified") else: action = args[0] # Validate inputs if action == 'categorymembers': if len(opts.categories) == 0: parser.error("Must specify at least one category (-c|--category)") elif action in ['usercontribs', 'usercontribstable']: if len(opts.users) == 0: parser.error("Must specify at least one category (-u|--user)") else: # expand space or comma-delimited values idx = 0 split_re="[ ,]" while idx < len(opts.users): u = opts.users[idx] if re.search(split_re, u): opts.users[idx] = re.split('[ ,]', u)[0] opts.users.extend(re.split('[ ,]', u)[1:]) idx += 1 opts.users.sort() # sort list elif action in ['parse', 'get']: if opts.title == '': parser.error("Must specify a page (-t|--title)") return (opts, action) def list_namespaces(wiki): query = dict(action='query', meta='siteinfo', siprop='namespaces') if opts.debug: print query response = wiki.call(query) if opts.debug: print response namespaces = dict() for id,entry in response.get('query',{}).get('namespaces',{}).items(): if entry.has_key('canonical'): namespaces[entry['canonical']] = id return namespaces def recentchanges(wiki, date_start='', date_end='', namespaces=""): # https://fedoraproject.org/w/api.php?action=query&list=recentchanges&rcprop="title|timestamp|ids|user # Build query arguments and call wiki query = dict(action='query', list='recentchanges', rcprop="title|timestamp|ids|user", rclimit=50, ) # FIXME - validate date input (expected format "%Y-%m-%dT%H:%M:%S") if date_start != '': query['rcstart'] = date_start if date_end != '': query['rcend'] = date_end if namespaces != '': query['rcnamespace'] = namespaces if opts.debug: print query response = wiki.call(query) if opts.debug: print response changes = list() # If necesary, repeatedly call the server to get more data while response.has_key('query-continue'): changes.extend( [entry.get('title') for entry in response.get('query',{}).get('recentchanges',[]) if entry.has_key('title')] ) query['rccontinue'] = True query['rcstart'] = response['query-continue']['recentchanges']['rcstart'] if opts.debug: print query response = wiki.call(query) if opts.debug: print response # Extract any remaining data from the response changes.extend( [entry.get('title') for entry in response.get('query',{}).get('recentchanges',[]) if entry.has_key('title')] ) # Sort results changes.sort() return changes def list_usercontribs(wiki, user, date_start='', date_end='', namespaces=""): '''Return a dictionary of page names and commits''' # "https://fedoraproject.org/w/api.php?action=query&list=usercontribs&uclimit=100&ucuser=jlaska&ucstart=2010-11-11T00:00:00Z&ucend=2010-11-01T23:59:59Z" # Build query arguments and call wiki query = dict(action='query', list='usercontribs', uclimit=50, ucuser=user) # FIXME - validate date input (expected format "%Y-%m-%dT%H:%M:%S") # FIXME - move this to parse_args() def recognize_date(d): yyyymmdd = re.match(r'^(\d{2,4})([ -])(\d{1,2})([ -])(\d{1,2})\s*(.*)$', d) if yyyymmdd: hhmmss = re.match(r'(\d{1,2})[ -:\.](\d{1,2})[ -:\.](\d{1,2})', yyyymmdd.group(6)) if hhmmss: date = datetime.datetime.strptime(d, \ "%%Y%s%%m%s%%d %%H:%%M:%%S" % (yyyymmdd.group(2), yyyymmdd.group(4))) else: date = datetime.datetime.strptime(d, \ "%%Y%s%%m%s%%d" % (yyyymmdd.group(2), yyyymmdd.group(4))) date_str = date.strftime('%Y-%m-%dT%H:%M:%SZ') return date_str else: # FIXME - return error for unexpected date format raise Exception("Unrecognized date format: %s" % d) if date_start != '': # Convert to expected format date_start = recognize_date(date_start) query['ucstart'] = date_start if date_end != '': date_end = recognize_date(date_end) query['ucend'] = date_end if namespaces != '': query['ucnamespace'] = namespaces if date_start != '' and date_end != '': # if we can recognize the date format, see if we need to add ucdir try: date_start = re.sub('[TZ]', ' ', date_start).strip() date_end = re.sub('[TZ]', ' ', date_end).strip() ds = datetime.datetime.strptime(date_start, '%Y-%m-%d %H:%M:%S') de = datetime.datetime.strptime(date_end, '%Y-%m-%d %H:%M:%S') if ds < de: query['ucdir'] = 'newer' except ValueError, e: pass if opts.debug: print query response = wiki.call(query) if opts.debug: print response contribs = list() # If necesary, repeatedly call the server to get more data while response.has_key('query-continue'): contribs.extend( [entry.get('title') for entry in response.get('query',{}).get('usercontribs',[]) if entry.has_key('title')] ) query['uccontinue'] = True query['ucstart'] = response['query-continue']['usercontribs']['ucstart'] if opts.debug: print query response = wiki.call(query) if opts.debug: print response # Extract any remaining data from the response contribs.extend( [entry.get('title') for entry in response.get('query',{}).get('usercontribs',[]) if entry.has_key('title')] ) # Sort results contribs.sort() return contribs def parse(wiki, page): '''Parse a page and return content''' # Build query arguments and call wiki query = dict(action='parse', page=page) if opts.debug: print query response = wiki.call(query) return response.get('parse',{}).get('text',{}).get('*','') def getraw(wiki, titles): '''Return raw mediawiki content of a specified page''' # Build query arguments and call wiki query = dict(action='query', prop='revisions', titles=titles, rvlimit=1, rvexpandtemplates=1, rvprop='content') if opts.debug: print query response = wiki.call(query) rev = response.get('query',{}).get('pages',{}) # XXX - only returns the first rev ... do we care? for page in response.get('query',{}).get('pages',{}).values(): revs = page.get('revisions',[]) for rev in revs: return rev.get('*','') return '' def list_categorymembers(wiki, cat_page, limit=5, namespaces=''): '''Return a list of pages belonging to category page''' # Add 'Category:' prefix if not given if not cat_page.startswith("Category:"): cat_page = "Category:%s" % cat_page # Build query arguments and call wiki query = dict(action='query', list='categorymembers', cmlimit=50, cmtitle=cat_page) if namespaces != '': query['cmnamespace'] = namespaces if opts.debug: print query response = wiki.call(query) if opts.debug: print response # Are more results available? members = list() # If necesary, repeatedly call the server to get more data while response.has_key('query-continue'): # get category member page names (limit to sub-categories if requested) members.extend( [entry.get('title') for entry in response.get('query',{}).get('categorymembers',{}) if entry.has_key('title')] ) query['cmcontinue'] = response['query-continue']['categorymembers']['cmcontinue'] if opts.debug: print query response = wiki.call(query) if opts.debug: print response # Extract any remaining data from the response members.extend( [entry.get('title') for entry in response.get('query',{}).get('categorymembers',{}) if entry.has_key('title')] ) # Determine whether we need to recurse idx = 0 while True: if idx >= len(members) or limit <= 0: break # Recurse? if members[idx].startswith('Category:') and limit > 0: members.extend(list_categorymembers(wiki, members[idx], limit-1, namespaces)) idx += 1 return members if __name__ == "__main__": (opts,action) = parse_args() # Create mediawiki handle wiki = MediaWiki(opts.url) if action == 'categorymembers': ns_ids = '' if len(opts.namespaces) > 0: ns_ids = list() namespaces = list_namespaces(wiki) ns_ids = '|'.join([namespaces[ns] for ns in opts.namespaces if namespaces.has_key(ns)]) for cat_page in opts.categories: pages = list_categorymembers(wiki, cat_page, opts.limit, ns_ids) if pages: print "\n".join(pages) else: print "No data found for '%s'" % cat_page elif action in ['usercontribs', 'usercontribstable']: # Gather namespace arguments ns_ids = '' if len(opts.namespaces) > 0: ns_ids = list() namespaces = list_namespaces(wiki) ns_ids = '|'.join([namespaces[ns] for ns in opts.namespaces if namespaces.has_key(ns)]) # Gather data user_edits = dict() for user in opts.users: user_edits[user] = list_usercontribs(wiki, user, date_start=opts.start, date_end=opts.end, namespaces=ns_ids) # '2010-12-31T00:00:00Z', '2010-10-01T00:00:00Z') # Display list of user contributions if action == 'usercontribs': user_pages = dict() for user in opts.users: # Create unique list of pages each user edited if not user_pages.has_key(user): user_pages[user] = dict() for page in user_edits[user]: if not user_pages[user].has_key(page): user_pages[user][page] = 0 user_pages[user][page] += 1 print "= Wiki contributions for %s between %s and %s =" % (user, opts.start[:10], opts.end[:10]) import operator #for page,edits in user_pages[user].items(): for item in sorted(user_pages[user].iteritems(), key=operator.itemgetter(0)): print " * %s (%s edits)" % (item) print "" # Display table of contributions where x/y axis == user / namespace elif action == 'usercontribstable': # Count pages/edits for each namespace namespaces = dict() for user,pages in user_edits.items(): for p in pages: if p.count(':') > 0: ns = p.split(':',1)[0] + ':' else: ns = "Main:" # Initialize per-ns dict if not namespaces.has_key(ns): namespaces[ns] = dict() # Initialize per-user dict if not namespaces[ns].has_key(user): namespaces[ns][user] = dict(pages={}, edits=0) # Increment count of edits namespaces[ns][user]['edits'] += 1 namespaces[ns][user]['pages'][p] = 0 # Total the number of unique pages for ns,users in namespaces.items(): for user in users: namespaces[ns][user]['pages'] = len(namespaces[ns][user]['pages'].keys()) # Print banner print "= Wiki contributions between %s and %s =" % (opts.start[:10], opts.end[:10]) # Banner row#1 idx = 0 for u in opts.users: # if halfway through list ... if idx >= len(opts.users)/2: sys.stdout.write("Number of pages (number of edits)") break else: sys.stdout.write("%10s " % (" "*10)) idx += 1 sys.stdout.write("\n") # Banner row#2 sys.stdout.write("%-20s " % "Namespace") garbage = [sys.stdout.write("%10s " % u) for u in opts.users] sys.stdout.write("%10s " % "Total") sys.stdout.write("\n") # Banner row#3 sys.stdout.write("%-20s-" % ("-"*20)) garbage = [sys.stdout.write("%10s-" % ("-"*10)) for u in opts.users] sys.stdout.write("%10s-" % ("-"*10)) sys.stdout.write("\n") # Display data for ns in namespaces.keys(): sys.stdout.write("%-20s " % ns) ttl_pages = 0 ttl_edits = 0 for u in opts.users: num_pages = namespaces[ns].get(u, {}).get('pages', 0) num_edits = namespaces[ns].get(u, {}).get('edits', 0) ttl_pages += num_pages ttl_edits += num_edits sys.stdout.write("%10s " % ("%s (%s)" % (num_pages, num_edits))) #garbage = [sys.stdout.write("%10s " % namespaces[ns].get(u,0)) for u in opts.users] sys.stdout.write("%10s " % ("%s (%s)" % (ttl_pages, ttl_edits))) sys.stdout.write("\n") sys.stdout.write("%-20s-" % ("-"*20)) garbage = [sys.stdout.write("%10s-" % ("-"*10)) for u in opts.users] sys.stdout.write("%10s-" % ("-"*10)) sys.stdout.write("\n") # Display bottom total ttl_pages = 0 ttl_edits = 0 sys.stdout.write("%-20s " % "Total") for u in opts.users: num_pages = reduce(lambda x,y: x+y, [namespaces[ns].get(u, {}).get('pages', 0) for ns in namespaces.keys()]) num_edits = reduce(lambda x,y: x+y, [namespaces[ns].get(u, {}).get('edits', 0) for ns in namespaces.keys()]) ttl_pages += num_pages ttl_edits += num_edits sys.stdout.write("%10s " % ("%s (%s)" % (num_pages, num_edits))) sys.stdout.write("%10s " % ("%s (%s)" % (ttl_pages, ttl_edits))) sys.stdout.write("\n") elif action == 'parse': print parse(wiki, opts.title) elif action == 'get': print getraw(wiki, opts.title) elif action == 'recentchanges': # FIXME # ianweller explained that mediawiki only retains the 500 most recent # changes for inspection print "Experimental!!" # Gather namespace arguments ns_ids = '' if len(opts.namespaces) > 0: ns_ids = list() namespaces = list_namespaces(wiki) for ns in opts.namespaces: if namespaces.has_key(ns): ns_ids.append(namespaces[ns]) ns_ids = "|".join(ns_ids) print recentchanges(wiki, opts.start, opts.end, ns_ids) else: print "Unknown action requested '%s'" % action sys.exit(1)