From 6f95223feffe43b14e1be4f70ef77bb5f0590f7e Mon Sep 17 00:00:00 2001 From: Aurélien Bompard Date: Tue, 27 Nov 2012 17:28:24 +0100 Subject: Package the get_mbox script as a proper generic script --- MANIFEST.in | 2 +- README.rst | 2 +- get_mbox.py | 51 ------------------------------------ kittystore/scripts.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 1 + 5 files changed, 75 insertions(+), 53 deletions(-) delete mode 100644 get_mbox.py diff --git a/MANIFEST.in b/MANIFEST.in index 2822522..2ca8ea4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ include AUTHORS.txt COPYING.txt pylintrc distribute_setup.py requirements.txt -include get_mbox.py +graft kittystore/test/testdata diff --git a/README.rst b/README.rst index f10e594..c02a49f 100644 --- a/README.rst +++ b/README.rst @@ -15,7 +15,7 @@ The code is available from: https://github.com/pypingou/kittystore Populating the database ======================= -- Retrieve the archives using the ``get_mbox.py`` script +- Retrieve the archives by calling ``kittystore-download21``, - Load the archives by calling ``kittystore-import``. This might be memory intensive, so you may want to do 2 or 3 years per run and diff --git a/get_mbox.py b/get_mbox.py deleted file mode 100644 index 4c611d8..0000000 --- a/get_mbox.py +++ /dev/null @@ -1,51 +0,0 @@ -#! /bin/python - -import itertools -import urlgrabber -import gzip -import sys -import os -from multiprocessing import Pool - -years = range(2002, 2013) -months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', - 'August', 'September', 'October', 'November', 'December'] - - -def archive_downloader(i): - """ Retrieve the archive for all the year and month defined. """ - list_name = i[0] - year = i[1] - month = i[2] - if not list_name or not year or not month: - return - basename = "{0}-{1}.txt.gz".format(year, month) - if os.path.exists(basename): - print "{0} already downloaded, skipping".format(basename) - return - filename = "http://lists.fedoraproject.org/pipermail/{0}/{1}".format( - list_name, basename) - try: - urlgrabber.urlgrab(filename) - pos = str(months.index(month) + 1) - if len(pos) == 1: - pos = '0{0}'.format(pos) - newname = '{0}-{1}-{2}-{3}.txt'.format(list_name, year, pos, month) - with open(newname, "w") as f: - f.write(gzip.open(basename).read()) - print "== {0} downloaded ==".format(filename) - except urlgrabber.grabber.URLGrabError, e: - print e - if e.errno == 14: # 404 - os.remove(basename) - - -if __name__ == "__main__": - if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv: - print '''USAGE: -python get_mbox.py list_name''' - else: - list_name = sys.argv[1:] - p = Pool(5) - p.map(archive_downloader, itertools.product(list_name, years, - months)) diff --git a/kittystore/scripts.py b/kittystore/scripts.py index d882d9a..0a21792 100644 --- a/kittystore/scripts.py +++ b/kittystore/scripts.py @@ -29,6 +29,10 @@ from optparse import OptionParser from kittystore import get_store +# +# Manual database update +# + def updatedb(): parser = OptionParser(usage="%prog -s store_url") parser.add_option("-s", "--store", help="the URL to the store database") @@ -47,3 +51,71 @@ def updatedb(): "ORDER BY version DESC LIMIT 1" ))[0][0] print "Done, the current schema version is %d." % version + + +# +# Mailman 2 archives downloader +# + +import os +import urllib2 +import gzip +import itertools +from multiprocessing import Pool +from datetime import date + +MONTHS = ['January', 'February', 'March', 'April', 'May', 'June', 'July', + 'August', 'September', 'October', 'November', 'December'] + +def dl_archives(): + parser = OptionParser(usage="%prog -u URL -l LIST_NAME [-d destdir]") + parser.add_option("-u", "--url", help="URL to the mailman installation") + parser.add_option("-l", "--list-name", help="mailing-list name") + parser.add_option("-d", "--destination", default=os.getcwd(), + help="directory to download the archives to. Defaults " + "to the current directory (%default)") + parser.add_option("-s", "--start", default="2002", + help="first year to start looking for archives") + parser.add_option("-v", "--verbose", action="store_true", + help="show more information") + opts, args = parser.parse_args() + if not opts.url: + parser.error("an URL must be provided") + if not opts.list_name: + parser.error("a list name must be provided") + if "@" in opts.list_name: + opts.list_name = opts.list_name[:opts.list_name.index("@")] + years = range(int(opts.start), date.today().year + 1) + p = Pool(5) + p.map(_archive_downloader, itertools.product([opts], years, MONTHS)) + +def _archive_downloader(args): + opts, year, month = args + if not year or not month: + return + basename = "{0}-{1}.txt.gz".format(year, month) + filepath = os.path.join(opts.destination, basename) + if os.path.exists(filepath): + if opts.verbose: + print "{0} already downloaded, skipping".format(basename) + return + url = "{0}/pipermail/{1}/{2}".format( + opts.url, opts.list_name, basename) + if opts.verbose: + print "Downloading from {0}".format(url) + try: + request = urllib2.urlopen(url) + with open(filepath, "w") as f: + f.write(request.read()) + except urllib2.URLError, e: + if e.code == 404: + print ("This archive hasn't been created on the server yet: " + + basename) + else: + print e + return + pos = str(MONTHS.index(month) + 1).rjust(2, "0") + newname = '{0}-{1}-{2}-{3}.txt'.format(opts.list_name, year, pos, month) + with open(os.path.join(opts.destination, newname), "w") as f: + f.write(gzip.open(filepath).read()) + print "Downloaded archive for {0} {1} from {2}".format(month, year, url) diff --git a/setup.py b/setup.py index 1408fe8..eae13d7 100755 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ setup( 'console_scripts': [ 'kittystore-import = kittystore.import:main', 'kittystore-updatedb = kittystore.scripts:updatedb', + 'kittystore-download21 = kittystore.scripts:dl_archives', ], }, ) -- cgit