summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-11-27 17:28:24 +0100
committerAurélien Bompard <aurelien@bompard.org>2012-11-28 09:58:31 +0100
commit6f95223feffe43b14e1be4f70ef77bb5f0590f7e (patch)
tree56a7f591302a7f7797fdb7bfac101d4bdfaa4212
parent35204016d043c9d2623f163ba7b7f37d6da207ab (diff)
downloadkittystore-6f95223feffe43b14e1be4f70ef77bb5f0590f7e.tar.gz
kittystore-6f95223feffe43b14e1be4f70ef77bb5f0590f7e.tar.xz
kittystore-6f95223feffe43b14e1be4f70ef77bb5f0590f7e.zip
Package the get_mbox script as a proper generic script
-rw-r--r--MANIFEST.in2
-rw-r--r--README.rst2
-rw-r--r--get_mbox.py51
-rw-r--r--kittystore/scripts.py72
-rwxr-xr-xsetup.py1
5 files changed, 75 insertions, 53 deletions
diff --git a/MANIFEST.in b/MANIFEST.in
index 2822522..2ca8ea4 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,2 @@
include AUTHORS.txt COPYING.txt pylintrc distribute_setup.py requirements.txt
-include get_mbox.py
+graft kittystore/test/testdata
diff --git a/README.rst b/README.rst
index f10e594..c02a49f 100644
--- a/README.rst
+++ b/README.rst
@@ -15,7 +15,7 @@ The code is available from: https://github.com/pypingou/kittystore
Populating the database
=======================
-- Retrieve the archives using the ``get_mbox.py`` script
+- Retrieve the archives by calling ``kittystore-download21``,
- Load the archives by calling ``kittystore-import``.
This might be memory intensive, so you may want to do 2 or 3 years per run and
diff --git a/get_mbox.py b/get_mbox.py
deleted file mode 100644
index 4c611d8..0000000
--- a/get_mbox.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#! /bin/python
-
-import itertools
-import urlgrabber
-import gzip
-import sys
-import os
-from multiprocessing import Pool
-
-years = range(2002, 2013)
-months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
- 'August', 'September', 'October', 'November', 'December']
-
-
-def archive_downloader(i):
- """ Retrieve the archive for all the year and month defined. """
- list_name = i[0]
- year = i[1]
- month = i[2]
- if not list_name or not year or not month:
- return
- basename = "{0}-{1}.txt.gz".format(year, month)
- if os.path.exists(basename):
- print "{0} already downloaded, skipping".format(basename)
- return
- filename = "http://lists.fedoraproject.org/pipermail/{0}/{1}".format(
- list_name, basename)
- try:
- urlgrabber.urlgrab(filename)
- pos = str(months.index(month) + 1)
- if len(pos) == 1:
- pos = '0{0}'.format(pos)
- newname = '{0}-{1}-{2}-{3}.txt'.format(list_name, year, pos, month)
- with open(newname, "w") as f:
- f.write(gzip.open(basename).read())
- print "== {0} downloaded ==".format(filename)
- except urlgrabber.grabber.URLGrabError, e:
- print e
- if e.errno == 14: # 404
- os.remove(basename)
-
-
-if __name__ == "__main__":
- if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv:
- print '''USAGE:
-python get_mbox.py list_name'''
- else:
- list_name = sys.argv[1:]
- p = Pool(5)
- p.map(archive_downloader, itertools.product(list_name, years,
- months))
diff --git a/kittystore/scripts.py b/kittystore/scripts.py
index d882d9a..0a21792 100644
--- a/kittystore/scripts.py
+++ b/kittystore/scripts.py
@@ -29,6 +29,10 @@ from optparse import OptionParser
from kittystore import get_store
+#
+# Manual database update
+#
+
def updatedb():
parser = OptionParser(usage="%prog -s store_url")
parser.add_option("-s", "--store", help="the URL to the store database")
@@ -47,3 +51,71 @@ def updatedb():
"ORDER BY version DESC LIMIT 1"
))[0][0]
print "Done, the current schema version is %d." % version
+
+
+#
+# Mailman 2 archives downloader
+#
+
+import os
+import urllib2
+import gzip
+import itertools
+from multiprocessing import Pool
+from datetime import date
+
+MONTHS = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
+ 'August', 'September', 'October', 'November', 'December']
+
+def dl_archives():
+ parser = OptionParser(usage="%prog -u URL -l LIST_NAME [-d destdir]")
+ parser.add_option("-u", "--url", help="URL to the mailman installation")
+ parser.add_option("-l", "--list-name", help="mailing-list name")
+ parser.add_option("-d", "--destination", default=os.getcwd(),
+ help="directory to download the archives to. Defaults "
+ "to the current directory (%default)")
+ parser.add_option("-s", "--start", default="2002",
+ help="first year to start looking for archives")
+ parser.add_option("-v", "--verbose", action="store_true",
+ help="show more information")
+ opts, args = parser.parse_args()
+ if not opts.url:
+ parser.error("an URL must be provided")
+ if not opts.list_name:
+ parser.error("a list name must be provided")
+ if "@" in opts.list_name:
+ opts.list_name = opts.list_name[:opts.list_name.index("@")]
+ years = range(int(opts.start), date.today().year + 1)
+ p = Pool(5)
+ p.map(_archive_downloader, itertools.product([opts], years, MONTHS))
+
+def _archive_downloader(args):
+ opts, year, month = args
+ if not year or not month:
+ return
+ basename = "{0}-{1}.txt.gz".format(year, month)
+ filepath = os.path.join(opts.destination, basename)
+ if os.path.exists(filepath):
+ if opts.verbose:
+ print "{0} already downloaded, skipping".format(basename)
+ return
+ url = "{0}/pipermail/{1}/{2}".format(
+ opts.url, opts.list_name, basename)
+ if opts.verbose:
+ print "Downloading from {0}".format(url)
+ try:
+ request = urllib2.urlopen(url)
+ with open(filepath, "w") as f:
+ f.write(request.read())
+ except urllib2.URLError, e:
+ if e.code == 404:
+ print ("This archive hasn't been created on the server yet: "
+ + basename)
+ else:
+ print e
+ return
+ pos = str(MONTHS.index(month) + 1).rjust(2, "0")
+ newname = '{0}-{1}-{2}-{3}.txt'.format(opts.list_name, year, pos, month)
+ with open(os.path.join(opts.destination, newname), "w") as f:
+ f.write(gzip.open(filepath).read())
+ print "Downloaded archive for {0} {1} from {2}".format(month, year, url)
diff --git a/setup.py b/setup.py
index 1408fe8..eae13d7 100755
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,7 @@ setup(
'console_scripts': [
'kittystore-import = kittystore.import:main',
'kittystore-updatedb = kittystore.scripts:updatedb',
+ 'kittystore-download21 = kittystore.scripts:dl_archives',
],
},
)