summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-09-03 16:19:26 +0200
committerAurélien Bompard <aurelien@bompard.org>2012-09-07 10:41:51 +0200
commitd7b70ee351cb7a26b7c7ee9400d8ef3491166373 (patch)
treebe72470c2677eedae02ec30dedf7129c0e26988d
parent4deab0e4779217dd0f82ba9beaea18b40ed31933 (diff)
downloadkittystore-d7b70ee351cb7a26b7c7ee9400d8ef3491166373.tar.gz
kittystore-d7b70ee351cb7a26b7c7ee9400d8ef3491166373.tar.xz
kittystore-d7b70ee351cb7a26b7c7ee9400d8ef3491166373.zip
Improve message loading
-rw-r--r--get_mbox.py12
-rw-r--r--to_sqldb.py3
2 files changed, 11 insertions, 4 deletions
diff --git a/get_mbox.py b/get_mbox.py
index 0576e31..4c611d8 100644
--- a/get_mbox.py
+++ b/get_mbox.py
@@ -4,9 +4,10 @@ import itertools
import urlgrabber
import gzip
import sys
+import os
from multiprocessing import Pool
-years = [2010, 2011, 2012, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002]
+years = range(2002, 2013)
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
'August', 'September', 'October', 'November', 'December']
@@ -19,6 +20,9 @@ def archive_downloader(i):
if not list_name or not year or not month:
return
basename = "{0}-{1}.txt.gz".format(year, month)
+ if os.path.exists(basename):
+ print "{0} already downloaded, skipping".format(basename)
+ return
filename = "http://lists.fedoraproject.org/pipermail/{0}/{1}".format(
list_name, basename)
try:
@@ -30,8 +34,10 @@ def archive_downloader(i):
with open(newname, "w") as f:
f.write(gzip.open(basename).read())
print "== {0} downloaded ==".format(filename)
- except urlgrabber.grabber.URLGrabError:
- pass
+ except urlgrabber.grabber.URLGrabError, e:
+ print e
+ if e.errno == 14: # 404
+ os.remove(basename)
if __name__ == "__main__":
diff --git a/to_sqldb.py b/to_sqldb.py
index 73ce615..add297e 100644
--- a/to_sqldb.py
+++ b/to_sqldb.py
@@ -19,6 +19,7 @@ from kittystore import get_store
TOTALCNT = 0
#DB_URL = 'postgres://mm3:mm3@localhost/mm3'
+#DB_URL = 'postgres://kittystore:kittystore@localhost/kittystore'
DB_URL = 'sqlite:///' + os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "kittystore.sqlite"))
@@ -71,7 +72,7 @@ python to_sqldb.py list_name mbox_file [mbox_file]'''
else:
print 'Adding to database list: %s' % sys.argv[1]
- store = get_store(DB_URL)
+ store = get_store(DB_URL, debug=False)
for mbfile in sys.argv[2:]:
print mbfile
if os.path.exists(mbfile):