summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAurélien Bompard <aurelien@bompard.org>2012-11-09 11:55:23 +0100
committerAurélien Bompard <aurelien@bompard.org>2012-11-09 11:55:23 +0100
commitff7b371366aaf51302c5f021af5d5e2bda02565c (patch)
treeefe4c50b2e640b36e3630ff933e1fa18645e47cc
parentf509608d40c08fce20f5f836ad764bcdc71ed5cb (diff)
downloadkittystore-ff7b371366aaf51302c5f021af5d5e2bda02565c.tar.gz
kittystore-ff7b371366aaf51302c5f021af5d5e2bda02565c.tar.xz
kittystore-ff7b371366aaf51302c5f021af5d5e2bda02565c.zip
Improve the import script
- fix regexes to parse attachments - move the store URL to a command-line option And include the testsuite in the package.
-rw-r--r--kittystore/import.py70
-rw-r--r--setup.py3
2 files changed, 47 insertions, 26 deletions
diff --git a/kittystore/import.py b/kittystore/import.py
index c820a8a..71e933f 100644
--- a/kittystore/import.py
+++ b/kittystore/import.py
@@ -36,11 +36,6 @@ from email.utils import unquote
from kittystore import get_store
-#KITTYSTORE_URL = 'postgres://mm3:mm3@localhost/mm3'
-#KITTYSTORE_URL = 'postgres://kittystore:kittystore@localhost/kittystore'
-KITTYSTORE_URL = 'sqlite:///' + os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "kittystore.sqlite"))
-
-
PREFIX_RE = re.compile("^\[([\w\s_-]+)\] ")
ATTACHMENT_RE = re.compile(r"""
@@ -50,7 +45,7 @@ Name:[ ]([^\n]+)\n
Type:[ ]([^\n]+)\n
Size:[ ]\d+[ ]bytes\n
Desc:[ ].+?\n
-Url[ ]:[ ]([^\s]+)\s*\n
+U(?:rl|RL)[ ]?:[ ]([^\s]+)\s*\n
""", re.X | re.S)
EMBEDDED_MSG_RE = re.compile(r"""
@@ -60,7 +55,7 @@ From:[ ].+?\n
Subject:[ ](.+?)\n
Date:[ ][^\n]+\n
Size:[ ]\d+\n
-Url:[ ]([^\s]+)\s*\n
+U(?:rl|RL)[ ]?:[ ]([^\s]+)\s*\n
""", re.X | re.S)
HTML_ATTACH_RE = re.compile(r"""
@@ -73,9 +68,11 @@ TEXT_NO_CHARSET_RE = re.compile(r"""
--------------[ ]next[ ]part[ ]--------------\n
An[ ]embedded[ ]and[ ]charset-unspecified[ ]text[ ]was[ ]scrubbed\.\.\.\n
Name:[ ]([^\n]+)\n
-Url:[ ]([^\s]+)\s*\n
+U(?:rl|RL)[ ]?:[ ]([^\s]+)\s*\n
""", re.X | re.S)
+TEXTWRAP_RE = re.compile("\n\s*")
+
def convert_date(date_string):
""" Convert the string of the date to a datetime object. """
@@ -104,6 +101,7 @@ class DbImporter(object):
self.total_imported = 0
self.force_import = opts.duplicates
self.no_download = opts.no_download
+ self.verbose = opts.verbose
def from_mbox(self, mbfile):
""" Upload all the emails in a mbox file into the database using
@@ -118,6 +116,9 @@ class DbImporter(object):
for message in mailbox.mbox(mbfile):
cnt_read = cnt_read + 1
self.total_imported += 1
+ # Un-wrap the subject line if necessary
+ message.replace_header("subject",
+ TEXTWRAP_RE.sub(" ", message["subject"]))
# Try to find the mailing-list subject prefix in the first email
if cnt_read == 1:
subject_prefix = PREFIX_RE.search(message["subject"])
@@ -125,12 +126,14 @@ class DbImporter(object):
self.mlist.display_name = unicode(subject_prefix.group(1))
if self.force_import:
while self.store.is_message_in_list(
- self.mlist.fqdn_listname, unquote(message["Message-Id"])):
- print "Found duplicate, changing message id from", message["Message-Id"], "to",
+ self.mlist.fqdn_listname,
+ unquote(message["Message-Id"])):
+ oldmsgid = message["Message-Id"]
message.replace_header("Message-Id",
"<%s-%s>" % (unquote(message["Message-Id"]),
str(randint(0, 100))))
- print message["Message-Id"]
+ print("Found duplicate, changing message id from %s to %s"
+ % (oldmsgid, message["Message-Id"]))
try:
self.store.add_to_list(self.mlist, message)
except ValueError, e:
@@ -145,33 +148,40 @@ class DbImporter(object):
self.store.flush()
cnt_imported += 1
self.store.commit()
- print ' %s email read' % cnt_read
- print ' %s email added to the database' % cnt_imported
+ if self.verbose:
+ print ' %s email read' % cnt_read
+ print ' %s email added to the database' % cnt_imported
def extract_attachments(self, message):
"""Parse message to search for attachments"""
message_text = message.as_string()
+ counter = 0
#has_attach = False
#if "-------------- next part --------------" in message_text:
# has_attach = True
# Regular attachments
attachments = ATTACHMENT_RE.findall(message_text)
- for counter, att in enumerate(attachments):
+ for att in attachments:
+ counter += 1
self.download_attachment(message["Message-Id"], counter,
att[0], att[1], att[2])
# Embedded messages
embedded = EMBEDDED_MSG_RE.findall(message_text)
- for counter, att in enumerate(embedded):
+ for att in embedded:
+ counter += 1
self.download_attachment(message["Message-Id"], counter,
att[0], 'message/rfc822', att[1])
# HTML attachments
html_attachments = HTML_ATTACH_RE.findall(message_text)
- for counter, att in enumerate(html_attachments):
+ for att in html_attachments:
+ counter += 1
+ url = att.strip("<>")
self.download_attachment(message["Message-Id"], counter,
- os.path.basename(att), 'text/html', att)
+ os.path.basename(url), 'text/html', url)
# Text without charset
text_no_charset = TEXT_NO_CHARSET_RE.findall(message_text)
- for counter, att in enumerate(text_no_charset):
+ for att in text_no_charset:
+ counter += 1
self.download_attachment(message["Message-Id"], counter,
att[0], 'text/plain', att[1])
## Other, probably inline text/plain
@@ -180,18 +190,24 @@ class DbImporter(object):
# print message_text
def download_attachment(self, message_id, counter, name, ctype, url):
- #print "Downloading attachment from", url
+ url = url.strip(" <>")
+ message_id = message_id.strip(" <>")
if self.no_download:
+ if self.verbose:
+ print "NOT downloading attachment from %s" % url
content = ""
else:
+ if self.verbose:
+ print "Downloading attachment from %s" % url
content = urllib.urlopen(url).read()
- self.store.add_attachment(self.mlist, message_id, counter, name,
- ctype, None, content)
+ self.store.add_attachment(self.mlist.fqdn_listname, message_id,
+ counter, name, ctype, None, content)
def parse_args():
- usage = "%prog -l list_name mbox_file [mbox_file ...]"
+ usage = "%prog -s store_url -l list_name mbox_file [mbox_file ...]"
parser = OptionParser(usage=usage)
+ parser.add_option("-s", "--store", help="the URL to the store database")
parser.add_option("-l", "--list-name", help="the fully-qualified list "
"name (including the '@' symbol and the domain name")
parser.add_option("-v", "--verbose", action="store_true",
@@ -204,6 +220,9 @@ def parse_args():
help="do not skip duplicate emails (same Message-ID header), "
"import them with a different Message-ID")
opts, args = parser.parse_args()
+ if opts.store is None:
+ parser.error("the store URL is missing (eg: "
+ "sqlite:///kittystore.sqlite)")
if opts.list_name is None:
parser.error("the list name must be given on the command-line.")
if not args:
@@ -220,11 +239,12 @@ def parse_args():
def main():
opts, args = parse_args()
print 'Importing messages from %s to database...' % opts.list_name
- store = get_store(KITTYSTORE_URL, debug=opts.debug)
+ store = get_store(opts.store, debug=opts.debug)
mlist = DummyMailingList(opts.list_name)
importer = DbImporter(mlist, store, opts)
for mbfile in args:
print "Importing from mbox file %s" % mbfile
importer.from_mbox(mbfile)
- print ' %s emails are stored into the database' \
- % store.get_list_size(opts.list_name)
+ if opts.verbose:
+ print ' %s emails are stored into the database' \
+ % store.get_list_size(opts.list_name)
diff --git a/setup.py b/setup.py
index bacfbf1..b9dd7dd 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,8 @@ setup(
description="A storage engine for GNU Mailman v3 archives",
long_description=open('README.rst').read(),
url="https://fedorahosted.org/hyperkitty/",
- packages=find_packages(exclude=["*.test", "test", "*.test.*"]),
+ #packages=find_packages(exclude=["*.test", "test", "*.test.*"]),
+ packages=find_packages(),
include_package_data=True,
install_requires=[
'mailman',