Code:
'''
Created on 02.11.2011
@version 0.9
@author: Fabrice Bongartz (fabrice (at) fabrice d.o.t. me)
@copyright: (C) 2011 Fabrice Bongartz
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details. You should have received a copy of the GNU General Public License
along with this program; if not, see <http://www.gnu.org/licenses/>.
'''
import sys, os, os.path, time, email.utils, re, argparse, tarfile
from argparse import ArgumentTypeError
re_from = re.compile("^From ")
re_date = re.compile("^Date: +")
re_strip_from = re.compile("From ([^@]+@)*(\(.+\)|\S+) *")
def get_date_from_fromline(from_line):
return re_strip_from.sub("", from_line)
def get_date_from_dateline(date_line):
return re_date.sub("", date_line)
def mbox2eml(mbox_file, dest_dir, change_mtime, prefer_dateheader):
mbox_file.seek(0) # jump back to the beginning of the file
previous_was_newline = True
nr = 0
in_headers = False
date_from = None # to store dates from the "From " header.
date_date = None # to store dates from the "Date: " header.
cur_path = None
cur_file = None
bof = True
for line in mbox_file:
if (previous_was_newline): # True for the first iteration
if (re_from.match(line) != None):
# we're at the beginning of a new message
if (bof == False): # True for the first iteration
# finish the last message
if (cur_file.closed == False): cur_file.close()
if (change_mtime):
if (prefer_dateheader and date_date != None):
mtime = time.mktime(date_date)
else: mtime = time.mktime(date_from)
os.utime(cur_path, (mtime, mtime))
# prepare for a new message
cur_path = os.path.join(dest_dir, str(nr) + ".eml")
cur_file = open(cur_path, "w")
date_from = email.utils.parsedate(get_date_from_fromline(line))
date_date = None
in_headers = True
bof = False
nr += 1
elif (in_headers): in_headers = False
if (prefer_dateheader and in_headers and re_date.match(line) != None):
date_date = email.utils.parsedate(get_date_from_dateline(line))
# write the current line
cur_file.write(line)
# determine if we're at a newline
if line.replace("\r\n", "\n") == "\n": previous_was_newline = True
else: previous_was_newline = False
# treat the last remaining message
if (bof == False): # should only be True here if the file was empty
if (cur_file.closed == False): cur_file.close()
if (change_mtime):
if (prefer_dateheader and date_date != None):
mtime = time.mktime(date_date)
else: mtime = time.mktime(date_from)
os.utime(cur_path, (mtime, mtime))
return True
def is_mbox_file(f):
"""
Determine of the given File object is an mbox file. This simply checks
if the file's first line has the string "From " at its start.
"""
if (re_from.match(f.readline()) != None):
return True
return False
def recurse_mbox(mbox_start_dir, tmp_dir, change_mtime = True,
prefer_dateheader = True):
for root, dirs, files in os.walk(mbox_start_dir, True, None, False):
for f_str in files:
if ((len(f_str) > 3 and f_str[-4:] != ".msf") or len(f_str) <= 3):
f = open(os.path.join(root, f_str), "r")
if (is_mbox_file(f)):
print "Treating mbox " + os.path.join(root, f_str)
rel_path = os.path.relpath(root, mbox_start_dir)
if (rel_path == "."): rel_path = ""
dest_dir = os.path.join(tmp_dir, rel_path, f_str).replace(".sbd", "")
if (os.path.isdir(dest_dir) == False): os.makedirs(dest_dir)
mbox2eml(f, dest_dir, change_mtime, prefer_dateheader)
f.close()
def create_tgz(dir, tgz_path):
tgz = tarfile.open(tgz_path, "w:gz")
for root, dirnames, filenames in os.walk(dir):
for f in filenames:
filepath = os.path.join(root, f)
relpath = os.path.relpath(filepath, dir)
print "Adding to targz:", filepath
tgz.add(filepath, relpath)
tgz.close()
def initialize_options():
ap = argparse.ArgumentParser()
ap.add_argument("-s", "--mbox-start-dir", required = True,
dest = "mbox_start_dir", help = "A source directory "
+ "that contains a tree of mbox files.")
ap.add_argument("-d", "--destination", required = True,
dest = "dest_dir", help = "Directory where the "
+ "eml-file tree should be created.")
ap.add_argument("-z", "--tgz", dest = "tgz", help = "Create a gzipped "
+ "tar archive that contains the directory tree "
+ "specified with -d/--destination at the given "
+ "path. This is optional. Note that this might be "
+ "faster using an optimized commandline tool like "
+ "gnu tar.")
ap.add_argument("-M", "--dont-change-mtimes", dest = "dont_change_mtimes",
action = "store_true", default = False, help = "By "
+ "default, the mtime and atime of the created eml "
+ "files will be changed to a date found in each email "
+ "header. This option disabled changing mtime/atime.")
ap.add_argument("-I", "--ignore-date-header", default = False,
action = "store_true", dest = "ignore_date_header",
help = "By default, and if -M/--dont-change-mtimes "
+ "wasn't specified, in order to change the "
+ "atime+mtime of each eml file, the program will look "
+ "for a Date: line in the email headers. If no Date: "
+ "line was found, the date from the \"From \" line at "
+ "the beginning of the message will be used. This "
+ "option disables looking for the Date: line so that "
+ "the \"From \" line will always be used.")
return ap.parse_args()
if __name__ == '__main__':
opts = initialize_options()
# check args
if (not os.path.isdir(opts.mbox_start_dir)):
raise ArgumentTypeError("The given mbox start dir is not a directory")
if (not os.path.isdir(opts.dest_dir)):
raise ArgumentTypeError("The given destination dir is not a directory")
# walk the mboxes and create the destination eml file structure
recurse_mbox(os.path.abspath(opts.mbox_start_dir),
os.path.abspath(opts.dest_dir),
(opts.dont_change_mtimes == False),
(opts.ignore_date_header == False))
# optionally create a gzipped tar archive
if (opts.tgz): create_tgz(opts.dest_dir, opts.tgz) Save the script above somewhere as mbox2zeml.py. For example: /home/user/migrate/mbox2zeml.py