andoc/import_maildir.py at master · endpnt/andoc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import redis
from email import message_from_file
from email.utils import mktime_tz, parsedate_tz
from os import path, walk
from sys import exit, argv
from doc import *
from selection import *
from rediskeys import *
from triple import *

def usage():
    print "import raw emails from maildir"
    print 'Usage: %s MAILDIR' % argv[0]

def main():
    if path.exists(argv[1]) and path.isdir(argv[1]):
        search_dir = argv[1]
    else:
        print "Error: invalid directory"
        exit(1)

    valid_emails = []
    for root, dirs, files in walk(search_dir):
        for name in files:
            valid_emails.append(path.join(root, name))

    if len(valid_emails) == 0:
        print "Error: no files found"
        exit(1)

    r = redis.Redis()
    for email in valid_emails:
        msg = message_from_file(open(email))

        has_plaintext = False
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == 'text/plain':
                    has_plaintext = True
                    plaintext = part.get_payload(decode=True)
        else:
            if msg.get_content_type() == 'text/plain':
                has_plaintext = True
                plaintext = msg.get_payload(decode=True)


        if has_plaintext:
            destfile = open('data/%s.txt' % path.basename(email), 'w')
            selections = []
            dates = []
            for k,v in msg.items():
                selection_start = destfile.tell()
                # web browser counts one char for \r\n
                destfile.write('%s: %s\n' % (
                    k.replace('\r','').strip(),
                    v.replace('\r','').strip())
                    )
                selection_end = destfile.tell()
                selections.append((selection_start, selection_end,
                    'http://www.w3.org/1999/xhtml/#div'))

                if k == 'Date':
                    ts = mktime_tz(parsedate_tz(v))
                    ts_start = len('%s: ' %k )
                    ts_end = selection_end - selection_start
                    dates.append(
                        (selection_start, selection_end, ts, ts_start, ts_end))

            destfile.write('\n')
            bstart = destfile.tell()
            destfile.write(plaintext.replace('\r','').strip())
            bend = destfile.tell()
            destfile.close()
            selections.append((bstart, bend+1,
                'http://www.w3.org/1999/xhtml/#div'))

            doc = Document(r)
            if doc.add('data/%s.txt' % path.basename(email)):
                for start,end,ref in selections:
                    text_selection = TextSelection(doc.id, start, end, ref)
                    text_selection.save(r)

                for s_start, s_end, ts, ts_start, ts_end in dates:
                    pre = 'date'
                    sub = '%s%s#%s.s%se%s' %  (
                        'http://127.0.0.1:8080/doc/struc/',
                         doc.id, 'div', s_start, s_end)
                    # http://127.0.0.1:8080/doc/struc/1#div.s1086e1124/t6e37
                    trsub = '%s/t%se%s' % (sub, ts_start, ts_end)
                    trip = Triple(sub, pre, str(ts))
                    tid = trip.save(r)

                    h = HtmlSelection(doc.id, sub, ts_start, ts_end, tid)
                    h.save(r)

                   # save the object relation to this document
                    doc.add_relation(pre, str(ts))


if __name__ == "__main__":
    if len(argv) < 2:
        usage()
        exit(0)
    main()