Monday, February 7, 2011

Python code for making a histogram of your e-mail volume

Here is the source code for the example in my last post. I haven't had time to clean it up, and some parts are not very elegant. But if you want to try it out with your own inbox, all you need to do is change the e-mail address and run it.

Three caveats:
1. If you need to download a large number of e-mail headers, it will take some time (maybe several minutes).
2. It sometimes gets the dates wrong. However, this seems to occur only in a statistically insignificant fraction of cases.
3. Running this will mark all the messages it accesses as read. I'm sure there's a way to avoid this, but haven't had time to track it down.

import dateutil.parser
import matplotlib.dates as mdates
import imaplib, getpass, re
from email.parser import HeaderParser
import matplotlib.pyplot as pl
import numpy as np
import sys
def connect(email):
imap = imaplib.IMAP4_SSL("imap.gmail.com")
password = getpass.getpass("Enter your password: ")
imap.login(email, password)
return imap
def disconnect(imap):
imap.logout()
def parse_uid(data):
pattern_uid = re.compile('\d+ \(UID (?P<uid>\d+)\)')
match = pattern_uid.match(data)
return match.group('uid')
def plotdates(dnums):
"""Given a set of dates as numerical values, plot a histogram
by month."""
ddates=mdates.num2date(dnums)
mindate=mdates.num2date(dnums.min())
maxdate=mdates.num2date(dnums.max())
y,m=mindate.year,mindate.month
months=[mdates.datetime.datetime(y,m,1)]
counts=[sum([1 if (dd.year==y and dd.month==m) else 0 for dd in ddates])]
while y<maxdate.year:
while m<12:
m=m+1
months.append(mdates.datetime.datetime(y,m,1))
counts.append(sum([1 if (dd.year==y and dd.month==m) else 0
for dd in ddates]))
y=y+1; m=0
while m<maxdate.month:
m=m+1
months.append(mdates.datetime.datetime(y,m,1))
counts.append(sum([1 if (dd.year==y and dd.month==m) else 0
for dd in ddates]))
fig=pl.figure()
ax=fig.add_subplot(111)
ax.bar(months,counts,width=20.0)
for xlabel_i in ax.get_xticklabels():
xlabel_i.set_fontsize(30)
for ylabel_i in ax.get_yticklabels():
ylabel_i.set_fontsize(30)
ax.xaxis_date()
fig.autofmt_xdate()
pl.draw()
return counts,months
def getdates(imap,mailboxname):
imap.select(mailbox = mailboxname)
print 'searching...'; sys.stdout.flush()
resp, items = imap.search(None, 'All')
email_ids = ','.join(items[0].split())
email_id_list = [int(ID) for ID in items[0].split()]
print len(email_id_list), 'messages in ', mailboxname; sys.stdout.flush()
print 'fetching headers...'; sys.stdout.flush()
resp, data = imap.fetch(email_ids, "(BODY[HEADER.FIELDS (DATE)])")
print str(len(data)/2),' headers fetched from ', mailboxname; sys.stdout.flush()
print 'reformatting dates...'; sys.stdout.flush()
baddates = [data[2*i][1] for i in range(len(data)/2)]
dates = [' '.join(date.split()[1:5]) for date in baddates]
disconnect(imap)
print 'converting dates...'; sys.stdout.flush()
dtimes = [dateutil.parser.parse(date) for date in dates]
dnums = mdates.date2num(dtimes)
return dnums
if __name__ == '__main__':
mailboxname='everything'
imap = connect('<your.email.here>')
dnums = getdates(imap,mailboxname)
print 'plotting...'; sys.stdout.flush()
plotdates(dnums)
view raw gistfile1.py hosted with ❤ by GitHub

1 comment:

  1. I finally pushed my version to a gist! Can you see that bimodal hourly distribution?

    https://gist.github.com/3613794

    Best regards.

    ReplyDelete