boiscrape.py

#!/usr/bin/python

"""Bank of Ireland statement scraper. Version 20110824.

Copyright (C) 2011 Wilmer van der Gaast <wilmer@gaast.net>

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""

import csv
import datetime
import getopt
import getpass
import mechanize
import os
import re
import subprocess
import sys
import tempfile
import time
import xml.sax.saxutils

HTML_CONV = ["w3m", "-dump"]

def get_flag(flag, default=None):
	"""Ugly getopt wrapper."""
	flag = "-%s" % flag
	flags = getopt.getopt(sys.argv[1:], "u:b:p:o:f:t:l")[0]
	res = [y for x, y in flags if x == flag]
	if len(res) == 0:
		if default is not None:
			return default
		else:
			return False

	elif res[0] == "":
		return True
	else:
		return res[0]

def login(uid, dob, phone):
	br = mechanize.Browser()
	br.open("https://www.365online.com/servlet/Dispatcher/login.htm")
	br.select_form("frm_login")
	br["USER"] = uid
	if "date of birth" in br.response().read():
		br["Pass_Val_1"] = dob
	else:
		br["Pass_Val_1"] = phone
	br.submit()

	m = re.search(r'<tr>\s*<td class="(pin|astrix)">.*?</tr>',
	              br.response().read(), re.S)

	assert m is not None, "Authentication failure"

	digits = re.findall(r"<td.*?</td>", m.group(0), re.S)

	print "Please give the following password digits: ",
	for digit in digits:
		if "astrix" in digit:
			print "X",
		else:
			print "_",
	print

	passwd = getpass.getpass("Password digits: ")
	n = 0
	br.select_form("frm_login")
	for digit in passwd:
		n += 1
		br["PIN_Val_%d" % n] = digit

	br.submit()
	html = br.response().read()
	assert "Customer Login" not in html, "PIN failure"
	m = re.search(r'top\.location\.href="(.*365OnlineSecurityInformation.*)?"', html)
	if m:
		security_ack(br, m.group(1))
	return br

def security_ack(br, url):
	br.open(url)
	html = br.response().read()
	fetch = None
	for frame in re.findall("<frame\\b.*?>", html):
		if "content_frame" in frame:
			m = re.search("src=\"(.*?)\"", frame)
			fetch = m and m.group(1)
	
	assert fetch, "Could not find security URL to be fetched"
	br.open(fetch)
	
	fn = tempfile.mktemp(suffix=".html")
	file(fn, "w").write(br.response().read())
	p = subprocess.Popen(HTML_CONV + [fn])
	p.communicate()
	os.unlink(fn)
	
	print
	print
	raw_input("The bank wants you to read the above. Please press <Enter> to continue.")
	def findimglink(link):
		# There's an easier way to find this one, but ideally I'd do something
		# better than searching for text=[IMG]
		return link.text == "[IMG]"
	
	for l in br.links(predicate=findimglink):
		br.follow_link(l)
		return True
	
	assert False, "Could not automatically acknowledge security message."

if not get_flag("u"):
	print "boiscrape - Scrape bank statements from 365online.com"
	print
	print "Usage: boiscrape.py -u UID -b DATE OF BIRTH -p PHONE# 4 DIGITS"
	print "                    [-o OUTFILE] [-a ACCOUNT (1, 2, 3, ...)]"
	print "               then -l  # for the standard since-last-statement overview"
	print "               or   -f FROM_DATE -t TO_DATE"
	print
	print "It will ask you for the three PIN number digits of the day, for security"
	print "reasons this info can't be given on the command line. Account number is"
	print "not the full number but 1 for the first account in your list, 2 for the"
	print "second, etc. (Functionality untested!)"
	print
	print "Dates to be formatted in Irish way (DD/MM/YYYY)."
	print
	print "For custom statements, BoI is currently NOT filling in the balance column"
	print "correctly! This is not a bug in this scraper."
	print
	print "Every now and then (monthly?) BoI wants you to read a security message"
	print "before showing your account data. This script tries to handle this but may"
	print "fail, so if it's not working try logging in from your browser once."
	sys.exit(1)

br = login(get_flag("u"), get_flag("b"), get_flag("p"))

if get_flag("l"):
	base_url = ("https://www.365online.com/servlet/Dispatcher/txlist.htm?"
	            "page=%%d&row=%03d" % get_flag("a", 1))
else:
	d = datetime.datetime.now()
	deffrom = "%02d/%02d/%04d" % (d.day, d.month, d.year - 1)
	d -= datetime.timedelta(days=1)
	defto = "%02d/%02d/%04d" % (d.day, d.month, d.year)
	base_url = ("https://www.365online.com/servlet/Dispatcher/cs_statement.htm?"
	            "page=%%d&row=%03d&to=%s&from=%s&filt=All." %
	            (get_flag("a", 1), get_flag("t", defto), get_flag("f", deffrom)))

pg = 1
out = []
dehtml = re.compile(r"<.*?>", re.S)
bal = None
while True:
	url = base_url % pg
	br.open(url)
	
	html = br.response().read()
	# Seriously. On one of the pages Transactions is mistyped.
	m = re.search(r'<table summary="Trans?actions.*?</table>', html, re.S)
	rows = re.findall(r"<tr>\s*<td.*?</tr>", m.group(0), re.S)
	for row in rows:
		def __num(s):
			if s:
				return float(s)
			else:
				return 0
		
		cells = []
		for cell in re.findall(r"<td.*?>(.*?)</td>", row):
			cell = dehtml.sub("", cell)
			cell = xml.sax.saxutils.unescape(cell, {"&nbsp;": " "})
			cells.append(cell.strip())
		if not cells or not "".join(cells):
			continue
		if cells[1]:
			lastdate = cells[1]
		else:
			# Don't leave any blank as we're reversing the table.
			cells[1] = lastdate
		if bal is None:
			bal = __num(cells[5])
		bal = bal + __num(cells[3]) - __num(cells[4])
		# First and last column are empty. (Rounded corners or sth?)
		out.append(cells[1:6])
	
	if "cont_but_next" in html:
		pg += 1
	else:
		break

if not out:
	print "No output. :-("
	sys.exit(1)

if get_flag("l"):
	# The since-last-statement overview is in reverse order.
	out.reverse()
	out.insert(0, [lastdate, "Balance forward", None, None, "%.2f" % bal])
else:
	# The "custom statement" is not, so just add a first row.
	bal = __num(out[0][4]) - __num(out[0][3]) + __num(out[0][2])
	out.insert(0, [out[0][0], "Balance forward", None, None, "%.2f" % bal])

if get_flag("o"):
	outfile = open(get_flag("o"), "w")
else:
	outfile = sys.stdout
cw = csv.writer(outfile, delimiter=";")
for row in out:
	cw.writerow(row)

Generated by GNU Enscript 1.6.5.90.