diff options
Diffstat (limited to 'module/network/Keepalive.py')
-rw-r--r-- | module/network/Keepalive.py | 617 |
1 files changed, 617 insertions, 0 deletions
diff --git a/module/network/Keepalive.py b/module/network/Keepalive.py new file mode 100644 index 000000000..68abe087d --- /dev/null +++ b/module/network/Keepalive.py @@ -0,0 +1,617 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive. + +>>> import urllib2 +>>> from keepalive import HTTPHandler +>>> keepalive_handler = HTTPHandler() +>>> opener = urllib2.build_opener(keepalive_handler) +>>> urllib2.install_opener(opener) +>>> +>>> fo = urllib2.urlopen('http://www.python.org') + +If a connection to a given host is requested, and all of the existing +connections are still in use, another connection will be opened. If +the handler tries to use an existing connection but it fails in some +way, it will be closed and removed from the pool. + +To remove the handler, simply re-run build_opener with no arguments, and +install that opener. + +You can explicitly close connections by using the close_connection() +method of the returned file-like object (described below) or you can +use the handler methods: + + close_connection(host) + close_all() + open_connections() + +NOTE: using the close_connection and close_all methods of the handler +should be done with care when using multiple threads. + * there is nothing that prevents another thread from creating new + connections immediately after connections are closed + * no checks are done to prevent in-use connections from being closed + +>>> keepalive_handler.close_all() + +EXTRA ATTRIBUTES AND METHODS + + Upon a status of 200, the object returned has a few additional + attributes and methods, which should not be used if you want to + remain consistent with the normal urllib2-returned objects: + + close_connection() - close the connection to the host + readlines() - you know, readlines() + status - the return status (ie 404) + reason - english translation of status (ie 'File not found') + + If you want the best of both worlds, use this inside an + AttributeError-catching try: + + >>> try: status = fo.status + >>> except AttributeError: status = None + + Unfortunately, these are ONLY there if status == 200, so it's not + easy to distinguish between non-200 responses. The reason is that + urllib2 tries to do clever things with error codes 301, 302, 401, + and 407, and it wraps the object upon return. + + For python versions earlier than 2.4, you can avoid this fancy error + handling by setting the module-level global HANDLE_ERRORS to zero. + You see, prior to 2.4, it's the HTTP Handler's job to determine what + to handle specially, and what to just pass up. HANDLE_ERRORS == 0 + means "pass everything up". In python 2.4, however, this job no + longer belongs to the HTTP Handler and is now done by a NEW handler, + HTTPErrorProcessor. Here's the bottom line: + + python version < 2.4 + HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as + errors + HANDLE_ERRORS == 0 pass everything up, error processing is + left to the calling code + python version >= 2.4 + HANDLE_ERRORS == 1 pass up 200, treat the rest as errors + HANDLE_ERRORS == 0 (default) pass everything up, let the + other handlers (specifically, + HTTPErrorProcessor) decide what to do + + In practice, setting the variable either way makes little difference + in python 2.4, so for the most consistent behavior across versions, + you probably just want to use the defaults, which will give you + exceptions on errors. + +""" + +# $Id: keepalive.py,v 1.16 2006/09/22 00:58:05 mstenner Exp $ + +import urllib2 +import httplib +import socket +import thread + +DEBUG = None + +import sslfactory + +import sys +if sys.version_info < (2, 4): HANDLE_ERRORS = 1 +else: HANDLE_ERRORS = 0 + +class ConnectionManager: + """ + The connection manager must be able to: + * keep track of all existing + """ + def __init__(self): + self._lock = thread.allocate_lock() + self._hostmap = {} # map hosts to a list of connections + self._connmap = {} # map connections to host + self._readymap = {} # map connection to ready state + + def add(self, host, connection, ready): + self._lock.acquire() + try: + if not self._hostmap.has_key(host): self._hostmap[host] = [] + self._hostmap[host].append(connection) + self._connmap[connection] = host + self._readymap[connection] = ready + finally: + self._lock.release() + + def remove(self, connection): + self._lock.acquire() + try: + try: + host = self._connmap[connection] + except KeyError: + pass + else: + del self._connmap[connection] + del self._readymap[connection] + self._hostmap[host].remove(connection) + if not self._hostmap[host]: del self._hostmap[host] + finally: + self._lock.release() + + def set_ready(self, connection, ready): + try: self._readymap[connection] = ready + except KeyError: pass + + def get_ready_conn(self, host): + conn = None + self._lock.acquire() + try: + if self._hostmap.has_key(host): + for c in self._hostmap[host]: + if self._readymap[c]: + self._readymap[c] = 0 + conn = c + break + finally: + self._lock.release() + return conn + + def get_all(self, host=None): + if host: + return list(self._hostmap.get(host, [])) + else: + return dict(self._hostmap) + +class KeepAliveHandler: + def __init__(self): + self._cm = ConnectionManager() + + #### Connection Management + def open_connections(self): + """return a list of connected hosts and the number of connections + to each. [('foo.com:80', 2), ('bar.org', 1)]""" + return [(host, len(li)) for (host, li) in self._cm.get_all().items()] + + def close_connection(self, host): + """close connection(s) to <host> + host is the host:port spec, as in 'www.cnn.com:8080' as passed in. + no error occurs if there is no connection to that host.""" + for h in self._cm.get_all(host): + self._cm.remove(h) + h.close() + + def close_all(self): + """close all open connections""" + for host, conns in self._cm.get_all().items(): + for h in conns: + self._cm.remove(h) + h.close() + + def _request_closed(self, request, host, connection): + """tells us that this request is now closed and the the + connection is ready for another request""" + self._cm.set_ready(connection, 1) + + def _remove_connection(self, host, connection, close=0): + if close: connection.close() + self._cm.remove(connection) + + #### Transaction Execution + def do_open(self, req): + host = req.get_host() + if not host: + raise urllib2.URLError('no host given') + + try: + h = self._cm.get_ready_conn(host) + while h: + r = self._reuse_connection(h, req, host) + + # if this response is non-None, then it worked and we're + # done. Break out, skipping the else block. + if r: break + + # connection is bad - possibly closed by server + # discard it and ask for the next free connection + h.close() + self._cm.remove(h) + h = self._cm.get_ready_conn(host) + else: + # no (working) free connections were found. Create a new one. + h = self._get_connection(host) + if DEBUG: DEBUG.info("creating new connection to %s (%d)", + host, id(h)) + self._cm.add(host, h, 0) + self._start_transaction(h, req) + r = h.getresponse() + except (socket.error, httplib.HTTPException), err: + raise urllib2.URLError(err) + + # if not a persistent connection, don't try to reuse it + if r.will_close: self._cm.remove(h) + + if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason) + r._handler = self + r._host = host + r._url = req.get_full_url() + r._connection = h + r.code = r.status + r.headers = r.msg + r.msg = r.reason + + if r.status == 200 or not HANDLE_ERRORS: + return r + else: + return self.parent.error('http', req, r, + r.status, r.msg, r.headers) + + def _reuse_connection(self, h, req, host): + """start the transaction with a re-used connection + return a response object (r) upon success or None on failure. + This DOES not close or remove bad connections in cases where + it returns. However, if an unexpected exception occurs, it + will close and remove the connection before re-raising. + """ + try: + self._start_transaction(h, req) + r = h.getresponse() + # note: just because we got something back doesn't mean it + # worked. We'll check the version below, too. + except (socket.error, httplib.HTTPException): + r = None + except: + # adding this block just in case we've missed + # something we will still raise the exception, but + # lets try and close the connection and remove it + # first. We previously got into a nasty loop + # where an exception was uncaught, and so the + # connection stayed open. On the next try, the + # same exception was raised, etc. The tradeoff is + # that it's now possible this call will raise + # a DIFFERENT exception + if DEBUG: DEBUG.error("unexpected exception - closing " + \ + "connection to %s (%d)", host, id(h)) + self._cm.remove(h) + h.close() + raise + + if r is None or r.version == 9: + # httplib falls back to assuming HTTP 0.9 if it gets a + # bad header back. This is most likely to happen if + # the socket has been closed by the server since we + # last used the connection. + if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)", + host, id(h)) + r = None + else: + if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h)) + + return r + + def _start_transaction(self, h, req): + try: + if req.has_data(): + data = req.get_data() + h.putrequest('POST', req.get_selector(), skip_accept_encoding=1) + if not req.headers.has_key('Content-type'): + h.putheader('Content-type', + 'application/x-www-form-urlencoded') + if not req.headers.has_key('Content-length'): + h.putheader('Content-length', '%d' % len(data)) + else: + h.putrequest('GET', req.get_selector(), skip_accept_encoding=1) + except (socket.error, httplib.HTTPException), err: + raise urllib2.URLError(err) + + for args in self.parent.addheaders: + h.putheader(*args) + for k, v in req.headers.items(): + h.putheader(k, v) + h.endheaders() + if req.has_data(): + h.send(data) + + def _get_connection(self, host): + return NotImplementedError + +class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler): + def __init__(self): + KeepAliveHandler.__init__(self) + + def http_open(self, req): + return self.do_open(req) + + def _get_connection(self, host): + return HTTPConnection(host) + +class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler): + def __init__(self, ssl_factory=None): + KeepAliveHandler.__init__(self) + if not ssl_factory: + ssl_factory = sslfactory.get_factory() + self._ssl_factory = ssl_factory + + def https_open(self, req): + return self.do_open(req) + + def _get_connection(self, host): + return self._ssl_factory.get_https_connection(host) + +class HTTPResponse(httplib.HTTPResponse): + # we need to subclass HTTPResponse in order to + # 1) add readline() and readlines() methods + # 2) add close_connection() methods + # 3) add info() and geturl() methods + + # in order to add readline(), read must be modified to deal with a + # buffer. example: readline must read a buffer and then spit back + # one line at a time. The only real alternative is to read one + # BYTE at a time (ick). Once something has been read, it can't be + # put back (ok, maybe it can, but that's even uglier than this), + # so if you THEN do a normal read, you must first take stuff from + # the buffer. + + # the read method wraps the original to accomodate buffering, + # although read() never adds to the buffer. + # Both readline and readlines have been stolen with almost no + # modification from socket.py + + + def __init__(self, sock, debuglevel=0, strict=0, method=None): + if method: # the httplib in python 2.3 uses the method arg + httplib.HTTPResponse.__init__(self, sock, debuglevel, method) + else: # 2.2 doesn't + httplib.HTTPResponse.__init__(self, sock, debuglevel) + self.fileno = sock.fileno + self.code = None + self._rbuf = '' + self._rbufsize = 8096 + self._handler = None # inserted by the handler later + self._host = None # (same) + self._url = None # (same) + self._connection = None # (same) + + _raw_read = httplib.HTTPResponse.read + + def close(self): + if self.fp: + self.fp.close() + self.fp = None + if self._handler: + self._handler._request_closed(self, self._host, + self._connection) + + def close_connection(self): + self._handler._remove_connection(self._host, self._connection, close=1) + self.close() + + def info(self): + return self.headers + + def geturl(self): + return self._url + + def read(self, amt=None): + # the _rbuf test is only in this first if for speed. It's not + # logically necessary + if self._rbuf and not amt is None: + L = len(self._rbuf) + if amt > L: + amt -= L + else: + s = self._rbuf[:amt] + self._rbuf = self._rbuf[amt:] + return s + + s = self._rbuf + self._raw_read(amt) + self._rbuf = '' + return s + + def readline(self, limit=-1): + data = "" + i = self._rbuf.find('\n') + while i < 0 and not (0 < limit <= len(self._rbuf)): + new = self._raw_read(self._rbufsize) + if not new: break + i = new.find('\n') + if i >= 0: i = i + len(self._rbuf) + self._rbuf = self._rbuf + new + if i < 0: i = len(self._rbuf) + else: i = i+1 + if 0 <= limit < len(self._rbuf): i = limit + data, self._rbuf = self._rbuf[:i], self._rbuf[i:] + return data + + def readlines(self, sizehint = 0): + total = 0 + list = [] + while 1: + line = self.readline() + if not line: break + list.append(line) + total += len(line) + if sizehint and total >= sizehint: + break + return list + + +class HTTPConnection(httplib.HTTPConnection): + # use the modified response class + response_class = HTTPResponse + +class HTTPSConnection(httplib.HTTPSConnection): + response_class = HTTPResponse + +######################################################################### +##### TEST FUNCTIONS +######################################################################### + +def error_handler(url): + global HANDLE_ERRORS + orig = HANDLE_ERRORS + keepalive_handler = HTTPHandler() + opener = urllib2.build_opener(keepalive_handler) + urllib2.install_opener(opener) + pos = {0: 'off', 1: 'on'} + for i in (0, 1): + print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i) + HANDLE_ERRORS = i + try: + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + try: status, reason = fo.status, fo.reason + except AttributeError: status, reason = None, None + except IOError, e: + print " EXCEPTION: %s" % e + raise + else: + print " status = %s, reason = %s" % (status, reason) + HANDLE_ERRORS = orig + hosts = keepalive_handler.open_connections() + print "open connections:", hosts + keepalive_handler.close_all() + +def continuity(url): + import md5 + format = '%25s: %s' + + # first fetch the file with the normal http handler + opener = urllib2.build_opener() + urllib2.install_opener(opener) + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + m = md5.new(foo) + print format % ('normal urllib', m.hexdigest()) + + # now install the keepalive handler and try again + opener = urllib2.build_opener(HTTPHandler()) + urllib2.install_opener(opener) + + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + m = md5.new(foo) + print format % ('keepalive read', m.hexdigest()) + + fo = urllib2.urlopen(url) + foo = '' + while 1: + f = fo.readline() + if f: foo = foo + f + else: break + fo.close() + m = md5.new(foo) + print format % ('keepalive readline', m.hexdigest()) + +def comp(N, url): + print ' making %i connections to:\n %s' % (N, url) + + sys.stdout.write(' first using the normal urllib handlers') + # first use normal opener + opener = urllib2.build_opener() + urllib2.install_opener(opener) + t1 = fetch(N, url) + print ' TIME: %.3f s' % t1 + + sys.stdout.write(' now using the keepalive handler ') + # now install the keepalive handler and try again + opener = urllib2.build_opener(HTTPHandler()) + urllib2.install_opener(opener) + t2 = fetch(N, url) + print ' TIME: %.3f s' % t2 + print ' improvement factor: %.2f' % (t1/t2, ) + +def fetch(N, url, delay=0): + import time + lens = [] + starttime = time.time() + for i in range(N): + if delay and i > 0: time.sleep(delay) + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + lens.append(len(foo)) + diff = time.time() - starttime + + j = 0 + for i in lens[1:]: + j = j + 1 + if not i == lens[0]: + print "WARNING: inconsistent length on read %i: %i" % (j, i) + + return diff + +def test_timeout(url): + global DEBUG + dbbackup = DEBUG + class FakeLogger: + def debug(self, msg, *args): print msg % args + info = warning = error = debug + DEBUG = FakeLogger() + print " fetching the file to establish a connection" + fo = urllib2.urlopen(url) + data1 = fo.read() + fo.close() + + i = 20 + print " waiting %i seconds for the server to close the connection" % i + while i > 0: + sys.stdout.write('\r %2i' % i) + sys.stdout.flush() + time.sleep(1) + i -= 1 + sys.stderr.write('\r') + + print " fetching the file a second time" + fo = urllib2.urlopen(url) + data2 = fo.read() + fo.close() + + if data1 == data2: + print ' data are identical' + else: + print ' ERROR: DATA DIFFER' + + DEBUG = dbbackup + + +def test(url, N=10): + print "checking error hander (do this on a non-200)" + try: error_handler(url) + except IOError, e: + print "exiting - exception will prevent further tests" + sys.exit() + print + print "performing continuity test (making sure stuff isn't corrupted)" + continuity(url) + print + print "performing speed comparison" + comp(N, url) + print + print "performing dropped-connection check" + test_timeout(url) + +if __name__ == '__main__': + import time + import sys + try: + N = int(sys.argv[1]) + url = sys.argv[2] + except: + print "%s <integer> <url>" % sys.argv[0] + else: + test(url, N) |