#!/usr/bin/env python # -*- coding: utf-8 -*- """ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, see . @author: mkaay """ from urllib import urlencode from urllib2 import Request from urllib2 import OpenerDirector from urllib2 import HTTPHandler from urllib2 import HTTPSHandler from urllib2 import HTTPDefaultErrorHandler from urllib2 import HTTPErrorProcessor from urllib2 import ProxyHandler from urllib2 import URLError from urllib2 import _parse_proxy from httplib import HTTPConnection from httplib import HTTPResponse from httplib import responses as HTTPStatusCodes from httplib import ResponseNotReady from httplib import BadStatusLine from httplib import CannotSendRequest from CookieJar import CookieJar from CookieRedirectHandler import CookieRedirectHandler import socket import socks from MultipartPostHandler import MultipartPostHandler DEBUG = 0 HANDLE_ERRORS = 1 class PyLoadHTTPResponse(HTTPResponse): def __init__(self, sock, debuglevel=0, strict=0, method=None): if method: # the httplib in python 2.3 uses the method arg HTTPResponse.__init__(self, sock, debuglevel, method) else: # 2.2 doesn't HTTPResponse.__init__(self, sock, debuglevel) self.fileno = sock.fileno self._rbuf = '' self._rbufsize = 8096 self._handler = None # inserted by the handler later self._host = None # (same) self._url = None # (same) _raw_read = HTTPResponse.read def close_connection(self): self.close() self._handler._remove_connection(self._host, close=1) def info(self): return self.msg def geturl(self): return self._url def read(self, amt=None): # the _rbuf test is only in this first if for speed. It's not # logically necessary if self._rbuf and not amt is None: L = len(self._rbuf) if amt > L: amt -= L else: s = self._rbuf[:amt] self._rbuf = self._rbuf[amt:] return s s = self._rbuf + self._raw_read(amt) self._rbuf = '' return s def readline(self, limit=-1): i = self._rbuf.find('\n') while i < 0 and not (0 < limit <= len(self._rbuf)): new = self._raw_read(self._rbufsize) if not new: break i = new.find('\n') if i >= 0: i = i + len(self._rbuf) self._rbuf = self._rbuf + new if i < 0: i = len(self._rbuf) else: i += 1 if 0 <= limit < len(self._rbuf): i = limit data, self._rbuf = self._rbuf[:i], self._rbuf[i:] return data def readlines(self, sizehint = 0): total = 0 list = [] while 1: line = self.readline() if not line: break list.append(line) total += len(line) if sizehint and total >= sizehint: break return list @property def code(self): return self.status def getcode(self): return self.status class PyLoadHTTPConnection(HTTPConnection): sourceAddress = ('', 0) socksProxy = None response_class = PyLoadHTTPResponse def connect(self): if self.socksProxy: self.sock = socks.socksocket() t = _parse_proxy(self.socksProxy[1]) self.sock.setproxy(self.socksProxy[0], addr=t[3].split(":")[0], port=int(t[3].split(":")[1]), username=t[1], password=t[2]) else: self.sock = socket.socket() self.sock.settimeout(30) self.sock.bind(self.sourceAddress) self.sock.connect((self.host, self.port)) try: if self._tunnel_host: self._tunnel() except: #python2.5 pass class PyLoadHTTPHandler(HTTPHandler): sourceAddress = ('', 0) socksProxy = None def __init__(self): self._connections = {} def setInterface(self, interface): if interface is None: interface = "" self.sourceAddress = (interface, 0) def setSocksProxy(self, *t): self.socksProxy = t def close_connection(self, host): """close connection to host is the host:port spec, as in 'www.cnn.com:8080' as passed in. no error occurs if there is no connection to that host.""" self._remove_connection(host, close=1) def open_connections(self): """return a list of connected hosts""" return self._connections.keys() def close_all(self): """close all open connections""" for host, conn in self._connections.items(): conn.close() self._connections = {} def _remove_connection(self, host, close=0): if self._connections.has_key(host): if close: self._connections[host].close() del self._connections[host] def _start_connection(self, h, req): data = "" if req.has_data(): data = req.get_data() h.putrequest('POST', req.get_selector(), skip_accept_encoding=1) if not req.headers.has_key('Content-type'): h.putheader('Content-type', 'application/x-www-form-urlencoded') if not req.headers.has_key('Content-length'): h.putheader('Content-length', '%d' % len(data)) else: h.putrequest('GET', req.get_selector(), skip_accept_encoding=1) for args in self.parent.addheaders: h.putheader(*args) for k, v in req.headers.items(): h.putheader(k, v) h.endheaders() if req.has_data(): h.send(data) def do_open(self, http_class, req): host = req.get_host() if not host: raise URLError('no host given') need_new_connection = 1 h = self._connections.get(host) if not h is None: try: self._start_connection(h, req) except socket.error: r = None except BadStatusLine: r = None except CannotSendRequest: r = None else: try: r = h.getresponse() except ResponseNotReady: r = None except BadStatusLine: r = None if r is None or r.version == 9: # httplib falls back to assuming HTTP 0.9 if it gets a # bad header back. This is most likely to happen if # the socket has been closed by the server since we # last used the connection. if DEBUG: print "failed to re-use connection to %s" % host h.close() else: if DEBUG: print "re-using connection to %s" % host need_new_connection = 0 if need_new_connection: if DEBUG: print "creating new connection to %s" % host h = http_class(host) h.sourceAddress = self.sourceAddress h.socksProxy = self.socksProxy self._connections[host] = h self._start_connection(h, req) r = h.getresponse() # if not a persistent connection, don't try to reuse it if r.will_close: self._remove_connection(host) if DEBUG: print "STATUS: %s, %s" % (r.status, r.reason) r._handler = self r._host = host r._url = req.get_full_url() if r.status in (200, 206) or not HANDLE_ERRORS: return r else: return self.parent.error('http', req, r, r.status, r.reason, r.msg) def http_open(self, req): return self.do_open(PyLoadHTTPConnection, req) class HTTPBase(): def __init__(self, interface=None, proxies={}): self.followRedirect = True self.interface = interface self.proxies = proxies self.size = None self.referer = None self.userAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.10" self.handler = PyLoadHTTPHandler() self.handler.setInterface(interface) if proxies.has_key("socks5"): self.handler.setSocksProxy(socks.PROXY_TYPE_SOCKS5, proxies["socks5"]) elif proxies.has_key("socks4"): self.handler.setSocksProxy(socks.PROXY_TYPE_SOCKS4, proxies["socks4"]) self.cookieJar = CookieJar() self.opener = None self.debug = DEBUG def getOpener(self, cookies=True): if not self.opener: self.opener = self.createOpener(cookies) return self.opener def createOpener(self, cookies=True): opener = OpenerDirector() opener.add_handler(self.handler) opener.add_handler(MultipartPostHandler()) opener.add_handler(HTTPSHandler()) opener.add_handler(HTTPDefaultErrorHandler()) opener.add_handler(HTTPErrorProcessor()) opener.add_handler(CookieRedirectHandler(self.cookieJar, self.followRedirect)) if self.proxies.has_key("http") or self.proxies.has_key("https"): opener.add_handler(ProxyHandler(self.proxies)) opener.version = self.userAgent opener.addheaders[0] = ("User-Agent", self.userAgent) opener.addheaders.append(("Accept", "*/*")) opener.addheaders.append(("Accept-Language", "en-US,en")) opener.addheaders.append(("Accept-Encoding", "gzip, deflate")) opener.addheaders.append(("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7")) return opener def createRequest(self, url, get={}, post={}, referer=None, customHeaders={}): if get: if isinstance(get, dict): get = urlencode(get) url = "%s?%s" % (url, get) req = Request(url) if post: if isinstance(post, dict): post = urlencode(post) req.add_data(post) if referer: req.add_header("Referer", referer) for key, val in customHeaders.iteritems(): req.add_header(key, val) return req def getResponse(self, url, get={}, post={}, referer=None, cookies=True, customHeaders={}): req = self.createRequest(url, get, post, referer, customHeaders) opener = self.getOpener(cookies) if self.debug: print "[HTTP] ----" print "[HTTP] creating request" print "[HTTP] URL:", url print "[HTTP] GET" if get: for key, value in get.iteritems(): print "[HTTP] \t", key, ":", value if post: print "[HTTP] POST" for key, value in post.iteritems(): print "[HTTP] \t", key, ":", value print "[HTTP] headers" for key, value in opener.addheaders: print "[HTTP] \t", key, ":", value for key, value in req.headers.iteritems(): print "[HTTP] \t", key, ":", value print "[HTTP] cookies" if self.cookieJar: from pprint import pprint pprint(self.cookieJar._cookies) print "[HTTP] ----" resp = opener.open(req) resp.getcode = lambda: resp.code if self.debug: print "[HTTP] ----" print "[HTTP] got response" print "[HTTP] status:", resp.getcode() print "[HTTP] headers" for key, value in resp.info().dict.iteritems(): print "[HTTP] \t", key, ":", value print "[HTTP] cookies" if self.cookieJar: from pprint import pprint pprint(self.cookieJar._cookies) print "[HTTP] ----" try: self.size = int(resp.info()["Content-Length"]) except: #chunked transfer pass return resp def closeAll(self): """ closes all connections """ if hasattr(self, "handler"): self.handler.close_all() def clean(self): """ cleanup """ self.closeAll() if hasattr(self, "opener"): del self.opener if hasattr(self, "handler"): del self.handler if __name__ == "__main__": base = HTTPBase() resp = base.getResponse("http://python.org/") print resp.read()