summaryrefslogtreecommitdiffstats
path: root/pyload/network/HTTPRequest.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyload/network/HTTPRequest.py')
-rw-r--r--pyload/network/HTTPRequest.py324
1 files changed, 324 insertions, 0 deletions
diff --git a/pyload/network/HTTPRequest.py b/pyload/network/HTTPRequest.py
new file mode 100644
index 000000000..ebf2c3132
--- /dev/null
+++ b/pyload/network/HTTPRequest.py
@@ -0,0 +1,324 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+ @author: RaNaN
+"""
+
+import pycurl
+
+from codecs import getincrementaldecoder, lookup, BOM_UTF8
+from urllib import quote, urlencode
+from httplib import responses
+from logging import getLogger
+from cStringIO import StringIO
+
+from pyload.plugins.Base import Abort
+
+def myquote(url):
+ return quote(url.encode('utf8') if isinstance(url, unicode) else url, safe="%/:=&?~#+!$,;'@()*[]")
+
+def myurlencode(data):
+ data = dict(data)
+ return urlencode(dict((x.encode('utf8') if isinstance(x, unicode) else x, \
+ y.encode('utf8') if isinstance(y, unicode) else y ) for x, y in data.iteritems()))
+
+bad_headers = range(400, 404) + range(405, 418) + range(500, 506)
+
+class BadHeader(Exception):
+ def __init__(self, code, content=""):
+ Exception.__init__(self, "Bad server response: %s %s" % (code, responses.get(int(code), "Unknown Header")))
+ self.code = code
+ self.content = content
+
+
+class HTTPRequest():
+ def __init__(self, cookies=None, options=None):
+ self.c = pycurl.Curl()
+ self.rep = StringIO()
+
+ self.cj = cookies #cookiejar
+
+ self.lastURL = None
+ self.lastEffectiveURL = None
+ self.abort = False
+ self.code = 0 # last http code
+
+ self.header = ""
+
+ self.headers = [] #temporary request header
+
+ self.initHandle()
+ self.setInterface(options)
+ self.setOptions(options)
+
+ self.c.setopt(pycurl.WRITEFUNCTION, self.write)
+ self.c.setopt(pycurl.HEADERFUNCTION, self.writeHeader)
+
+ self.log = getLogger("log")
+
+
+ def initHandle(self):
+ """ sets common options to curl handle """
+ self.c.setopt(pycurl.FOLLOWLOCATION, 1)
+ self.c.setopt(pycurl.MAXREDIRS, 5)
+ self.c.setopt(pycurl.CONNECTTIMEOUT, 30)
+ self.c.setopt(pycurl.NOSIGNAL, 1)
+ self.c.setopt(pycurl.NOPROGRESS, 1)
+ if hasattr(pycurl, "AUTOREFERER"):
+ self.c.setopt(pycurl.AUTOREFERER, 1)
+ self.c.setopt(pycurl.SSL_VERIFYPEER, 0)
+ # Interval for low speed, detects connection loss, but can abort dl if hoster stalls the download
+ self.c.setopt(pycurl.LOW_SPEED_TIME, 45)
+ self.c.setopt(pycurl.LOW_SPEED_LIMIT, 5)
+
+ #self.c.setopt(pycurl.VERBOSE, 1)
+
+ self.c.setopt(pycurl.USERAGENT,
+ "Mozilla/5.0 (Windows NT 6.1; Win64; x64;en; rv:5.0) Gecko/20110619 Firefox/5.0")
+ if pycurl.version_info()[7]:
+ self.c.setopt(pycurl.ENCODING, "gzip, deflate")
+ self.c.setopt(pycurl.HTTPHEADER, ["Accept: */*",
+ "Accept-Language: en-US,en",
+ "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7",
+ "Connection: keep-alive",
+ "Keep-Alive: 300",
+ "Expect:"])
+
+ def setInterface(self, options):
+
+ interface, proxy, ipv6 = options["interface"], options["proxies"], options["ipv6"]
+
+ if interface and interface.lower() != "none":
+ self.c.setopt(pycurl.INTERFACE, str(interface))
+
+ if proxy:
+ if proxy["type"] == "socks4":
+ self.c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS4)
+ elif proxy["type"] == "socks5":
+ self.c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
+ else:
+ self.c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_HTTP)
+
+ self.c.setopt(pycurl.PROXY, str(proxy["address"]))
+ self.c.setopt(pycurl.PROXYPORT, proxy["port"])
+
+ if proxy["username"]:
+ self.c.setopt(pycurl.PROXYUSERPWD, str("%s:%s" % (proxy["username"], proxy["password"])))
+
+ if ipv6:
+ self.c.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_WHATEVER)
+ else:
+ self.c.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
+
+ if "auth" in options:
+ self.c.setopt(pycurl.USERPWD, str(options["auth"]))
+
+ if "timeout" in options:
+ self.c.setopt(pycurl.LOW_SPEED_TIME, options["timeout"])
+
+ def setOptions(self, options):
+ """ Sets same options as available in pycurl """
+ for k, v in options.iteritems():
+ if hasattr(pycurl, k):
+ self.c.setopt(getattr(pycurl, k), v)
+
+ def addCookies(self):
+ """ put cookies from curl handle to cj """
+ if self.cj:
+ self.cj.addCookies(self.c.getinfo(pycurl.INFO_COOKIELIST))
+
+ def getCookies(self):
+ """ add cookies from cj to curl handle """
+ if self.cj:
+ for c in self.cj.getCookies():
+ self.c.setopt(pycurl.COOKIELIST, c)
+ return
+
+ def clearCookies(self):
+ self.c.setopt(pycurl.COOKIELIST, "")
+
+ def setRequestContext(self, url, get, post, referer, cookies, multipart=False):
+ """ sets everything needed for the request """
+
+ url = myquote(url)
+
+ if get:
+ get = urlencode(get)
+ url = "%s?%s" % (url, get)
+
+ self.c.setopt(pycurl.URL, url)
+ self.lastURL = url
+
+ if post:
+ self.c.setopt(pycurl.POST, 1)
+ if not multipart:
+ if type(post) == unicode:
+ post = str(post) #unicode not allowed
+ elif type(post) == str:
+ pass
+ else:
+ post = myurlencode(post)
+
+ self.c.setopt(pycurl.POSTFIELDS, post)
+ else:
+ post = [(x, y.encode('utf8') if type(y) == unicode else y ) for x, y in post.iteritems()]
+ self.c.setopt(pycurl.HTTPPOST, post)
+ else:
+ self.c.setopt(pycurl.POST, 0)
+
+ if referer and self.lastURL:
+ self.c.setopt(pycurl.REFERER, str(self.lastURL))
+
+ if cookies:
+ self.c.setopt(pycurl.COOKIEFILE, "")
+ self.c.setopt(pycurl.COOKIEJAR, "")
+ self.getCookies()
+
+
+ def load(self, url, get={}, post={}, referer=True, cookies=True, just_header=False, multipart=False, decode=False):
+ """ load and returns a given page """
+
+ self.setRequestContext(url, get, post, referer, cookies, multipart)
+
+ # TODO: use http/rfc message instead
+ self.header = ""
+
+ self.c.setopt(pycurl.HTTPHEADER, self.headers)
+
+ if just_header:
+ self.c.setopt(pycurl.FOLLOWLOCATION, 0)
+ self.c.setopt(pycurl.NOBODY, 1) #TODO: nobody= no post?
+
+ # overwrite HEAD request, we want a common request type
+ if post:
+ self.c.setopt(pycurl.CUSTOMREQUEST, "POST")
+ else:
+ self.c.setopt(pycurl.CUSTOMREQUEST, "GET")
+
+ try:
+ self.c.perform()
+ rep = self.header
+ finally:
+ self.c.setopt(pycurl.FOLLOWLOCATION, 1)
+ self.c.setopt(pycurl.NOBODY, 0)
+ self.c.unsetopt(pycurl.CUSTOMREQUEST)
+
+ else:
+ self.c.perform()
+ rep = self.getResponse()
+
+ self.c.setopt(pycurl.POSTFIELDS, "")
+ self.lastEffectiveURL = self.c.getinfo(pycurl.EFFECTIVE_URL)
+ self.code = self.verifyHeader()
+
+ self.addCookies()
+
+ if decode:
+ rep = self.decodeResponse(rep)
+
+ return rep
+
+ def verifyHeader(self):
+ """ raise an exceptions on bad headers """
+ code = int(self.c.getinfo(pycurl.RESPONSE_CODE))
+ # TODO: raise anyway to be consistent, also rename exception
+ if code in bad_headers:
+ #404 will NOT raise an exception
+ raise BadHeader(code, self.getResponse())
+ return code
+
+ def checkHeader(self):
+ """ check if header indicates failure"""
+ return int(self.c.getinfo(pycurl.RESPONSE_CODE)) not in bad_headers
+
+ def getResponse(self):
+ """ retrieve response from string io """
+ if self.rep is None: return ""
+ value = self.rep.getvalue()
+ self.rep.close()
+ self.rep = StringIO()
+ return value
+
+ def decodeResponse(self, rep):
+ """ decode with correct encoding, relies on header """
+ header = self.header.splitlines()
+ encoding = "utf8" # default encoding
+
+ for line in header:
+ line = line.lower().replace(" ", "")
+ if not line.startswith("content-type:") or\
+ ("text" not in line and "application" not in line):
+ continue
+
+ none, delemiter, charset = line.rpartition("charset=")
+ if delemiter:
+ charset = charset.split(";")
+ if charset:
+ encoding = charset[0]
+
+ try:
+ #self.log.debug("Decoded %s" % encoding )
+ if lookup(encoding).name == 'utf-8' and rep.startswith(BOM_UTF8):
+ encoding = 'utf-8-sig'
+
+ decoder = getincrementaldecoder(encoding)("replace")
+ rep = decoder.decode(rep, True)
+
+ #TODO: html_unescape as default
+
+ except LookupError:
+ self.log.debug("No Decoder found for %s" % encoding)
+ except Exception:
+ self.log.debug("Error when decoding string from %s." % encoding)
+
+ return rep
+
+ def write(self, buf):
+ """ writes response """
+ if self.rep.tell() > 1000000 or self.abort:
+ rep = self.getResponse()
+ if self.abort: raise Abort()
+ f = open("response.dump", "wb")
+ f.write(rep)
+ f.close()
+ raise Exception("Loaded Url exceeded limit")
+
+ self.rep.write(buf)
+
+ def writeHeader(self, buf):
+ """ writes header """
+ self.header += buf
+
+ def putHeader(self, name, value):
+ self.headers.append("%s: %s" % (name, value))
+
+ def clearHeaders(self):
+ self.headers = []
+
+ def close(self):
+ """ cleanup, unusable after this """
+ self.rep.close()
+ if hasattr(self, "cj"):
+ del self.cj
+ if hasattr(self, "c"):
+ self.c.close()
+ del self.c
+
+if __name__ == "__main__":
+ url = "http://pyload.org"
+ c = HTTPRequest()
+ print c.load(url)
+