diff options
Diffstat (limited to 'pyload/network/HTTPRequest.py')
| -rw-r--r-- | pyload/network/HTTPRequest.py | 315 | 
1 files changed, 315 insertions, 0 deletions
| diff --git a/pyload/network/HTTPRequest.py b/pyload/network/HTTPRequest.py new file mode 100644 index 000000000..c2442367a --- /dev/null +++ b/pyload/network/HTTPRequest.py @@ -0,0 +1,315 @@ +# -*- coding: utf-8 -*- +""" +    This program is free software; you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation; either version 3 of the License, +    or (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +    See the GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program; if not, see <http://www.gnu.org/licenses/>. + +    @author: RaNaN +""" + +import pycurl + +from codecs import getincrementaldecoder, lookup, BOM_UTF8 +from urllib import quote, urlencode +from httplib import responses +from logging import getLogger +from cStringIO import StringIO + +from pyload.plugins.Plugin import Abort, Fail + +from pyload.utils import encode + + +def myquote(url): +    return quote(encode(url), safe="%/:=&?~#+!$,;'@()*[]") + +def myurlencode(data): +    data = dict(data) +    return urlencode(dict(encode(x), encode(y) for x, y in data.iteritems())) + +bad_headers = range(400, 404) + range(405, 418) + range(500, 506) + +class BadHeader(Exception): +    def __init__(self, code, content=""): +        Exception.__init__(self, "Bad server response: %s %s" % (code, responses[int(code)])) +        self.code = code +        self.content = content + + +class HTTPRequest(object): +    def __init__(self, cookies=None, options=None): +        self.c = pycurl.Curl() +        self.rep = StringIO() + +        self.cj = cookies #cookiejar + +        self.lastURL = None +        self.lastEffectiveURL = None +        self.abort = False +        self.code = 0 # last http code + +        self.header = "" + +        self.headers = [] #temporary request header + +        self.initHandle() +        self.setInterface(options) + +        self.c.setopt(pycurl.WRITEFUNCTION, self.write) +        self.c.setopt(pycurl.HEADERFUNCTION, self.writeHeader) + +        self.log = getLogger("log") + + +    def initHandle(self): +        """ sets common options to curl handle """ +        self.c.setopt(pycurl.FOLLOWLOCATION, 1) +        self.c.setopt(pycurl.MAXREDIRS, 5) +        self.c.setopt(pycurl.CONNECTTIMEOUT, 30) +        self.c.setopt(pycurl.NOSIGNAL, 1) +        self.c.setopt(pycurl.NOPROGRESS, 1) +        if hasattr(pycurl, "AUTOREFERER"): +            self.c.setopt(pycurl.AUTOREFERER, 1) +        self.c.setopt(pycurl.SSL_VERIFYPEER, 0) +        self.c.setopt(pycurl.LOW_SPEED_TIME, 60) +        self.c.setopt(pycurl.LOW_SPEED_LIMIT, 5) +        self.c.setopt(pycurl.USE_SSL, pycurl.CURLUSESSL_TRY) + +        #self.c.setopt(pycurl.VERBOSE, 1) + +        self.c.setopt(pycurl.USERAGENT, +            "Mozilla/5.0 (Windows NT 6.1; Win64; x64;en; rv:5.0) Gecko/20110619 Firefox/5.0") +        if pycurl.version_info()[7]: +            self.c.setopt(pycurl.ENCODING, "gzip, deflate") +        self.c.setopt(pycurl.HTTPHEADER, ["Accept: */*", +                                          "Accept-Language: en-US, en", +                                          "Accept-Charset: ISO-8859-1, utf-8;q=0.7,*;q=0.7", +                                          "Connection: keep-alive", +                                          "Keep-Alive: 300", +                                          "Expect:"]) + +    def setInterface(self, options): + +        interface, proxy, ipv6 = options["interface"], options["proxies"], options["ipv6"] + +        if interface and interface.lower() != "none": +            self.c.setopt(pycurl.INTERFACE, str(interface)) + +        if proxy: +            if proxy["type"] == "socks4": +                self.c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS4) +            elif proxy["type"] == "socks5": +                self.c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5) +            else: +                self.c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_HTTP) + +            self.c.setopt(pycurl.PROXY, str(proxy["address"])) +            self.c.setopt(pycurl.PROXYPORT, proxy["port"]) + +            if proxy["username"]: +                self.c.setopt(pycurl.PROXYUSERPWD, str("%s:%s" % (proxy["username"], proxy["password"]))) + +        if ipv6: +            self.c.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_WHATEVER) +        else: +            self.c.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) + +        if "auth" in options: +            self.c.setopt(pycurl.USERPWD, str(options["auth"])) + +        if "timeout" in options: +            self.c.setopt(pycurl.LOW_SPEED_TIME, options["timeout"]) + + +    def addCookies(self): +        """ put cookies from curl handle to cj """ +        if self.cj: +            self.cj.addCookies(self.c.getinfo(pycurl.INFO_COOKIELIST)) + +    def getCookies(self): +        """ add cookies from cj to curl handle """ +        if self.cj: +            for c in self.cj.getCookies(): +                self.c.setopt(pycurl.COOKIELIST, c) +        return + +    def clearCookies(self): +        self.c.setopt(pycurl.COOKIELIST, "") + +    def setRequestContext(self, url, get, post, referer, cookies, multipart=False): +        """ sets everything needed for the request """ + +        url = myquote(url) + +        if get: +            get = urlencode(get) +            url = "%s?%s" % (url, get) + +        self.c.setopt(pycurl.URL, url) +        self.c.lastUrl = url + +        if post: +            self.c.setopt(pycurl.POST, 1) +            if not multipart: +                if type(post) == unicode: +                    post = str(post) #unicode not allowed +                elif type(post) == str: +                    pass +                else: +                    post = myurlencode(post) + +                self.c.setopt(pycurl.POSTFIELDS, post) +            else: +                post = [(x, encode(y) for x, y in post.iteritems()] +                self.c.setopt(pycurl.HTTPPOST, post) +        else: +            self.c.setopt(pycurl.POST, 0) + +        if referer and self.lastURL: +            self.c.setopt(pycurl.REFERER, str(self.lastURL)) + +        if cookies: +            self.c.setopt(pycurl.COOKIEFILE, "") +            self.c.setopt(pycurl.COOKIEJAR, "") +            self.getCookies() + + +    def load(self, url, get={}, post={}, referer=True, cookies=True, just_header=False, multipart=False, decode=False, follow_location=True, save_cookies=True): +        """ load and returns a given page """ + +        self.setRequestContext(url, get, post, referer, cookies, multipart) + +        self.header = "" + +        self.c.setopt(pycurl.HTTPHEADER, self.headers) + +        if post: +            self.c.setopt(pycurl.POST, 1) +        else: +            self.c.setopt(pycurl.HTTPGET, 1) + +        if not follow_location: +            self.c.setopt(pycurl.FOLLOWLOCATION, 0) + +        if just_header: +            self.c.setopt(pycurl.NOBODY, 1) + +        self.c.perform() +        rep = self.header if just_header else self.getResponse() + +        if not follow_location: +            self.c.setopt(pycurl.FOLLOWLOCATION, 1) + +        if just_header: +            self.c.setopt(pycurl.NOBODY, 0) + +        self.c.setopt(pycurl.POSTFIELDS, "") +        self.lastEffectiveURL = self.c.getinfo(pycurl.EFFECTIVE_URL) +        self.code = self.verifyHeader() + +        if save_cookies: +            self.addCookies() + +        if decode: +            rep = self.decodeResponse(rep) + +        return rep + +    def verifyHeader(self): +        """ raise an exceptions on bad headers """ +        code = int(self.c.getinfo(pycurl.RESPONSE_CODE)) +        if code in bad_headers: +            #404 will NOT raise an exception +            raise BadHeader(code, self.getResponse()) +        return code + +    def checkHeader(self): +        """ check if header indicates failure""" +        return int(self.c.getinfo(pycurl.RESPONSE_CODE)) not in bad_headers + +    def getResponse(self): +        """ retrieve response from string io """ +        if self.rep is None: +            return "" +        else: +            value = self.rep.getvalue() +            self.rep.close() +            self.rep = StringIO() +            return value + +    def decodeResponse(self, rep): +        """ decode with correct encoding, relies on header """ +        header = self.header.splitlines() +        encoding = "utf8" # default encoding + +        for line in header: +            line = line.lower().replace(" ", "") +            if not line.startswith("content-type:") or\ +               ("text" not in line and "application" not in line): +                continue + +            none, delemiter, charset = line.rpartition("charset=") +            if delemiter: +                charset = charset.split(";") +                if charset: +                    encoding = charset[0] + +        try: +            #self.log.debug("Decoded %s" % encoding ) +            if lookup(encoding).name == 'utf-8' and rep.startswith(BOM_UTF8): +                encoding = 'utf-8-sig' + +            decoder = getincrementaldecoder(encoding)("replace") +            rep = decoder.decode(rep, True) + +            #TODO: html_unescape as default + +        except LookupError: +            self.log.debug("No Decoder foung for %s" % encoding) + +        except Exception: +            self.log.debug("Error when decoding string from %s." % encoding) + +        return rep + +    def write(self, buf): +        """ writes response """ +        if self.rep.tell() > 1000000 or self.abort: +            rep = self.getResponse() + +            if self.abort: +                raise Abort + +            with open("response.dump", "wb") as f: +                f.write(rep) +            raise Fail("Loaded url exceeded size limit") +        else: +            self.rep.write(buf) + +    def writeHeader(self, buf): +        """ writes header """ +        self.header += buf + +    def putHeader(self, name, value): +        self.headers.append("%s: %s" % (name, value)) + +    def clearHeaders(self): +        self.headers = [] + +    def close(self): +        """ cleanup, unusable after this """ +        self.rep.close() +        if hasattr(self, "cj"): +            del self.cj +        if hasattr(self, "c"): +            self.c.close() +            del self.c | 
