[YoutubeCom] fix #2390

author: Nitzo <nitzo2001@yahoo.com> 2016-04-17 22:01:38 +0200
committer: Nitzo <nitzo2001@yahoo.com> 2016-04-17 22:01:38 +0200
commit: 1382946ed80105e8e6615142d73a35a9f7efc80c (patch)
tree: af59294a752f378a8660560e687811963ee052d6 /module
parent: [misc] Fix DB.store() bug that causes a dictionary to be converted to an array (diff)
download: pyload-1382946ed80105e8e6615142d73a35a9f7efc80c.tar.xz
1 files changed, 363 insertions, 18 deletions
diff --git a/module/plugins/hoster/YoutubeCom.py b/module/plugins/hoster/YoutubeCom.py
index 6c423b84a..972a86000 100644
--- a/module/plugins/hoster/YoutubeCom.py
+++ b/module/plugins/hoster/YoutubeCom.py
@@ -1,18 +1,46 @@
 # -*- coding: utf-8 -*-
 
+import operator
 import os
 import re
 import subprocess
+import time
 import urllib
 
+from module.plugins.Plugin import Abort
+from module.network.HTTPRequest import HTTPRequest
+from module.network.CookieJar import CookieJar
 from module.plugins.internal.Hoster import Hoster
-from module.plugins.internal.misc import html_unescape, replace_patterns, which
+from module.plugins.internal.misc import html_unescape, json, replace_patterns, which
+
+
+class BIGHTTPRequest(HTTPRequest):
+    """
+    Overcome HTTPRequest's load() size limit to allow
+    loading very big web pages by overrding HTTPRequest's write() function
+    """
+    def __init__(self, cookies=None, options=None, limit=1000000):  #@TODO: Add 'limit' parameter to HTTPRequest in v0.4.10
+        self.limit = limit
+        HTTPRequest.__init__(self, cookies=cookies, options=options)
+
+    def write(self, buf):
+        """ writes response """
+        if self.limit and self.rep.tell() > self.limit or self.abort:
+            rep = self.getResponse()
+            if self.abort: raise Abort()
+            f = open("response.dump", "wb")
+            f.write(rep)
+            f.close()
+            raise Exception("Loaded Url exceeded limit")
+
+        self.rep.write(buf)
+
 
 
 class YoutubeCom(Hoster):
     __name__    = "YoutubeCom"
     __type__    = "hoster"
-    __version__ = "0.50"
+    __version__ = "0.51"
     __status__  = "testing"
 
     __pattern__ = r'https?://(?:[^/]*\.)?(youtu\.be/|youtube\.com/watch\?(?:.*&)?v=)\w+'
@@ -27,8 +55,9 @@ class YoutubeCom(Hoster):
 
     __description__ = """Youtube.com hoster plugin"""
     __license__     = "GPLv3"
-    __authors__     = [("spoob", "spoob@pyload.org"),
-                       ("zoidberg", "zoidberg@mujmail.cz")]
+    __authors__     = [("spoob",     "spoob@pyload.org"          ),
+                       ("zoidberg",  "zoidberg@mujmail.cz"       ),
+                       ("GammaC0de", "nitzo2001[AT]yahoo[DOT]com")]
 
 
     URL_REPLACEMENTS = [(r'youtu\.be/', 'youtube.com/watch?v=')]
@@ -59,20 +88,89 @@ class YoutubeCom(Hoster):
                101: (".webm", 640 , 360 , 4 , True ),
                102: (".webm", 1280, 720 , 8 , True )}
 
+    def _decrypt_signature(self, encrypted_sig):
+        """Turn the encrypted 's' field into a working signature"""
+        try:
+            player_url = json.loads(re.search(r'"assets":.+?"js":\s*("[^"]+")', self.data).group(1))
+        except (AttributeError, IndexError):
+            self.fail(_("Player URL not found"))
+
+        if player_url.startswith("//"):
+            player_url = 'https:' + player_url
+
+        if not player_url.endswith(".js"):
+            self.fail(_("Unsupported player type %s") % player_url)
+
+        cache_info = self.db.retrieve("cache")
+        cache_dirty = False
+
+        if cache_info is None or 'version' not in cache_info or cache_info['version'] != self.__version__:
+            cache_info = {'version': self.__version__,
+                          'cache'  : {}}
+            cache_dirty = True
+
+        if player_url in cache_info['cache'] and time.time() < cache_info['cache'][player_url]['time'] + 24 * 60 * 60:
+            self.log_debug("Using cached decode function to decrypt the URL")
+            decrypt_func = lambda s: ''.join(s[_i] for _i in cache_info['cache'][player_url]['decrypt_map'])
+            decrypted_sig = decrypt_func(encrypted_sig)
+
+        else:
+            player_data = self.load(player_url)
+            try:
+                function_name = re.search(r'\.sig\|\|([a-zA-Z0-9$]+)\(', player_data).group(1)
+
+            except (AttributeError, IndexError):
+                self.fail(_("Signature decode function name not found"))
+
+            try:
+                jsi = JSInterpreter(player_data)
+                decrypt_func = lambda s: jsi.extract_function(function_name)([s])
+
+                #: Since Youtube just scrambles the order of the characters in the signature
+                #: and does not change any byte value, we can store just a transformation map as a cached function
+                decrypt_map = [ord(c) for c in decrypt_func(''.join(map(unichr, xrange(len(encrypted_sig)))))]
+                cache_info['cache'][player_url] = {'decrypt_map': decrypt_map,
+                                                   'time'       : time.time()}
+                cache_dirty = True
+
+                decrypted_sig = decrypt_func(encrypted_sig)
+
+            except (JSInterpreterError, AssertionError), e:
+                self.log_error(_("Signature decode failed"), e)
+                self.fail(e.message)
+
+        #: Remove old records from cache
+        for _c in cache_info['cache'].iterkeys():
+            if time.time() >= cache_info['cache'][_c]['time'] + 24 * 60 * 60:
+                cache_info['cache'].pop(_c, None)
+                cache_dirty = True
+
+        if cache_dirty:
+            self.db.store("cache", cache_info)
+
+        return decrypted_sig
+
 
     def setup(self):
         self.resume_download = True
-        self.multiDL        = True
+        self.multiDL         = True
+
+        try:
+            self.req.http.close()
+        except Exception:
+            pass
+
+        self.req.http = BIGHTTPRequest(cookies=CookieJar(None), options=self.pyload.requestFactory.getOptions(), limit=2000000)
 
 
     def process(self, pyfile):
         pyfile.url = replace_patterns(pyfile.url, self.URL_REPLACEMENTS)
-        html       = self.load(pyfile.url)
+        self.data  = self.load(pyfile.url)
 
-        if re.search(r'<div id="player-unavailable" class="\s*player-width player-height\s*">', html):
+        if re.search(r'<div id="player-unavailable" class="\s*player-width player-height\s*">', self.data):
             self.offline()
 
-        if "We have been receiving a large volume of requests from your network." in html:
+        if "We have been receiving a large volume of requests from your network." in self.data:
             self.temp_offline()
 
         #: Get config
@@ -95,10 +193,14 @@ class YoutubeCom(Hoster):
             desired_fmt = 0
 
         #: Parse available streams
-        streams = re.search(r'"url_encoded_fmt_stream_map":"(.+?)",', html).group(1)
+        streams = re.search(r'"url_encoded_fmt_stream_map":"(.+?)",', self.data).group(1)
         streams = [x.split('\u0026') for x in streams.split(',')]
         streams = [dict((y.split('=', 1)) for y in x) for x in streams]
-        streams = [(int(x['itag']), urllib.unquote(x['url'])) for x in streams]
+        streams = [(int(x['itag']),
+                    urllib.unquote(x['url']),
+                    x.get('s', x.get('sig', None)),
+                    True if 's' in x else False)
+                   for x in streams]
 
         # self.log_debug("Found links: %s" % streams)
 
@@ -111,7 +213,7 @@ class YoutubeCom(Hoster):
         if not streams:
             self.fail(_("No available stream meets your preferences"))
 
-        fmt_dict = dict([x for x in streams if self.formats[x[0]][4] == use3d] or streams)
+        fmt_dict = dict([(x[0], x[1:]) for x in streams if self.formats[x[0]][4] == use3d] or streams)
 
         self.log_debug("DESIRED STREAM: ITAG:%d (%s) %sfound, %sallowed" %
                       (desired_fmt, "%s %dx%d Q:%d 3D:%s" % self.formats[desired_fmt],
@@ -119,26 +221,38 @@ class YoutubeCom(Hoster):
 
         #: Return fmt nearest to quality index
         if desired_fmt in fmt_dict and allowed(desired_fmt):
-            fmt = desired_fmt
+            choosen_fmt = desired_fmt
         else:
             sel  = lambda x: self.formats[x][3]  #: Select quality index
             comp = lambda x, y: abs(sel(x) - sel(y))
 
             self.log_debug("Choosing nearest fmt: %s" % [(x, allowed(x), comp(x, desired_fmt)) for x in fmt_dict.keys()])
 
-            fmt = reduce(lambda x, y: x if comp(x, desired_fmt) <= comp(y, desired_fmt) and
-                         sel(x) > sel(y) else y, fmt_dict.keys())
+            choosen_fmt = reduce(lambda x, y: x if comp(x, desired_fmt) <= comp(y, desired_fmt) and
+                                                   sel(x) > sel(y) else y, fmt_dict.keys())
+
+        self.log_debug("Chosen fmt: %s" % choosen_fmt)
 
-        self.log_debug("Chosen fmt: %s" % fmt)
+        url = fmt_dict[choosen_fmt][0]
 
-        url = fmt_dict[fmt]
+        if fmt_dict[choosen_fmt][1]:
+            if fmt_dict[choosen_fmt][2]:
+                signature = self._decrypt_signature(fmt_dict[choosen_fmt][1])
+
+            else:
+                signature = fmt_dict[choosen_fmt][1]
+
+            url += "&signature=" + signature
+
+        if "&ratebypass=" not in url:
+            url += "&ratebypass=yes"
 
         self.log_debug("URL: %s" % url)
 
         #: Set file name
-        file_suffix = self.formats[fmt][0] if fmt in self.formats else ".flv"
+        file_suffix = self.formats[choosen_fmt][0] if choosen_fmt in self.formats else ".flv"
         file_name_pattern = '<meta name="title" content="(.+?)">'
-        name = re.search(file_name_pattern, html).group(1).replace("/", "")
+        name = re.search(file_name_pattern, self.data).group(1).replace("/", "")
 
         #: Cleaning invalid characters from the file name
         name = name.encode('ascii', 'replace')
@@ -172,3 +286,234 @@ class YoutubeCom(Hoster):
                 filename])
 
             self.remove(inputfile, trash=False)
+
+
+"""Credit to this awesome piece of code below goes to the 'youtube_dl' project, kudos!"""
+class JSInterpreterError(Exception):
+    pass
+
+
+class JSInterpreter(object):
+    def __init__(self, code, objects=None):
+        self._OPERATORS = [
+            ('|', operator.or_),
+            ('^', operator.xor),
+            ('&', operator.and_),
+            ('>>', operator.rshift),
+            ('<<', operator.lshift),
+            ('-', operator.sub),
+            ('+', operator.add),
+            ('%', operator.mod),
+            ('/', operator.truediv),
+            ('*', operator.mul),
+        ]
+        self._ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in self._OPERATORS]
+        self._ASSIGN_OPERATORS.append(('=', lambda cur, right: right))
+        self._VARNAME_PATTERN = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
+
+        if objects is None:
+            objects = {}
+        self.code = code
+        self._functions = {}
+        self._objects = objects
+
+    def interpret_statement(self, stmt, local_vars, allow_recursion=100):
+        if allow_recursion < 0:
+            raise JSInterpreterError('Recursion limit reached')
+
+        should_abort = False
+        stmt = stmt.lstrip()
+        stmt_m = re.match(r'var\s', stmt)
+        if stmt_m:
+            expr = stmt[len(stmt_m.group(0)):]
+
+        else:
+            return_m = re.match(r'return(?:\s+|$)', stmt)
+            if return_m:
+                expr = stmt[len(return_m.group(0)):]
+                should_abort = True
+            else:
+                # Try interpreting it as an expression
+                expr = stmt
+
+        v = self.interpret_expression(expr, local_vars, allow_recursion)
+        return v, should_abort
+
+    def interpret_expression(self, expr, local_vars, allow_recursion):
+        expr = expr.strip()
+
+        if expr == '':  # Empty expression
+            return None
+
+        if expr.startswith('('):
+            parens_count = 0
+            for m in re.finditer(r'[()]', expr):
+                if m.group(0) == '(':
+                    parens_count += 1
+                else:
+                    parens_count -= 1
+                    if parens_count == 0:
+                        sub_expr = expr[1:m.start()]
+                        sub_result = self.interpret_expression(sub_expr, local_vars, allow_recursion)
+                        remaining_expr = expr[m.end():].strip()
+                        if not remaining_expr:
+                            return sub_result
+                        else:
+                            expr = json.dumps(sub_result) + remaining_expr
+                        break
+            else:
+                raise JSInterpreterError('Premature end of parens in %r' % expr)
+
+        for op, opfunc in self._ASSIGN_OPERATORS:
+            m = re.match(r'(?x)(?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?\s*%s(?P<expr>.*)$' % (self._VARNAME_PATTERN, re.escape(op)), expr)
+            if not m:
+                continue
+            right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion - 1)
+
+            if m.groupdict().get('index'):
+                lvar = local_vars[m.group('out')]
+                idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion)
+                assert isinstance(idx, int)
+                cur = lvar[idx]
+                val = opfunc(cur, right_val)
+                lvar[idx] = val
+                return val
+            else:
+                cur = local_vars.get(m.group('out'))
+                val = opfunc(cur, right_val)
+                local_vars[m.group('out')] = val
+                return val
+
+        if expr.isdigit():
+            return int(expr)
+
+        var_m = re.match(r'(?!if|return|true|false)(?P<name>%s)$' % self._VARNAME_PATTERN, expr)
+        if var_m:
+            return local_vars[var_m.group('name')]
+
+        try:
+            return json.loads(expr)
+        except ValueError:
+            pass
+
+        m = re.match(r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % self._VARNAME_PATTERN, expr)
+        if m:
+            variable = m.group('var')
+            member = m.group('member')
+            arg_str = m.group('args')
+
+            if variable in local_vars:
+                obj = local_vars[variable]
+            else:
+                if variable not in self._objects:
+                    self._objects[variable] = self.extract_object(variable)
+                obj = self._objects[variable]
+
+            if arg_str is None:
+                # Member access
+                if member == 'length':
+                    return len(obj)
+                return obj[member]
+
+            assert expr.endswith(')')
+            # Function call
+            if arg_str == '':
+                argvals = tuple()
+            else:
+                argvals = tuple([self.interpret_expression(v, local_vars, allow_recursion) for v in arg_str.split(',')])
+
+            if member == 'split':
+                assert argvals == ('',)
+                return list(obj)
+
+            if member == 'join':
+                assert len(argvals) == 1
+                return argvals[0].join(obj)
+
+            if member == 'reverse':
+                assert len(argvals) == 0
+                obj.reverse()
+                return obj
+
+            if member == 'slice':
+                assert len(argvals) == 1
+                return obj[argvals[0]:]
+
+            if member == 'splice':
+                assert isinstance(obj, list)
+                index, howMany = argvals
+                res = []
+                for i in range(index, min(index + howMany, len(obj))):
+                    res.append(obj.pop(index))
+                return res
+
+            return obj[member](argvals)
+
+        m = re.match(r'(?P<in>%s)\[(?P<idx>.+)\]$' % self._VARNAME_PATTERN, expr)
+        if m:
+            val = local_vars[m.group('in')]
+            idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion - 1)
+            return val[idx]
+
+        for op, opfunc in self._OPERATORS:
+            m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
+            if not m:
+                continue
+
+            x, abort = self.interpret_statement(m.group('x'), local_vars, allow_recursion - 1)
+            if abort:
+                raise JSInterpreterError('Premature left-side return of %s in %r' % (op, expr))
+
+            y, abort = self.interpret_statement(m.group('y'), local_vars, allow_recursion - 1)
+            if abort:
+                raise JSInterpreterError('Premature right-side return of %s in %r' % (op, expr))
+
+            return opfunc(x, y)
+
+        m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]+)\)$' % self._VARNAME_PATTERN, expr)
+        if m:
+            fname = m.group('func')
+            argvals = tuple([int(v) if v.isdigit() else local_vars[v] for v in m.group('args').split(',')])
+            if fname not in self._functions:
+                self._functions[fname] = self.extract_function(fname)
+            return self._functions[fname](argvals)
+
+        raise JSInterpreterError('Unsupported JS expression %r' % expr)
+
+    def extract_object(self, objname):
+        obj = {}
+        obj_m = re.search(r'(?:var\s+)?%s\s*=\s*\{\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)\}\s*;'
+                          % re.escape(objname), self.code)
+        fields = obj_m.group('fields')
+        # Currently, it only supports function definitions
+        fields_m = re.finditer(r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', fields)
+        for f in fields_m:
+            argnames = f.group('args').split(',')
+            obj[f.group('key')] = self.build_function(argnames, f.group('code'))
+
+        return obj
+
+    def extract_function(self, function_name):
+        func_m = re.search(r'(?x)(?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s*\((?P<args>[^)]*)\)\s*\{(?P<code>[^}]+)\}'
+                           % (re.escape(function_name), re.escape(function_name), re.escape(function_name)), self.code)
+        if func_m is None:
+            raise JSInterpreterError('Could not find JS function %r' % function_name)
+
+        argnames = func_m.group('args').split(',')
+
+        return self.build_function(argnames, func_m.group('code'))
+
+    def call_function(self, function_name, *args):
+        f = self.extract_function(function_name)
+        return f(args)
+
+    def build_function(self, argnames, code):
+        def resf(argvals):
+            local_vars = dict(zip(argnames, argvals))
+            for stmt in code.split(';'):
+                res, abort = self.interpret_statement(stmt, local_vars)
+                if abort:
+                    break
+            return res
+
+        return resf
+\ No newline at end of file
author	Nitzo <nitzo2001@yahoo.com>	2016-04-17 22:01:38 +0200
committer	Nitzo <nitzo2001@yahoo.com>	2016-04-17 22:01:38 +0200
commit	1382946ed80105e8e6615142d73a35a9f7efc80c (patch)
tree	af59294a752f378a8660560e687811963ee052d6 /module
parent	[misc] Fix DB.store() bug that causes a dictionary to be converted to an array (diff)
download	pyload-1382946ed80105e8e6615142d73a35a9f7efc80c.tar.xz