diff options
author | Nitzo <nitzo2001@yahoo.com> | 2016-04-17 22:01:38 +0200 |
---|---|---|
committer | Nitzo <nitzo2001@yahoo.com> | 2016-04-17 22:01:38 +0200 |
commit | 1382946ed80105e8e6615142d73a35a9f7efc80c (patch) | |
tree | af59294a752f378a8660560e687811963ee052d6 | |
parent | [misc] Fix DB.store() bug that causes a dictionary to be converted to an array (diff) | |
download | pyload-1382946ed80105e8e6615142d73a35a9f7efc80c.tar.xz |
[YoutubeCom] fix #2390
-rw-r--r-- | module/plugins/hoster/YoutubeCom.py | 381 |
1 files changed, 363 insertions, 18 deletions
diff --git a/module/plugins/hoster/YoutubeCom.py b/module/plugins/hoster/YoutubeCom.py index 6c423b84a..972a86000 100644 --- a/module/plugins/hoster/YoutubeCom.py +++ b/module/plugins/hoster/YoutubeCom.py @@ -1,18 +1,46 @@ # -*- coding: utf-8 -*- +import operator import os import re import subprocess +import time import urllib +from module.plugins.Plugin import Abort +from module.network.HTTPRequest import HTTPRequest +from module.network.CookieJar import CookieJar from module.plugins.internal.Hoster import Hoster -from module.plugins.internal.misc import html_unescape, replace_patterns, which +from module.plugins.internal.misc import html_unescape, json, replace_patterns, which + + +class BIGHTTPRequest(HTTPRequest): + """ + Overcome HTTPRequest's load() size limit to allow + loading very big web pages by overrding HTTPRequest's write() function + """ + def __init__(self, cookies=None, options=None, limit=1000000): #@TODO: Add 'limit' parameter to HTTPRequest in v0.4.10 + self.limit = limit + HTTPRequest.__init__(self, cookies=cookies, options=options) + + def write(self, buf): + """ writes response """ + if self.limit and self.rep.tell() > self.limit or self.abort: + rep = self.getResponse() + if self.abort: raise Abort() + f = open("response.dump", "wb") + f.write(rep) + f.close() + raise Exception("Loaded Url exceeded limit") + + self.rep.write(buf) + class YoutubeCom(Hoster): __name__ = "YoutubeCom" __type__ = "hoster" - __version__ = "0.50" + __version__ = "0.51" __status__ = "testing" __pattern__ = r'https?://(?:[^/]*\.)?(youtu\.be/|youtube\.com/watch\?(?:.*&)?v=)\w+' @@ -27,8 +55,9 @@ class YoutubeCom(Hoster): __description__ = """Youtube.com hoster plugin""" __license__ = "GPLv3" - __authors__ = [("spoob", "spoob@pyload.org"), - ("zoidberg", "zoidberg@mujmail.cz")] + __authors__ = [("spoob", "spoob@pyload.org" ), + ("zoidberg", "zoidberg@mujmail.cz" ), + ("GammaC0de", "nitzo2001[AT]yahoo[DOT]com")] URL_REPLACEMENTS = [(r'youtu\.be/', 'youtube.com/watch?v=')] @@ -59,20 +88,89 @@ class YoutubeCom(Hoster): 101: (".webm", 640 , 360 , 4 , True ), 102: (".webm", 1280, 720 , 8 , True )} + def _decrypt_signature(self, encrypted_sig): + """Turn the encrypted 's' field into a working signature""" + try: + player_url = json.loads(re.search(r'"assets":.+?"js":\s*("[^"]+")', self.data).group(1)) + except (AttributeError, IndexError): + self.fail(_("Player URL not found")) + + if player_url.startswith("//"): + player_url = 'https:' + player_url + + if not player_url.endswith(".js"): + self.fail(_("Unsupported player type %s") % player_url) + + cache_info = self.db.retrieve("cache") + cache_dirty = False + + if cache_info is None or 'version' not in cache_info or cache_info['version'] != self.__version__: + cache_info = {'version': self.__version__, + 'cache' : {}} + cache_dirty = True + + if player_url in cache_info['cache'] and time.time() < cache_info['cache'][player_url]['time'] + 24 * 60 * 60: + self.log_debug("Using cached decode function to decrypt the URL") + decrypt_func = lambda s: ''.join(s[_i] for _i in cache_info['cache'][player_url]['decrypt_map']) + decrypted_sig = decrypt_func(encrypted_sig) + + else: + player_data = self.load(player_url) + try: + function_name = re.search(r'\.sig\|\|([a-zA-Z0-9$]+)\(', player_data).group(1) + + except (AttributeError, IndexError): + self.fail(_("Signature decode function name not found")) + + try: + jsi = JSInterpreter(player_data) + decrypt_func = lambda s: jsi.extract_function(function_name)([s]) + + #: Since Youtube just scrambles the order of the characters in the signature + #: and does not change any byte value, we can store just a transformation map as a cached function + decrypt_map = [ord(c) for c in decrypt_func(''.join(map(unichr, xrange(len(encrypted_sig)))))] + cache_info['cache'][player_url] = {'decrypt_map': decrypt_map, + 'time' : time.time()} + cache_dirty = True + + decrypted_sig = decrypt_func(encrypted_sig) + + except (JSInterpreterError, AssertionError), e: + self.log_error(_("Signature decode failed"), e) + self.fail(e.message) + + #: Remove old records from cache + for _c in cache_info['cache'].iterkeys(): + if time.time() >= cache_info['cache'][_c]['time'] + 24 * 60 * 60: + cache_info['cache'].pop(_c, None) + cache_dirty = True + + if cache_dirty: + self.db.store("cache", cache_info) + + return decrypted_sig + def setup(self): self.resume_download = True - self.multiDL = True + self.multiDL = True + + try: + self.req.http.close() + except Exception: + pass + + self.req.http = BIGHTTPRequest(cookies=CookieJar(None), options=self.pyload.requestFactory.getOptions(), limit=2000000) def process(self, pyfile): pyfile.url = replace_patterns(pyfile.url, self.URL_REPLACEMENTS) - html = self.load(pyfile.url) + self.data = self.load(pyfile.url) - if re.search(r'<div id="player-unavailable" class="\s*player-width player-height\s*">', html): + if re.search(r'<div id="player-unavailable" class="\s*player-width player-height\s*">', self.data): self.offline() - if "We have been receiving a large volume of requests from your network." in html: + if "We have been receiving a large volume of requests from your network." in self.data: self.temp_offline() #: Get config @@ -95,10 +193,14 @@ class YoutubeCom(Hoster): desired_fmt = 0 #: Parse available streams - streams = re.search(r'"url_encoded_fmt_stream_map":"(.+?)",', html).group(1) + streams = re.search(r'"url_encoded_fmt_stream_map":"(.+?)",', self.data).group(1) streams = [x.split('\u0026') for x in streams.split(',')] streams = [dict((y.split('=', 1)) for y in x) for x in streams] - streams = [(int(x['itag']), urllib.unquote(x['url'])) for x in streams] + streams = [(int(x['itag']), + urllib.unquote(x['url']), + x.get('s', x.get('sig', None)), + True if 's' in x else False) + for x in streams] # self.log_debug("Found links: %s" % streams) @@ -111,7 +213,7 @@ class YoutubeCom(Hoster): if not streams: self.fail(_("No available stream meets your preferences")) - fmt_dict = dict([x for x in streams if self.formats[x[0]][4] == use3d] or streams) + fmt_dict = dict([(x[0], x[1:]) for x in streams if self.formats[x[0]][4] == use3d] or streams) self.log_debug("DESIRED STREAM: ITAG:%d (%s) %sfound, %sallowed" % (desired_fmt, "%s %dx%d Q:%d 3D:%s" % self.formats[desired_fmt], @@ -119,26 +221,38 @@ class YoutubeCom(Hoster): #: Return fmt nearest to quality index if desired_fmt in fmt_dict and allowed(desired_fmt): - fmt = desired_fmt + choosen_fmt = desired_fmt else: sel = lambda x: self.formats[x][3] #: Select quality index comp = lambda x, y: abs(sel(x) - sel(y)) self.log_debug("Choosing nearest fmt: %s" % [(x, allowed(x), comp(x, desired_fmt)) for x in fmt_dict.keys()]) - fmt = reduce(lambda x, y: x if comp(x, desired_fmt) <= comp(y, desired_fmt) and - sel(x) > sel(y) else y, fmt_dict.keys()) + choosen_fmt = reduce(lambda x, y: x if comp(x, desired_fmt) <= comp(y, desired_fmt) and + sel(x) > sel(y) else y, fmt_dict.keys()) + + self.log_debug("Chosen fmt: %s" % choosen_fmt) - self.log_debug("Chosen fmt: %s" % fmt) + url = fmt_dict[choosen_fmt][0] - url = fmt_dict[fmt] + if fmt_dict[choosen_fmt][1]: + if fmt_dict[choosen_fmt][2]: + signature = self._decrypt_signature(fmt_dict[choosen_fmt][1]) + + else: + signature = fmt_dict[choosen_fmt][1] + + url += "&signature=" + signature + + if "&ratebypass=" not in url: + url += "&ratebypass=yes" self.log_debug("URL: %s" % url) #: Set file name - file_suffix = self.formats[fmt][0] if fmt in self.formats else ".flv" + file_suffix = self.formats[choosen_fmt][0] if choosen_fmt in self.formats else ".flv" file_name_pattern = '<meta name="title" content="(.+?)">' - name = re.search(file_name_pattern, html).group(1).replace("/", "") + name = re.search(file_name_pattern, self.data).group(1).replace("/", "") #: Cleaning invalid characters from the file name name = name.encode('ascii', 'replace') @@ -172,3 +286,234 @@ class YoutubeCom(Hoster): filename]) self.remove(inputfile, trash=False) + + +"""Credit to this awesome piece of code below goes to the 'youtube_dl' project, kudos!""" +class JSInterpreterError(Exception): + pass + + +class JSInterpreter(object): + def __init__(self, code, objects=None): + self._OPERATORS = [ + ('|', operator.or_), + ('^', operator.xor), + ('&', operator.and_), + ('>>', operator.rshift), + ('<<', operator.lshift), + ('-', operator.sub), + ('+', operator.add), + ('%', operator.mod), + ('/', operator.truediv), + ('*', operator.mul), + ] + self._ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in self._OPERATORS] + self._ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) + self._VARNAME_PATTERN = r'[a-zA-Z_$][a-zA-Z_$0-9]*' + + if objects is None: + objects = {} + self.code = code + self._functions = {} + self._objects = objects + + def interpret_statement(self, stmt, local_vars, allow_recursion=100): + if allow_recursion < 0: + raise JSInterpreterError('Recursion limit reached') + + should_abort = False + stmt = stmt.lstrip() + stmt_m = re.match(r'var\s', stmt) + if stmt_m: + expr = stmt[len(stmt_m.group(0)):] + + else: + return_m = re.match(r'return(?:\s+|$)', stmt) + if return_m: + expr = stmt[len(return_m.group(0)):] + should_abort = True + else: + # Try interpreting it as an expression + expr = stmt + + v = self.interpret_expression(expr, local_vars, allow_recursion) + return v, should_abort + + def interpret_expression(self, expr, local_vars, allow_recursion): + expr = expr.strip() + + if expr == '': # Empty expression + return None + + if expr.startswith('('): + parens_count = 0 + for m in re.finditer(r'[()]', expr): + if m.group(0) == '(': + parens_count += 1 + else: + parens_count -= 1 + if parens_count == 0: + sub_expr = expr[1:m.start()] + sub_result = self.interpret_expression(sub_expr, local_vars, allow_recursion) + remaining_expr = expr[m.end():].strip() + if not remaining_expr: + return sub_result + else: + expr = json.dumps(sub_result) + remaining_expr + break + else: + raise JSInterpreterError('Premature end of parens in %r' % expr) + + for op, opfunc in self._ASSIGN_OPERATORS: + m = re.match(r'(?x)(?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?\s*%s(?P<expr>.*)$' % (self._VARNAME_PATTERN, re.escape(op)), expr) + if not m: + continue + right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion - 1) + + if m.groupdict().get('index'): + lvar = local_vars[m.group('out')] + idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) + assert isinstance(idx, int) + cur = lvar[idx] + val = opfunc(cur, right_val) + lvar[idx] = val + return val + else: + cur = local_vars.get(m.group('out')) + val = opfunc(cur, right_val) + local_vars[m.group('out')] = val + return val + + if expr.isdigit(): + return int(expr) + + var_m = re.match(r'(?!if|return|true|false)(?P<name>%s)$' % self._VARNAME_PATTERN, expr) + if var_m: + return local_vars[var_m.group('name')] + + try: + return json.loads(expr) + except ValueError: + pass + + m = re.match(r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % self._VARNAME_PATTERN, expr) + if m: + variable = m.group('var') + member = m.group('member') + arg_str = m.group('args') + + if variable in local_vars: + obj = local_vars[variable] + else: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + + if arg_str is None: + # Member access + if member == 'length': + return len(obj) + return obj[member] + + assert expr.endswith(')') + # Function call + if arg_str == '': + argvals = tuple() + else: + argvals = tuple([self.interpret_expression(v, local_vars, allow_recursion) for v in arg_str.split(',')]) + + if member == 'split': + assert argvals == ('',) + return list(obj) + + if member == 'join': + assert len(argvals) == 1 + return argvals[0].join(obj) + + if member == 'reverse': + assert len(argvals) == 0 + obj.reverse() + return obj + + if member == 'slice': + assert len(argvals) == 1 + return obj[argvals[0]:] + + if member == 'splice': + assert isinstance(obj, list) + index, howMany = argvals + res = [] + for i in range(index, min(index + howMany, len(obj))): + res.append(obj.pop(index)) + return res + + return obj[member](argvals) + + m = re.match(r'(?P<in>%s)\[(?P<idx>.+)\]$' % self._VARNAME_PATTERN, expr) + if m: + val = local_vars[m.group('in')] + idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion - 1) + return val[idx] + + for op, opfunc in self._OPERATORS: + m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr) + if not m: + continue + + x, abort = self.interpret_statement(m.group('x'), local_vars, allow_recursion - 1) + if abort: + raise JSInterpreterError('Premature left-side return of %s in %r' % (op, expr)) + + y, abort = self.interpret_statement(m.group('y'), local_vars, allow_recursion - 1) + if abort: + raise JSInterpreterError('Premature right-side return of %s in %r' % (op, expr)) + + return opfunc(x, y) + + m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]+)\)$' % self._VARNAME_PATTERN, expr) + if m: + fname = m.group('func') + argvals = tuple([int(v) if v.isdigit() else local_vars[v] for v in m.group('args').split(',')]) + if fname not in self._functions: + self._functions[fname] = self.extract_function(fname) + return self._functions[fname](argvals) + + raise JSInterpreterError('Unsupported JS expression %r' % expr) + + def extract_object(self, objname): + obj = {} + obj_m = re.search(r'(?:var\s+)?%s\s*=\s*\{\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)\}\s*;' + % re.escape(objname), self.code) + fields = obj_m.group('fields') + # Currently, it only supports function definitions + fields_m = re.finditer(r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', fields) + for f in fields_m: + argnames = f.group('args').split(',') + obj[f.group('key')] = self.build_function(argnames, f.group('code')) + + return obj + + def extract_function(self, function_name): + func_m = re.search(r'(?x)(?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s*\((?P<args>[^)]*)\)\s*\{(?P<code>[^}]+)\}' + % (re.escape(function_name), re.escape(function_name), re.escape(function_name)), self.code) + if func_m is None: + raise JSInterpreterError('Could not find JS function %r' % function_name) + + argnames = func_m.group('args').split(',') + + return self.build_function(argnames, func_m.group('code')) + + def call_function(self, function_name, *args): + f = self.extract_function(function_name) + return f(args) + + def build_function(self, argnames, code): + def resf(argvals): + local_vars = dict(zip(argnames, argvals)) + for stmt in code.split(';'): + res, abort = self.interpret_statement(stmt, local_vars) + if abort: + break + return res + + return resf
\ No newline at end of file |