summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar Nitzo <nitzo2001@yahoo.com> 2016-04-17 22:01:38 +0200
committerGravatar Nitzo <nitzo2001@yahoo.com> 2016-04-17 22:01:38 +0200
commit1382946ed80105e8e6615142d73a35a9f7efc80c (patch)
treeaf59294a752f378a8660560e687811963ee052d6
parent[misc] Fix DB.store() bug that causes a dictionary to be converted to an array (diff)
downloadpyload-1382946ed80105e8e6615142d73a35a9f7efc80c.tar.xz
[YoutubeCom] fix #2390
-rw-r--r--module/plugins/hoster/YoutubeCom.py381
1 files changed, 363 insertions, 18 deletions
diff --git a/module/plugins/hoster/YoutubeCom.py b/module/plugins/hoster/YoutubeCom.py
index 6c423b84a..972a86000 100644
--- a/module/plugins/hoster/YoutubeCom.py
+++ b/module/plugins/hoster/YoutubeCom.py
@@ -1,18 +1,46 @@
# -*- coding: utf-8 -*-
+import operator
import os
import re
import subprocess
+import time
import urllib
+from module.plugins.Plugin import Abort
+from module.network.HTTPRequest import HTTPRequest
+from module.network.CookieJar import CookieJar
from module.plugins.internal.Hoster import Hoster
-from module.plugins.internal.misc import html_unescape, replace_patterns, which
+from module.plugins.internal.misc import html_unescape, json, replace_patterns, which
+
+
+class BIGHTTPRequest(HTTPRequest):
+ """
+ Overcome HTTPRequest's load() size limit to allow
+ loading very big web pages by overrding HTTPRequest's write() function
+ """
+ def __init__(self, cookies=None, options=None, limit=1000000): #@TODO: Add 'limit' parameter to HTTPRequest in v0.4.10
+ self.limit = limit
+ HTTPRequest.__init__(self, cookies=cookies, options=options)
+
+ def write(self, buf):
+ """ writes response """
+ if self.limit and self.rep.tell() > self.limit or self.abort:
+ rep = self.getResponse()
+ if self.abort: raise Abort()
+ f = open("response.dump", "wb")
+ f.write(rep)
+ f.close()
+ raise Exception("Loaded Url exceeded limit")
+
+ self.rep.write(buf)
+
class YoutubeCom(Hoster):
__name__ = "YoutubeCom"
__type__ = "hoster"
- __version__ = "0.50"
+ __version__ = "0.51"
__status__ = "testing"
__pattern__ = r'https?://(?:[^/]*\.)?(youtu\.be/|youtube\.com/watch\?(?:.*&)?v=)\w+'
@@ -27,8 +55,9 @@ class YoutubeCom(Hoster):
__description__ = """Youtube.com hoster plugin"""
__license__ = "GPLv3"
- __authors__ = [("spoob", "spoob@pyload.org"),
- ("zoidberg", "zoidberg@mujmail.cz")]
+ __authors__ = [("spoob", "spoob@pyload.org" ),
+ ("zoidberg", "zoidberg@mujmail.cz" ),
+ ("GammaC0de", "nitzo2001[AT]yahoo[DOT]com")]
URL_REPLACEMENTS = [(r'youtu\.be/', 'youtube.com/watch?v=')]
@@ -59,20 +88,89 @@ class YoutubeCom(Hoster):
101: (".webm", 640 , 360 , 4 , True ),
102: (".webm", 1280, 720 , 8 , True )}
+ def _decrypt_signature(self, encrypted_sig):
+ """Turn the encrypted 's' field into a working signature"""
+ try:
+ player_url = json.loads(re.search(r'"assets":.+?"js":\s*("[^"]+")', self.data).group(1))
+ except (AttributeError, IndexError):
+ self.fail(_("Player URL not found"))
+
+ if player_url.startswith("//"):
+ player_url = 'https:' + player_url
+
+ if not player_url.endswith(".js"):
+ self.fail(_("Unsupported player type %s") % player_url)
+
+ cache_info = self.db.retrieve("cache")
+ cache_dirty = False
+
+ if cache_info is None or 'version' not in cache_info or cache_info['version'] != self.__version__:
+ cache_info = {'version': self.__version__,
+ 'cache' : {}}
+ cache_dirty = True
+
+ if player_url in cache_info['cache'] and time.time() < cache_info['cache'][player_url]['time'] + 24 * 60 * 60:
+ self.log_debug("Using cached decode function to decrypt the URL")
+ decrypt_func = lambda s: ''.join(s[_i] for _i in cache_info['cache'][player_url]['decrypt_map'])
+ decrypted_sig = decrypt_func(encrypted_sig)
+
+ else:
+ player_data = self.load(player_url)
+ try:
+ function_name = re.search(r'\.sig\|\|([a-zA-Z0-9$]+)\(', player_data).group(1)
+
+ except (AttributeError, IndexError):
+ self.fail(_("Signature decode function name not found"))
+
+ try:
+ jsi = JSInterpreter(player_data)
+ decrypt_func = lambda s: jsi.extract_function(function_name)([s])
+
+ #: Since Youtube just scrambles the order of the characters in the signature
+ #: and does not change any byte value, we can store just a transformation map as a cached function
+ decrypt_map = [ord(c) for c in decrypt_func(''.join(map(unichr, xrange(len(encrypted_sig)))))]
+ cache_info['cache'][player_url] = {'decrypt_map': decrypt_map,
+ 'time' : time.time()}
+ cache_dirty = True
+
+ decrypted_sig = decrypt_func(encrypted_sig)
+
+ except (JSInterpreterError, AssertionError), e:
+ self.log_error(_("Signature decode failed"), e)
+ self.fail(e.message)
+
+ #: Remove old records from cache
+ for _c in cache_info['cache'].iterkeys():
+ if time.time() >= cache_info['cache'][_c]['time'] + 24 * 60 * 60:
+ cache_info['cache'].pop(_c, None)
+ cache_dirty = True
+
+ if cache_dirty:
+ self.db.store("cache", cache_info)
+
+ return decrypted_sig
+
def setup(self):
self.resume_download = True
- self.multiDL = True
+ self.multiDL = True
+
+ try:
+ self.req.http.close()
+ except Exception:
+ pass
+
+ self.req.http = BIGHTTPRequest(cookies=CookieJar(None), options=self.pyload.requestFactory.getOptions(), limit=2000000)
def process(self, pyfile):
pyfile.url = replace_patterns(pyfile.url, self.URL_REPLACEMENTS)
- html = self.load(pyfile.url)
+ self.data = self.load(pyfile.url)
- if re.search(r'<div id="player-unavailable" class="\s*player-width player-height\s*">', html):
+ if re.search(r'<div id="player-unavailable" class="\s*player-width player-height\s*">', self.data):
self.offline()
- if "We have been receiving a large volume of requests from your network." in html:
+ if "We have been receiving a large volume of requests from your network." in self.data:
self.temp_offline()
#: Get config
@@ -95,10 +193,14 @@ class YoutubeCom(Hoster):
desired_fmt = 0
#: Parse available streams
- streams = re.search(r'"url_encoded_fmt_stream_map":"(.+?)",', html).group(1)
+ streams = re.search(r'"url_encoded_fmt_stream_map":"(.+?)",', self.data).group(1)
streams = [x.split('\u0026') for x in streams.split(',')]
streams = [dict((y.split('=', 1)) for y in x) for x in streams]
- streams = [(int(x['itag']), urllib.unquote(x['url'])) for x in streams]
+ streams = [(int(x['itag']),
+ urllib.unquote(x['url']),
+ x.get('s', x.get('sig', None)),
+ True if 's' in x else False)
+ for x in streams]
# self.log_debug("Found links: %s" % streams)
@@ -111,7 +213,7 @@ class YoutubeCom(Hoster):
if not streams:
self.fail(_("No available stream meets your preferences"))
- fmt_dict = dict([x for x in streams if self.formats[x[0]][4] == use3d] or streams)
+ fmt_dict = dict([(x[0], x[1:]) for x in streams if self.formats[x[0]][4] == use3d] or streams)
self.log_debug("DESIRED STREAM: ITAG:%d (%s) %sfound, %sallowed" %
(desired_fmt, "%s %dx%d Q:%d 3D:%s" % self.formats[desired_fmt],
@@ -119,26 +221,38 @@ class YoutubeCom(Hoster):
#: Return fmt nearest to quality index
if desired_fmt in fmt_dict and allowed(desired_fmt):
- fmt = desired_fmt
+ choosen_fmt = desired_fmt
else:
sel = lambda x: self.formats[x][3] #: Select quality index
comp = lambda x, y: abs(sel(x) - sel(y))
self.log_debug("Choosing nearest fmt: %s" % [(x, allowed(x), comp(x, desired_fmt)) for x in fmt_dict.keys()])
- fmt = reduce(lambda x, y: x if comp(x, desired_fmt) <= comp(y, desired_fmt) and
- sel(x) > sel(y) else y, fmt_dict.keys())
+ choosen_fmt = reduce(lambda x, y: x if comp(x, desired_fmt) <= comp(y, desired_fmt) and
+ sel(x) > sel(y) else y, fmt_dict.keys())
+
+ self.log_debug("Chosen fmt: %s" % choosen_fmt)
- self.log_debug("Chosen fmt: %s" % fmt)
+ url = fmt_dict[choosen_fmt][0]
- url = fmt_dict[fmt]
+ if fmt_dict[choosen_fmt][1]:
+ if fmt_dict[choosen_fmt][2]:
+ signature = self._decrypt_signature(fmt_dict[choosen_fmt][1])
+
+ else:
+ signature = fmt_dict[choosen_fmt][1]
+
+ url += "&signature=" + signature
+
+ if "&ratebypass=" not in url:
+ url += "&ratebypass=yes"
self.log_debug("URL: %s" % url)
#: Set file name
- file_suffix = self.formats[fmt][0] if fmt in self.formats else ".flv"
+ file_suffix = self.formats[choosen_fmt][0] if choosen_fmt in self.formats else ".flv"
file_name_pattern = '<meta name="title" content="(.+?)">'
- name = re.search(file_name_pattern, html).group(1).replace("/", "")
+ name = re.search(file_name_pattern, self.data).group(1).replace("/", "")
#: Cleaning invalid characters from the file name
name = name.encode('ascii', 'replace')
@@ -172,3 +286,234 @@ class YoutubeCom(Hoster):
filename])
self.remove(inputfile, trash=False)
+
+
+"""Credit to this awesome piece of code below goes to the 'youtube_dl' project, kudos!"""
+class JSInterpreterError(Exception):
+ pass
+
+
+class JSInterpreter(object):
+ def __init__(self, code, objects=None):
+ self._OPERATORS = [
+ ('|', operator.or_),
+ ('^', operator.xor),
+ ('&', operator.and_),
+ ('>>', operator.rshift),
+ ('<<', operator.lshift),
+ ('-', operator.sub),
+ ('+', operator.add),
+ ('%', operator.mod),
+ ('/', operator.truediv),
+ ('*', operator.mul),
+ ]
+ self._ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in self._OPERATORS]
+ self._ASSIGN_OPERATORS.append(('=', lambda cur, right: right))
+ self._VARNAME_PATTERN = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
+
+ if objects is None:
+ objects = {}
+ self.code = code
+ self._functions = {}
+ self._objects = objects
+
+ def interpret_statement(self, stmt, local_vars, allow_recursion=100):
+ if allow_recursion < 0:
+ raise JSInterpreterError('Recursion limit reached')
+
+ should_abort = False
+ stmt = stmt.lstrip()
+ stmt_m = re.match(r'var\s', stmt)
+ if stmt_m:
+ expr = stmt[len(stmt_m.group(0)):]
+
+ else:
+ return_m = re.match(r'return(?:\s+|$)', stmt)
+ if return_m:
+ expr = stmt[len(return_m.group(0)):]
+ should_abort = True
+ else:
+ # Try interpreting it as an expression
+ expr = stmt
+
+ v = self.interpret_expression(expr, local_vars, allow_recursion)
+ return v, should_abort
+
+ def interpret_expression(self, expr, local_vars, allow_recursion):
+ expr = expr.strip()
+
+ if expr == '': # Empty expression
+ return None
+
+ if expr.startswith('('):
+ parens_count = 0
+ for m in re.finditer(r'[()]', expr):
+ if m.group(0) == '(':
+ parens_count += 1
+ else:
+ parens_count -= 1
+ if parens_count == 0:
+ sub_expr = expr[1:m.start()]
+ sub_result = self.interpret_expression(sub_expr, local_vars, allow_recursion)
+ remaining_expr = expr[m.end():].strip()
+ if not remaining_expr:
+ return sub_result
+ else:
+ expr = json.dumps(sub_result) + remaining_expr
+ break
+ else:
+ raise JSInterpreterError('Premature end of parens in %r' % expr)
+
+ for op, opfunc in self._ASSIGN_OPERATORS:
+ m = re.match(r'(?x)(?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?\s*%s(?P<expr>.*)$' % (self._VARNAME_PATTERN, re.escape(op)), expr)
+ if not m:
+ continue
+ right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion - 1)
+
+ if m.groupdict().get('index'):
+ lvar = local_vars[m.group('out')]
+ idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion)
+ assert isinstance(idx, int)
+ cur = lvar[idx]
+ val = opfunc(cur, right_val)
+ lvar[idx] = val
+ return val
+ else:
+ cur = local_vars.get(m.group('out'))
+ val = opfunc(cur, right_val)
+ local_vars[m.group('out')] = val
+ return val
+
+ if expr.isdigit():
+ return int(expr)
+
+ var_m = re.match(r'(?!if|return|true|false)(?P<name>%s)$' % self._VARNAME_PATTERN, expr)
+ if var_m:
+ return local_vars[var_m.group('name')]
+
+ try:
+ return json.loads(expr)
+ except ValueError:
+ pass
+
+ m = re.match(r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % self._VARNAME_PATTERN, expr)
+ if m:
+ variable = m.group('var')
+ member = m.group('member')
+ arg_str = m.group('args')
+
+ if variable in local_vars:
+ obj = local_vars[variable]
+ else:
+ if variable not in self._objects:
+ self._objects[variable] = self.extract_object(variable)
+ obj = self._objects[variable]
+
+ if arg_str is None:
+ # Member access
+ if member == 'length':
+ return len(obj)
+ return obj[member]
+
+ assert expr.endswith(')')
+ # Function call
+ if arg_str == '':
+ argvals = tuple()
+ else:
+ argvals = tuple([self.interpret_expression(v, local_vars, allow_recursion) for v in arg_str.split(',')])
+
+ if member == 'split':
+ assert argvals == ('',)
+ return list(obj)
+
+ if member == 'join':
+ assert len(argvals) == 1
+ return argvals[0].join(obj)
+
+ if member == 'reverse':
+ assert len(argvals) == 0
+ obj.reverse()
+ return obj
+
+ if member == 'slice':
+ assert len(argvals) == 1
+ return obj[argvals[0]:]
+
+ if member == 'splice':
+ assert isinstance(obj, list)
+ index, howMany = argvals
+ res = []
+ for i in range(index, min(index + howMany, len(obj))):
+ res.append(obj.pop(index))
+ return res
+
+ return obj[member](argvals)
+
+ m = re.match(r'(?P<in>%s)\[(?P<idx>.+)\]$' % self._VARNAME_PATTERN, expr)
+ if m:
+ val = local_vars[m.group('in')]
+ idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion - 1)
+ return val[idx]
+
+ for op, opfunc in self._OPERATORS:
+ m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
+ if not m:
+ continue
+
+ x, abort = self.interpret_statement(m.group('x'), local_vars, allow_recursion - 1)
+ if abort:
+ raise JSInterpreterError('Premature left-side return of %s in %r' % (op, expr))
+
+ y, abort = self.interpret_statement(m.group('y'), local_vars, allow_recursion - 1)
+ if abort:
+ raise JSInterpreterError('Premature right-side return of %s in %r' % (op, expr))
+
+ return opfunc(x, y)
+
+ m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]+)\)$' % self._VARNAME_PATTERN, expr)
+ if m:
+ fname = m.group('func')
+ argvals = tuple([int(v) if v.isdigit() else local_vars[v] for v in m.group('args').split(',')])
+ if fname not in self._functions:
+ self._functions[fname] = self.extract_function(fname)
+ return self._functions[fname](argvals)
+
+ raise JSInterpreterError('Unsupported JS expression %r' % expr)
+
+ def extract_object(self, objname):
+ obj = {}
+ obj_m = re.search(r'(?:var\s+)?%s\s*=\s*\{\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)\}\s*;'
+ % re.escape(objname), self.code)
+ fields = obj_m.group('fields')
+ # Currently, it only supports function definitions
+ fields_m = re.finditer(r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', fields)
+ for f in fields_m:
+ argnames = f.group('args').split(',')
+ obj[f.group('key')] = self.build_function(argnames, f.group('code'))
+
+ return obj
+
+ def extract_function(self, function_name):
+ func_m = re.search(r'(?x)(?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s*\((?P<args>[^)]*)\)\s*\{(?P<code>[^}]+)\}'
+ % (re.escape(function_name), re.escape(function_name), re.escape(function_name)), self.code)
+ if func_m is None:
+ raise JSInterpreterError('Could not find JS function %r' % function_name)
+
+ argnames = func_m.group('args').split(',')
+
+ return self.build_function(argnames, func_m.group('code'))
+
+ def call_function(self, function_name, *args):
+ f = self.extract_function(function_name)
+ return f(args)
+
+ def build_function(self, argnames, code):
+ def resf(argvals):
+ local_vars = dict(zip(argnames, argvals))
+ for stmt in code.split(';'):
+ res, abort = self.interpret_statement(stmt, local_vars)
+ if abort:
+ break
+ return res
+
+ return resf \ No newline at end of file