diff options
author | Walter Purcaro <vuolter@users.noreply.github.com> | 2015-07-23 23:44:45 +0200 |
---|---|---|
committer | Walter Purcaro <vuolter@users.noreply.github.com> | 2015-07-23 23:44:45 +0200 |
commit | 6af9b38a8d5d49355b85aef6ddd003605d6bba05 (patch) | |
tree | cbfb5b2212cab406ba75b3acd553879311e9153f /module/plugins/internal | |
parent | Code cosmetics (diff) | |
download | pyload-6af9b38a8d5d49355b85aef6ddd003605d6bba05.tar.xz |
Improve Captcha
Diffstat (limited to 'module/plugins/internal')
-rw-r--r-- | module/plugins/internal/AdYouLike.py | 92 | ||||
-rw-r--r-- | module/plugins/internal/AdsCaptcha.py | 64 | ||||
-rw-r--r-- | module/plugins/internal/Captcha.py | 89 | ||||
-rw-r--r-- | module/plugins/internal/OCR.py | 4 | ||||
-rw-r--r-- | module/plugins/internal/ReCaptcha.py | 197 | ||||
-rw-r--r-- | module/plugins/internal/SolveMedia.py | 105 | ||||
-rw-r--r-- | module/plugins/internal/XFSHoster.py | 6 |
7 files changed, 53 insertions, 504 deletions
diff --git a/module/plugins/internal/AdYouLike.py b/module/plugins/internal/AdYouLike.py deleted file mode 100644 index d14babb51..000000000 --- a/module/plugins/internal/AdYouLike.py +++ /dev/null @@ -1,92 +0,0 @@ -# -*- coding: utf-8 -*- - -import re - -from module.common.json_layer import json_loads -from module.plugins.internal.CaptchaService import CaptchaService - - -class AdYouLike(CaptchaService): - __name__ = "AdYouLike" - __type__ = "captcha" - __version__ = "0.07" - __status__ = "stable" - - __description__ = """AdYouLike captcha service plugin""" - __license__ = "GPLv3" - __authors__ = [("Walter Purcaro", "vuolter@gmail.com")] - - - AYL_PATTERN = r'Adyoulike\.create\s*\((.+?)\)' - CALLBACK_PATTERN = r'(Adyoulike\.g\._jsonp_\d+)' - - - def detect_key(self, data=None): - html = data or self.retrieve_data() - - m = re.search(self.AYL_PATTERN, html) - n = re.search(self.CALLBACK_PATTERN, html) - if m and n: - self.key = (m.group(1).strip(), n.group(1).strip()) - self.log_debug("Ayl: %s | Callback: %s" % self.key) - return self.key #: Key is the tuple(ayl, callback) - else: - self.log_warning(_("Ayl or callback pattern not found")) - return None - - - def challenge(self, key=None, data=None): - ayl, callback = key or self.retrieve_key(data) - - #: {'adyoulike':{'key':"P~zQ~O0zV0WTiAzC-iw0navWQpCLoYEP"}, - #: 'all':{'element_id':"ayl_private_cap_92300",'lang':"fr",'env':"prod"}} - ayl = json_loads(ayl) - - html = self.plugin.load("http://api-ayl.appspot.com/challenge", - get={'key' : ayl['adyoulike']['key'], - 'env' : ayl['all']['env'], - 'callback': callback}) - try: - challenge = json_loads(re.search(callback + r'\s*\((.+?)\)', html).group(1)) - - except AttributeError: - self.fail(_("AdYouLike challenge pattern not found")) - - self.log_debug("Challenge: %s" % challenge) - - return self.result(ayl, challenge), challenge - - - def result(self, server, challenge): - #: Adyoulike.g._jsonp_5579316662423138 - #: ({'translations':{'fr':{'instructions_visual':"Recopiez « Soonnight » ci-dessous :"}}, - #: 'site_under':true,'clickable':true,'pixels':{'VIDEO_050':[],'DISPLAY':[],'VIDEO_000':[],'VIDEO_100':[], - #: 'VIDEO_025':[],'VIDEO_075':[]},'medium_type':"image/adyoulike", - #: 'iframes':{'big':"<iframe src=\"http://www.soonnight.com/campagn.html\" scrolling=\"no\" - #: height=\"250\" width=\"300\" frameborder=\"0\"></iframe>"},'shares':{},'id':256, - #: 'token':"e6QuI4aRSnbIZJg02IsV6cp4JQ9~MjA1",'formats':{'small':{'y':300,'x':0,'w':300,'h':60}, - #: 'big':{'y':0,'x':0,'w':300,'h':250},'hover':{'y':440,'x':0,'w':300,'h':60}}, - #: 'tid':"SqwuAdxT1EZoi4B5q0T63LN2AkiCJBg5"}) - - if isinstance(server, basestring): - server = json_loads(server) - - if isinstance(challenge, basestring): - challenge = json_loads(challenge) - - try: - instructions_visual = challenge['translations'][server['all']['lang']]['instructions_visual'] - result = re.search(u'«(.+?)»', instructions_visual).group(1).strip() - - except AttributeError: - self.fail(_("AdYouLike result not found")) - - result = {'_ayl_captcha_engine' : "adyoulike", - '_ayl_env' : server['all']['env'], - '_ayl_tid' : challenge['tid'], - '_ayl_token_challenge': challenge['token'], - '_ayl_response' : response} - - self.log_debug("Result: %s" % result) - - return result diff --git a/module/plugins/internal/AdsCaptcha.py b/module/plugins/internal/AdsCaptcha.py deleted file mode 100644 index f487042e2..000000000 --- a/module/plugins/internal/AdsCaptcha.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- - -import random -import re - -from module.plugins.internal.CaptchaService import CaptchaService - - -class AdsCaptcha(CaptchaService): - __name__ = "AdsCaptcha" - __type__ = "captcha" - __version__ = "0.10" - __status__ = "stable" - - __description__ = """AdsCaptcha captcha service plugin""" - __license__ = "GPLv3" - __authors__ = [("pyLoad Team", "admin@pyload.org")] - - - CAPTCHAID_PATTERN = r'api\.adscaptcha\.com/Get\.aspx\?.*?CaptchaId=(\d+)' - PUBLICKEY_PATTERN = r'api\.adscaptcha\.com/Get\.aspx\?.*?PublicKey=([\w-]+)' - - - def detect_key(self, data=None): - html = data or self.retrieve_data() - - m = re.search(self.PUBLICKEY_PATTERN, html) - n = re.search(self.CAPTCHAID_PATTERN, html) - if m and n: - self.key = (m.group(1).strip(), n.group(1).strip()) #: Key is the tuple(PublicKey, CaptchaId) - self.log_debug("Key: %s | ID: %s" % self.key) - return self.key - else: - self.log_warning(_("Key or id pattern not found")) - return None - - - def challenge(self, key=None, data=None): - PublicKey, CaptchaId = key or self.retrieve_key(data) - - html = self.plugin.load("http://api.adscaptcha.com/Get.aspx", - get={'CaptchaId': CaptchaId, - 'PublicKey': PublicKey}) - try: - challenge = re.search("challenge: '(.+?)',", html).group(1) - server = re.search("server: '(.+?)',", html).group(1) - - except AttributeError: - self.fail(_("AdsCaptcha challenge pattern not found")) - - self.log_debug("Challenge: %s" % challenge) - - return self.result(server, challenge), challenge - - - def result(self, server, challenge): - result = self.decrypt_image("%sChallenge.aspx" % server, - get={'cid': challenge, 'dummy': random.random()}, - cookies=True, - input_type="jpg") - - self.log_debug("Result: %s" % result) - - return result diff --git a/module/plugins/internal/Captcha.py b/module/plugins/internal/Captcha.py index af7f66ed5..942021f26 100644 --- a/module/plugins/internal/Captcha.py +++ b/module/plugins/internal/Captcha.py @@ -1,12 +1,14 @@ # -*- coding: utf-8 -*- +import time + from module.plugins.internal.Plugin import Plugin class Captcha(Plugin): __name__ = "Captcha" __type__ = "captcha" - __version__ = "0.01" + __version__ = "0.02" __status__ = "stable" __description__ = """Base anti-captcha plugin""" @@ -35,13 +37,21 @@ class Captcha(Plugin): pass - def decrypt_image(self, url, get={}, post={}, ref=False, cookies=False, decode=False, - input_type='png', output_type='textual', try_ocr=True): - image = self.load(url, get=get, post=post, ref=ref, cookies=cookies, decode=decode) - return self.decrypt(image, input_type, output_type, try_ocr) + def recognize(self, image): + """ + Extend to build your custom anti-captcha ocr + """ + pass + + + def decrypt(self, url, get={}, post={}, ref=False, cookies=False, decode=False, + input_type='png', output_type='textual', ocr=True): + img = self.load(url, get=get, post=post, ref=ref, cookies=cookies, decode=decode) + return self._decrypt(img, input_type, output_type, ocr) - def decrypt(self, data, input_type='png', output_type='textual', try_ocr=True): + #@TODO: Definitely dhoose a better name for this method! + def _decrypt(self, raw, input_type='png', output_type='textual', ocr=None): """ Loads a captcha and decrypts it with ocr, plugin, user input @@ -53,56 +63,53 @@ class Captcha(Plugin): :param output_type: 'textual' if text is written on the captcha\ or 'positional' for captcha where the user have to click\ on a specific region on the captcha - :param try_ocr: if True, ocr is not used + :param ocr: if True, ocr is not used :return: result of decrypting """ - id = ("%.2f" % time.time())[-6:].replace(".", "") + time_ref = ("%.2f" % time.time())[-6:].replace(".", "") + + with open(os.path.join("tmp", "captcha_image_%s_%s.%s" % (self.plugin.__name__, time_ref, input_type)), "wb") as tmp_img: + tmp_img.write(raw) + + if ocr is not False: + if isinstance(ocr, basestring): + OCR = self.pyload.pluginManager.loadClass("captcha", ocr) #: Rename `captcha` to `ocr` in 0.4.10 - with open(os.path.join("tmp", "tmpCaptcha_%s_%s.%s" % (self.plugin.__name__, id, input_type)), "wb") as tmpCaptcha: - tmpCaptcha.write(img) + if self.plugin.pyfile.abort: + self.abort() - has_plugin = self.plugin.__name__ in self.pyload.pluginManager.ocrPlugins + result = OCR(self.plugin.pyfile).recognize(tmp_img.name) - if self.pyload.captcha: - Ocr = self.pyload.pluginManager.loadClass("ocr", self.plugin.__name__) - else: - Ocr = None + else: + result = self.recognize(tmp_img.name) - if Ocr and try_ocr: - time.sleep(random.randint(3000, 5000) / 1000.0) - if self.pyfile.abort: - self.abort() + else: + captchaManager = self.pyload.captchaManager - ocr = Ocr(self.pyfile) - result = ocr.get_captcha(tmpCaptcha.name) - else: - captchaManager = self.pyload.captchaManager - task = captchaManager.newTask(img, input_type, tmpCaptcha.name, output_type) - self.task = task - captchaManager.handleCaptcha(task) + try: + self.task = captchaManager.newTask(img, input_type, tmp_img.name, output_type) + captchaManager.handleCaptcha(self.task) - while task.isWaiting(): - if self.pyfile.abort: - captchaManager.removeTask(task) - self.abort() - time.sleep(1) + while self.task.isWaiting(): + if self.plugin.pyfile.abort: + self.abort() + time.sleep(1) + finally: + captchaManager.removeTask(self.task) - captchaManager.removeTask(task) + if self.task.error: + self.fail(task.error) - if task.error and has_plugin: #: Ignore default error message since the user could use try_ocr - self.fail(_("Pil and tesseract not installed and no Client connected for captcha decrypting")) - elif task.error: - self.fail(task.error) - elif not task.result: - self.fail(_("No captcha result obtained in appropiate time by any of the plugins")) + elif not self.task.result: + self.fail(_("No captcha result obtained in appropiate time by any of the plugins")) - result = task.result - self.log_debug("Received captcha result: %s" % result) + result = task.result + self.log_debug("Received captcha result: %s" % result) #@TODO: Remove from here? if not self.pyload.debug: try: - os.remove(tmpCaptcha.name) + os.remove(tmp_img.name) except Exception: pass diff --git a/module/plugins/internal/OCR.py b/module/plugins/internal/OCR.py index ee5571f77..5c22866c8 100644 --- a/module/plugins/internal/OCR.py +++ b/module/plugins/internal/OCR.py @@ -20,7 +20,7 @@ from module.utils import save_join as fs_join class OCR(Plugin): __name__ = "OCR" __type__ = "ocr" - __version__ = "0.12" + __version__ = "0.13" __status__ = "stable" __description__ = """OCR base plugin""" @@ -133,7 +133,7 @@ class OCR(Plugin): pass - def get_captcha(self, name): + def recognize(self, name): raise NotImplementedError diff --git a/module/plugins/internal/ReCaptcha.py b/module/plugins/internal/ReCaptcha.py deleted file mode 100644 index b4f9ef1eb..000000000 --- a/module/plugins/internal/ReCaptcha.py +++ /dev/null @@ -1,197 +0,0 @@ -# -*- coding: utf-8 -*- - -import random -import re -import time -import urlparse - -from base64 import b64encode - -from module.plugins.internal.CaptchaService import CaptchaService - - -class ReCaptcha(CaptchaService): - __name__ = "ReCaptcha" - __type__ = "captcha" - __version__ = "0.18" - __status__ = "stable" - - __description__ = """ReCaptcha captcha service plugin""" - __license__ = "GPLv3" - __authors__ = [("pyLoad Team", "admin@pyload.org"), - ("Walter Purcaro", "vuolter@gmail.com"), - ("zapp-brannigan", "fuerst.reinje@web.de")] - - - KEY_V1_PATTERN = r'(?:recaptcha(?:/api|\.net)/(?:challenge|noscript)\?k=|Recaptcha\.create\s*\(\s*["\'])([\w-]+)' - KEY_V2_PATTERN = r'(?:data-sitekey=["\']|["\']sitekey["\']:\s*["\'])([\w-]+)' - - - def detect_key(self, data=None): - html = data or self.retrieve_data() - - m = re.search(self.KEY_V2_PATTERN, html) or re.search(self.KEY_V1_PATTERN, html) - if m: - self.key = m.group(1).strip() - self.log_debug("Key: %s" % self.key) - return self.key - else: - self.log_warning(_("Key pattern not found")) - return None - - - def challenge(self, key=None, data=None, version=None): - key = key or self.retrieve_key(data) - - if version in (1, 2): - return getattr(self, "_challenge_v%s" % version)(key) - - else: - return self.challenge(key, - version=2 if re.search(self.KEY_V2_PATTERN, html or self.retrieve_data()) else 1) - - - def _challenge_v1(self, key): - html = self.plugin.load("http://www.google.com/recaptcha/api/challenge", - get={'k': key}) - try: - challenge = re.search("challenge : '(.+?)',", html).group(1) - server = re.search("server : '(.+?)',", html).group(1) - - except AttributeError: - self.fail(_("ReCaptcha challenge pattern not found")) - - self.log_debug("Challenge: %s" % challenge) - - return self.result(server, challenge, key) - - - def result(self, server, challenge, key): - self.plugin.load("http://www.google.com/recaptcha/api/js/recaptcha.js") - html = self.plugin.load("http://www.google.com/recaptcha/api/reload", - get={'c' : challenge, - 'k' : key, - 'reason': "i", - 'type' : "image"}) - - try: - challenge = re.search('\(\'(.+?)\',',html).group(1) - - except AttributeError: - self.fail(_("ReCaptcha second challenge pattern not found")) - - self.log_debug("Second challenge: %s" % challenge) - result = self.decrypt("%simage" % server, - get={'c': challenge}, - cookies=True, - input_type="jpg", - try_ocr=False) - - self.log_debug("Result: %s" % result) - - return result, challenge - - - def _collect_api_info(self): - html = self.plugin.load("http://www.google.com/recaptcha/api.js") - a = re.search(r'po.src = \'(.*?)\';', html).group(1) - vers = a.split("/")[5] - - self.log_debug("API version: %s" % vers) - - language = a.split("__")[1].split(".")[0] - - self.log_debug("API language: %s" % language) - - html = self.plugin.load("https://apis.google.com/js/api.js") - b = re.search(r'"h":"(.*?)","', html).group(1) - jsh = b.decode('unicode-escape') - - self.log_debug("API jsh-string: %s" % jsh) - - return vers, language, jsh - - - def _prepare_time_and_rpc(self): - self.plugin.load("http://www.google.com/recaptcha/api2/demo") - - millis = int(round(time.time() * 1000)) - - self.log_debug("Time: %s" % millis) - - rand = random.randint(1, 99999999) - a = "0.%s" % str(rand * 2147483647) - rpc = int(100000000 * float(a)) - - self.log_debug("Rpc-token: %s" % rpc) - - return millis, rpc - - - def _challenge_v2(self, key, parent=None): - if parent is None: - try: - parent = urlparse.urljoin("http://", urlparse.urlparse(self.plugin.pyfile.url).netloc) - - except Exception: - parent = "" - - botguardstring = "!A" - vers, language, jsh = self._collect_api_info() - millis, rpc = self._prepare_time_and_rpc() - - html = self.plugin.load("https://www.google.com/recaptcha/api2/anchor", - get={'k' : key, - 'hl' : language, - 'v' : vers, - 'usegapi' : "1", - 'jsh' : "%s#id=IO_%s" % (jsh, millis), - 'parent' : parent, - 'pfname' : "", - 'rpctoken': rpc}) - - token1 = re.search(r'id="recaptcha-token" value="(.*?)">', html) - self.log_debug("Token #1: %s" % token1.group(1)) - - html = self.plugin.load("https://www.google.com/recaptcha/api2/frame", - get={'c' : token1.group(1), - 'hl' : language, - 'v' : vers, - 'bg' : botguardstring, - 'k' : key, - 'usegapi': "1", - 'jsh' : jsh}, - decode="unicode-escape") - - token2 = re.search(r'"finput","(.*?)",', html) - self.log_debug("Token #2: %s" % token2.group(1)) - - token3 = re.search(r'"rresp","(.*?)",', html) - self.log_debug("Token #3: %s" % token3.group(1)) - - millis_captcha_loading = int(round(time.time() * 1000)) - captcha_response = self.decrypt_image("https://www.google.com/recaptcha/api2/payload", - get={'c':token3.group(1), 'k':key}, - cookies=True, - try_ocr=False) - response = b64encode('{"response":"%s"}' % captcha_response) - - self.log_debug("Result: %s" % response) - - timeToSolve = int(round(time.time() * 1000)) - millis_captcha_loading - timeToSolveMore = timeToSolve + int(float("0." + str(random.randint(1, 99999999))) * 500) - - html = self.plugin.load("https://www.google.com/recaptcha/api2/userverify", - post={'k' : key, - 'c' : token3.group(1), - 'response': response, - 't' : timeToSolve, - 'ct' : timeToSolveMore, - 'bg' : botguardstring}) - - token4 = re.search(r'"uvresp","(.*?)",', html) - self.log_debug("Token #4: %s" % token4.group(1)) - - result = token4.group(1) - - return result, None diff --git a/module/plugins/internal/SolveMedia.py b/module/plugins/internal/SolveMedia.py deleted file mode 100644 index ce4ebb007..000000000 --- a/module/plugins/internal/SolveMedia.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- - -import re - -from module.plugins.internal.Plugin import Fail -from module.plugins.internal.CaptchaService import CaptchaService - - -class SolveMedia(CaptchaService): - __name__ = "SolveMedia" - __type__ = "captcha" - __version__ = "0.15" - __status__ = "stable" - - __description__ = """SolveMedia captcha service plugin""" - __license__ = "GPLv3" - __authors__ = [("pyLoad Team", "admin@pyload.org")] - - - KEY_PATTERN = r'api\.solvemedia\.com/papi/challenge\.(?:no)?script\?k=(.+?)["\']' - - - def detect_key(self, data=None): - html = data or self.retrieve_data() - - m = re.search(self.KEY_PATTERN, html) - if m: - self.key = m.group(1).strip() - self.log_debug("Key: %s" % self.key) - return self.key - else: - self.log_warning(_("Key pattern not found") - return None - - - def challenge(self, key=None, data=None): - key = key or self.retrieve_key(data) - - html = self.plugin.load("http://api.solvemedia.com/papi/challenge.noscript", - get={'k': key}) - - for i in xrange(1, 11): - try: - magic = re.search(r'name="magic" value="(.+?)"', html).group(1) - - except AttributeError: - self.log_warning(_("Magic pattern not found") - magic = None - - try: - challenge = re.search(r'<input type=hidden name="adcopy_challenge" id="adcopy_challenge" value="(.+?)">', - html).group(1) - - except AttributeError: - self.fail(_("SolveMedia challenge pattern not found")) - - else: - self.log_debug("Challenge: %s" % challenge) - - try: - result = self.result("http://api.solvemedia.com/papi/media", challenge) - - except Fail, e: - self.log_warning(e) - self.plugin.invalidCaptcha() - result = None - - html = self.plugin.load("http://api.solvemedia.com/papi/verify.noscript", - post={'adcopy_response' : result, - 'k' : key, - 'l' : "en", - 't' : "img", - 's' : "standard", - 'magic' : magic, - 'adcopy_challenge': challenge, - 'ref' : self.plugin.pyfile.url}) - try: - redirect = re.search(r'URL=(.+?)">', html).group(1) - - except AttributeError: - self.fail(_("SolveMedia verify pattern not found")) - - else: - if "error" in html: - self.log_warning(_("Captcha code was invalid")) - self.log_debug("Retry #%d" % i) - html = self.plugin.load(redirect) - else: - break - - else: - self.fail(_("SolveMedia max retries exceeded")) - - return result, challenge - - - def result(self, server, challenge): - result = self.decrypt_image(server, - get={'c': challenge}, - cookies=True, - input_type="gif") - - self.log_debug("Result: %s" % result) - - return result diff --git a/module/plugins/internal/XFSHoster.py b/module/plugins/internal/XFSHoster.py index ec9a18a48..18a50a6b0 100644 --- a/module/plugins/internal/XFSHoster.py +++ b/module/plugins/internal/XFSHoster.py @@ -4,8 +4,8 @@ import pycurl import random import re -from module.plugins.internal.ReCaptcha import ReCaptcha -from module.plugins.internal.SolveMedia import SolveMedia +from module.plugins.captcha.ReCaptcha import ReCaptcha +from module.plugins.captcha.SolveMedia import SolveMedia from module.plugins.internal.SimpleHoster import SimpleHoster, create_getInfo, seconds_to_midnight from module.utils import html_unescape @@ -221,7 +221,7 @@ class XFSHoster(SimpleHoster): m = re.search(self.CAPTCHA_PATTERN, self.html) if m: captcha_url = m.group(1) - inputs['code'] = self.captcha.decrypt_image(captcha_url) + inputs['code'] = self.captcha.decrypt(captcha_url) return m = re.search(self.CAPTCHA_BLOCK_PATTERN, self.html, re.S) |