diff options
Diffstat (limited to 'module/plugins/captcha')
-rw-r--r-- | module/plugins/captcha/GigasizeCom.py | 20 | ||||
-rw-r--r-- | module/plugins/captcha/LinksaveIn.py | 39 | ||||
-rw-r--r-- | module/plugins/captcha/NetloadIn.py | 21 | ||||
-rw-r--r-- | module/plugins/captcha/ShareonlineBiz.py | 38 | ||||
-rw-r--r-- | module/plugins/captcha/captcha.py | 163 |
5 files changed, 144 insertions, 137 deletions
diff --git a/module/plugins/captcha/GigasizeCom.py b/module/plugins/captcha/GigasizeCom.py index 2d0837257..244cf6a2a 100644 --- a/module/plugins/captcha/GigasizeCom.py +++ b/module/plugins/captcha/GigasizeCom.py @@ -1,20 +1,24 @@ # -*- coding: utf-8 -*- -from captcha import OCR +from module.plugins.captcha.captcha import OCR + class GigasizeCom(OCR): + __name__ = "GigasizeCom" + __type__ = "ocr" + __version__ = "0.10" + + __description__ = """Gigasize.com ocr plugin""" + __license__ = "GPLv3" + __authors__ = [("pyLoad Team", "admin@pyload.org")] + + def __init__(self): OCR.__init__(self) + def get_captcha(self, image): self.load_image(image) self.threshold(2.8) self.run_tesser(True, False, False, True) return self.result_captcha - -if __name__ == '__main__': - ocr = GigasizeCom() - import urllib - urllib.urlretrieve('http://www.gigasize.com/randomImage.php', "gigasize_tmp.jpg") - - print ocr.get_captcha('gigasize_tmp.jpg') diff --git a/module/plugins/captcha/LinksaveIn.py b/module/plugins/captcha/LinksaveIn.py index 8ce26fbac..56cbd58a0 100644 --- a/module/plugins/captcha/LinksaveIn.py +++ b/module/plugins/captcha/LinksaveIn.py @@ -1,19 +1,32 @@ # -*- coding: utf-8 -*- -from captcha import OCR -import Image -from os import sep -from os.path import dirname -from os.path import abspath +try: + from PIL import Image +except ImportError: + import Image + from glob import glob +from os import sep +from os.path import abspath, dirname + +from module.plugins.captcha.captcha import OCR class LinksaveIn(OCR): - __name__ = "LinksaveIn" + __name__ = "LinksaveIn" + __type__ = "ocr" + __version__ = "0.10" + + __description__ = """Linksave.in ocr plugin""" + __license__ = "GPLv3" + __authors__ = [("pyLoad Team", "admin@pyload.org")] + + def __init__(self): OCR.__init__(self) self.data_dir = dirname(abspath(__file__)) + sep + "LinksaveIn" + sep + def load_image(self, image): im = Image.open(image) frame_nr = 0 @@ -41,6 +54,7 @@ class LinksaveIn(OCR): self.pixels = self.image.load() self.result_captcha = '' + def get_bg(self): stat = {} cstat = {} @@ -71,12 +85,13 @@ class LinksaveIn(OCR): stat[bgpath] += 1 max_p = 0 bg = "" - for bgpath, value in stat.items(): + for bgpath, value in stat.iteritems(): if max_p < value: bg = bgpath max_p = value return bg + def substract_bg(self, bgpath): bg = Image.open(bgpath) img = self.image.convert("P") @@ -99,6 +114,7 @@ class LinksaveIn(OCR): if rgb_c == rgb_bg: orgpix[x, y] = (255,255,255) + def eval_black_white(self): new = Image.new("RGB", (140, 75)) pix = new.load() @@ -120,6 +136,7 @@ class LinksaveIn(OCR): self.image = new self.pixels = self.image.load() + def get_captcha(self, image): self.load_image(image) bg = self.get_bg() @@ -139,11 +156,3 @@ class LinksaveIn(OCR): final += self.result_captcha return final - -if __name__ == '__main__': - import urllib - ocr = LinksaveIn() - testurl = "http://linksave.in/captcha/cap.php?hsh=2229185&code=ZzHdhl3UffV3lXTH5U4b7nShXj%2Bwma1vyoNBcbc6lcc%3D" - urllib.urlretrieve(testurl, ocr.data_dir+"captcha.gif") - - print ocr.get_captcha(ocr.data_dir+'captcha.gif') diff --git a/module/plugins/captcha/NetloadIn.py b/module/plugins/captcha/NetloadIn.py index 733fe99db..28eb18fb5 100644 --- a/module/plugins/captcha/NetloadIn.py +++ b/module/plugins/captcha/NetloadIn.py @@ -1,12 +1,22 @@ # -*- coding: utf-8 -*- -from captcha import OCR +from module.plugins.captcha.captcha import OCR + class NetloadIn(OCR): - __name__ = "NetloadIn" + __name__ = "NetloadIn" + __type__ = "ocr" + __version__ = "0.10" + + __description__ = """Netload.in ocr plugin""" + __license__ = "GPLv3" + __authors__ = [("pyLoad Team", "admin@pyload.org")] + + def __init__(self): OCR.__init__(self) + def get_captcha(self, image): self.load_image(image) self.to_greyscale() @@ -17,10 +27,3 @@ class NetloadIn(OCR): self.result_captcha = self.result_captcha.replace(" ", "")[:4] # cut to 4 numbers return self.result_captcha - -if __name__ == '__main__': - import urllib - ocr = NetloadIn() - urllib.urlretrieve("http://netload.in/share/includes/captcha.php", "captcha.png") - - print ocr.get_captcha('captcha.png') diff --git a/module/plugins/captcha/ShareonlineBiz.py b/module/plugins/captcha/ShareonlineBiz.py index 0c87b636d..8210e8859 100644 --- a/module/plugins/captcha/ShareonlineBiz.py +++ b/module/plugins/captcha/ShareonlineBiz.py @@ -1,31 +1,23 @@ # -*- coding: utf-8 -*- -# -#Copyright (C) 2009 kingzero, RaNaN -# -#This program is free software; you can redistribute it and/or modify -#it under the terms of the GNU General Public License as published by -#the Free Software Foundation; either version 3 of the License, -#or (at your option) any later version. -# -#This program is distributed in the hope that it will be useful, -#but WITHOUT ANY WARRANTY; without even the implied warranty of -#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -#See the GNU General Public License for more details. -# -#You should have received a copy of the GNU General Public License -# along with this program; if not, see <http://www.gnu.org/licenses/>. -# -### -from captcha import OCR +from module.plugins.captcha.captcha import OCR + class ShareonlineBiz(OCR): - __name__ = "ShareonlineBiz" + __name__ = "ShareonlineBiz" + __type__ = "ocr" + __version__ = "0.10" + + __description__ = """Shareonline.biz ocr plugin""" + __license__ = "GPLv3" + __authors__ = [("RaNaN", "RaNaN@pyload.org")] + def __init__(self): OCR.__init__(self) - def get_captcha(self, image): + + def get_captcha(self, image): self.load_image(image) self.to_greyscale() self.image = self.image.resize((160, 50)) @@ -45,9 +37,3 @@ class ShareonlineBiz(OCR): return final #tesseract at 60% - -if __name__ == '__main__': - import urllib - ocr = ShareonlineBiz() - urllib.urlretrieve("http://www.share-online.biz/captcha.php", "captcha.jpeg") - print ocr.get_captcha('captcha.jpeg') diff --git a/module/plugins/captcha/captcha.py b/module/plugins/captcha/captcha.py index 7e4dec697..0f233ec00 100644 --- a/module/plugins/captcha/captcha.py +++ b/module/plugins/captcha/captcha.py @@ -1,56 +1,49 @@ # -*- coding: utf-8 -*- -# -#Copyright (C) 2009 kingzero, RaNaN -# -#This program is free software; you can redistribute it and/or modify -#it under the terms of the GNU General Public License as published by -#the Free Software Foundation; either version 3 of the License, -#or (at your option) any later version. -# -#This program is distributed in the hope that it will be useful, -#but WITHOUT ANY WARRANTY; without even the implied warranty of -#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -#See the GNU General Public License for more details. -# -#You should have received a copy of the GNU General Public License -# along with this program; if not, see <http://www.gnu.org/licenses/>. -# -### from __future__ import with_statement -import os -from os.path import join -from os.path import abspath + +try: + from PIL import Image, GifImagePlugin, JpegImagePlugin, PngImagePlugin, TiffImagePlugin +except ImportError: + import Image, GifImagePlugin, JpegImagePlugin, PngImagePlugin, TiffImagePlugin + import logging +import os import subprocess #import tempfile -import Image -import TiffImagePlugin -import PngImagePlugin -import GifImagePlugin -import JpegImagePlugin +from os.path import abspath, join class OCR(object): + __name__ = "OCR" + __type__ = "ocr" + __version__ = "0.10" + + __description__ = """OCR base plugin""" + __license__ = "GPLv3" + __authors__ = [("pyLoad Team", "admin@pyload.org")] - __name__ = "OCR" def __init__(self): self.logger = logging.getLogger("log") + def load_image(self, image): self.image = Image.open(image) self.pixels = self.image.load() self.result_captcha = '' + def unload(self): """delete all tmp images""" pass + def threshold(self, value): self.image = self.image.point(lambda a: a * value + 10) + def run(self, command): """Run a command""" @@ -61,42 +54,46 @@ class OCR(object): popen.stderr.close() self.logger.debug("Tesseract ReturnCode %s Output: %s" % (popen.returncode, output)) + def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True): - #self.logger.debug("create tmp tif") + #tmpTif = tempfile.NamedTemporaryFile(suffix=".tif") + try: + tmpTif = open(join("tmp", "tmpTif_%s.tif" % self.__name__), "wb") + tmpTif.close() + + #tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") + tmpTxt = open(join("tmp", "tmpTxt_%s.txt" % self.__name__), "wb") + tmpTxt.close() - #tmp = tempfile.NamedTemporaryFile(suffix=".tif") - tmp = open(join("tmp", "tmpTif_%s.tif" % self.__name__), "wb") - tmp.close() - #self.logger.debug("create tmp txt") - #tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") - tmpTxt = open(join("tmp", "tmpTxt_%s.txt" % self.__name__), "wb") - tmpTxt.close() + except IOError, e: + self.logError(e) + return self.logger.debug("save tiff") - self.image.save(tmp.name, 'TIFF') + self.image.save(tmpTif.name, 'TIFF') if os.name == "nt": - tessparams = [join(pypath,"tesseract","tesseract.exe")] + tessparams = [join(pypath, "tesseract", "tesseract.exe")] else: - tessparams = ['tesseract'] + tessparams = ["tesseract"] - tessparams.extend( [abspath(tmp.name), abspath(tmpTxt.name).replace(".txt", "")] ) + tessparams.extend( [abspath(tmpTif.name), abspath(tmpTxt.name).replace(".txt", "")] ) if subset and (digits or lowercase or uppercase): - #self.logger.debug("create temp subset config") #tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") - tmpSub = open(join("tmp", "tmpSub_%s.subset" % self.__name__), "wb") - tmpSub.write("tessedit_char_whitelist ") - if digits: - tmpSub.write("0123456789") - if lowercase: - tmpSub.write("abcdefghijklmnopqrstuvwxyz") - if uppercase: - tmpSub.write("ABCDEFGHIJKLMNOPQRSTUVWXYZ") - tmpSub.write("\n") - tessparams.append("nobatch") - tessparams.append(abspath(tmpSub.name)) - tmpSub.close() + with open(join("tmp", "tmpSub_%s.subset" % self.__name__), "wb") as tmpSub: + tmpSub.write("tessedit_char_whitelist ") + + if digits: + tmpSub.write("0123456789") + if lowercase: + tmpSub.write("abcdefghijklmnopqrstuvwxyz") + if uppercase: + tmpSub.write("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + + tmpSub.write("\n") + tessparams.append("nobatch") + tessparams.append(abspath(tmpSub.name)) self.logger.debug("run tesseract") self.run(tessparams) @@ -110,22 +107,25 @@ class OCR(object): self.logger.debug(self.result_captcha) try: - os.remove(tmp.name) + os.remove(tmpTif.name) os.remove(tmpTxt.name) if subset and (digits or lowercase or uppercase): os.remove(tmpSub.name) except: pass + def get_captcha(self, name): raise NotImplementedError + def to_greyscale(self): if self.image.mode != 'L': self.image = self.image.convert('L') self.pixels = self.image.load() + def eval_black_white(self, limit): self.pixels = self.image.load() w, h = self.image.size @@ -136,6 +136,7 @@ class OCR(object): else: self.pixels[x, y] = 0 + def clean(self, allowed): pixels = self.pixels @@ -143,19 +144,28 @@ class OCR(object): for x in xrange(w): for y in xrange(h): - if pixels[x, y] == 255: continue - # no point in processing white pixels since we only want to remove black pixel + if pixels[x, y] == 255: + continue + # No point in processing white pixels since we only want to remove black pixel count = 0 try: - if pixels[x-1, y-1] != 255: count += 1 - if pixels[x-1, y] != 255: count += 1 - if pixels[x-1, y + 1] != 255: count += 1 - if pixels[x, y + 1] != 255: count += 1 - if pixels[x + 1, y + 1] != 255: count += 1 - if pixels[x + 1, y] != 255: count += 1 - if pixels[x + 1, y-1] != 255: count += 1 - if pixels[x, y-1] != 255: count += 1 + if pixels[x-1, y-1] != 255: + count += 1 + if pixels[x-1, y] != 255: + count += 1 + if pixels[x-1, y + 1] != 255: + count += 1 + if pixels[x, y + 1] != 255: + count += 1 + if pixels[x + 1, y + 1] != 255: + count += 1 + if pixels[x + 1, y] != 255: + count += 1 + if pixels[x + 1, y-1] != 255: + count += 1 + if pixels[x, y-1] != 255: + count += 1 except: pass @@ -167,10 +177,12 @@ class OCR(object): # second pass: this time set all 1's to 255 (white) for x in xrange(w): for y in xrange(h): - if pixels[x, y] == 1: pixels[x, y] = 255 + if pixels[x, y] == 1: + pixels[x, y] = 255 self.pixels = pixels + def derotate_by_average(self): """rotate by checking each angle and guess most suitable""" @@ -245,6 +257,7 @@ class OCR(object): self.pixels = pixels + def split_captcha_letters(self): captcha = self.image started = False @@ -262,13 +275,16 @@ class OCR(object): firstX = x lastX = x - if y > bottomY: bottomY = y - if y < topY: topY = y - if x > lastX: lastX = x + if y > bottomY: + bottomY = y + if y < topY: + topY = y + if x > lastX: + lastX = x black_pixel_in_col = True - if black_pixel_in_col == False and started == True: + if black_pixel_in_col is False and started is True: rect = (firstX, topY, lastX, bottomY) new_captcha = captcha.crop(rect) @@ -281,8 +297,8 @@ class OCR(object): return letters - def correct(self, values, var=None): + def correct(self, values, var=None): if var: result = var else: @@ -300,14 +316,3 @@ class OCR(object): return result else: self.result_captcha = result - - -if __name__ == '__main__': - ocr = OCR() - ocr.load_image("B.jpg") - ocr.to_greyscale() - ocr.eval_black_white(140) - ocr.derotate_by_average() - ocr.run_tesser() - print "Tesseract", ocr.result_captcha - ocr.image.save("derotated.jpg") |