diff options
author | Walter Purcaro <vuolter@gmail.com> | 2014-07-10 03:26:45 +0200 |
---|---|---|
committer | Walter Purcaro <vuolter@gmail.com> | 2014-07-10 03:26:45 +0200 |
commit | c1abc13d4dccb20f3845594c28952667573b7d0b (patch) | |
tree | da8a8678bd804bec77ef16e864bfe2bf2e561eaf /module/plugins/ocr | |
parent | Improved filename sanitation removing non-ascii chars. (diff) | |
download | pyload-c1abc13d4dccb20f3845594c28952667573b7d0b.tar.xz |
Move captcha to ocr
Diffstat (limited to 'module/plugins/ocr')
-rw-r--r-- | module/plugins/ocr/GigasizeCom.py | 20 | ||||
-rw-r--r-- | module/plugins/ocr/LinksaveIn.py | 149 | ||||
-rw-r--r-- | module/plugins/ocr/NetloadIn.py | 26 | ||||
-rw-r--r-- | module/plugins/ocr/ShareonlineBiz.py | 53 | ||||
-rw-r--r-- | module/plugins/ocr/__init__.py | 0 |
5 files changed, 248 insertions, 0 deletions
diff --git a/module/plugins/ocr/GigasizeCom.py b/module/plugins/ocr/GigasizeCom.py new file mode 100644 index 000000000..8f9d78710 --- /dev/null +++ b/module/plugins/ocr/GigasizeCom.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +from module.plugins.OCR import OCR + +class GigasizeCom(OCR): + def __init__(self): + OCR.__init__(self) + + def get_captcha(self, image): + self.load_image(image) + self.threshold(2.8) + self.run_tesser(True, False, False, True) + return self.result_captcha + +if __name__ == '__main__': + ocr = GigasizeCom() + import urllib + urllib.urlretrieve('http://www.gigasize.com/randomImage.php', "gigasize_tmp.jpg") + + print ocr.get_captcha('gigasize_tmp.jpg') diff --git a/module/plugins/ocr/LinksaveIn.py b/module/plugins/ocr/LinksaveIn.py new file mode 100644 index 000000000..0ddd50a50 --- /dev/null +++ b/module/plugins/ocr/LinksaveIn.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +from module.plugins.OCR import OCR +from PIL import Image +from os import sep +from os.path import dirname +from os.path import abspath +from glob import glob + + +class LinksaveIn(OCR): + __name__ = "LinksaveIn" + def __init__(self): + OCR.__init__(self) + self.data_dir = dirname(abspath(__file__)) + sep + "LinksaveIn" + sep + + def load_image(self, image): + im = Image.open(image) + frame_nr = 0 + + lut = im.resize((256, 1)) + lut.putdata(range(256)) + lut = list(lut.convert("RGB").getdata()) + + new = Image.new("RGB", im.size) + npix = new.load() + while True: + try: + im.seek(frame_nr) + except EOFError: + break + frame = im.copy() + pix = frame.load() + for x in xrange(frame.size[0]): + for y in xrange(frame.size[1]): + if lut[pix[x, y]] != (0,0,0): + npix[x, y] = lut[pix[x, y]] + frame_nr += 1 + new.save(self.data_dir+"unblacked.png") + self.image = new.copy() + self.pixels = self.image.load() + self.result_captcha = '' + + def get_bg(self): + stat = {} + cstat = {} + img = self.image.convert("P") + for bgpath in glob(self.data_dir+"bg/*.gif"): + stat[bgpath] = 0 + bg = Image.open(bgpath) + + bglut = bg.resize((256, 1)) + bglut.putdata(range(256)) + bglut = list(bglut.convert("RGB").getdata()) + + lut = img.resize((256, 1)) + lut.putdata(range(256)) + lut = list(lut.convert("RGB").getdata()) + + bgpix = bg.load() + pix = img.load() + for x in xrange(bg.size[0]): + for y in xrange(bg.size[1]): + rgb_bg = bglut[bgpix[x, y]] + rgb_c = lut[pix[x, y]] + try: + cstat[rgb_c] += 1 + except: + cstat[rgb_c] = 1 + if rgb_bg == rgb_c: + stat[bgpath] += 1 + max_p = 0 + bg = "" + for bgpath, value in stat.items(): + if max_p < value: + bg = bgpath + max_p = value + return bg + + def substract_bg(self, bgpath): + bg = Image.open(bgpath) + img = self.image.convert("P") + + bglut = bg.resize((256, 1)) + bglut.putdata(range(256)) + bglut = list(bglut.convert("RGB").getdata()) + + lut = img.resize((256, 1)) + lut.putdata(range(256)) + lut = list(lut.convert("RGB").getdata()) + + bgpix = bg.load() + pix = img.load() + orgpix = self.image.load() + for x in xrange(bg.size[0]): + for y in xrange(bg.size[1]): + rgb_bg = bglut[bgpix[x, y]] + rgb_c = lut[pix[x, y]] + if rgb_c == rgb_bg: + orgpix[x, y] = (255,255,255) + + def eval_black_white(self): + new = Image.new("RGB", (140, 75)) + pix = new.load() + orgpix = self.image.load() + thresh = 4 + for x in xrange(new.size[0]): + for y in xrange(new.size[1]): + rgb = orgpix[x, y] + r, g, b = rgb + pix[x, y] = (255,255,255) + if r > max(b, g)+thresh: + pix[x, y] = (0,0,0) + if g < min(r, b): + pix[x, y] = (0,0,0) + if g > max(r, b)+thresh: + pix[x, y] = (0,0,0) + if b > max(r, g)+thresh: + pix[x, y] = (0,0,0) + self.image = new + self.pixels = self.image.load() + + def get_captcha(self, image): + self.load_image(image) + bg = self.get_bg() + self.substract_bg(bg) + self.eval_black_white() + self.to_greyscale() + self.image.save(self.data_dir+"cleaned_pass1.png") + self.clean(4) + self.clean(4) + self.image.save(self.data_dir+"cleaned_pass2.png") + letters = self.split_captcha_letters() + final = "" + for n, letter in enumerate(letters): + self.image = letter + self.image.save(ocr.data_dir+"letter%d.png" % n) + self.run_tesser(True, True, False, False) + final += self.result_captcha + + return final + +if __name__ == '__main__': + import urllib + ocr = LinksaveIn() + testurl = "http://linksave.in/captcha/cap.php?hsh=2229185&code=ZzHdhl3UffV3lXTH5U4b7nShXj%2Bwma1vyoNBcbc6lcc%3D" + urllib.urlretrieve(testurl, ocr.data_dir+"captcha.gif") + + print ocr.get_captcha(ocr.data_dir+'captcha.gif') diff --git a/module/plugins/ocr/NetloadIn.py b/module/plugins/ocr/NetloadIn.py new file mode 100644 index 000000000..9fc2f0725 --- /dev/null +++ b/module/plugins/ocr/NetloadIn.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +from module.plugins.OCR import OCR + +class NetloadIn(OCR): + __name__ = "NetloadIn" + def __init__(self): + OCR.__init__(self) + + def get_captcha(self, image): + self.load_image(image) + self.to_greyscale() + self.clean(3) + self.clean(3) + self.run_tesser(True, True, False, False) + + self.result_captcha = self.result_captcha.replace(" ", "")[:4] # cut to 4 numbers + + return self.result_captcha + +if __name__ == '__main__': + import urllib + ocr = NetloadIn() + urllib.urlretrieve("http://netload.in/share/includes/captcha.php", "captcha.png") + + print ocr.get_captcha('captcha.png') diff --git a/module/plugins/ocr/ShareonlineBiz.py b/module/plugins/ocr/ShareonlineBiz.py new file mode 100644 index 000000000..db72449d1 --- /dev/null +++ b/module/plugins/ocr/ShareonlineBiz.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +# +#Copyright (C) 2009 kingzero, RaNaN +# +#This program is free software; you can redistribute it and/or modify +#it under the terms of the GNU General Public License as published by +#the Free Software Foundation; either version 3 of the License, +#or (at your option) any later version. +# +#This program is distributed in the hope that it will be useful, +#but WITHOUT ANY WARRANTY; without even the implied warranty of +#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +#See the GNU General Public License for more details. +# +#You should have received a copy of the GNU General Public License +# along with this program; if not, see <http://www.gnu.org/licenses/>. +# +### +from module.plugins.OCR import OCR + +class ShareonlineBiz(OCR): + __name__ = "ShareonlineBiz" + + def __init__(self): + OCR.__init__(self) + + def get_captcha(self, image): + self.load_image(image) + self.to_greyscale() + self.image = self.image.resize((160, 50)) + self.pixels = self.image.load() + self.threshold(1.85) + #self.eval_black_white(240) + #self.derotate_by_average() + + letters = self.split_captcha_letters() + + final = "" + for letter in letters: + self.image = letter + self.run_tesser(True, True, False, False) + final += self.result_captcha + + return final + + #tesseract at 60% + +if __name__ == '__main__': + import urllib + ocr = ShareonlineBiz() + urllib.urlretrieve("http://www.share-online.biz/captcha.php", "captcha.jpeg") + print ocr.get_captcha('captcha.jpeg') diff --git a/module/plugins/ocr/__init__.py b/module/plugins/ocr/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/module/plugins/ocr/__init__.py |