diff options
author | Sleeper <devnull@localhost> | 2010-04-12 21:51:44 +0200 |
---|---|---|
committer | Sleeper <devnull@localhost> | 2010-04-12 21:51:44 +0200 |
commit | b849139d38d9cebd367879d9a3323dfde733e866 (patch) | |
tree | 5c3e609bb343e92a421cfc5725ca1df0f55401b3 /module/plugins/captcha | |
parent | set_conf fix (diff) | |
download | pyload-b849139d38d9cebd367879d9a3323dfde733e866.tar.xz |
fix netload and shareonline, captcha subsets for better recognition
Diffstat (limited to 'module/plugins/captcha')
-rw-r--r-- | module/plugins/captcha/LinksaveIn.py | 16 | ||||
-rw-r--r-- | module/plugins/captcha/MegauploadCom.py | 2 | ||||
-rw-r--r-- | module/plugins/captcha/NetloadIn.py | 6 | ||||
-rw-r--r-- | module/plugins/captcha/ShareonlineBiz.py | 12 | ||||
-rw-r--r-- | module/plugins/captcha/captcha.py | 20 |
5 files changed, 22 insertions, 34 deletions
diff --git a/module/plugins/captcha/LinksaveIn.py b/module/plugins/captcha/LinksaveIn.py index d6f61e362..22b801273 100644 --- a/module/plugins/captcha/LinksaveIn.py +++ b/module/plugins/captcha/LinksaveIn.py @@ -118,20 +118,6 @@ class LinksaveIn(OCR): self.image = new self.pixels = self.image.load() - def run_tesser(self): - self.logger.debug("create tmp tif") - tmp = tempfile.NamedTemporaryFile(suffix=".tif") - self.logger.debug("create tmp txt") - tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") - self.logger.debug("save tiff") - self.image.save(tmp.name, 'TIFF') - self.logger.debug("run tesseract") - self.run(['tesseract', tmp.name, tmpTxt.name.replace(".txt", ""), "nobatch", self.data_dir+"tesser_conf"]) - self.logger.debug("read txt") - - with open(tmpTxt.name, 'r') as f: - self.result_captcha = f.read().replace("\n", "") - def get_captcha(self, image): self.load_image(image) bg = self.get_bg() @@ -147,7 +133,7 @@ class LinksaveIn(OCR): for n, letter in enumerate(letters): self.image = letter self.image.save(ocr.data_dir+"letter%d.png" % n) - self.run_tesser() + self.run_tesser(True, True, False, False) final += self.result_captcha return final diff --git a/module/plugins/captcha/MegauploadCom.py b/module/plugins/captcha/MegauploadCom.py index 374bcd678..da8ab2cb9 100644 --- a/module/plugins/captcha/MegauploadCom.py +++ b/module/plugins/captcha/MegauploadCom.py @@ -6,7 +6,7 @@ class MegauploadCom(OCR): def get_captcha(self, image): self.load_image(image) - self.run_tesser() + self.run_tesser(True, True, False, True) return self.result_captcha if __name__ == '__main__': diff --git a/module/plugins/captcha/NetloadIn.py b/module/plugins/captcha/NetloadIn.py index 9799a6a2b..c99a0744c 100644 --- a/module/plugins/captcha/NetloadIn.py +++ b/module/plugins/captcha/NetloadIn.py @@ -9,11 +9,7 @@ class NetloadIn(OCR): self.to_greyscale() self.clean(3) self.clean(3) - self.run_tesser() - - self.correct({ - ("$", "g"): "5", - }) + self.run_tesser(True, True, False, False) return self.result_captcha diff --git a/module/plugins/captcha/ShareonlineBiz.py b/module/plugins/captcha/ShareonlineBiz.py index 91124f181..7bd5d7960 100644 --- a/module/plugins/captcha/ShareonlineBiz.py +++ b/module/plugins/captcha/ShareonlineBiz.py @@ -37,19 +37,9 @@ class ShareonlineBiz(OCR): final = "" for letter in letters: self.image = letter - self.run_tesser() + self.run_tesser(True, True, False, False) final += self.result_captcha - #replace common errors - final = self.correct({ - "A": "4", - "‘5": "3", - ("‘1", "T"): "7", - ("‘L", "B", "'L"): "2", - "b": "6", - ("I", "X"): "1" - }, final) - return final #tesseract at 60% diff --git a/module/plugins/captcha/captcha.py b/module/plugins/captcha/captcha.py index 283b171e0..452952533 100644 --- a/module/plugins/captcha/captcha.py +++ b/module/plugins/captcha/captcha.py @@ -82,15 +82,31 @@ class OCR(object): self.image.save(tmp) self.result_captcha = self.run(['gocr', tmp.name]).replace("\n", "") - def run_tesser(self): + def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True ): self.logger.debug("create tmp tif") tmp = tempfile.NamedTemporaryFile(suffix=".tif") self.logger.debug("create tmp txt") tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt") self.logger.debug("save tiff") self.image.save(tmp.name, 'TIFF') + + tessparams = ['tesseract', tmp.name, tmpTxt.name.replace(".txt", "") + + if subset and (digits or lowercase or uppercase): + self.logger.debug("create temp subset config") + tmpSub = tempfile.NamedTemporaryFile(suffix=".subset") + tmpSub.write("tessedit_char_whitelist ") + if digits: + tmpSub.write("0123456789") + if lowercase: + tmpSub.write("abcdefghijklmnopqrstuvwxyz") + if uppercase: + tmpSub.write("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + tessparams.append("nobatch") + tessparams.append(tmpSub.name) + self.logger.debug("run tesseract") - self.run(['tesseract', tmp.name, tmpTxt.name.replace(".txt", "")]) + self.run(tessparams) self.logger.debug("read txt") with open(tmpTxt.name, 'r') as f: |