summaryrefslogtreecommitdiffstats
path: root/module/plugins/captcha
diff options
context:
space:
mode:
authorGravatar Sleeper <devnull@localhost> 2010-04-12 21:51:44 +0200
committerGravatar Sleeper <devnull@localhost> 2010-04-12 21:51:44 +0200
commitb849139d38d9cebd367879d9a3323dfde733e866 (patch)
tree5c3e609bb343e92a421cfc5725ca1df0f55401b3 /module/plugins/captcha
parentset_conf fix (diff)
downloadpyload-b849139d38d9cebd367879d9a3323dfde733e866.tar.xz
fix netload and shareonline, captcha subsets for better recognition
Diffstat (limited to 'module/plugins/captcha')
-rw-r--r--module/plugins/captcha/LinksaveIn.py16
-rw-r--r--module/plugins/captcha/MegauploadCom.py2
-rw-r--r--module/plugins/captcha/NetloadIn.py6
-rw-r--r--module/plugins/captcha/ShareonlineBiz.py12
-rw-r--r--module/plugins/captcha/captcha.py20
5 files changed, 22 insertions, 34 deletions
diff --git a/module/plugins/captcha/LinksaveIn.py b/module/plugins/captcha/LinksaveIn.py
index d6f61e362..22b801273 100644
--- a/module/plugins/captcha/LinksaveIn.py
+++ b/module/plugins/captcha/LinksaveIn.py
@@ -118,20 +118,6 @@ class LinksaveIn(OCR):
self.image = new
self.pixels = self.image.load()
- def run_tesser(self):
- self.logger.debug("create tmp tif")
- tmp = tempfile.NamedTemporaryFile(suffix=".tif")
- self.logger.debug("create tmp txt")
- tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt")
- self.logger.debug("save tiff")
- self.image.save(tmp.name, 'TIFF')
- self.logger.debug("run tesseract")
- self.run(['tesseract', tmp.name, tmpTxt.name.replace(".txt", ""), "nobatch", self.data_dir+"tesser_conf"])
- self.logger.debug("read txt")
-
- with open(tmpTxt.name, 'r') as f:
- self.result_captcha = f.read().replace("\n", "")
-
def get_captcha(self, image):
self.load_image(image)
bg = self.get_bg()
@@ -147,7 +133,7 @@ class LinksaveIn(OCR):
for n, letter in enumerate(letters):
self.image = letter
self.image.save(ocr.data_dir+"letter%d.png" % n)
- self.run_tesser()
+ self.run_tesser(True, True, False, False)
final += self.result_captcha
return final
diff --git a/module/plugins/captcha/MegauploadCom.py b/module/plugins/captcha/MegauploadCom.py
index 374bcd678..da8ab2cb9 100644
--- a/module/plugins/captcha/MegauploadCom.py
+++ b/module/plugins/captcha/MegauploadCom.py
@@ -6,7 +6,7 @@ class MegauploadCom(OCR):
def get_captcha(self, image):
self.load_image(image)
- self.run_tesser()
+ self.run_tesser(True, True, False, True)
return self.result_captcha
if __name__ == '__main__':
diff --git a/module/plugins/captcha/NetloadIn.py b/module/plugins/captcha/NetloadIn.py
index 9799a6a2b..c99a0744c 100644
--- a/module/plugins/captcha/NetloadIn.py
+++ b/module/plugins/captcha/NetloadIn.py
@@ -9,11 +9,7 @@ class NetloadIn(OCR):
self.to_greyscale()
self.clean(3)
self.clean(3)
- self.run_tesser()
-
- self.correct({
- ("$", "g"): "5",
- })
+ self.run_tesser(True, True, False, False)
return self.result_captcha
diff --git a/module/plugins/captcha/ShareonlineBiz.py b/module/plugins/captcha/ShareonlineBiz.py
index 91124f181..7bd5d7960 100644
--- a/module/plugins/captcha/ShareonlineBiz.py
+++ b/module/plugins/captcha/ShareonlineBiz.py
@@ -37,19 +37,9 @@ class ShareonlineBiz(OCR):
final = ""
for letter in letters:
self.image = letter
- self.run_tesser()
+ self.run_tesser(True, True, False, False)
final += self.result_captcha
- #replace common errors
- final = self.correct({
- "A": "4",
- "‘5": "3",
- ("‘1", "T"): "7",
- ("‘L", "B", "'L"): "2",
- "b": "6",
- ("I", "X"): "1"
- }, final)
-
return final
#tesseract at 60%
diff --git a/module/plugins/captcha/captcha.py b/module/plugins/captcha/captcha.py
index 283b171e0..452952533 100644
--- a/module/plugins/captcha/captcha.py
+++ b/module/plugins/captcha/captcha.py
@@ -82,15 +82,31 @@ class OCR(object):
self.image.save(tmp)
self.result_captcha = self.run(['gocr', tmp.name]).replace("\n", "")
- def run_tesser(self):
+ def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True ):
self.logger.debug("create tmp tif")
tmp = tempfile.NamedTemporaryFile(suffix=".tif")
self.logger.debug("create tmp txt")
tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt")
self.logger.debug("save tiff")
self.image.save(tmp.name, 'TIFF')
+
+ tessparams = ['tesseract', tmp.name, tmpTxt.name.replace(".txt", "")
+
+ if subset and (digits or lowercase or uppercase):
+ self.logger.debug("create temp subset config")
+ tmpSub = tempfile.NamedTemporaryFile(suffix=".subset")
+ tmpSub.write("tessedit_char_whitelist ")
+ if digits:
+ tmpSub.write("0123456789")
+ if lowercase:
+ tmpSub.write("abcdefghijklmnopqrstuvwxyz")
+ if uppercase:
+ tmpSub.write("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
+ tessparams.append("nobatch")
+ tessparams.append(tmpSub.name)
+
self.logger.debug("run tesseract")
- self.run(['tesseract', tmp.name, tmpTxt.name.replace(".txt", "")])
+ self.run(tessparams)
self.logger.debug("read txt")
with open(tmpTxt.name, 'r') as f: