summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar Sleeper <devnull@localhost> 2010-04-12 21:51:44 +0200
committerGravatar Sleeper <devnull@localhost> 2010-04-12 21:51:44 +0200
commitb849139d38d9cebd367879d9a3323dfde733e866 (patch)
tree5c3e609bb343e92a421cfc5725ca1df0f55401b3
parentset_conf fix (diff)
downloadpyload-b849139d38d9cebd367879d9a3323dfde733e866.tar.xz
fix netload and shareonline, captcha subsets for better recognition
-rw-r--r--module/plugins/captcha/LinksaveIn.py16
-rw-r--r--module/plugins/captcha/MegauploadCom.py2
-rw-r--r--module/plugins/captcha/NetloadIn.py6
-rw-r--r--module/plugins/captcha/ShareonlineBiz.py12
-rw-r--r--module/plugins/captcha/captcha.py20
-rw-r--r--module/plugins/hoster/NetloadIn.py10
-rw-r--r--module/plugins/hoster/ShareonlineBiz.py1
7 files changed, 30 insertions, 37 deletions
diff --git a/module/plugins/captcha/LinksaveIn.py b/module/plugins/captcha/LinksaveIn.py
index d6f61e362..22b801273 100644
--- a/module/plugins/captcha/LinksaveIn.py
+++ b/module/plugins/captcha/LinksaveIn.py
@@ -118,20 +118,6 @@ class LinksaveIn(OCR):
self.image = new
self.pixels = self.image.load()
- def run_tesser(self):
- self.logger.debug("create tmp tif")
- tmp = tempfile.NamedTemporaryFile(suffix=".tif")
- self.logger.debug("create tmp txt")
- tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt")
- self.logger.debug("save tiff")
- self.image.save(tmp.name, 'TIFF')
- self.logger.debug("run tesseract")
- self.run(['tesseract', tmp.name, tmpTxt.name.replace(".txt", ""), "nobatch", self.data_dir+"tesser_conf"])
- self.logger.debug("read txt")
-
- with open(tmpTxt.name, 'r') as f:
- self.result_captcha = f.read().replace("\n", "")
-
def get_captcha(self, image):
self.load_image(image)
bg = self.get_bg()
@@ -147,7 +133,7 @@ class LinksaveIn(OCR):
for n, letter in enumerate(letters):
self.image = letter
self.image.save(ocr.data_dir+"letter%d.png" % n)
- self.run_tesser()
+ self.run_tesser(True, True, False, False)
final += self.result_captcha
return final
diff --git a/module/plugins/captcha/MegauploadCom.py b/module/plugins/captcha/MegauploadCom.py
index 374bcd678..da8ab2cb9 100644
--- a/module/plugins/captcha/MegauploadCom.py
+++ b/module/plugins/captcha/MegauploadCom.py
@@ -6,7 +6,7 @@ class MegauploadCom(OCR):
def get_captcha(self, image):
self.load_image(image)
- self.run_tesser()
+ self.run_tesser(True, True, False, True)
return self.result_captcha
if __name__ == '__main__':
diff --git a/module/plugins/captcha/NetloadIn.py b/module/plugins/captcha/NetloadIn.py
index 9799a6a2b..c99a0744c 100644
--- a/module/plugins/captcha/NetloadIn.py
+++ b/module/plugins/captcha/NetloadIn.py
@@ -9,11 +9,7 @@ class NetloadIn(OCR):
self.to_greyscale()
self.clean(3)
self.clean(3)
- self.run_tesser()
-
- self.correct({
- ("$", "g"): "5",
- })
+ self.run_tesser(True, True, False, False)
return self.result_captcha
diff --git a/module/plugins/captcha/ShareonlineBiz.py b/module/plugins/captcha/ShareonlineBiz.py
index 91124f181..7bd5d7960 100644
--- a/module/plugins/captcha/ShareonlineBiz.py
+++ b/module/plugins/captcha/ShareonlineBiz.py
@@ -37,19 +37,9 @@ class ShareonlineBiz(OCR):
final = ""
for letter in letters:
self.image = letter
- self.run_tesser()
+ self.run_tesser(True, True, False, False)
final += self.result_captcha
- #replace common errors
- final = self.correct({
- "A": "4",
- "‘5": "3",
- ("‘1", "T"): "7",
- ("‘L", "B", "'L"): "2",
- "b": "6",
- ("I", "X"): "1"
- }, final)
-
return final
#tesseract at 60%
diff --git a/module/plugins/captcha/captcha.py b/module/plugins/captcha/captcha.py
index 283b171e0..452952533 100644
--- a/module/plugins/captcha/captcha.py
+++ b/module/plugins/captcha/captcha.py
@@ -82,15 +82,31 @@ class OCR(object):
self.image.save(tmp)
self.result_captcha = self.run(['gocr', tmp.name]).replace("\n", "")
- def run_tesser(self):
+ def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True ):
self.logger.debug("create tmp tif")
tmp = tempfile.NamedTemporaryFile(suffix=".tif")
self.logger.debug("create tmp txt")
tmpTxt = tempfile.NamedTemporaryFile(suffix=".txt")
self.logger.debug("save tiff")
self.image.save(tmp.name, 'TIFF')
+
+ tessparams = ['tesseract', tmp.name, tmpTxt.name.replace(".txt", "")
+
+ if subset and (digits or lowercase or uppercase):
+ self.logger.debug("create temp subset config")
+ tmpSub = tempfile.NamedTemporaryFile(suffix=".subset")
+ tmpSub.write("tessedit_char_whitelist ")
+ if digits:
+ tmpSub.write("0123456789")
+ if lowercase:
+ tmpSub.write("abcdefghijklmnopqrstuvwxyz")
+ if uppercase:
+ tmpSub.write("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
+ tessparams.append("nobatch")
+ tessparams.append(tmpSub.name)
+
self.logger.debug("run tesseract")
- self.run(['tesseract', tmp.name, tmpTxt.name.replace(".txt", "")])
+ self.run(tessparams)
self.logger.debug("read txt")
with open(tmpTxt.name, 'r') as f:
diff --git a/module/plugins/hoster/NetloadIn.py b/module/plugins/hoster/NetloadIn.py
index 0b7bcd27f..9891828a2 100644
--- a/module/plugins/hoster/NetloadIn.py
+++ b/module/plugins/hoster/NetloadIn.py
@@ -57,6 +57,7 @@ class NetloadIn(Plugin):
thread.wait(self.parent)
pyfile.status.url = self.get_file_url()
+ return True
else:
return False
@@ -69,7 +70,10 @@ class NetloadIn(Plugin):
apiurl = "http://netload.in/share/fileinfos2.php"
src = self.req.load(apiurl, cookies=False, get={"file_id": match.group(1)})
self.api_data = {}
- if not src == "unknown file_data":
+ if src == "unknown_server_data":
+ self.api_data = False
+ self.html[0] = self.req.load(self.parent.url, cookies=False)
+ elif not src == "unknown file_data":
lines = src.split(";")
self.api_data["exists"] = True
self.api_data["fileid"] = lines[0]
@@ -124,7 +128,7 @@ class NetloadIn(Plugin):
self.time_plus_wait = time() + wait_seconds
def get_file_name(self):
- if self.api_data["filename"]:
+ if self.api_data and self.api_data["filename"]:
return self.api_data["filename"]
elif self.html[0]:
file_name_pattern = '\t\t\t(.+)<span style="color: #8d8d8d;">'
@@ -134,7 +138,7 @@ class NetloadIn(Plugin):
return self.parent.url
def file_exists(self):
- if self.api_data["exists"]:
+ if self.api_data and self.api_data["exists"]:
return self.api_data["exists"]
elif self.html[0] and re.search(r"The file has been deleted", self.html[0]) == None:
return True
diff --git a/module/plugins/hoster/ShareonlineBiz.py b/module/plugins/hoster/ShareonlineBiz.py
index ede810bbd..b76e83568 100644
--- a/module/plugins/hoster/ShareonlineBiz.py
+++ b/module/plugins/hoster/ShareonlineBiz.py
@@ -46,6 +46,7 @@ class ShareonlineBiz(Plugin):
pyfile.status.waituntil = self.time_plus_wait
pyfile.status.url = self.get_file_url()
pyfile.status.want_reconnect = self.want_reconnect
+ return True
else:
return False