from captcha import OCR import Image from os import sep from os.path import dirname from os.path import abspath from glob import glob class LinksaveIn(OCR): def __init__(self): OCR.__init__(self) self.data_dir = dirname(abspath(__file__)) + sep + "LinksaveIn" + sep def load_image(self, image): im = Image.open(image) frame_nr = 0 lut = im.resize((256, 1)) lut.putdata(range(256)) lut = list(lut.convert("RGB").getdata()) new = Image.new("RGB", im.size) npix = new.load() while True: try: im.seek(frame_nr) except EOFError: break frame = im.copy() pix = frame.load() for x in range(frame.size[0]): for y in range(frame.size[1]): if lut[pix[x, y]] != (0,0,0): npix[x, y] = lut[pix[x, y]] frame_nr += 1 new.save(self.data_dir+"unblacked.png") self.image = new.copy() self.pixels = self.image.load() self.result_captcha = '' def get_bg(self): stat = {} cstat = {} img = self.image.convert("P") for bgpath in glob(self.data_dir+"bg/*.gif"): stat[bgpath] = 0 bg = Image.open(bgpath) bglut = bg.resize((256, 1)) bglut.putdata(range(256)) bglut = list(bglut.convert("RGB").getdata()) lut = img.resize((256, 1)) lut.putdata(range(256)) lut = list(lut.convert("RGB").getdata()) bgpix = bg.load() pix = img.load() for x in range(bg.size[0]): for y in range(bg.size[1]): rgb_bg = bglut[bgpix[x, y]] rgb_c = lut[pix[x, y]] try: cstat[rgb_c] += 1 except: cstat[rgb_c] = 1 if rgb_bg == rgb_c: stat[bgpath] += 1 max_p = 0 bg = "" for bgpath, value in stat.items(): if max_p < value: bg = bgpath max_p = value return bg def substract_bg(self, bgpath): bg = Image.open(bgpath) img = self.image.convert("P") bglut = bg.resize((256, 1)) bglut.putdata(range(256)) bglut = list(bglut.convert("RGB").getdata()) lut = img.resize((256, 1)) lut.putdata(range(256)) lut = list(lut.convert("RGB").getdata()) bgpix = bg.load() pix = img.load() orgpix = self.image.load() for x in range(bg.size[0]): for y in range(bg.size[1]): rgb_bg = bglut[bgpix[x, y]] rgb_c = lut[pix[x, y]] if rgb_c == rgb_bg: orgpix[x, y] = (255,255,255) def eval_black_white(self): new = Image.new("RGB", (140, 75)) pix = new.load() orgpix = self.image.load() thresh = 4 for x in range(new.size[0]): for y in range(new.size[1]): rgb = orgpix[x, y] r, g, b = rgb pix[x, y] = (255,255,255) if r > max(b, g)+thresh: pix[x, y] = (0,0,0) if g < min(r, b): pix[x, y] = (0,0,0) if g > max(r, b)+thresh: pix[x, y] = (0,0,0) if b > max(r, g)+thresh: pix[x, y] = (0,0,0) self.image = new self.pixels = self.image.load() def get_captcha(self, image): self.load_image(image) bg = self.get_bg() self.substract_bg(bg) self.eval_black_white() self.to_greyscale() self.image.save(self.data_dir+"cleaned_pass1.png") self.clean(4) self.clean(4) self.image.save(self.data_dir+"cleaned_pass2.png") letters = self.split_captcha_letters() final = "" for n, letter in enumerate(letters): self.image = letter self.image.save(ocr.data_dir+"letter%d.png" % n) self.run_tesser(True, True, False, False) final += self.result_captcha return final if __name__ == '__main__': import urllib ocr = LinksaveIn() testurl = "http://linksave.in/captcha/cap.php?hsh=2229185&code=ZzHdhl3UffV3lXTH5U4b7nShXj%2Bwma1vyoNBcbc6lcc%3D" urllib.urlretrieve(testurl, ocr.data_dir+"captcha.gif") print ocr.get_captcha(ocr.data_dir+'captcha.gif')