Skip to content

Instantly share code, notes, and snippets.

@ghandic
Created July 17, 2018 16:29
Show Gist options
  • Save ghandic/7f9771d5e03a738d8ddddcd9ec808b42 to your computer and use it in GitHub Desktop.
Save ghandic/7f9771d5e03a738d8ddddcd9ec808b42 to your computer and use it in GitHub Desktop.
Running Tesseract with lots of small images is much slower than multipage TIFF, so make a multipage tiff (to be implemented with multiprocessing for the major speed gains)
## Tesseract4
# 2 images into tiff: 843 msec 796
# 10 images into tiff: 1003 msec
# 53 images (same as page 1) into tiff: 1740 msec
## Tesseract3
# 2 images into tiff: 727 msec
# 10 images into tiff: 816 msec
# 53 images (same as page 1) into tiff: 1620 msec
import tempfile
from PIL import Image
import cv2
import subprocess
import pandas as pd
from io import StringIO, BytesIO
import os
import requests
from collections import namedtuple
TiffPage = namedtuple('TiffPage', 'field image')
TESS_EXE='tesseract4'
def tess_from_array(arr):
tiff_res = {tp.field:'' for tp in arr}
pages = {str(i+1): arr[i] for i in range(len(arr))}
tiff_list = [tp.image for tp in pages.values()]
tmp = tempfile.TemporaryFile()
tiff_list[0].save(tmp, 'TIFF', save_all=True, append_images=tiff_list[1:], dpi=(300,300))
tmp.seek(0)
tess_process = subprocess.Popen('{} - stdout --psm 7 -c tessedit_unrej_any_wd=1 -c load_system_dawg=F -c load_freq_dawg=F --tessdata-dir /usr/local/share/tessdata tsv quiet'.format(TESS_EXE).split(' '),
stdin=tmp, stdout=subprocess.PIPE)
results= tess_process.communicate()
df = pd.read_csv(StringIO(results[0].decode('utf-8').replace('"', '')), sep="\t")
df.fillna('', inplace=True)
for n in set(df['page_num']):
img_res = ' '.join(df[df['page_num'] == n]['text']).strip()
tiff_res[pages[str(n)].field] = img_res
return tiff_res
if __name__ == '__main__':
# Image.open(io.BytesIO(requests.get(url).content))
urls = ['http://www.edocfile.com/images/screenfortextsearch.jpg',
'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTwxoaykiHAb0-LjKgJ15wRlhn5S24CKUcknQdQPWelyGdfsmHp']
arr = [TiffPage(url, Image.open(BytesIO(requests.get(url).content))) for url in urls]
print(tess_from_array(arr))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment