Test easyocr

This commit is contained in:
Michael Soukup 2021-08-17 22:59:52 +02:00
commit 4db0055103
2 changed files with 84 additions and 0 deletions

60
bad_ocr.py Executable file
View File

@ -0,0 +1,60 @@
#!/usr/bin/env python3
"""Extract text from images."""
import os
import easyocr
import argparse
import tqdm
from toolz import compose, curry
from toolz.curried import map, filter
IMG_EXT = (
'.png',
'.jpeg',
'.jpg',
'.apng',
'.webp',
'.avif'
)
def list_images(folder):
_, _, files = next(os.walk(folder))
return compose(
map(lambda f: os.path.join(folder, f)),
filter(lambda f: os.path.splitext(f)[-1].lower() in IMG_EXT)
)(files)
@curry
def ocr(reader, img_filename):
result = reader.readtext(img_filename, detail=0)
basename, _ = os.path.splitext(img_filename)
txt_filename = basename + '.txt'
with open(txt_filename, 'w') as f:
f.write(' '.join(result) + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'dir',
help="input dir"
)
parser.add_argument(
'-l', '--lang',
default='no,en',
help="comma-separated list of language codes"
)
args = parser.parse_args()
print("Loading model ...")
ocr_reader = easyocr.Reader(args.lang.split(','))
print("Run")
compose(
list,
tqdm.tqdm,
map(ocr(ocr_reader)),
list_images
)(args.dir)
print("Done.")

24
requirements.pip Normal file
View File

@ -0,0 +1,24 @@
cycler==0.10.0
decorator==4.4.2
easyocr==1.3.2
imageio==2.9.0
kiwisolver==1.3.1
matplotlib==3.4.2
networkx==2.5.1
numpy==1.21.1
opencv-python==4.5.3.56
Pillow==8.3.1
pyparsing==2.4.7
python-bidi==0.4.2
python-dateutil==2.8.2
PyWavelets==1.1.1
PyYAML==5.4.1
scikit-image==0.18.2
scipy==1.7.0
six==1.16.0
tifffile==2021.7.2
toolz==0.11.1
torch==1.9.0
torchvision==0.10.0
tqdm==4.61.2
typing-extensions==3.10.0.0