From 4db0055103f5811a4f9b2d2558b56c0b78372f99 Mon Sep 17 00:00:00 2001 From: Michael Soukup Date: Tue, 17 Aug 2021 22:59:52 +0200 Subject: [PATCH] Test easyocr --- bad_ocr.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.pip | 24 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100755 bad_ocr.py create mode 100644 requirements.pip diff --git a/bad_ocr.py b/bad_ocr.py new file mode 100755 index 0000000..6e69ad8 --- /dev/null +++ b/bad_ocr.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +"""Extract text from images.""" + +import os +import easyocr +import argparse +import tqdm +from toolz import compose, curry +from toolz.curried import map, filter + + +IMG_EXT = ( + '.png', + '.jpeg', + '.jpg', + '.apng', + '.webp', + '.avif' +) + +def list_images(folder): + _, _, files = next(os.walk(folder)) + return compose( + map(lambda f: os.path.join(folder, f)), + filter(lambda f: os.path.splitext(f)[-1].lower() in IMG_EXT) + )(files) + +@curry +def ocr(reader, img_filename): + result = reader.readtext(img_filename, detail=0) + basename, _ = os.path.splitext(img_filename) + txt_filename = basename + '.txt' + with open(txt_filename, 'w') as f: + f.write(' '.join(result) + '\n') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + 'dir', + help="input dir" + ) + parser.add_argument( + '-l', '--lang', + default='no,en', + help="comma-separated list of language codes" + ) + args = parser.parse_args() + + print("Loading model ...") + ocr_reader = easyocr.Reader(args.lang.split(',')) + print("Run") + compose( + list, + tqdm.tqdm, + map(ocr(ocr_reader)), + list_images + )(args.dir) + print("Done.") + diff --git a/requirements.pip b/requirements.pip new file mode 100644 index 0000000..c71394e --- /dev/null +++ b/requirements.pip @@ -0,0 +1,24 @@ +cycler==0.10.0 +decorator==4.4.2 +easyocr==1.3.2 +imageio==2.9.0 +kiwisolver==1.3.1 +matplotlib==3.4.2 +networkx==2.5.1 +numpy==1.21.1 +opencv-python==4.5.3.56 +Pillow==8.3.1 +pyparsing==2.4.7 +python-bidi==0.4.2 +python-dateutil==2.8.2 +PyWavelets==1.1.1 +PyYAML==5.4.1 +scikit-image==0.18.2 +scipy==1.7.0 +six==1.16.0 +tifffile==2021.7.2 +toolz==0.11.1 +torch==1.9.0 +torchvision==0.10.0 +tqdm==4.61.2 +typing-extensions==3.10.0.0