bad-ocr/easyocr/bad_ocr.py

61 lines
1.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""Extract text from images."""
import os
import easyocr
import argparse
import tqdm
from toolz import compose, curry
from toolz.curried import map, filter
IMG_EXT = (
'.png',
'.jpeg',
'.jpg',
'.apng',
'.webp',
'.avif'
)
def list_images(folder):
_, _, files = next(os.walk(folder))
return compose(
map(lambda f: os.path.join(folder, f)),
filter(lambda f: os.path.splitext(f)[-1].lower() in IMG_EXT)
)(files)
@curry
def ocr(reader, img_filename):
result = reader.readtext(img_filename, detail=0)
basename, _ = os.path.splitext(img_filename)
txt_filename = basename + '.txt'
with open(txt_filename, 'w') as f:
f.write(' '.join(result) + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'dir',
help="input dir"
)
parser.add_argument(
'-l', '--lang',
default='no,en',
help="comma-separated list of language codes"
)
args = parser.parse_args()
print("Loading model ...")
ocr_reader = easyocr.Reader(args.lang.split(','))
print("Run")
compose(
list,
tqdm.tqdm,
map(ocr(ocr_reader)),
list_images
)(args.dir)
print("Done.")