#!/usr/bin/env python3 """Extract text from images.""" import os import easyocr import argparse import tqdm from toolz import compose, curry from toolz.curried import map, filter IMG_EXT = ( '.png', '.jpeg', '.jpg', '.apng', '.webp', '.avif' ) def list_images(folder): _, _, files = next(os.walk(folder)) return compose( map(lambda f: os.path.join(folder, f)), filter(lambda f: os.path.splitext(f)[-1].lower() in IMG_EXT) )(files) @curry def ocr(reader, img_filename): result = reader.readtext(img_filename, detail=0) basename, _ = os.path.splitext(img_filename) txt_filename = basename + '.txt' with open(txt_filename, 'w') as f: f.write(' '.join(result) + '\n') if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( 'dir', help="input dir" ) parser.add_argument( '-l', '--lang', default='no,en', help="comma-separated list of language codes" ) args = parser.parse_args() print("Loading model ...") ocr_reader = easyocr.Reader(args.lang.split(',')) print("Run") compose( list, tqdm.tqdm, map(ocr(ocr_reader)), list_images )(args.dir) print("Done.")