61 lines
1.3 KiB
Python
Executable File
61 lines
1.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Extract text from images."""
|
|
|
|
import os
|
|
import easyocr
|
|
import argparse
|
|
import tqdm
|
|
from toolz import compose, curry
|
|
from toolz.curried import map, filter
|
|
|
|
|
|
IMG_EXT = (
|
|
'.png',
|
|
'.jpeg',
|
|
'.jpg',
|
|
'.apng',
|
|
'.webp',
|
|
'.avif'
|
|
)
|
|
|
|
def list_images(folder):
|
|
_, _, files = next(os.walk(folder))
|
|
return compose(
|
|
map(lambda f: os.path.join(folder, f)),
|
|
filter(lambda f: os.path.splitext(f)[-1].lower() in IMG_EXT)
|
|
)(files)
|
|
|
|
@curry
|
|
def ocr(reader, img_filename):
|
|
result = reader.readtext(img_filename, detail=0)
|
|
basename, _ = os.path.splitext(img_filename)
|
|
txt_filename = basename + '.txt'
|
|
with open(txt_filename, 'w') as f:
|
|
f.write(' '.join(result) + '\n')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument(
|
|
'dir',
|
|
help="input dir"
|
|
)
|
|
parser.add_argument(
|
|
'-l', '--lang',
|
|
default='no,en',
|
|
help="comma-separated list of language codes"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
print("Loading model ...")
|
|
ocr_reader = easyocr.Reader(args.lang.split(','))
|
|
print("Run")
|
|
compose(
|
|
list,
|
|
tqdm.tqdm,
|
|
map(ocr(ocr_reader)),
|
|
list_images
|
|
)(args.dir)
|
|
print("Done.")
|
|
|