Shortcuts

Note

You are reading the documentation for MMOCR 0.x, which will soon be deprecated by the end of 2022. We recommend you upgrade to MMOCR 1.0 to enjoy fruitful new features and better performance brought by OpenMMLab 2.0. Check out the maintenance plan, changelog, code and documentation of MMOCR 1.0 for more details.

Source code for mmocr.utils.lmdb_util

# Copyright (c) OpenMMLab. All rights reserved.
import json
import os
import os.path as osp

import cv2
import lmdb
import numpy as np

from mmocr.utils import list_from_file


def check_image_is_valid(imageBin):
    if imageBin is None:
        return False
    imageBuf = np.frombuffer(imageBin, dtype=np.uint8)
    img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
    imgH, imgW = img.shape[0], img.shape[1]
    if imgH * imgW == 0:
        return False
    return True


def parse_line(line, format):
    if format == 'txt':
        img_name, text = line.split(' ')
    else:
        line = json.loads(line)
        img_name = line['filename']
        text = line['text']
    return img_name, text


def write_cache(env, cache):
    with env.begin(write=True) as txn:
        cursor = txn.cursor()
        cursor.putmulti(cache, dupdata=False, overwrite=True)


[docs]def recog2lmdb(img_root, label_path, output, label_format='txt', label_only=False, batch_size=1000, encoding='utf-8', lmdb_map_size=1099511627776, verify=True): """Create text recognition dataset to LMDB format. Args: img_root (str): Path to images. label_path (str): Path to label file. output (str): LMDB output path. label_format (str): Format of the label file, either txt or jsonl. label_only (bool): Only convert label to lmdb format. batch_size (int): Number of files written to the cache each time. encoding (str): Label encoding method. lmdb_map_size (int): Maximum size database may grow to. verify (bool): If true, check the validity of every image.Defaults to True. E.g. This function supports MMOCR's recognition data format and the label file can be txt or jsonl, as follows: ├──img_root | |—— img1.jpg | |—— img2.jpg | |—— ... |——label.txt (or label.jsonl) label.txt: img1.jpg HELLO img2.jpg WORLD ... label.jsonl: {'filename':'img1.jpg', 'text':'HELLO'} {'filename':'img2.jpg', 'text':'WORLD'} ... """ # check label format assert osp.basename(label_path).split('.')[-1] == label_format # create lmdb env os.makedirs(output, exist_ok=True) env = lmdb.open(output, map_size=lmdb_map_size) # load label file anno_list = list_from_file(label_path, encoding=encoding) cache = [] # index start from 1 cnt = 1 n_samples = len(anno_list) for anno in anno_list: label_key = 'label-%09d'.encode(encoding) % cnt img_name, text = parse_line(anno, label_format) if label_only: # convert only labels to lmdb line = json.dumps( dict(filename=img_name, text=text), ensure_ascii=False) cache.append((label_key, line.encode(encoding))) else: # convert both images and labels to lmdb img_path = osp.join(img_root, img_name) if not osp.exists(img_path): print('%s does not exist' % img_path) continue with open(img_path, 'rb') as f: image_bin = f.read() if verify: try: if not check_image_is_valid(image_bin): print('%s is not a valid image' % img_path) continue except Exception: print('error occurred at ', img_name) image_key = 'image-%09d'.encode(encoding) % cnt cache.append((image_key, image_bin)) cache.append((label_key, text.encode(encoding))) if cnt % batch_size == 0: write_cache(env, cache) cache = [] print('Written %d / %d' % (cnt, n_samples)) cnt += 1 n_samples = cnt - 1 cache.append( ('num-samples'.encode(encoding), str(n_samples).encode(encoding))) write_cache(env, cache) print('Created lmdb dataset with %d samples' % n_samples)
Read the Docs v: v0.6.3
Versions
latest
stable
v0.6.3
v0.6.2
v0.6.1
v0.6.0
v0.5.0
v0.4.1
v0.4.0
v0.3.0
v0.2.1
v0.2.0
v0.1.0
dev-1.x
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.