Note
You are reading the documentation for MMOCR 0.x, which will soon be deprecated by the end of 2022. We recommend you upgrade to MMOCR 1.0 to enjoy fruitful new features and better performance brought by OpenMMLab 2.0. Check out the maintenance plan, changelog, code and documentation of MMOCR 1.0 for more details.
Source code for mmocr.datasets.utils.parser
# Copyright (c) OpenMMLab. All rights reserved.
import json
import warnings
from mmocr.datasets.builder import PARSERS
from mmocr.utils import StringStrip
[docs]@PARSERS.register_module()
class LineStrParser:
"""Parse string of one line in annotation file to dict format.
Args:
keys (list[str]): Keys in result dict.
keys_idx (list[int]): Value index in sub-string list
for each key above.
separator (str): Separator to separate string to list of sub-string.
"""
def __init__(self,
keys=['filename', 'text'],
keys_idx=[0, 1],
separator=' ',
**kwargs):
assert isinstance(keys, list)
assert isinstance(keys_idx, list)
assert isinstance(separator, str)
assert len(keys) > 0
assert len(keys) == len(keys_idx)
self.keys = keys
self.keys_idx = keys_idx
self.separator = separator
self.strip_cls = StringStrip(**kwargs)
def get_item(self, data_ret, index):
map_index = index % len(data_ret)
line_str = data_ret[map_index]
line_str = self.strip_cls(line_str)
if len(line_str.split(' ')) > 2:
msg = 'More than two blank spaces were detected. '
msg += 'Please use LineJsonParser to handle '
msg += 'annotations with blanks. '
msg += 'Check Doc '
msg += 'https://mmocr.readthedocs.io/en/latest/'
msg += 'tutorials/blank_recog.html '
msg += 'for details.'
warnings.warn(msg)
line_str = line_str.split(self.separator)
if len(line_str) <= max(self.keys_idx):
raise Exception(
f'key index: {max(self.keys_idx)} out of range: {line_str}')
line_info = {}
for i, key in enumerate(self.keys):
line_info[key] = line_str[self.keys_idx[i]]
return line_info
[docs]@PARSERS.register_module()
class LineJsonParser:
"""Parse json-string of one line in annotation file to dict format.
Args:
keys (list[str]): Keys in both json-string and result dict.
"""
def __init__(self, keys=[]):
assert isinstance(keys, list)
assert len(keys) > 0
self.keys = keys
def get_item(self, data_ret, index):
map_index = index % len(data_ret)
json_str = data_ret[map_index]
line_json_obj = json.loads(json_str)
line_info = {}
for key in self.keys:
if key not in line_json_obj:
raise Exception(f'key {key} not in line json {line_json_obj}')
line_info[key] = line_json_obj[key]
return line_info