Shortcuts

Note

You are reading the documentation for MMOCR 0.x, which will soon be deprecated by the end of 2022. We recommend you upgrade to MMOCR 1.0 to enjoy fruitful new features and better performance brought by OpenMMLab 2.0. Check out the maintenance plan, changelog, code and documentation of MMOCR 1.0 for more details.

Source code for mmocr.datasets.pipelines.ocr_transforms

# Copyright (c) OpenMMLab. All rights reserved.
import math

import mmcv
import numpy as np
import torch
import torchvision.transforms.functional as TF
from mmcv.runner.dist_utils import get_dist_info
from mmdet.datasets.builder import PIPELINES
from PIL import Image
from shapely.geometry import Polygon
from shapely.geometry import box as shapely_box

import mmocr.utils as utils
from mmocr.datasets.pipelines.crop import warp_img


[docs]@PIPELINES.register_module() class ResizeOCR: """Image resizing and padding for OCR. Args: height (int | tuple(int)): Image height after resizing. min_width (none | int | tuple(int)): Image minimum width after resizing. max_width (none | int | tuple(int)): Image maximum width after resizing. keep_aspect_ratio (bool): Keep image aspect ratio if True during resizing, Otherwise resize to the size height * max_width. img_pad_value (Number | Sequence[Number]): Values to be filled in padding areas when padding_mode is 'constant'. Default: 0. width_downsample_ratio (float): Downsample ratio in horizontal direction from input image to output feature. backend (str | None): The image resize backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default: constant. - constant: pads with a constant value, this value is specified with img_pad_value. - edge: pads with the last value at the edge of the image. - reflect: pads with reflection of image without repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode will result in [3, 2, 1, 2, 3, 4, 3, 2]. - symmetric: pads with reflection of image repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode will result in [2, 1, 1, 2, 3, 4, 4, 3] """ def __init__(self, height, min_width=None, max_width=None, keep_aspect_ratio=True, img_pad_value=0, width_downsample_ratio=1.0 / 16, backend=None, padding_mode='constant'): assert isinstance(height, (int, tuple)) assert utils.is_none_or_type(min_width, (int, tuple)) assert utils.is_none_or_type(max_width, (int, tuple)) if not keep_aspect_ratio: assert max_width is not None, ('"max_width" must assigned ' 'if "keep_aspect_ratio" is False') if isinstance(height, tuple): assert isinstance(min_width, tuple) assert isinstance(max_width, tuple) assert len(height) == len(min_width) == len(max_width) self.height = height self.min_width = min_width self.max_width = max_width self.keep_aspect_ratio = keep_aspect_ratio self.img_pad_value = img_pad_value self.width_downsample_ratio = width_downsample_ratio self.backend = backend self.padding_mode = padding_mode def __call__(self, results): rank, _ = get_dist_info() if isinstance(self.height, int): dst_height = self.height dst_min_width = self.min_width dst_max_width = self.max_width else: # Multi-scale resize used in distributed training. # Choose one (height, width) pair for one rank id. idx = rank % len(self.height) dst_height = self.height[idx] dst_min_width = self.min_width[idx] dst_max_width = self.max_width[idx] img_shape = results['img_shape'] ori_height, ori_width = img_shape[:2] valid_ratio = 1.0 resize_shape = list(img_shape) pad_shape = list(img_shape) if self.keep_aspect_ratio: new_width = math.ceil(float(dst_height) / ori_height * ori_width) width_divisor = int(1 / self.width_downsample_ratio) # make sure new_width is an integral multiple of width_divisor. if new_width % width_divisor != 0: new_width = round(new_width / width_divisor) * width_divisor if dst_min_width is not None: new_width = max(dst_min_width, new_width) if dst_max_width is not None: valid_ratio = min(1.0, 1.0 * new_width / dst_max_width) resize_width = min(dst_max_width, new_width) img_resize = mmcv.imresize( results['img'], (resize_width, dst_height), backend=self.backend) resize_shape = img_resize.shape pad_shape = img_resize.shape if new_width < dst_max_width: img_resize = mmcv.impad( img_resize, shape=(dst_height, dst_max_width), pad_val=self.img_pad_value, padding_mode=self.padding_mode) pad_shape = img_resize.shape else: img_resize = mmcv.imresize( results['img'], (new_width, dst_height), backend=self.backend) resize_shape = img_resize.shape pad_shape = img_resize.shape else: img_resize = mmcv.imresize( results['img'], (dst_max_width, dst_height), backend=self.backend) resize_shape = img_resize.shape pad_shape = img_resize.shape results['img'] = img_resize results['img_shape'] = resize_shape results['resize_shape'] = resize_shape results['pad_shape'] = pad_shape results['valid_ratio'] = valid_ratio return results
[docs]@PIPELINES.register_module() class ToTensorOCR: """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.""" def __init__(self): pass def __call__(self, results): results['img'] = TF.to_tensor(results['img'].copy()) return results
[docs]@PIPELINES.register_module() class NormalizeOCR: """Normalize a tensor image with mean and standard deviation.""" def __init__(self, mean, std): self.mean = mean self.std = std def __call__(self, results): results['img'] = TF.normalize(results['img'], self.mean, self.std) results['img_norm_cfg'] = dict(mean=self.mean, std=self.std) return results
[docs]@PIPELINES.register_module() class OnlineCropOCR: """Crop text areas from whole image with bounding box jitter. If no bbox is given, return directly. Args: box_keys (list[str]): Keys in results which correspond to RoI bbox. jitter_prob (float): The probability of box jitter. max_jitter_ratio_x (float): Maximum horizontal jitter ratio relative to height. max_jitter_ratio_y (float): Maximum vertical jitter ratio relative to height. """ def __init__(self, box_keys=['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4'], jitter_prob=0.5, max_jitter_ratio_x=0.05, max_jitter_ratio_y=0.02): assert utils.is_type_list(box_keys, str) assert 0 <= jitter_prob <= 1 assert 0 <= max_jitter_ratio_x <= 1 assert 0 <= max_jitter_ratio_y <= 1 self.box_keys = box_keys self.jitter_prob = jitter_prob self.max_jitter_ratio_x = max_jitter_ratio_x self.max_jitter_ratio_y = max_jitter_ratio_y def __call__(self, results): if 'img_info' not in results: return results crop_flag = True box = [] for key in self.box_keys: if key not in results['img_info']: crop_flag = False break box.append(float(results['img_info'][key])) if not crop_flag: return results jitter_flag = np.random.random() > self.jitter_prob kwargs = dict( jitter_flag=jitter_flag, jitter_ratio_x=self.max_jitter_ratio_x, jitter_ratio_y=self.max_jitter_ratio_y) crop_img = warp_img(results['img'], box, **kwargs) results['img'] = crop_img results['img_shape'] = crop_img.shape return results
[docs]@PIPELINES.register_module() class FancyPCA: """Implementation of PCA based image augmentation, proposed in the paper ``Imagenet Classification With Deep Convolutional Neural Networks``. It alters the intensities of RGB values along the principal components of ImageNet dataset. """ def __init__(self, eig_vec=None, eig_val=None): if eig_vec is None: eig_vec = torch.Tensor([ [-0.5675, +0.7192, +0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, +0.4203], ]).t() if eig_val is None: eig_val = torch.Tensor([[0.2175, 0.0188, 0.0045]]) self.eig_val = eig_val # 1*3 self.eig_vec = eig_vec # 3*3 def pca(self, tensor): assert tensor.size(0) == 3 alpha = torch.normal(mean=torch.zeros_like(self.eig_val)) * 0.1 reconst = torch.mm(self.eig_val * alpha, self.eig_vec) tensor = tensor + reconst.view(3, 1, 1) return tensor def __call__(self, results): img = results['img'] tensor = self.pca(img) results['img'] = tensor return results def __repr__(self): repr_str = self.__class__.__name__ return repr_str
[docs]@PIPELINES.register_module() class RandomPaddingOCR: """Pad the given image on all sides, as well as modify the coordinates of character bounding box in image. Args: max_ratio (list[int]): [left, top, right, bottom]. box_type (None|str): Character box type. If not none, should be either 'char_rects' or 'char_quads', with 'char_rects' for rectangle with ``xyxy`` style and 'char_quads' for quadrangle with ``x1y1x2y2x3y3x4y4`` style. """ def __init__(self, max_ratio=None, box_type=None): if max_ratio is None: max_ratio = [0.1, 0.2, 0.1, 0.2] else: assert utils.is_type_list(max_ratio, float) assert len(max_ratio) == 4 assert box_type is None or box_type in ('char_rects', 'char_quads') self.max_ratio = max_ratio self.box_type = box_type def __call__(self, results): img_shape = results['img_shape'] ori_height, ori_width = img_shape[:2] random_padding_left = round( np.random.uniform(0, self.max_ratio[0]) * ori_width) random_padding_top = round( np.random.uniform(0, self.max_ratio[1]) * ori_height) random_padding_right = round( np.random.uniform(0, self.max_ratio[2]) * ori_width) random_padding_bottom = round( np.random.uniform(0, self.max_ratio[3]) * ori_height) padding = (random_padding_left, random_padding_top, random_padding_right, random_padding_bottom) img = mmcv.impad(results['img'], padding=padding, padding_mode='edge') results['img'] = img results['img_shape'] = img.shape if self.box_type is not None: num_points = 2 if self.box_type == 'char_rects' else 4 char_num = len(results['ann_info'][self.box_type]) for i in range(char_num): for j in range(num_points): results['ann_info'][self.box_type][i][ j * 2] += random_padding_left results['ann_info'][self.box_type][i][ j * 2 + 1] += random_padding_top return results def __repr__(self): repr_str = self.__class__.__name__ return repr_str
[docs]@PIPELINES.register_module() class RandomRotateImageBox: """Rotate augmentation for segmentation based text recognition. Args: min_angle (int): Minimum rotation angle for image and box. max_angle (int): Maximum rotation angle for image and box. box_type (str): Character box type, should be either 'char_rects' or 'char_quads', with 'char_rects' for rectangle with ``xyxy`` style and 'char_quads' for quadrangle with ``x1y1x2y2x3y3x4y4`` style. """ def __init__(self, min_angle=-10, max_angle=10, box_type='char_quads'): assert box_type in ('char_rects', 'char_quads') self.min_angle = min_angle self.max_angle = max_angle self.box_type = box_type def __call__(self, results): in_img = results['img'] in_chars = results['ann_info']['chars'] in_boxes = results['ann_info'][self.box_type] img_width, img_height = in_img.size rotate_center = [img_width / 2., img_height / 2.] tan_temp_max_angle = rotate_center[1] / rotate_center[0] temp_max_angle = np.arctan(tan_temp_max_angle) * 180. / np.pi random_angle = np.random.uniform( max(self.min_angle, -temp_max_angle), min(self.max_angle, temp_max_angle)) random_angle_radian = random_angle * np.pi / 180. img_box = shapely_box(0, 0, img_width, img_height) out_img = TF.rotate( in_img, random_angle, resample=False, expand=False, center=rotate_center) out_boxes, out_chars = self.rotate_bbox(in_boxes, in_chars, random_angle_radian, rotate_center, img_box) results['img'] = out_img results['ann_info']['chars'] = out_chars results['ann_info'][self.box_type] = out_boxes return results @staticmethod def rotate_bbox(boxes, chars, angle, center, img_box): out_boxes = [] out_chars = [] for idx, bbox in enumerate(boxes): temp_bbox = [] for i in range(len(bbox) // 2): point = [bbox[2 * i], bbox[2 * i + 1]] temp_bbox.append( RandomRotateImageBox.rotate_point(point, angle, center)) poly_temp_bbox = Polygon(temp_bbox).buffer(0) if poly_temp_bbox.is_valid: if img_box.intersects(poly_temp_bbox) and ( not img_box.touches(poly_temp_bbox)): temp_bbox_area = poly_temp_bbox.area intersect_area = img_box.intersection(poly_temp_bbox).area intersect_ratio = intersect_area / temp_bbox_area if intersect_ratio >= 0.7: out_box = [] for p in temp_bbox: out_box.extend(p) out_boxes.append(out_box) out_chars.append(chars[idx]) return out_boxes, out_chars @staticmethod def rotate_point(point, angle, center): cos_theta = math.cos(-angle) sin_theta = math.sin(-angle) c_x = center[0] c_y = center[1] new_x = (point[0] - c_x) * cos_theta - (point[1] - c_y) * sin_theta + c_x new_y = (point[0] - c_x) * sin_theta + (point[1] - c_y) * cos_theta + c_y return [new_x, new_y]
[docs]@PIPELINES.register_module() class OpencvToPil: """Convert ``numpy.ndarray`` (bgr) to ``PIL Image`` (rgb).""" def __init__(self, **kwargs): pass def __call__(self, results): img = results['img'][..., ::-1] img = Image.fromarray(img) results['img'] = img return results def __repr__(self): repr_str = self.__class__.__name__ return repr_str
[docs]@PIPELINES.register_module() class PilToOpencv: """Convert ``PIL Image`` (rgb) to ``numpy.ndarray`` (bgr).""" def __init__(self, **kwargs): pass def __call__(self, results): img = np.asarray(results['img']) img = img[..., ::-1] results['img'] = img return results def __repr__(self): repr_str = self.__class__.__name__ return repr_str
Read the Docs v: v0.6.3
Versions
latest
stable
v0.6.3
v0.6.2
v0.6.1
v0.6.0
v0.5.0
v0.4.1
v0.4.0
v0.3.0
v0.2.1
v0.2.0
v0.1.0
dev-1.x
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.