Shortcuts

Source code for mmocr.datasets.pipelines.ocr_transforms

# Copyright (c) OpenMMLab. All rights reserved.
import math

import mmcv
import numpy as np
import torch
import torchvision.transforms.functional as TF
from mmcv.runner.dist_utils import get_dist_info
from mmdet.datasets.builder import PIPELINES
from PIL import Image
from shapely.geometry import Polygon
from shapely.geometry import box as shapely_box

import mmocr.utils as utils
from mmocr.datasets.pipelines.crop import warp_img


[docs]@PIPELINES.register_module() class ResizeOCR: """Image resizing and padding for OCR. Args: height (int | tuple(int)): Image height after resizing. min_width (none | int | tuple(int)): Image minimum width after resizing. max_width (none | int | tuple(int)): Image maximum width after resizing. keep_aspect_ratio (bool): Keep image aspect ratio if True during resizing, Otherwise resize to the size height * max_width. img_pad_value (int): Scalar to fill padding area. width_downsample_ratio (float): Downsample ratio in horizontal direction from input image to output feature. backend (str | None): The image resize backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. """ def __init__(self, height, min_width=None, max_width=None, keep_aspect_ratio=True, img_pad_value=0, width_downsample_ratio=1.0 / 16, backend=None): assert isinstance(height, (int, tuple)) assert utils.is_none_or_type(min_width, (int, tuple)) assert utils.is_none_or_type(max_width, (int, tuple)) if not keep_aspect_ratio: assert max_width is not None, ('"max_width" must assigned ' 'if "keep_aspect_ratio" is False') assert isinstance(img_pad_value, int) if isinstance(height, tuple): assert isinstance(min_width, tuple) assert isinstance(max_width, tuple) assert len(height) == len(min_width) == len(max_width) self.height = height self.min_width = min_width self.max_width = max_width self.keep_aspect_ratio = keep_aspect_ratio self.img_pad_value = img_pad_value self.width_downsample_ratio = width_downsample_ratio self.backend = backend def __call__(self, results): rank, _ = get_dist_info() if isinstance(self.height, int): dst_height = self.height dst_min_width = self.min_width dst_max_width = self.max_width else: # Multi-scale resize used in distributed training. # Choose one (height, width) pair for one rank id. idx = rank % len(self.height) dst_height = self.height[idx] dst_min_width = self.min_width[idx] dst_max_width = self.max_width[idx] img_shape = results['img_shape'] ori_height, ori_width = img_shape[:2] valid_ratio = 1.0 resize_shape = list(img_shape) pad_shape = list(img_shape) if self.keep_aspect_ratio: new_width = math.ceil(float(dst_height) / ori_height * ori_width) width_divisor = int(1 / self.width_downsample_ratio) # make sure new_width is an integral multiple of width_divisor. if new_width % width_divisor != 0: new_width = round(new_width / width_divisor) * width_divisor if dst_min_width is not None: new_width = max(dst_min_width, new_width) if dst_max_width is not None: valid_ratio = min(1.0, 1.0 * new_width / dst_max_width) resize_width = min(dst_max_width, new_width) img_resize = mmcv.imresize( results['img'], (resize_width, dst_height), backend=self.backend) resize_shape = img_resize.shape pad_shape = img_resize.shape if new_width < dst_max_width: img_resize = mmcv.impad( img_resize, shape=(dst_height, dst_max_width), pad_val=self.img_pad_value) pad_shape = img_resize.shape else: img_resize = mmcv.imresize( results['img'], (new_width, dst_height), backend=self.backend) resize_shape = img_resize.shape pad_shape = img_resize.shape else: img_resize = mmcv.imresize( results['img'], (dst_max_width, dst_height), backend=self.backend) resize_shape = img_resize.shape pad_shape = img_resize.shape results['img'] = img_resize results['resize_shape'] = resize_shape results['pad_shape'] = pad_shape results['valid_ratio'] = valid_ratio return results
[docs]@PIPELINES.register_module() class ToTensorOCR: """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.""" def __init__(self): pass def __call__(self, results): results['img'] = TF.to_tensor(results['img'].copy()) return results
[docs]@PIPELINES.register_module() class NormalizeOCR: """Normalize a tensor image with mean and standard deviation.""" def __init__(self, mean, std): self.mean = mean self.std = std def __call__(self, results): results['img'] = TF.normalize(results['img'], self.mean, self.std) results['img_norm_cfg'] = dict(mean=self.mean, std=self.std) return results
[docs]@PIPELINES.register_module() class OnlineCropOCR: """Crop text areas from whole image with bounding box jitter. If no bbox is given, return directly. Args: box_keys (list[str]): Keys in results which correspond to RoI bbox. jitter_prob (float): The probability of box jitter. max_jitter_ratio_x (float): Maximum horizontal jitter ratio relative to height. max_jitter_ratio_y (float): Maximum vertical jitter ratio relative to height. """ def __init__(self, box_keys=['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4'], jitter_prob=0.5, max_jitter_ratio_x=0.05, max_jitter_ratio_y=0.02): assert utils.is_type_list(box_keys, str) assert 0 <= jitter_prob <= 1 assert 0 <= max_jitter_ratio_x <= 1 assert 0 <= max_jitter_ratio_y <= 1 self.box_keys = box_keys self.jitter_prob = jitter_prob self.max_jitter_ratio_x = max_jitter_ratio_x self.max_jitter_ratio_y = max_jitter_ratio_y def __call__(self, results): if 'img_info' not in results: return results crop_flag = True box = [] for key in self.box_keys: if key not in results['img_info']: crop_flag = False break box.append(float(results['img_info'][key])) if not crop_flag: return results jitter_flag = np.random.random() > self.jitter_prob kwargs = dict( jitter_flag=jitter_flag, jitter_ratio_x=self.max_jitter_ratio_x, jitter_ratio_y=self.max_jitter_ratio_y) crop_img = warp_img(results['img'], box, **kwargs) results['img'] = crop_img results['img_shape'] = crop_img.shape return results
[docs]@PIPELINES.register_module() class FancyPCA: """Implementation of PCA based image augmentation, proposed in the paper ``Imagenet Classification With Deep Convolutional Neural Networks``. It alters the intensities of RGB values along the principal components of ImageNet dataset. """ def __init__(self, eig_vec=None, eig_val=None): if eig_vec is None: eig_vec = torch.Tensor([ [-0.5675, +0.7192, +0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, +0.4203], ]).t() if eig_val is None: eig_val = torch.Tensor([[0.2175, 0.0188, 0.0045]]) self.eig_val = eig_val # 1*3 self.eig_vec = eig_vec # 3*3 def pca(self, tensor): assert tensor.size(0) == 3 alpha = torch.normal(mean=torch.zeros_like(self.eig_val)) * 0.1 reconst = torch.mm(self.eig_val * alpha, self.eig_vec) tensor = tensor + reconst.view(3, 1, 1) return tensor def __call__(self, results): img = results['img'] tensor = self.pca(img) results['img'] = tensor return results def __repr__(self): repr_str = self.__class__.__name__ return repr_str
[docs]@PIPELINES.register_module() class RandomPaddingOCR: """Pad the given image on all sides, as well as modify the coordinates of character bounding box in image. Args: max_ratio (list[int]): [left, top, right, bottom]. box_type (None|str): Character box type. If not none, should be either 'char_rects' or 'char_quads', with 'char_rects' for rectangle with ``xyxy`` style and 'char_quads' for quadrangle with ``x1y1x2y2x3y3x4y4`` style. """ def __init__(self, max_ratio=None, box_type=None): if max_ratio is None: max_ratio = [0.1, 0.2, 0.1, 0.2] else: assert utils.is_type_list(max_ratio, float) assert len(max_ratio) == 4 assert box_type is None or box_type in ('char_rects', 'char_quads') self.max_ratio = max_ratio self.box_type = box_type def __call__(self, results): img_shape = results['img_shape'] ori_height, ori_width = img_shape[:2] random_padding_left = round( np.random.uniform(0, self.max_ratio[0]) * ori_width) random_padding_top = round( np.random.uniform(0, self.max_ratio[1]) * ori_height) random_padding_right = round( np.random.uniform(0, self.max_ratio[2]) * ori_width) random_padding_bottom = round( np.random.uniform(0, self.max_ratio[3]) * ori_height) padding = (random_padding_left, random_padding_top, random_padding_right, random_padding_bottom) img = mmcv.impad(results['img'], padding=padding, padding_mode='edge') results['img'] = img results['img_shape'] = img.shape if self.box_type is not None: num_points = 2 if self.box_type == 'char_rects' else 4 char_num = len(results['ann_info'][self.box_type]) for i in range(char_num): for j in range(num_points): results['ann_info'][self.box_type][i][ j * 2] += random_padding_left results['ann_info'][self.box_type][i][ j * 2 + 1] += random_padding_top return results def __repr__(self): repr_str = self.__class__.__name__ return repr_str
[docs]@PIPELINES.register_module() class RandomRotateImageBox: """Rotate augmentation for segmentation based text recognition. Args: min_angle (int): Minimum rotation angle for image and box. max_angle (int): Maximum rotation angle for image and box. box_type (str): Character box type, should be either 'char_rects' or 'char_quads', with 'char_rects' for rectangle with ``xyxy`` style and 'char_quads' for quadrangle with ``x1y1x2y2x3y3x4y4`` style. """ def __init__(self, min_angle=-10, max_angle=10, box_type='char_quads'): assert box_type in ('char_rects', 'char_quads') self.min_angle = min_angle self.max_angle = max_angle self.box_type = box_type def __call__(self, results): in_img = results['img'] in_chars = results['ann_info']['chars'] in_boxes = results['ann_info'][self.box_type] img_width, img_height = in_img.size rotate_center = [img_width / 2., img_height / 2.] tan_temp_max_angle = rotate_center[1] / rotate_center[0] temp_max_angle = np.arctan(tan_temp_max_angle) * 180. / np.pi random_angle = np.random.uniform( max(self.min_angle, -temp_max_angle), min(self.max_angle, temp_max_angle)) random_angle_radian = random_angle * np.pi / 180. img_box = shapely_box(0, 0, img_width, img_height) out_img = TF.rotate( in_img, random_angle, resample=False, expand=False, center=rotate_center) out_boxes, out_chars = self.rotate_bbox(in_boxes, in_chars, random_angle_radian, rotate_center, img_box) results['img'] = out_img results['ann_info']['chars'] = out_chars results['ann_info'][self.box_type] = out_boxes return results @staticmethod def rotate_bbox(boxes, chars, angle, center, img_box): out_boxes = [] out_chars = [] for idx, bbox in enumerate(boxes): temp_bbox = [] for i in range(len(bbox) // 2): point = [bbox[2 * i], bbox[2 * i + 1]] temp_bbox.append( RandomRotateImageBox.rotate_point(point, angle, center)) poly_temp_bbox = Polygon(temp_bbox).buffer(0) if poly_temp_bbox.is_valid: if img_box.intersects(poly_temp_bbox) and ( not img_box.touches(poly_temp_bbox)): temp_bbox_area = poly_temp_bbox.area intersect_area = img_box.intersection(poly_temp_bbox).area intersect_ratio = intersect_area / temp_bbox_area if intersect_ratio >= 0.7: out_box = [] for p in temp_bbox: out_box.extend(p) out_boxes.append(out_box) out_chars.append(chars[idx]) return out_boxes, out_chars @staticmethod def rotate_point(point, angle, center): cos_theta = math.cos(-angle) sin_theta = math.sin(-angle) c_x = center[0] c_y = center[1] new_x = (point[0] - c_x) * cos_theta - (point[1] - c_y) * sin_theta + c_x new_y = (point[0] - c_x) * sin_theta + (point[1] - c_y) * cos_theta + c_y return [new_x, new_y]
[docs]@PIPELINES.register_module() class OpencvToPil: """Convert ``numpy.ndarray`` (bgr) to ``PIL Image`` (rgb).""" def __init__(self, **kwargs): pass def __call__(self, results): img = results['img'][..., ::-1] img = Image.fromarray(img) results['img'] = img return results def __repr__(self): repr_str = self.__class__.__name__ return repr_str
[docs]@PIPELINES.register_module() class PilToOpencv: """Convert ``PIL Image`` (rgb) to ``numpy.ndarray`` (bgr).""" def __init__(self, **kwargs): pass def __call__(self, results): img = np.asarray(results['img']) img = img[..., ::-1] results['img'] = img return results def __repr__(self): repr_str = self.__class__.__name__ return repr_str
Read the Docs v: v0.4.0
Versions
latest
stable
v0.4.0
v0.3.0
v0.2.1
v0.2.0
v0.1.0
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.