Shortcuts

mmocr.models.textdet.module_losses.drrg_module_loss 源代码

# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, List, Sequence, Tuple

import cv2
import numpy as np
import torch

try:
    from lanms import merge_quadrangle_n9 as la_nms
except ImportError:
    la_nms = None
from mmcv.image import imrescale
from mmdet.models.utils import multi_apply
from numpy import ndarray
from numpy.linalg import norm
from torch import Tensor

from mmocr.registry import MODELS
from mmocr.structures import TextDetDataSample
from mmocr.utils import check_argument
from .textsnake_module_loss import TextSnakeModuleLoss


[文档]@MODELS.register_module() class DRRGModuleLoss(TextSnakeModuleLoss): """The class for implementing DRRG loss. This is partially adapted from https://github.com/GXYM/DRRG licensed under the MIT license. DRRG: `Deep Relational Reasoning Graph Network for Arbitrary Shape Text Detection <https://arxiv.org/abs/1908.05900>`_. Args: ohem_ratio (float): The negative/positive ratio in ohem. Defaults to 3.0. downsample_ratio (float): Downsample ratio. Defaults to 1.0. TODO: remove it. orientation_thr (float): The threshold for distinguishing between head edge and tail edge among the horizontal and vertical edges of a quadrangle. Defaults to 2.0. resample_step (float): The step size for resampling the text center line. Defaults to 8.0. num_min_comps (int): The minimum number of text components, which should be larger than k_hop1 mentioned in paper. Defaults to 9. num_max_comps (int): The maximum number of text components. Defaults to 600. min_width (float): The minimum width of text components. Defaults to 8.0. max_width (float): The maximum width of text components. Defaults to 24.0. center_region_shrink_ratio (float): The shrink ratio of text center regions. Defaults to 0.3. comp_shrink_ratio (float): The shrink ratio of text components. Defaults to 1.0. comp_w_h_ratio (float): The width to height ratio of text components. Defaults to 0.3. min_rand_half_height(float): The minimum half-height of random text components. Defaults to 8.0. max_rand_half_height (float): The maximum half-height of random text components. Defaults to 24.0. jitter_level (float): The jitter level of text component geometric features. Defaults to 0.2. loss_text (dict): The loss config used to calculate the text loss. Defaults to ``dict(type='MaskedBalancedBCEWithLogitsLoss', fallback_negative_num=100, eps=1e-5)``. loss_center (dict): The loss config used to calculate the center loss. Defaults to ``dict(type='MaskedBCEWithLogitsLoss')``. loss_top (dict): The loss config used to calculate the top loss, which is a part of the height loss. Defaults to ``dict(type='SmoothL1Loss', reduction='none')``. loss_btm (dict): The loss config used to calculate the bottom loss, which is a part of the height loss. Defaults to ``dict(type='SmoothL1Loss', reduction='none')``. loss_sin (dict): The loss config used to calculate the sin loss. Defaults to ``dict(type='MaskedSmoothL1Loss')``. loss_cos (dict): The loss config used to calculate the cos loss. Defaults to ``dict(type='MaskedSmoothL1Loss')``. loss_gcn (dict): The loss config used to calculate the GCN loss. Defaults to ``dict(type='CrossEntropyLoss')``. """ def __init__( self, ohem_ratio: float = 3.0, downsample_ratio: float = 1.0, orientation_thr: float = 2.0, resample_step: float = 8.0, num_min_comps: int = 9, num_max_comps: int = 600, min_width: float = 8.0, max_width: float = 24.0, center_region_shrink_ratio: float = 0.3, comp_shrink_ratio: float = 1.0, comp_w_h_ratio: float = 0.3, text_comp_nms_thr: float = 0.25, min_rand_half_height: float = 8.0, max_rand_half_height: float = 24.0, jitter_level: float = 0.2, loss_text: Dict = dict( type='MaskedBalancedBCEWithLogitsLoss', fallback_negative_num=100, eps=1e-5), loss_center: Dict = dict(type='MaskedBCEWithLogitsLoss'), loss_top: Dict = dict(type='SmoothL1Loss', reduction='none'), loss_btm: Dict = dict(type='SmoothL1Loss', reduction='none'), loss_sin: Dict = dict(type='MaskedSmoothL1Loss'), loss_cos: Dict = dict(type='MaskedSmoothL1Loss'), loss_gcn: Dict = dict(type='CrossEntropyLoss') ) -> None: super().__init__() self.ohem_ratio = ohem_ratio self.downsample_ratio = downsample_ratio self.orientation_thr = orientation_thr self.resample_step = resample_step self.num_max_comps = num_max_comps self.num_min_comps = num_min_comps self.min_width = min_width self.max_width = max_width self.center_region_shrink_ratio = center_region_shrink_ratio self.comp_shrink_ratio = comp_shrink_ratio self.comp_w_h_ratio = comp_w_h_ratio self.text_comp_nms_thr = text_comp_nms_thr self.min_rand_half_height = min_rand_half_height self.max_rand_half_height = max_rand_half_height self.jitter_level = jitter_level self.loss_text = MODELS.build(loss_text) self.loss_center = MODELS.build(loss_center) self.loss_top = MODELS.build(loss_top) self.loss_btm = MODELS.build(loss_btm) self.loss_sin = MODELS.build(loss_sin) self.loss_cos = MODELS.build(loss_cos) self.loss_gcn = MODELS.build(loss_gcn)
[文档] def forward(self, preds: Tuple[Tensor, Tensor, Tensor], data_samples: Sequence[TextDetDataSample]) -> Dict: """Compute Drrg loss. Args: preds (tuple): The prediction tuple(pred_maps, gcn_pred, gt_labels), each of shape :math:`(N, 6, H, W)`, :math:`(N, 2)` and :math:`(m ,n)`, where :math:`m * n = N`. data_samples (list[TextDetDataSample]): The data samples. Returns: dict: A loss dict with ``loss_text``, ``loss_center``, ``loss_height``, ``loss_sin``, ``loss_cos``, and ``loss_gcn``. """ assert isinstance(preds, tuple) (gt_text_masks, gt_center_region_masks, gt_masks, gt_top_height_maps, gt_bot_height_maps, gt_sin_maps, gt_cos_maps, _) = self.get_targets(data_samples) pred_maps, gcn_pred, gt_labels = preds pred_text_region = pred_maps[:, 0, :, :] pred_center_region = pred_maps[:, 1, :, :] pred_sin_map = pred_maps[:, 2, :, :] pred_cos_map = pred_maps[:, 3, :, :] pred_top_height_map = pred_maps[:, 4, :, :] pred_bot_height_map = pred_maps[:, 5, :, :] feature_sz = pred_maps.size() device = pred_maps.device # bitmask 2 tensor mapping = { 'gt_text_masks': gt_text_masks, 'gt_center_region_masks': gt_center_region_masks, 'gt_masks': gt_masks, 'gt_top_height_maps': gt_top_height_maps, 'gt_bot_height_maps': gt_bot_height_maps, 'gt_sin_maps': gt_sin_maps, 'gt_cos_maps': gt_cos_maps } gt = {} for key, value in mapping.items(): gt[key] = value if abs(self.downsample_ratio - 1.0) < 1e-2: gt[key] = self._batch_pad(gt[key], feature_sz[2:]) else: gt[key] = [ imrescale( mask, scale=self.downsample_ratio, interpolation='nearest') for mask in gt[key] ] gt[key] = self._batch_pad(gt[key], feature_sz[2:]) if key in ['gt_top_height_maps', 'gt_bot_height_maps']: gt[key] *= self.downsample_ratio gt[key] = torch.from_numpy(gt[key]).float().to(device) scale = torch.sqrt(1.0 / (pred_sin_map**2 + pred_cos_map**2 + 1e-8)) pred_sin_map = pred_sin_map * scale pred_cos_map = pred_cos_map * scale loss_text = self.loss_text(pred_text_region, gt['gt_text_masks'], gt['gt_masks']) text_mask = (gt['gt_text_masks'] * gt['gt_masks']).float() negative_text_mask = ((1 - gt['gt_text_masks']) * gt['gt_masks']).float() loss_center_positive = self.loss_center(pred_center_region, gt['gt_center_region_masks'], text_mask) loss_center_negative = self.loss_center(pred_center_region, gt['gt_center_region_masks'], negative_text_mask) loss_center = loss_center_positive + 0.5 * loss_center_negative center_mask = (gt['gt_center_region_masks'] * gt['gt_masks']).float() map_sz = pred_top_height_map.size() ones = torch.ones(map_sz, dtype=torch.float, device=device) loss_top = self.loss_top( pred_top_height_map / (gt['gt_top_height_maps'] + 1e-2), ones) loss_btm = self.loss_btm( pred_bot_height_map / (gt['gt_bot_height_maps'] + 1e-2), ones) gt_height = gt['gt_top_height_maps'] + gt['gt_bot_height_maps'] loss_height = torch.sum((torch.log(gt_height + 1) * (loss_top + loss_btm)) * center_mask) / ( torch.sum(center_mask) + 1e-6) loss_sin = self.loss_sin(pred_sin_map, gt['gt_sin_maps'], center_mask) loss_cos = self.loss_cos(pred_cos_map, gt['gt_cos_maps'], center_mask) loss_gcn = self.loss_gcn(gcn_pred, gt_labels.view(-1).to(gcn_pred.device)) results = dict( loss_text=loss_text, loss_center=loss_center, loss_height=loss_height, loss_sin=loss_sin, loss_cos=loss_cos, loss_gcn=loss_gcn) return results
[文档] def get_targets(self, data_samples: List[TextDetDataSample]) -> Tuple: """Generate loss targets from data samples. Args: data_samples (list(TextDetDataSample)): Ground truth data samples. Returns: tuple: A tuple of 8 lists of tensors as DRRG targets. Read docstring of ``_get_target_single`` for more details. """ # If data_samples points to same object as self.cached_data_samples, it # means that get_targets is called more than once in the same train # iteration, and pre-computed targets can be reused. if hasattr(self, 'targets') and \ self.cache_data_samples is data_samples: return self.targets self.cache_data_samples = data_samples self.targets = multi_apply(self._get_target_single, data_samples) return self.targets
def _get_target_single(self, data_sample: TextDetDataSample) -> Tuple: """Generate loss target from a data sample. Args: data_sample (TextDetDataSample): The data sample. Returns: tuple: A tuple of 8 tensors as DRRG targets. - gt_text_mask (ndarray): The text region mask. - gt_center_region_mask (ndarray): The text center region mask. - gt_mask (ndarray): The effective mask. - gt_top_height_map (ndarray): The map on which the distance from points to top side lines will be drawn for each pixel in text center regions. - gt_bot_height_map (ndarray): The map on which the distance from points to bottom side lines will be drawn for each pixel in text center regions. - gt_sin_map (ndarray): The sin(theta) map where theta is the angle between vector (top point - bottom point) and vector (1, 0). - gt_cos_map (ndarray): The cos(theta) map where theta is the angle between vector (top point - bottom point) and vector (1, 0). - gt_comp_attribs (ndarray): The padded text component attributes of a fixed size. Shape: (num_component, 8). """ gt_instances = data_sample.gt_instances ignore_flags = gt_instances.ignored polygons = gt_instances[~ignore_flags].polygons ignored_polygons = gt_instances[ignore_flags].polygons h, w = data_sample.img_shape gt_text_mask = self._generate_text_region_mask((h, w), polygons) gt_mask = self._generate_effective_mask((h, w), ignored_polygons) (center_lines, gt_center_region_mask, gt_top_height_map, gt_bot_height_map, gt_sin_map, gt_cos_map) = self._generate_center_mask_attrib_maps((h, w), polygons) gt_comp_attribs = self._generate_comp_attribs(center_lines, gt_text_mask, gt_center_region_mask, gt_top_height_map, gt_bot_height_map, gt_sin_map, gt_cos_map) return (gt_text_mask, gt_center_region_mask, gt_mask, gt_top_height_map, gt_bot_height_map, gt_sin_map, gt_cos_map, gt_comp_attribs) def _generate_center_mask_attrib_maps(self, img_size: Tuple[int, int], text_polys: List[ndarray]) -> Tuple: """Generate text center region masks and geometric attribute maps. Args: img_size (tuple(int, int)): The image size (height, width). text_polys (list[ndarray]): The list of text polygons. Returns: tuple(center_lines, center_region_mask, top_height_map, bot_height_map,sin_map, cos_map): center_lines (list[ndarray]): The list of text center lines. center_region_mask (ndarray): The text center region mask. top_height_map (ndarray): The map on which the distance from points to top side lines will be drawn for each pixel in text center regions. bot_height_map (ndarray): The map on which the distance from points to bottom side lines will be drawn for each pixel in text center regions. sin_map (ndarray): The sin(theta) map where theta is the angle between vector (top point - bottom point) and vector (1, 0). cos_map (ndarray): The cos(theta) map where theta is the angle between vector (top point - bottom point) and vector (1, 0). """ assert isinstance(img_size, tuple) assert check_argument.is_type_list(text_polys, ndarray) h, w = img_size center_lines = [] center_region_mask = np.zeros((h, w), np.uint8) top_height_map = np.zeros((h, w), dtype=np.float32) bot_height_map = np.zeros((h, w), dtype=np.float32) sin_map = np.zeros((h, w), dtype=np.float32) cos_map = np.zeros((h, w), dtype=np.float32) for poly in text_polys: polygon_points = poly.reshape(-1, 2) _, _, top_line, bot_line = self._reorder_poly_edge(polygon_points) resampled_top_line, resampled_bot_line = self._resample_sidelines( top_line, bot_line, self.resample_step) resampled_bot_line = resampled_bot_line[::-1] center_line = (resampled_top_line + resampled_bot_line) / 2 if self.vector_slope(center_line[-1] - center_line[0]) > 2: if (center_line[-1] - center_line[0])[1] < 0: center_line = center_line[::-1] resampled_top_line = resampled_top_line[::-1] resampled_bot_line = resampled_bot_line[::-1] else: if (center_line[-1] - center_line[0])[0] < 0: center_line = center_line[::-1] resampled_top_line = resampled_top_line[::-1] resampled_bot_line = resampled_bot_line[::-1] line_head_shrink_len = np.clip( (norm(top_line[0] - bot_line[0]) * self.comp_w_h_ratio), self.min_width, self.max_width) / 2 line_tail_shrink_len = np.clip( (norm(top_line[-1] - bot_line[-1]) * self.comp_w_h_ratio), self.min_width, self.max_width) / 2 num_head_shrink = int(line_head_shrink_len // self.resample_step) num_tail_shrink = int(line_tail_shrink_len // self.resample_step) if len(center_line) > num_head_shrink + num_tail_shrink + 2: center_line = center_line[num_head_shrink:len(center_line) - num_tail_shrink] resampled_top_line = resampled_top_line[ num_head_shrink:len(resampled_top_line) - num_tail_shrink] resampled_bot_line = resampled_bot_line[ num_head_shrink:len(resampled_bot_line) - num_tail_shrink] center_lines.append(center_line.astype(np.int32)) self._draw_center_region_maps(resampled_top_line, resampled_bot_line, center_line, center_region_mask, top_height_map, bot_height_map, sin_map, cos_map, self.center_region_shrink_ratio) return (center_lines, center_region_mask, top_height_map, bot_height_map, sin_map, cos_map) def _generate_comp_attribs(self, center_lines: List[ndarray], text_mask: ndarray, center_region_mask: ndarray, top_height_map: ndarray, bot_height_map: ndarray, sin_map: ndarray, cos_map: ndarray) -> ndarray: """Generate text component attributes. Args: center_lines (list[ndarray]): The list of text center lines . text_mask (ndarray): The text region mask. center_region_mask (ndarray): The text center region mask. top_height_map (ndarray): The map on which the distance from points to top side lines will be drawn for each pixel in text center regions. bot_height_map (ndarray): The map on which the distance from points to bottom side lines will be drawn for each pixel in text center regions. sin_map (ndarray): The sin(theta) map where theta is the angle between vector (top point - bottom point) and vector (1, 0). cos_map (ndarray): The cos(theta) map where theta is the angle between vector (top point - bottom point) and vector (1, 0). Returns: ndarray: The padded text component attributes of a fixed size. """ assert isinstance(center_lines, list) assert (text_mask.shape == center_region_mask.shape == top_height_map.shape == bot_height_map.shape == sin_map.shape == cos_map.shape) center_lines_mask = np.zeros_like(center_region_mask) cv2.polylines(center_lines_mask, center_lines, 0, 1, 1) center_lines_mask = center_lines_mask * center_region_mask comp_centers = np.argwhere(center_lines_mask > 0) y = comp_centers[:, 0] x = comp_centers[:, 1] top_height = top_height_map[y, x].reshape( (-1, 1)) * self.comp_shrink_ratio bot_height = bot_height_map[y, x].reshape( (-1, 1)) * self.comp_shrink_ratio sin = sin_map[y, x].reshape((-1, 1)) cos = cos_map[y, x].reshape((-1, 1)) top_mid_points = comp_centers + np.hstack( [top_height * sin, top_height * cos]) bot_mid_points = comp_centers - np.hstack( [bot_height * sin, bot_height * cos]) width = (top_height + bot_height) * self.comp_w_h_ratio width = np.clip(width, self.min_width, self.max_width) r = width / 2 tl = top_mid_points[:, ::-1] - np.hstack([-r * sin, r * cos]) tr = top_mid_points[:, ::-1] + np.hstack([-r * sin, r * cos]) br = bot_mid_points[:, ::-1] + np.hstack([-r * sin, r * cos]) bl = bot_mid_points[:, ::-1] - np.hstack([-r * sin, r * cos]) text_comps = np.hstack([tl, tr, br, bl]).astype(np.float32) score = np.ones((text_comps.shape[0], 1), dtype=np.float32) text_comps = np.hstack([text_comps, score]) if la_nms is None: raise ImportError('lanms-neo is not installed, ' 'please run "pip install lanms-neo==1.0.2".') text_comps = la_nms(text_comps, self.text_comp_nms_thr) if text_comps.shape[0] >= 1: img_h, img_w = center_region_mask.shape text_comps[:, 0:8:2] = np.clip(text_comps[:, 0:8:2], 0, img_w - 1) text_comps[:, 1:8:2] = np.clip(text_comps[:, 1:8:2], 0, img_h - 1) comp_centers = np.mean( text_comps[:, 0:8].reshape((-1, 4, 2)), axis=1).astype(np.int32) x = comp_centers[:, 0] y = comp_centers[:, 1] height = (top_height_map[y, x] + bot_height_map[y, x]).reshape( (-1, 1)) width = np.clip(height * self.comp_w_h_ratio, self.min_width, self.max_width) cos = cos_map[y, x].reshape((-1, 1)) sin = sin_map[y, x].reshape((-1, 1)) _, comp_label_mask = cv2.connectedComponents( center_region_mask, connectivity=8) comp_labels = comp_label_mask[y, x].reshape( (-1, 1)).astype(np.float32) x = x.reshape((-1, 1)).astype(np.float32) y = y.reshape((-1, 1)).astype(np.float32) comp_attribs = np.hstack( [x, y, height, width, cos, sin, comp_labels]) comp_attribs = self._jitter_comp_attribs(comp_attribs, self.jitter_level) if comp_attribs.shape[0] < self.num_min_comps: num_rand_comps = self.num_min_comps - comp_attribs.shape[0] rand_comp_attribs = self._generate_rand_comp_attribs( num_rand_comps, 1 - text_mask) comp_attribs = np.vstack([comp_attribs, rand_comp_attribs]) else: comp_attribs = self._generate_rand_comp_attribs( self.num_min_comps, 1 - text_mask) num_comps = ( np.ones((comp_attribs.shape[0], 1), dtype=np.float32) * comp_attribs.shape[0]) comp_attribs = np.hstack([num_comps, comp_attribs]) if comp_attribs.shape[0] > self.num_max_comps: comp_attribs = comp_attribs[:self.num_max_comps, :] comp_attribs[:, 0] = self.num_max_comps pad_comp_attribs = np.zeros( (self.num_max_comps, comp_attribs.shape[1]), dtype=np.float32) pad_comp_attribs[:comp_attribs.shape[0], :] = comp_attribs return pad_comp_attribs def _generate_rand_comp_attribs(self, num_rand_comps: int, center_sample_mask: ndarray) -> ndarray: """Generate random text components and their attributes to ensure the the number of text components in an image is larger than k_hop1, which is the number of one hop neighbors in KNN graph. Args: num_rand_comps (int): The number of random text components. center_sample_mask (ndarray): The region mask for sampling text component centers . Returns: ndarray: The random text component attributes (x, y, h, w, cos, sin, comp_label=0). """ assert isinstance(num_rand_comps, int) assert num_rand_comps > 0 assert center_sample_mask.ndim == 2 h, w = center_sample_mask.shape max_rand_half_height = self.max_rand_half_height min_rand_half_height = self.min_rand_half_height max_rand_height = max_rand_half_height * 2 max_rand_width = np.clip(max_rand_height * self.comp_w_h_ratio, self.min_width, self.max_width) margin = int( np.sqrt((max_rand_height / 2)**2 + (max_rand_width / 2)**2)) + 1 if 2 * margin + 1 > min(h, w): assert min(h, w) > (np.sqrt(2) * (self.min_width + 1)) max_rand_half_height = max(min(h, w) / 4, self.min_width / 2 + 1) min_rand_half_height = max(max_rand_half_height / 4, self.min_width / 2) max_rand_height = max_rand_half_height * 2 max_rand_width = np.clip(max_rand_height * self.comp_w_h_ratio, self.min_width, self.max_width) margin = int( np.sqrt((max_rand_height / 2)**2 + (max_rand_width / 2)**2)) + 1 inner_center_sample_mask = np.zeros_like(center_sample_mask) inner_center_sample_mask[margin:h - margin, margin:w - margin] = \ center_sample_mask[margin:h - margin, margin:w - margin] kernel_size = int(np.clip(max_rand_half_height, 7, 21)) inner_center_sample_mask = cv2.erode( inner_center_sample_mask, np.ones((kernel_size, kernel_size), np.uint8)) center_candidates = np.argwhere(inner_center_sample_mask > 0) num_center_candidates = len(center_candidates) sample_inds = np.random.choice(num_center_candidates, num_rand_comps) rand_centers = center_candidates[sample_inds] rand_top_height = np.random.randint( min_rand_half_height, max_rand_half_height, size=(len(rand_centers), 1)) rand_bot_height = np.random.randint( min_rand_half_height, max_rand_half_height, size=(len(rand_centers), 1)) rand_cos = 2 * np.random.random(size=(len(rand_centers), 1)) - 1 rand_sin = 2 * np.random.random(size=(len(rand_centers), 1)) - 1 scale = np.sqrt(1.0 / (rand_cos**2 + rand_sin**2 + 1e-8)) rand_cos = rand_cos * scale rand_sin = rand_sin * scale height = (rand_top_height + rand_bot_height) width = np.clip(height * self.comp_w_h_ratio, self.min_width, self.max_width) rand_comp_attribs = np.hstack([ rand_centers[:, ::-1], height, width, rand_cos, rand_sin, np.zeros_like(rand_sin) ]).astype(np.float32) return rand_comp_attribs def _jitter_comp_attribs(self, comp_attribs: ndarray, jitter_level: float) -> ndarray: """Jitter text components attributes. Args: comp_attribs (ndarray): The text component attributes. jitter_level (float): The jitter level of text components attributes. Returns: ndarray: The jittered text component attributes (x, y, h, w, cos, sin, comp_label). """ assert comp_attribs.shape[1] == 7 assert comp_attribs.shape[0] > 0 assert isinstance(jitter_level, float) x = comp_attribs[:, 0].reshape((-1, 1)) y = comp_attribs[:, 1].reshape((-1, 1)) h = comp_attribs[:, 2].reshape((-1, 1)) w = comp_attribs[:, 3].reshape((-1, 1)) cos = comp_attribs[:, 4].reshape((-1, 1)) sin = comp_attribs[:, 5].reshape((-1, 1)) comp_labels = comp_attribs[:, 6].reshape((-1, 1)) x += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * (h * np.abs(cos) + w * np.abs(sin)) * jitter_level y += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * (h * np.abs(sin) + w * np.abs(cos)) * jitter_level h += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * h * jitter_level w += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * w * jitter_level cos += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * 2 * jitter_level sin += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * 2 * jitter_level scale = np.sqrt(1.0 / (cos**2 + sin**2 + 1e-8)) cos = cos * scale sin = sin * scale jittered_comp_attribs = np.hstack([x, y, h, w, cos, sin, comp_labels]) return jittered_comp_attribs def _draw_center_region_maps(self, top_line: ndarray, bot_line: ndarray, center_line: ndarray, center_region_mask: ndarray, top_height_map: ndarray, bot_height_map: ndarray, sin_map: ndarray, cos_map: ndarray, region_shrink_ratio: float) -> None: """Draw attributes of text components on text center regions. Args: top_line (ndarray): The points composing the top side lines of text polygons. bot_line (ndarray): The points composing bottom side lines of text polygons. center_line (ndarray): The points composing the center lines of text instances. center_region_mask (ndarray): The text center region mask. top_height_map (ndarray): The map on which the distance from points to top side lines will be drawn for each pixel in text center regions. bot_height_map (ndarray): The map on which the distance from points to bottom side lines will be drawn for each pixel in text center regions. sin_map (ndarray): The map of vector_sin(top_point - bot_point) that will be drawn on text center regions. cos_map (ndarray): The map of vector_cos(top_point - bot_point) will be drawn on text center regions. region_shrink_ratio (float): The shrink ratio of text center regions. """ assert top_line.shape == bot_line.shape == center_line.shape assert (center_region_mask.shape == top_height_map.shape == bot_height_map.shape == sin_map.shape == cos_map.shape) assert isinstance(region_shrink_ratio, float) h, w = center_region_mask.shape for i in range(0, len(center_line) - 1): top_mid_point = (top_line[i] + top_line[i + 1]) / 2 bot_mid_point = (bot_line[i] + bot_line[i + 1]) / 2 sin_theta = self.vector_sin(top_mid_point - bot_mid_point) cos_theta = self.vector_cos(top_mid_point - bot_mid_point) tl = center_line[i] + (top_line[i] - center_line[i]) * region_shrink_ratio tr = center_line[i + 1] + ( top_line[i + 1] - center_line[i + 1]) * region_shrink_ratio br = center_line[i + 1] + ( bot_line[i + 1] - center_line[i + 1]) * region_shrink_ratio bl = center_line[i] + (bot_line[i] - center_line[i]) * region_shrink_ratio current_center_box = np.vstack([tl, tr, br, bl]).astype(np.int32) cv2.fillPoly(center_region_mask, [current_center_box], color=1) cv2.fillPoly(sin_map, [current_center_box], color=sin_theta) cv2.fillPoly(cos_map, [current_center_box], color=cos_theta) current_center_box[:, 0] = np.clip(current_center_box[:, 0], 0, w - 1) current_center_box[:, 1] = np.clip(current_center_box[:, 1], 0, h - 1) min_coord = np.min(current_center_box, axis=0).astype(np.int32) max_coord = np.max(current_center_box, axis=0).astype(np.int32) current_center_box = current_center_box - min_coord box_sz = (max_coord - min_coord + 1) center_box_mask = np.zeros((box_sz[1], box_sz[0]), dtype=np.uint8) cv2.fillPoly(center_box_mask, [current_center_box], color=1) inds = np.argwhere(center_box_mask > 0) inds = inds + (min_coord[1], min_coord[0]) inds_xy = np.fliplr(inds) top_height_map[(inds[:, 0], inds[:, 1])] = self._dist_point2line( inds_xy, (top_line[i], top_line[i + 1])) bot_height_map[(inds[:, 0], inds[:, 1])] = self._dist_point2line( inds_xy, (bot_line[i], bot_line[i + 1])) def _dist_point2line(self, point: ndarray, line: Tuple[ndarray, ndarray]) -> ndarray: """Calculate the distance from points to a line. TODO: Check its mergibility with the one in mmocr.utils.point_utils. """ assert isinstance(line, tuple) point1, point2 = line d = abs(np.cross(point2 - point1, point - point1)) / ( norm(point2 - point1) + 1e-8) return d
Read the Docs v: latest
Versions
latest
stable
0.x
dev-1.x
Downloads
pdf
html
epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.