Shortcuts

Source code for mmocr.models.textrecog.encoders.nrtr_encoder

# Copyright (c) OpenMMLab. All rights reserved.
import math
from typing import Dict, Optional, Sequence, Union

import torch
import torch.nn as nn
from mmengine.model import ModuleList

from mmocr.models.common import TFEncoderLayer
from mmocr.registry import MODELS
from mmocr.structures import TextRecogDataSample
from .base import BaseEncoder


[docs]@MODELS.register_module() class NRTREncoder(BaseEncoder): """Transformer Encoder block with self attention mechanism. Args: n_layers (int): The number of sub-encoder-layers in the encoder. Defaults to 6. n_head (int): The number of heads in the multiheadattention models Defaults to 8. d_k (int): Total number of features in key. Defaults to 64. d_v (int): Total number of features in value. Defaults to 64. d_model (int): The number of expected features in the decoder inputs. Defaults to 512. d_inner (int): The dimension of the feedforward network model. Defaults to 256. dropout (float): Dropout rate for MHSA and FFN. Defaults to 0.1. init_cfg (dict or list[dict], optional): Initialization configs. """ def __init__(self, n_layers: int = 6, n_head: int = 8, d_k: int = 64, d_v: int = 64, d_model: int = 512, d_inner: int = 256, dropout: float = 0.1, init_cfg: Optional[Union[Dict, Sequence[Dict]]] = None) -> None: super().__init__(init_cfg=init_cfg) self.d_model = d_model self.layer_stack = ModuleList([ TFEncoderLayer( d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ]) self.layer_norm = nn.LayerNorm(d_model) def _get_source_mask(self, src_seq: torch.Tensor, valid_ratios: Sequence[float]) -> torch.Tensor: """Generate mask for source sequence. Args: src_seq (torch.Tensor): Image sequence. Shape :math:`(N, T, C)`. valid_ratios (list[float]): The valid ratio of input image. For example, if the width of the original image is w1 and the width after pad is w2, then valid_ratio = w1/w2. source mask is used to cover the area of the pad region. Returns: Tensor or None: Source mask. Shape :math:`(N, T)`. The region of pad area are False, and the rest are True. """ N, T, _ = src_seq.size() mask = None if len(valid_ratios) > 0: mask = src_seq.new_zeros((N, T), device=src_seq.device) for i, valid_ratio in enumerate(valid_ratios): valid_width = min(T, math.ceil(T * valid_ratio)) mask[i, :valid_width] = 1 return mask
[docs] def forward(self, feat: torch.Tensor, data_samples: Sequence[TextRecogDataSample] = None ) -> torch.Tensor: """ Args: feat (Tensor): Backbone output of shape :math:`(N, C, H, W)`. data_samples (list[TextRecogDataSample]): Batch of TextRecogDataSample, containing valid_ratio information. Defaults to None. Returns: Tensor: The encoder output tensor. Shape :math:`(N, T, C)`. """ n, c, h, w = feat.size() feat = feat.view(n, c, h * w).permute(0, 2, 1).contiguous() valid_ratios = [] for data_sample in data_samples: valid_ratios.append(data_sample.get('valid_ratio')) mask = self._get_source_mask(feat, valid_ratios) output = feat for enc_layer in self.layer_stack: output = enc_layer(output, mask) output = self.layer_norm(output) return output
Read the Docs v: latest
Versions
latest
stable
v1.0.1
v1.0.0
0.x
v0.6.3
v0.6.2
v0.6.1
v0.6.0
v0.5.0
v0.4.1
v0.4.0
v0.3.0
v0.2.1
v0.2.0
v0.1.0
dev-1.x
Downloads
pdf
html
epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.