Shortcuts

Source code for mmocr.models.textrecog.encoders.abinet_vision_model

# Copyright (c) OpenMMLab. All rights reserved.
from mmocr.models.builder import ENCODERS, build_decoder, build_encoder
from .base_encoder import BaseEncoder


[docs]@ENCODERS.register_module() class ABIVisionModel(BaseEncoder): """A wrapper of visual feature encoder and language token decoder that converts visual features into text tokens. Implementation of VisionEncoder in `ABINet <https://arxiv.org/abs/1910.04396>`_. Args: encoder (dict): Config for image feature encoder. decoder (dict): Config for language token decoder. init_cfg (dict): Specifies the initialization method for model layers. """ def __init__(self, encoder=dict(type='TransformerEncoder'), decoder=dict(type='ABIVisionDecoder'), init_cfg=dict(type='Xavier', layer='Conv2d'), **kwargs): super().__init__(init_cfg=init_cfg) self.encoder = build_encoder(encoder) self.decoder = build_decoder(decoder)
[docs] def forward(self, feat, img_metas=None): """ Args: feat (Tensor): Images of shape (N, E, H, W). Returns: dict: A dict with keys ``feature``, ``logits`` and ``attn_scores``. - | feature (Tensor): Shape (N, T, E). Raw visual features for language decoder. - | logits (Tensor): Shape (N, T, C). The raw logits for characters. C is the number of characters. - | attn_scores (Tensor): Shape (N, T, H, W). Intermediate result for vision-language aligner. """ feat = self.encoder(feat) return self.decoder(feat=feat, out_enc=None)
Read the Docs v: v0.4.0
Versions
latest
stable
v0.4.0
v0.3.0
v0.2.1
v0.2.0
v0.1.0
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.