@inproceedings{miyawaki-etal-2022-scene,
title = "Scene-Text Aware Image and Text Retrieval with Dual-Encoder",
author = "Miyawaki, Shumpei and
Hasegawa, Taku and
Nishida, Kyosuke and
Kato, Takuma and
Suzuki, Jun",
editor = "Louvan, Samuel and
Madotto, Andrea and
Madureira, Brielen",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-srw.34",
doi = "10.18653/v1/2022.acl-srw.34",
pages = "422--433",
abstract = "We tackle the tasks of image and text retrieval using a dual-encoder model in which images and text are encoded independently. This model has attracted attention as an approach that enables efficient offline inferences by connecting both vision and language in the same semantic space; however, whether an image encoder as part of a dual-encoder model can interpret scene-text (i.e., the textual information in images) is unclear. We propose pre-training methods that encourage a joint understanding of the scene-text and surrounding visual information. The experimental results demonstrate that our methods improve the retrieval performances of the dual-encoder models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="miyawaki-etal-2022-scene">
<titleInfo>
<title>Scene-Text Aware Image and Text Retrieval with Dual-Encoder</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shumpei</namePart>
<namePart type="family">Miyawaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taku</namePart>
<namePart type="family">Hasegawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyosuke</namePart>
<namePart type="family">Nishida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takuma</namePart>
<namePart type="family">Kato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Suzuki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Louvan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Madotto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brielen</namePart>
<namePart type="family">Madureira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We tackle the tasks of image and text retrieval using a dual-encoder model in which images and text are encoded independently. This model has attracted attention as an approach that enables efficient offline inferences by connecting both vision and language in the same semantic space; however, whether an image encoder as part of a dual-encoder model can interpret scene-text (i.e., the textual information in images) is unclear. We propose pre-training methods that encourage a joint understanding of the scene-text and surrounding visual information. The experimental results demonstrate that our methods improve the retrieval performances of the dual-encoder models.</abstract>
<identifier type="citekey">miyawaki-etal-2022-scene</identifier>
<identifier type="doi">10.18653/v1/2022.acl-srw.34</identifier>
<location>
<url>https://aclanthology.org/2022.acl-srw.34</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>422</start>
<end>433</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Scene-Text Aware Image and Text Retrieval with Dual-Encoder
%A Miyawaki, Shumpei
%A Hasegawa, Taku
%A Nishida, Kyosuke
%A Kato, Takuma
%A Suzuki, Jun
%Y Louvan, Samuel
%Y Madotto, Andrea
%Y Madureira, Brielen
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F miyawaki-etal-2022-scene
%X We tackle the tasks of image and text retrieval using a dual-encoder model in which images and text are encoded independently. This model has attracted attention as an approach that enables efficient offline inferences by connecting both vision and language in the same semantic space; however, whether an image encoder as part of a dual-encoder model can interpret scene-text (i.e., the textual information in images) is unclear. We propose pre-training methods that encourage a joint understanding of the scene-text and surrounding visual information. The experimental results demonstrate that our methods improve the retrieval performances of the dual-encoder models.
%R 10.18653/v1/2022.acl-srw.34
%U https://aclanthology.org/2022.acl-srw.34
%U https://doi.org/10.18653/v1/2022.acl-srw.34
%P 422-433
Markdown (Informal)
[Scene-Text Aware Image and Text Retrieval with Dual-Encoder](https://aclanthology.org/2022.acl-srw.34) (Miyawaki et al., ACL 2022)
ACL
- Shumpei Miyawaki, Taku Hasegawa, Kyosuke Nishida, Takuma Kato, and Jun Suzuki. 2022. Scene-Text Aware Image and Text Retrieval with Dual-Encoder. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop, pages 422–433, Dublin, Ireland. Association for Computational Linguistics.