Chinese Journal of Information Fusion
ISSN: 2998-3371 (Online) | ISSN: 2998-3363 (Print)
Email: [email protected]

Submit Manuscript
Edit a Special Issue

TY - JOUR AU - Li, Tieying AU - Kong, Lingdu AU - Yang, Xiaochun AU - Wang, Bin AU - Xu, Jiaxing PY - 2024 DA - 2024/06/12 TI - Bridging Modalities: A Survey of Cross-Modal Image-Text Retrieval JO - Chinese Journal of Information Fusion T2 - Chinese Journal of Information Fusion JF - Chinese Journal of Information Fusion VL - 1 IS - 1 SP - 79 EP - 92 DO - 10.62762/CJIF.2024.361895 UR - https://www.icck.org/article/abs/CJIF.2024.361895 KW - multi-modal data KW - cross-modal retrieval KW - cross-modal alignment KW - cross-modal fusion KW - large language models AB - The rapid advancement of Internet technology, driven by social media and e-commerce platforms, has facilitated the generation and sharing of multimodal data, leading to increased interest in efficient cross-modal retrieval systems. Cross-modal image-text retrieval, encompassing tasks such as image query text (IqT) retrieval and text query image (TqI) retrieval, plays a crucial role in semantic searches across modalities. This paper presents a comprehensive survey of cross-modal image-text retrieval, addressing the limitations of previous studies that focused on single perspectives such as subspace learning or deep learning models. We categorize existing models into single-tower, dual-tower, real-value representation, and binary representation models based on their structure and feature representation. A key focus is placed on the fusion of modalities to enhance retrieval performance across diverse data types. Additionally, we explore the impact of multimodal Large Language Models (MLLMs) on cross-modal fusion and retrieval. Our study also provides a detailed overview of common datasets, evaluation metrics, and performance comparisons of representative methods. Finally, we identify current challenges and propose future research directions to advance the field of cross-modal image-text retrieval. SN - 2998-3371 PB - Institute of Central Computation and Knowledge LA - English ER -
@article{Li2024Bridging,
author = {Tieying Li and Lingdu Kong and Xiaochun Yang and Bin Wang and Jiaxing Xu},
title = {Bridging Modalities: A Survey of Cross-Modal Image-Text Retrieval},
journal = {Chinese Journal of Information Fusion},
year = {2024},
volume = {1},
number = {1},
pages = {79-92},
doi = {10.62762/CJIF.2024.361895},
url = {https://www.icck.org/article/abs/CJIF.2024.361895},
abstract = {The rapid advancement of Internet technology, driven by social media and e-commerce platforms, has facilitated the generation and sharing of multimodal data, leading to increased interest in efficient cross-modal retrieval systems. Cross-modal image-text retrieval, encompassing tasks such as image query text (IqT) retrieval and text query image (TqI) retrieval, plays a crucial role in semantic searches across modalities. This paper presents a comprehensive survey of cross-modal image-text retrieval, addressing the limitations of previous studies that focused on single perspectives such as subspace learning or deep learning models. We categorize existing models into single-tower, dual-tower, real-value representation, and binary representation models based on their structure and feature representation. A key focus is placed on the fusion of modalities to enhance retrieval performance across diverse data types. Additionally, we explore the impact of multimodal Large Language Models (MLLMs) on cross-modal fusion and retrieval. Our study also provides a detailed overview of common datasets, evaluation metrics, and performance comparisons of representative methods. Finally, we identify current challenges and propose future research directions to advance the field of cross-modal image-text retrieval.},
keywords = {multi-modal data, cross-modal retrieval, cross-modal alignment, cross-modal fusion, large language models},
issn = {2998-3371},
publisher = {Institute of Central Computation and Knowledge}
}
Copyright © 2024 by the Author(s). Published by Institute of Central Computation and Knowledge. This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution (CC BY) license (https://creativecommons.org/licenses/by/4.0/), which permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. Chinese Journal of Information Fusion
ISSN: 2998-3371 (Online) | ISSN: 2998-3363 (Print)
Email: [email protected]
Portico
All published articles are preserved here permanently:
https://www.portico.org/publishers/icck/