Chinese Journal of Information Fusion
ISSN: 2998-3371 (Online) | ISSN: 2998-3363 (Print)
Email: [email protected]

Submit Manuscript
Edit a Special Issue

TY - JOUR AU - Liu, Yupu AU - Kang, Xin AU - Matsumoto, Kazuyuki AU - Zhou, Jiazheng PY - 2025 DA - 2025/11/13 TI - VBCSNet: A Hybrid Attention-Based Multimodal Framework with Structured Self-Attention for Sentiment Classification JO - Chinese Journal of Information Fusion T2 - Chinese Journal of Information Fusion JF - Chinese Journal of Information Fusion VL - 2 IS - 4 SP - 356 EP - 369 DO - 10.62762/CJIF.2025.537775 UR - https://www.icck.org/article/abs/CJIF.2025.537775 KW - multimodal sentiment analysis KW - vision-language models KW - structured self-attention KW - cross-attention KW - contrastive learning KW - interpretability KW - cross-lingual evaluation AB - Multimodal Sentiment Analysis (MSA), a pivotal task in affective computing, aims to enhance sentiment understanding by integrating heterogeneous data from modalities such as text, images, and audio. However, existing methods continue to face challenges in semantic alignment, modality fusion, and interpretability. To address these limitations, we propose VBCSNet, a hybrid attention-based multimodal framework that leverages the complementary strengths of Vision Transformer (ViT), BERT, and CLIP architectures. VBCSNet employs a Structured Self-Attention (SSA) mechanism to optimize intra-modal feature representation and a Cross-Attention module to achieve fine-grained semantic alignment across modalities. Furthermore, we introduce a multi-objective optimization strategy that jointly minimizes classification loss, modality alignment loss, and contrastive loss, thereby enhancing semantic consistency and feature discriminability. We evaluated VBCSNet on three multilingual multimodal sentiment datasets, including MVSA, IJCAI2019, and a self-constructed Japanese Twitter corpus(JP-Buzz). Experimental results demonstrated that VBCSNet significantly outperformed state-of-the-art baselines in terms of Accuracy, Macro-F1, and cross-lingual generalization. Per-class performance analysis further highlighted the model’s interpretability and robustness. Overall, VBCSNet advances sentiment classification across languages and domains while offering a transparent reasoning mechanism suitable for real-world applications in affective computing, human-computer interaction, and socially aware AI systems. SN - 2998-3371 PB - Institute of Central Computation and Knowledge LA - English ER -
@article{Liu2025VBCSNet,
author = {Yupu Liu and Xin Kang and Kazuyuki Matsumoto and Jiazheng Zhou},
title = {VBCSNet: A Hybrid Attention-Based Multimodal Framework with Structured Self-Attention for Sentiment Classification},
journal = {Chinese Journal of Information Fusion},
year = {2025},
volume = {2},
number = {4},
pages = {356-369},
doi = {10.62762/CJIF.2025.537775},
url = {https://www.icck.org/article/abs/CJIF.2025.537775},
abstract = {Multimodal Sentiment Analysis (MSA), a pivotal task in affective computing, aims to enhance sentiment understanding by integrating heterogeneous data from modalities such as text, images, and audio. However, existing methods continue to face challenges in semantic alignment, modality fusion, and interpretability. To address these limitations, we propose VBCSNet, a hybrid attention-based multimodal framework that leverages the complementary strengths of Vision Transformer (ViT), BERT, and CLIP architectures. VBCSNet employs a Structured Self-Attention (SSA) mechanism to optimize intra-modal feature representation and a Cross-Attention module to achieve fine-grained semantic alignment across modalities. Furthermore, we introduce a multi-objective optimization strategy that jointly minimizes classification loss, modality alignment loss, and contrastive loss, thereby enhancing semantic consistency and feature discriminability. We evaluated VBCSNet on three multilingual multimodal sentiment datasets, including MVSA, IJCAI2019, and a self-constructed Japanese Twitter corpus(JP-Buzz). Experimental results demonstrated that VBCSNet significantly outperformed state-of-the-art baselines in terms of Accuracy, Macro-F1, and cross-lingual generalization. Per-class performance analysis further highlighted the model’s interpretability and robustness. Overall, VBCSNet advances sentiment classification across languages and domains while offering a transparent reasoning mechanism suitable for real-world applications in affective computing, human-computer interaction, and socially aware AI systems.},
keywords = {multimodal sentiment analysis, vision-language models, structured self-attention, cross-attention, contrastive learning, interpretability, cross-lingual evaluation},
issn = {2998-3371},
publisher = {Institute of Central Computation and Knowledge}
}
Copyright © 2025 by the Author(s). Published by Institute of Central Computation and Knowledge. This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution (CC BY) license (https://creativecommons.org/licenses/by/4.0/), which permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. Chinese Journal of Information Fusion
ISSN: 2998-3371 (Online) | ISSN: 2998-3363 (Print)
Email: [email protected]
Portico
All published articles are preserved here permanently:
https://www.portico.org/publishers/icck/