Abstract
Post-hoc explanation methods for black-box models often struggle with
faithfulness and human interpretability due to the lack of explainability in
current neural models. Meanwhile, B-cos networks have been introduced to
improve model explainability through architectural and computational
adaptations, but their application has so far been limited to computer vision
models and their associated training pipelines. In this work, we introduce
B-cos LMs, i.e., B-cos networks empowered for NLP tasks. Our approach directly
transforms pre-trained language models into B-cos LMs by combining B-cos
conversion and task fine-tuning, improving efficiency compared to previous
B-cos methods. Our automatic and human evaluation results demonstrate that
B-cos LMs produce more faithful and human interpretable explanations than post
hoc methods, while maintaining task performance comparable to conventional
fine-tuning. Our in-depth analysis explores how B-cos LMs differ from
conventionally fine-tuned models in their learning processes and explanation
patterns. Finally, we provide practical guidelines for effectively building
B-cos LMs based on our findings. Our code is available at
anonymous.4open.science/r/bcos_lm.
BibTeX
@online{Wang2502.12992, TITLE = {B-cos {LM}: Efficiently Transforming Pre-trained Language Models for Improved Explainability}, AUTHOR = {Wang, Yifan and Rao, Sukrut and Lee, Ji-Ung and Jobanputra, Mayank and Demberg, Vera}, LANGUAGE = {eng}, URL = {https://arxiv.org/abs/2502.12992}, EPRINT = {2502.12992}, EPRINTTYPE = {arXiv}, YEAR = {2025}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Post-hoc explanation methods for black-box models often struggle with<br>faithfulness and human interpretability due to the lack of explainability in<br>current neural models. Meanwhile, B-cos networks have been introduced to<br>improve model explainability through architectural and computational<br>adaptations, but their application has so far been limited to computer vision<br>models and their associated training pipelines. In this work, we introduce<br>B-cos LMs, i.e., B-cos networks empowered for NLP tasks. Our approach directly<br>transforms pre-trained language models into B-cos LMs by combining B-cos<br>conversion and task fine-tuning, improving efficiency compared to previous<br>B-cos methods. Our automatic and human evaluation results demonstrate that<br>B-cos LMs produce more faithful and human interpretable explanations than post<br>hoc methods, while maintaining task performance comparable to conventional<br>fine-tuning. Our in-depth analysis explores how B-cos LMs differ from<br>conventionally fine-tuned models in their learning processes and explanation<br>patterns. Finally, we provide practical guidelines for effectively building<br>B-cos LMs based on our findings. Our code is available at<br>https://anonymous.4open.science/r/bcos_lm.<br>}, }
Endnote
%0 Report %A Wang, Yifan %A Rao, Sukrut %A Lee, Ji-Ung %A Jobanputra, Mayank %A Demberg, Vera %+ External Organizations Computer Vision and Machine Learning, MPI for Informatics, Max Planck Society External Organizations External Organizations Multimodal Language Processing, MPI for Informatics, Max Planck Society %T B-cos LM: Efficiently Transforming Pre-trained Language Models for Improved Explainability : %G eng %U http://hdl.handle.net/21.11116/0000-0010-C156-3 %U https://arxiv.org/abs/2502.12992 %D 2025 %X Post-hoc explanation methods for black-box models often struggle with<br>faithfulness and human interpretability due to the lack of explainability in<br>current neural models. Meanwhile, B-cos networks have been introduced to<br>improve model explainability through architectural and computational<br>adaptations, but their application has so far been limited to computer vision<br>models and their associated training pipelines. In this work, we introduce<br>B-cos LMs, i.e., B-cos networks empowered for NLP tasks. Our approach directly<br>transforms pre-trained language models into B-cos LMs by combining B-cos<br>conversion and task fine-tuning, improving efficiency compared to previous<br>B-cos methods. Our automatic and human evaluation results demonstrate that<br>B-cos LMs produce more faithful and human interpretable explanations than post<br>hoc methods, while maintaining task performance comparable to conventional<br>fine-tuning. Our in-depth analysis explores how B-cos LMs differ from<br>conventionally fine-tuned models in their learning processes and explanation<br>patterns. Finally, we provide practical guidelines for effectively building<br>B-cos LMs based on our findings. Our code is available at<br>https://anonymous.4open.science/r/bcos_lm.<br> %K Computer Science, Computation and Language, cs.CL,Computer Science, Artificial Intelligence, cs.AI