Abstract
Large Language Models (LLMs) can generate text by transferring style
attributes like formality resulting in formal or informal text. However,
instructing LLMs to generate text that when spoken, is more intelligible in an
acoustically difficult environment, is an under-explored topic. We conduct the
first study to evaluate LLMs on a novel task of generating acoustically
intelligible paraphrases for better human speech perception in noise. Our
experiments in English demonstrated that with standard prompting, LLMs struggle
to control the non-textual attribute, i.e., acoustic intelligibility, while
efficiently capturing the desired textual attributes like semantic equivalence.
To remedy this issue, we propose a simple prompting approach,
prompt-and-select, which generates paraphrases by decoupling the desired
textual and non-textual attributes in the text generation pipeline. Our
approach resulted in a 40% relative improvement in human speech perception, by
paraphrasing utterances that are highly distorted in a listening condition with
babble noise at a signal-to-noise ratio (SNR) -5 dB. This study reveals the
limitation of LLMs in capturing non-textual attributes, and our proposed method
showcases the potential of using LLMs for better human speech perception in
noise.
BibTeX
@inproceedings{Chingacham_2408.04029, TITLE = {Human Speech Perception in Noise: Can Large Language Models Paraphrase to Improve It?}, AUTHOR = {Chingacham, Anupama and Zhang, Miaoran and Demberg, Vera and Klakow, Dietrich}, LANGUAGE = {eng}, ISBN = {979-8-89176-152-0}, DOI = {10.18653/v1/2024.hucllm-1.1}, PUBLISHER = {ACL}, YEAR = {2024}, MARGINALMARK = {$\bullet$}, ABSTRACT = {Large Language Models (LLMs) can generate text by transferring style<br>attributes like formality resulting in formal or informal text. However,<br>instructing LLMs to generate text that when spoken, is more intelligible in an<br>acoustically difficult environment, is an under-explored topic. We conduct the<br>first study to evaluate LLMs on a novel task of generating acoustically<br>intelligible paraphrases for better human speech perception in noise. Our<br>experiments in English demonstrated that with standard prompting, LLMs struggle<br>to control the non-textual attribute, i.e., acoustic intelligibility, while<br>efficiently capturing the desired textual attributes like semantic equivalence.<br>To remedy this issue, we propose a simple prompting approach,<br>prompt-and-select, which generates paraphrases by decoupling the desired<br>textual and non-textual attributes in the text generation pipeline. Our<br>approach resulted in a 40% relative improvement in human speech perception, by<br>paraphrasing utterances that are highly distorted in a listening condition with<br>babble noise at a signal-to-noise ratio (SNR) -5 dB. This study reveals the<br>limitation of LLMs in capturing non-textual attributes, and our proposed method<br>showcases the potential of using LLMs for better human speech perception in<br>noise.<br>}, BOOKTITLE = {Proceedings of the 1st Human-Centered Large Language Modeling Workshop (HuCLLM 2024)}, EDITOR = {Soni, Nikita and Flek, Lucie and Sharma, Ashsih and Yang, Diyi and Hooker, Sara and Schwartz, H. Andrew}, PAGES = {1--15}, ADDRESS = {Bangkok, Thailand}, }
Endnote
%0 Conference Proceedings %A Chingacham, Anupama %A Zhang, Miaoran %A Demberg, Vera %A Klakow, Dietrich %+ External Organizations External Organizations Multimodal Language Processing, MPI for Informatics, Max Planck Society External Organizations %T Human Speech Perception in Noise: Can Large Language Models Paraphrase to Improve It? : %G eng %U http://hdl.handle.net/21.11116/0000-0010-43EE-7 %R 10.18653/v1/2024.hucllm-1.1 %D 2024 %B 1st Human-Centered Large Language Modeling Workshop %Z date of event: 2024-08-15 - 2024-08-15 %C Bangkok, Thailand %X Large Language Models (LLMs) can generate text by transferring style<br>attributes like formality resulting in formal or informal text. However,<br>instructing LLMs to generate text that when spoken, is more intelligible in an<br>acoustically difficult environment, is an under-explored topic. We conduct the<br>first study to evaluate LLMs on a novel task of generating acoustically<br>intelligible paraphrases for better human speech perception in noise. Our<br>experiments in English demonstrated that with standard prompting, LLMs struggle<br>to control the non-textual attribute, i.e., acoustic intelligibility, while<br>efficiently capturing the desired textual attributes like semantic equivalence.<br>To remedy this issue, we propose a simple prompting approach,<br>prompt-and-select, which generates paraphrases by decoupling the desired<br>textual and non-textual attributes in the text generation pipeline. Our<br>approach resulted in a 40% relative improvement in human speech perception, by<br>paraphrasing utterances that are highly distorted in a listening condition with<br>babble noise at a signal-to-noise ratio (SNR) -5 dB. This study reveals the<br>limitation of LLMs in capturing non-textual attributes, and our proposed method<br>showcases the potential of using LLMs for better human speech perception in<br>noise.<br> %K Computer Science, Computation and Language, cs.CL %B Proceedings of the 1st Human-Centered Large Language Modeling Workshop %E Soni, Nikita; Flek, Lucie; Sharma, Ashsih; Yang, Diyi; Hooker, Sara; Schwartz, H. Andrew %P 1 - 15 %I ACL %@ 979-8-89176-152-0