@article{596, author = {Ariel Goldstein and Haocheng Wang and Leonard Niekerken and Mariano Schain and Zaid Zada and Bobbi Aubrey and Tom Sheffer and Samuel Nastase and Harshvardhan Gazula and Aditi Singh and Aditi Rao and Gina Choe and Catherine Kim and Werner Doyle and Daniel Friedman and Sasha Devore and Patricia Dugan and Avinatan Hassidim and Michael Brenner and Yossi Matias and Orrin Devinsky and Adeen Flinker and Uri Hasson}, title = {A unified acoustic-to-speech-to-language embedding space captures the neural basis of natural language processing in everyday conversations.}, abstract = {
This study introduces a unified computational framework connecting acoustic, speech and word-level linguistic structures to study the neural basis of everyday conversations in the human brain. We used electrocorticography to record neural signals across 100 h of speech production and comprehension as participants engaged in open-ended real-life conversations. We extracted low-level acoustic, mid-level speech and contextual word embeddings from a multimodal speech-to-text model (Whisper). We developed encoding models that linearly map these embeddings onto brain activity during speech production and comprehension. Remarkably, this model accurately predicts neural activity at each level of the language processing hierarchy across hours of new conversations not used in training the model. The internal processing hierarchy in the model is aligned with the cortical hierarchy for speech and language processing, where sensory and motor regions better align with the model{\textquoteright}s speech embeddings, and higher-level language areas better align with the model{\textquoteright}s language embeddings. The Whisper model captures the temporal sequence of language-to-speech encoding before word articulation (speech production) and speech-to-language encoding post articulation (speech comprehension). The embeddings learned by this model outperform symbolic models in capturing neural activity supporting natural speech and language. These findings support a paradigm shift towards unified computational models that capture the entire processing hierarchy for speech comprehension and production in real-world conversations.
}, year = {2025}, journal = {Nature human behaviour}, month = {03/2025}, issn = {2397-3374}, doi = {10.1038/s41562-025-02105-9}, language = {eng}, }