@inproceedings{thorbecke-etal-2024-fast,
title = "Fast Streaming Transducer {ASR} Prototyping via Knowledge Distillation with Whisper",
author = "Thorbecke, Iuliia and
Zuluaga Gomez, Juan Pablo and
Villatoro-tello, Esa{\'u} and
Kumar, Shashi and
Rangappa, Pradeep and
Burdisso, Sergio and
Motlicek, Petr and
S, Karthik Pandia D and
Ganapathiraju, Aravind",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.976",
doi = "10.18653/v1/2024.findings-emnlp.976",
pages = "16747--16762",
abstract = "The training of automatic speech recognition (ASR) with little to no supervised data remains an open question. In this work, we demonstrate that streaming Transformer-Transducer (TT) models can be trained from scratch in consumer and accessible GPUs in their entirety with pseudo-labeled (PL) speech from foundational speech models (FSM). This allows training a robust ASR model just in one stage and does not require large data and computational budget compared to the two-step scenario with pre-training and fine-tuning. We perform a comprehensive ablation on different aspects of PL-based streaming TT models such as the impact of (1) shallow fusion of n-gram LMs, (2) contextual biasing with named entities, (3) chunk-wise decoding for low-latency streaming applications, and (4) TT overall performance as the function of the FSM size. Our results demonstrate that TT can be trained from scratch without supervised data, even with very noisy PLs. We validate the proposed framework on 6 languages from CommonVoice and propose multiple heuristics to filter out hallucinated PLs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thorbecke-etal-2024-fast">
<titleInfo>
<title>Fast Streaming Transducer ASR Prototyping via Knowledge Distillation with Whisper</title>
</titleInfo>
<name type="personal">
<namePart type="given">Iuliia</namePart>
<namePart type="family">Thorbecke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Pablo</namePart>
<namePart type="family">Zuluaga Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esaú</namePart>
<namePart type="family">Villatoro-tello</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashi</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pradeep</namePart>
<namePart type="family">Rangappa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Burdisso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petr</namePart>
<namePart type="family">Motlicek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karthik</namePart>
<namePart type="given">Pandia</namePart>
<namePart type="given">D</namePart>
<namePart type="family">S</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aravind</namePart>
<namePart type="family">Ganapathiraju</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The training of automatic speech recognition (ASR) with little to no supervised data remains an open question. In this work, we demonstrate that streaming Transformer-Transducer (TT) models can be trained from scratch in consumer and accessible GPUs in their entirety with pseudo-labeled (PL) speech from foundational speech models (FSM). This allows training a robust ASR model just in one stage and does not require large data and computational budget compared to the two-step scenario with pre-training and fine-tuning. We perform a comprehensive ablation on different aspects of PL-based streaming TT models such as the impact of (1) shallow fusion of n-gram LMs, (2) contextual biasing with named entities, (3) chunk-wise decoding for low-latency streaming applications, and (4) TT overall performance as the function of the FSM size. Our results demonstrate that TT can be trained from scratch without supervised data, even with very noisy PLs. We validate the proposed framework on 6 languages from CommonVoice and propose multiple heuristics to filter out hallucinated PLs.</abstract>
<identifier type="citekey">thorbecke-etal-2024-fast</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.976</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.976</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>16747</start>
<end>16762</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fast Streaming Transducer ASR Prototyping via Knowledge Distillation with Whisper
%A Thorbecke, Iuliia
%A Zuluaga Gomez, Juan Pablo
%A Villatoro-tello, Esaú
%A Kumar, Shashi
%A Rangappa, Pradeep
%A Burdisso, Sergio
%A Motlicek, Petr
%A S, Karthik Pandia D.
%A Ganapathiraju, Aravind
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F thorbecke-etal-2024-fast
%X The training of automatic speech recognition (ASR) with little to no supervised data remains an open question. In this work, we demonstrate that streaming Transformer-Transducer (TT) models can be trained from scratch in consumer and accessible GPUs in their entirety with pseudo-labeled (PL) speech from foundational speech models (FSM). This allows training a robust ASR model just in one stage and does not require large data and computational budget compared to the two-step scenario with pre-training and fine-tuning. We perform a comprehensive ablation on different aspects of PL-based streaming TT models such as the impact of (1) shallow fusion of n-gram LMs, (2) contextual biasing with named entities, (3) chunk-wise decoding for low-latency streaming applications, and (4) TT overall performance as the function of the FSM size. Our results demonstrate that TT can be trained from scratch without supervised data, even with very noisy PLs. We validate the proposed framework on 6 languages from CommonVoice and propose multiple heuristics to filter out hallucinated PLs.
%R 10.18653/v1/2024.findings-emnlp.976
%U https://aclanthology.org/2024.findings-emnlp.976
%U https://doi.org/10.18653/v1/2024.findings-emnlp.976
%P 16747-16762
Markdown (Informal)
[Fast Streaming Transducer ASR Prototyping via Knowledge Distillation with Whisper](https://aclanthology.org/2024.findings-emnlp.976) (Thorbecke et al., Findings 2024)
ACL
- Iuliia Thorbecke, Juan Pablo Zuluaga Gomez, Esaú Villatoro-tello, Shashi Kumar, Pradeep Rangappa, Sergio Burdisso, Petr Motlicek, Karthik Pandia D S, and Aravind Ganapathiraju. 2024. Fast Streaming Transducer ASR Prototyping via Knowledge Distillation with Whisper. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 16747–16762, Miami, Florida, USA. Association for Computational Linguistics.