@@ -51,7 +51,9 @@ typedef struct CandidateTranscript {
5151 * contributed to the creation of this transcript.
5252 */
5353 const double confidence ;
54+
5455} CandidateTranscript ;
56+
5557/**
5658 * @brief An structure to contain emissions (the softmax output of individual
5759 * timesteps) from the acoustic model.
@@ -316,23 +318,33 @@ Metadata* STT_SpeechToTextWithMetadata(ModelState* aCtx,
316318 unsigned int aNumResults );
317319
318320/**
319- * @brief Create a new streaming inference state. The streaming state returned
320- * by this function can then be passed to {@link STT_FeedAudioContent()}
321+ * @brief Use the Coqui STT model to generate emissions (the softmax output of individual
322+ * timesteps).
323+ * by this function can then be passed to {@link STT_CreateStream()}
321324 * and {@link STT_FinishStream()}.
322325 *
323326 * @param aCtx The ModelState pointer for the model to use.
324327 * @param[out] retval an opaque pointer that represents the streaming state. Can
325328 * be NULL if an error occurs.
326329 *
327- * @return Zero for success, non-zero on failure.
330+ * @return probability of symbol j at timestep i you would use
331+ * emissions[i * num_symbols + j]
328332 */
329-
330333STT_EXPORT
331334Metadata * STT_SpeechToTextWithEmissions (ModelState * aCtx ,
332335 const short * aBuffer ,
333336 unsigned int aBufferSize ,
334- unsigned int aNumResults );
335-
337+ unsigned int aNumResults );
338+
339+ /**
340+ * @brief Create a new streaming inference state. The streaming state returned
341+ * by this function can then be passed to {@link STT_FeedAudioContent()}
342+ * and {@link STT_FinishStream()}.
343+ *
344+ * @param aCtx The ModelState pointer for the model to use.
345+ *
346+ * @return Zero for success, non-zero on failure.
347+ */
336348STT_EXPORT
337349int STT_CreateStream (ModelState * aCtx ,
338350 StreamingState * * retval );
0 commit comments