feat: ios long form file transcription (#62)

* feat: ios long form file transcription * add changeset * remove debug prints * update example to use on-device recognition on iOS * fix(example): recording expo-av * feat: use input sample rate, add debug playback function * remove debug logging
jamsch · Nov 25, 2024 · 7fa9bc6 · 7fa9bc6
1 parent eef1cd0
commit 7fa9bc6
Show file tree

Hide file tree

Showing 9 changed files with 777 additions and 707 deletions.
diff --git a/.changeset/brown-flowers-build.md b/.changeset/brown-flowers-build.md
@@ -0,0 +1,5 @@
+---
+"expo-speech-recognition": minor
+---
+
+Implemented long form file-based transcriptions for iOS
diff --git a/README.md b/README.md
@@ -485,7 +485,7 @@ function AudioPlayer(props: { source: string }) {
 > [!IMPORTANT]
 > This feature is available on Android 13+ and iOS. If the device does not support the feature, you'll receive an `error` event with the code `audio-capture`.
 
-Instead of using the microphone, you can configure the `audioSource.uri` option to transcribe audio files.
+Instead of using the microphone, you can configure the `audioSource.uri` option to transcribe audio files. For long-form audio files, you will likely want to use on-device recognition instead of network-based recognition which you can opt-in via `requiresOnDeviceRecognition`. For Android, you should first check if the user has the speech model installed with `getSupportedLocales()`.
 
 ### Supported input audio formats
 
@@ -500,8 +500,6 @@ The following audio formats have been verified on a Samsung Galaxy S23 Ultra on
 
 #### iOS
 
-> Due to a limitation in the underlying `SFSpeechURLRecognitionRequest` API, file-based transcription will only transcribe the **first 1 minute of the audio file**.
-
 The following audio formats have been verified on an iPhone 15 Pro Max on iOS 17.5:
 
 - 16000hz 16-bit 1-channel PCM WAV ([example file](https://github.com/jamsch/expo-speech-recognition/blob/main/example/assets/audio-remote/remote-en-us-sentence-16000hz-pcm_s16le.wav))
@@ -524,6 +522,8 @@ function TranscribeAudioFile() {
     ExpoSpeechRecognitionModule.start({
       lang: "en-US",
       interimResults: true,
+      // Recommended: true on iOS, false on Android, unless the speech model is installed, which you can check with `getSupportedLocales()`
+      requiresOnDeviceRecognition: Platform.OS === "ios",
       audioSource: {
         /** Local file URI */
         uri: "file:///path/to/audio.wav",
@@ -534,7 +534,7 @@ function TranscribeAudioFile() {
         /** [Android only] Audio sampling rate in Hz. */
         sampleRate: 16000,
         /**
-         * [Android only] The delay between chunks of audio to stream to the speech recognition service.
+         * The delay between chunks of audio to stream to the speech recognition service.
          * Use this setting to avoid being rate-limited when using network-based recognition.
          * If you're using on-device recognition, you may want to increase this value to avoid unprocessed audio chunks.
          * Default: 50ms for network-based recognition, 15ms for on-device recognition
@@ -545,7 +545,7 @@ function TranscribeAudioFile() {
   };
 
   useSpeechRecognitionEvent("result", (ev) => {
-    // Note: multiple final results will likely be returned on Android
+    // Note: multiple final results will likely be returned
     // so you'll need to concatenate previous final results
     setTranscription(ev.results[0]?.transcript || "");
   });

diff --git a/example/App.tsx b/example/App.tsx
@@ -998,12 +998,13 @@ function TranscribeLocalAudioFile() {
     ExpoSpeechRecognitionModule.start({
       lang: "en-US",
       interimResults: true,
-      requiresOnDeviceRecognition: true,
+      requiresOnDeviceRecognition: Platform.OS === "ios",
       audioSource: {
         uri: localUri,
         audioChannels: 1,
         audioEncoding: AudioEncodingAndroid.ENCODING_PCM_16BIT,
         sampleRate: 16000,
+        // chunkDelayMillis: 50,
       },
     });
   };
@@ -1188,29 +1189,38 @@ function RecordUsingExpoAvDemo() {
 
   const handleStart = async () => {
     setIsRecording(true);
+    try {
+      await Audio.setAudioModeAsync({
+        allowsRecordingIOS: true,
+        playsInSilentModeIOS: true,
+      });
+      const { recording } = await Audio.Recording.createAsync({
+        isMeteringEnabled: true,
+        android: {
+          bitRate: 32000,
+          extension: ".m4a",
+          outputFormat: AndroidOutputFormat.MPEG_4,
+          audioEncoder: AndroidAudioEncoder.AAC,
+          numberOfChannels: 1,
+          sampleRate: 16000,
+        },
+        ios: {
+          ...Audio.RecordingOptionsPresets.HIGH_QUALITY.ios,
+          numberOfChannels: 1,
+          bitRate: 16000,
+          extension: ".wav",
+          outputFormat: IOSOutputFormat.LINEARPCM,
+        },
+        web: {
+          mimeType: "audio/wav",
+          bitsPerSecond: 128000,
+        },
+      });
 
-    const { recording } = await Audio.Recording.createAsync({
-      isMeteringEnabled: true,
-      android: {
-        bitRate: 32000,
-        extension: ".m4a",
-        outputFormat: AndroidOutputFormat.MPEG_4,
-        audioEncoder: AndroidAudioEncoder.AAC,
-        numberOfChannels: 1,
-        sampleRate: 16000,
-      },
-      ios: {
-        ...Audio.RecordingOptionsPresets.HIGH_QUALITY.ios,
-        extension: ".wav",
-        outputFormat: IOSOutputFormat.LINEARPCM,
-      },
-      web: {
-        mimeType: "audio/wav",
-        bitsPerSecond: 128000,
-      },
-    });
-
-    recordingRef.current = recording;
+      recordingRef.current = recording;
+    } catch (e) {
+      console.log("Error starting recording", e);
+    }
   };
 
   const handleStop = async () => {