Skip to content

Commit 647c2a2

Browse files
authored
Merge 05e1939 into 0e81807
2 parents 0e81807 + 05e1939 commit 647c2a2

File tree

8 files changed

+151
-21
lines changed

8 files changed

+151
-21
lines changed

firebase-ai/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Unreleased
22

3+
- [feature] Added support for sending realtime audio and video in a `LiveSession`.
34
- [changed] Removed redundant internal exception types. (#7475)
45

56
# 17.4.0

firebase-ai/api.txt

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,11 @@ package com.google.firebase.ai.java {
145145
method public abstract org.reactivestreams.Publisher<com.google.firebase.ai.type.LiveServerMessage> receive();
146146
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> send(com.google.firebase.ai.type.Content content);
147147
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> send(String text);
148+
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendAudioRealtime(com.google.firebase.ai.type.InlineData audio);
148149
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendFunctionResponse(java.util.List<com.google.firebase.ai.type.FunctionResponsePart> functionList);
149-
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendMediaStream(java.util.List<com.google.firebase.ai.type.MediaData> mediaChunks);
150+
method @Deprecated public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendMediaStream(java.util.List<com.google.firebase.ai.type.MediaData> mediaChunks);
151+
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendTextRealtime(String text);
152+
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendVideoRealtime(com.google.firebase.ai.type.InlineData video);
150153
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation();
151154
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(boolean enableInterruptions);
152155
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler);
@@ -801,6 +804,14 @@ package com.google.firebase.ai.type {
801804
public static final class ImagenSubjectReferenceType.Companion {
802805
}
803806

807+
public final class InlineData {
808+
ctor public InlineData(byte[] data, String mimeType);
809+
method public byte[] getData();
810+
method public String getMimeType();
811+
property public final byte[] data;
812+
property public final String mimeType;
813+
}
814+
804815
public final class InlineDataPart implements com.google.firebase.ai.type.Part {
805816
ctor public InlineDataPart(byte[] inlineData, String mimeType);
806817
method public byte[] getInlineData();
@@ -891,20 +902,23 @@ package com.google.firebase.ai.type {
891902
method public kotlinx.coroutines.flow.Flow<com.google.firebase.ai.type.LiveServerMessage> receive();
892903
method public suspend Object? send(com.google.firebase.ai.type.Content content, kotlin.coroutines.Continuation<? super kotlin.Unit>);
893904
method public suspend Object? send(String text, kotlin.coroutines.Continuation<? super kotlin.Unit>);
905+
method public suspend Object? sendAudioRealtime(com.google.firebase.ai.type.InlineData audio, kotlin.coroutines.Continuation<? super kotlin.Unit>);
894906
method public suspend Object? sendFunctionResponse(java.util.List<com.google.firebase.ai.type.FunctionResponsePart> functionList, kotlin.coroutines.Continuation<? super kotlin.Unit>);
895-
method public suspend Object? sendMediaStream(java.util.List<com.google.firebase.ai.type.MediaData> mediaChunks, kotlin.coroutines.Continuation<? super kotlin.Unit>);
907+
method @Deprecated public suspend Object? sendMediaStream(java.util.List<com.google.firebase.ai.type.MediaData> mediaChunks, kotlin.coroutines.Continuation<? super kotlin.Unit>);
908+
method public suspend Object? sendTextRealtime(String text, kotlin.coroutines.Continuation<? super kotlin.Unit>);
909+
method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation<? super kotlin.Unit>);
896910
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
897911
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.coroutines.Continuation<? super kotlin.Unit>);
898912
method public void stopAudioConversation();
899913
method public void stopReceiving();
900914
}
901915

902-
@com.google.firebase.ai.type.PublicPreviewAPI public final class MediaData {
903-
ctor public MediaData(byte[] data, String mimeType);
904-
method public byte[] getData();
905-
method public String getMimeType();
906-
property public final byte[] data;
907-
property public final String mimeType;
916+
@Deprecated @com.google.firebase.ai.type.PublicPreviewAPI public final class MediaData {
917+
ctor @Deprecated public MediaData(byte[] data, String mimeType);
918+
method @Deprecated public byte[] getData();
919+
method @Deprecated public String getMimeType();
920+
property @Deprecated public final byte[] data;
921+
property @Deprecated public final String mimeType;
908922
}
909923

910924
public final class ModalityTokenCount {

firebase-ai/gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
version=17.4.1
15+
version=17.5.0
1616
latestReleasedVersion=17.4.0

firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import com.google.common.util.concurrent.ListenableFuture
2323
import com.google.firebase.ai.type.Content
2424
import com.google.firebase.ai.type.FunctionCallPart
2525
import com.google.firebase.ai.type.FunctionResponsePart
26+
import com.google.firebase.ai.type.InlineData
2627
import com.google.firebase.ai.type.LiveServerMessage
2728
import com.google.firebase.ai.type.LiveSession
2829
import com.google.firebase.ai.type.MediaData
@@ -126,13 +127,38 @@ public abstract class LiveSessionFutures internal constructor() {
126127
functionList: List<FunctionResponsePart>
127128
): ListenableFuture<Unit>
128129

130+
/**
131+
* Sends audio data to the server in realtime. Check
132+
* https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime
133+
* input usage.
134+
* @param audio The audio data to send.
135+
*/
136+
public abstract fun sendAudioRealtime(audio: InlineData): ListenableFuture<Unit>
137+
138+
/**
139+
* Sends video data to the server in realtime. Check
140+
* https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime
141+
* input usage.
142+
* @param video The video data to send. Video MIME type could be either video or image.
143+
*/
144+
public abstract fun sendVideoRealtime(video: InlineData): ListenableFuture<Unit>
145+
146+
/**
147+
* Sends text data to the server in realtime. Check
148+
* https://ai.google.dev/api/live#bidigeneratecontentrealtimeinput for details about the realtime
149+
* input usage.
150+
* @param text The text data to send.
151+
*/
152+
public abstract fun sendTextRealtime(text: String): ListenableFuture<Unit>
153+
129154
/**
130155
* Streams client data to the model.
131156
*
132157
* Calling this after [startAudioConversation] will play the response audio immediately.
133158
*
134159
* @param mediaChunks The list of [MediaData] instances representing the media data to be sent.
135160
*/
161+
@Deprecated("Use sendAudioRealtime, sendVideoRealtime, or sendTextRealtime instead")
136162
public abstract fun sendMediaStream(mediaChunks: List<MediaData>): ListenableFuture<Unit>
137163

138164
/**
@@ -190,6 +216,15 @@ public abstract class LiveSessionFutures internal constructor() {
190216
override fun sendFunctionResponse(functionList: List<FunctionResponsePart>) =
191217
SuspendToFutureAdapter.launchFuture { session.sendFunctionResponse(functionList) }
192218

219+
override fun sendAudioRealtime(audio: InlineData): ListenableFuture<Unit> =
220+
SuspendToFutureAdapter.launchFuture { session.sendAudioRealtime(audio) }
221+
222+
override fun sendVideoRealtime(video: InlineData): ListenableFuture<Unit> =
223+
SuspendToFutureAdapter.launchFuture { session.sendVideoRealtime(video) }
224+
225+
override fun sendTextRealtime(text: String): ListenableFuture<Unit> =
226+
SuspendToFutureAdapter.launchFuture { session.sendTextRealtime(text) }
227+
193228
override fun sendMediaStream(mediaChunks: List<MediaData>) =
194229
SuspendToFutureAdapter.launchFuture { session.sendMediaStream(mediaChunks) }
195230

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import com.google.firebase.ai.common.JSON
2828
import com.google.firebase.ai.common.util.CancelledCoroutineScope
2929
import com.google.firebase.ai.common.util.accumulateUntil
3030
import com.google.firebase.ai.common.util.childJob
31+
import com.google.firebase.ai.type.MediaData.Internal
3132
import com.google.firebase.annotations.concurrent.Blocking
3233
import io.ktor.client.plugins.websocket.DefaultClientWebSocketSession
3334
import io.ktor.websocket.Frame
@@ -255,20 +256,69 @@ internal constructor(
255256
}
256257
}
257258

259+
/**
260+
* Sends an audio input stream to the model, using the realtime API.
261+
*
262+
* To learn more about audio formats, and the required state they should be provided in, see the
263+
* docs on
264+
* [Supported audio formats](https://cloud.google.com/vertex-ai/generative-ai/docs/live-api#supported-audio-formats)
265+
*
266+
* @param audio Raw audio data used to update the model on the client's conversation. For best
267+
* results, send 16-bit PCM audio at 24kHz.
268+
*/
269+
public suspend fun sendAudioRealtime(audio: InlineData) {
270+
FirebaseAIException.catchAsync {
271+
val jsonString =
272+
Json.encodeToString(BidiGenerateContentRealtimeInputSetup(audio = audio).toInternal())
273+
session.send(Frame.Text(jsonString))
274+
}
275+
}
276+
277+
/**
278+
* Sends a video input stream to the model, using the realtime API.
279+
*
280+
* @param video Encoded video data, used to update the model on the client's conversation. The
281+
* MIME type can be a video format (e.g., `video/webm`) or an image format (e.g., `image/jpeg`).
282+
*/
283+
public suspend fun sendVideoRealtime(video: InlineData) {
284+
FirebaseAIException.catchAsync {
285+
val jsonString =
286+
Json.encodeToString(BidiGenerateContentRealtimeInputSetup(video = video).toInternal())
287+
session.send(Frame.Text(jsonString))
288+
}
289+
}
290+
291+
/**
292+
* Sends a text input stream to the model, using the realtime API.
293+
*
294+
* @param text Text content to append to the current client's conversation.
295+
*/
296+
public suspend fun sendTextRealtime(text: String) {
297+
FirebaseAIException.catchAsync {
298+
val jsonString =
299+
Json.encodeToString(BidiGenerateContentRealtimeInputSetup(text = text).toInternal())
300+
session.send(Frame.Text(jsonString))
301+
}
302+
}
303+
258304
/**
259305
* Streams client data to the model.
260306
*
261307
* Calling this after [startAudioConversation] will play the response audio immediately.
262308
*
263309
* @param mediaChunks The list of [MediaData] instances representing the media data to be sent.
264310
*/
311+
@Deprecated("Use sendAudioRealtime, sendVideoRealtime, or sendTextRealtime instead")
265312
public suspend fun sendMediaStream(
266313
mediaChunks: List<MediaData>,
267314
) {
268315
FirebaseAIException.catchAsync {
269316
val jsonString =
270317
Json.encodeToString(
271-
BidiGenerateContentRealtimeInputSetup(mediaChunks.map { (it.toInternal()) }).toInternal()
318+
BidiGenerateContentRealtimeInputSetup(
319+
mediaChunks.map { InlineData(it.data, it.mimeType) }
320+
)
321+
.toInternal()
272322
)
273323
session.send(Frame.Text(jsonString))
274324
}
@@ -324,7 +374,7 @@ internal constructor(
324374
?.listenToRecording()
325375
?.buffer(UNLIMITED)
326376
?.accumulateUntil(MIN_BUFFER_SIZE)
327-
?.onEach { sendMediaStream(listOf(MediaData(it, "audio/pcm"))) }
377+
?.onEach { sendAudioRealtime(InlineData(it, "audio/pcm")) }
328378
?.catch { throw FirebaseAIException.from(it) }
329379
?.launchIn(scope)
330380
}
@@ -464,15 +514,31 @@ internal constructor(
464514
*
465515
* End of turn is derived from user activity (eg; end of speech).
466516
*/
467-
internal class BidiGenerateContentRealtimeInputSetup(val mediaChunks: List<MediaData.Internal>) {
517+
internal class BidiGenerateContentRealtimeInputSetup(
518+
val mediaChunks: List<InlineData>? = null,
519+
val audio: InlineData? = null,
520+
val video: InlineData? = null,
521+
val text: String? = null
522+
) {
468523
@Serializable
469524
internal class Internal(val realtimeInput: BidiGenerateContentRealtimeInput) {
470525
@Serializable
471526
internal data class BidiGenerateContentRealtimeInput(
472-
val mediaChunks: List<MediaData.Internal>
527+
val mediaChunks: List<InlineData.Internal>?,
528+
val audio: InlineData.Internal?,
529+
val video: InlineData.Internal?,
530+
val text: String?
473531
)
474532
}
475-
fun toInternal() = Internal(Internal.BidiGenerateContentRealtimeInput(mediaChunks))
533+
fun toInternal() =
534+
Internal(
535+
Internal.BidiGenerateContentRealtimeInput(
536+
mediaChunks?.map { it.toInternal() },
537+
audio?.toInternal(),
538+
video?.toInternal(),
539+
text
540+
)
541+
)
476542
}
477543

478544
private companion object {

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/MediaData.kt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import kotlinx.serialization.Serializable
2727
* [Firebase documentation](https://firebase.google.com/docs/vertex-ai/input-file-requirements).
2828
*/
2929
@PublicPreviewAPI
30+
@Deprecated("Use InlineData instead", ReplaceWith("InlineData"))
3031
public class MediaData(public val data: ByteArray, public val mimeType: String) {
3132
@Serializable
3233
internal class Internal(

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/Part.kt

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package com.google.firebase.ai.type
1919
import android.graphics.Bitmap
2020
import android.graphics.BitmapFactory
2121
import android.util.Log
22+
import com.google.firebase.ai.type.ImagenImageFormat.Internal
2223
import java.io.ByteArrayOutputStream
2324
import kotlinx.serialization.DeserializationStrategy
2425
import kotlinx.serialization.SerialName
@@ -161,14 +162,22 @@ internal constructor(
161162

162163
@Serializable
163164
internal data class Internal(
164-
@SerialName("inlineData") val inlineData: InlineData,
165+
@SerialName("inlineData") val inlineData: InlineData.Internal,
165166
val thought: Boolean? = null,
166167
val thoughtSignature: String? = null
167-
) : InternalPart {
168+
) : InternalPart
169+
}
168170

169-
@Serializable
170-
internal data class InlineData(@SerialName("mimeType") val mimeType: String, val data: Base64)
171-
}
171+
/**
172+
* Represents binary data with an associated MIME type.
173+
* @property data the binary data as a [ByteArray]
174+
* @property mimeType an IANA standard MIME type.
175+
*/
176+
public class InlineData(public val data: ByteArray, public val mimeType: String) {
177+
@Serializable internal data class Internal(val mimeType: String, val data: Base64)
178+
179+
internal fun toInternal() =
180+
Internal(mimeType, android.util.Base64.encodeToString(data, BASE_64_FLAGS))
172181
}
173182

174183
/** Represents function call name and params received from requests. */
@@ -334,13 +343,13 @@ internal fun Part.toInternal(): InternalPart {
334343
is TextPart -> TextPart.Internal(text, isThought, thoughtSignature)
335344
is ImagePart ->
336345
InlineDataPart.Internal(
337-
InlineDataPart.Internal.InlineData("image/jpeg", encodeBitmapToBase64Jpeg(image)),
346+
InlineData.Internal("image/jpeg", encodeBitmapToBase64Jpeg(image)),
338347
isThought,
339348
thoughtSignature
340349
)
341350
is InlineDataPart ->
342351
InlineDataPart.Internal(
343-
InlineDataPart.Internal.InlineData(
352+
InlineData.Internal(
344353
mimeType,
345354
android.util.Base64.encodeToString(inlineData, BASE_64_FLAGS)
346355
),

firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import com.google.firebase.ai.type.ImagenEditingConfig;
5252
import com.google.firebase.ai.type.ImagenInlineImage;
5353
import com.google.firebase.ai.type.ImagenMaskReference;
54+
import com.google.firebase.ai.type.InlineData;
5455
import com.google.firebase.ai.type.InlineDataPart;
5556
import com.google.firebase.ai.type.LiveGenerationConfig;
5657
import com.google.firebase.ai.type.LiveServerContent;
@@ -365,6 +366,9 @@ public void onComplete() {
365366

366367
byte[] bytes = new byte[] {(byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE};
367368
session.sendMediaStream(List.of(new MediaData(bytes, "image/jxl")));
369+
session.sendAudioRealtime(new InlineData(bytes, "audio/jxl"));
370+
session.sendVideoRealtime(new InlineData(bytes, "image/jxl"));
371+
session.sendTextRealtime("text");
368372

369373
FunctionResponsePart functionResponse =
370374
new FunctionResponsePart("myFunction", new JsonObject(Map.of()));

0 commit comments

Comments
 (0)