Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 64 additions & 20 deletions firebaseai/src/LiveGenerationConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,63 @@

namespace Firebase.AI {

/// <summary>
/// A struct used to configure speech generation settings.
/// </summary>
public readonly struct SpeechConfig
{
internal readonly string voice;

private SpeechConfig(string voice)
{
this.voice = voice;
}

/// <summary>
/// See https://cloud.google.com/text-to-speech/docs/chirp3-hd for the list of available voices.
/// </summary>
/// <param name="voice"></param>
/// <returns></returns>
public static SpeechConfig UsePrebuiltVoice(string voice)
{
return new SpeechConfig(voice);
}

/// <summary>
/// Intended for internal use only.
/// This method is used for serializing the object to JSON for the API request.
/// </summary>
internal Dictionary<string, object> ToJson()
{
Dictionary<string, object> dict = new();

if (!string.IsNullOrWhiteSpace(voice))
{
dict["voiceConfig"] = new Dictionary<string, object>() {
{ "prebuiltVoiceConfig" , new Dictionary<string, object>() {
{ "voiceName", voice }
} }
};
}

return dict;
}
}

/// <summary>
/// A struct used to configure speech generation settings.
/// A struct used to configure speech transcription settings.
/// </summary>
public readonly struct SpeechConfig {
internal readonly string voice;
public readonly struct AudioTranscriptionConfig {

private SpeechConfig(string voice) {
this.voice = voice;
private AudioTranscriptionConfig() {
}

/// <summary>
/// See https://cloud.google.com/text-to-speech/docs/chirp3-hd for the list of available voices.
/// Creates a new transcription configuration.
/// </summary>
/// <param name="voice"></param>
/// <returns></returns>
public static SpeechConfig UsePrebuiltVoice(string voice) {
return new SpeechConfig(voice);
/// <returns>A new transcription configuration.</returns>
public static AudioTranscriptionConfig GetInstance() {
return new AudioTranscriptionConfig();
}

/// <summary>
Expand All @@ -45,15 +85,6 @@ public static SpeechConfig UsePrebuiltVoice(string voice) {
/// </summary>
internal Dictionary<string, object> ToJson() {
Dictionary<string, object> dict = new();

if (!string.IsNullOrWhiteSpace(voice)) {
dict["voiceConfig"] = new Dictionary<string, object>() {
{ "prebuiltVoiceConfig" , new Dictionary<string, object>() {
{ "voiceName", voice }
} }
};
}

return dict;
}
}
Expand All @@ -62,6 +93,8 @@ internal Dictionary<string, object> ToJson() {
/// A struct defining model parameters to be used when generating live session content.
/// </summary>
public readonly struct LiveGenerationConfig {
private readonly AudioTranscriptionConfig? _inputAudioTranscription;
private readonly AudioTranscriptionConfig? _outputAudioTranscription;
private readonly SpeechConfig? _speechConfig;
private readonly List<ResponseModality> _responseModalities;
private readonly float? _temperature;
Expand All @@ -81,6 +114,10 @@ public readonly struct LiveGenerationConfig {
/// for more details.
/// </summary>
///
/// <param name="inputAudioTranscription">The transcription configuration to use if transcribing audio input.</param>
///
/// <param name="outputAudioTranscription">The transcription configuration to use if transcribing audio output.</param>
///
/// <param name="speechConfig">The speech configuration to use if generating audio output.</param>
///
/// <param name="responseModalities">A list of response types to receive from the model.
Expand Down Expand Up @@ -155,6 +192,8 @@ public readonly struct LiveGenerationConfig {
/// [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig)
/// for more details.</param>
public LiveGenerationConfig(
AudioTranscriptionConfig? inputAudioTranscription = null,
AudioTranscriptionConfig? outputAudioTranscription = null,
SpeechConfig? speechConfig = null,
IEnumerable<ResponseModality> responseModalities = null,
float? temperature = null,
Expand All @@ -163,6 +202,8 @@ public LiveGenerationConfig(
int? maxOutputTokens = null,
float? presencePenalty = null,
float? frequencyPenalty = null) {
_inputAudioTranscription = inputAudioTranscription;
_outputAudioTranscription = outputAudioTranscription;
_speechConfig = speechConfig;
_responseModalities = responseModalities != null ?
new List<ResponseModality>(responseModalities) : new List<ResponseModality>();
Expand All @@ -178,8 +219,11 @@ public LiveGenerationConfig(
/// Intended for internal use only.
/// This method is used for serializing the object to JSON for the API request.
/// </summary>
internal Dictionary<string, object> ToJson() {
internal Dictionary<string, object> ToJson()
{
Dictionary<string, object> jsonDict = new();
if (_inputAudioTranscription.HasValue) jsonDict["inputAudioTranscription"] = _inputAudioTranscription?.ToJson();
if (_outputAudioTranscription.HasValue) jsonDict["outputAudioTranscription"] = _outputAudioTranscription?.ToJson();
if (_speechConfig.HasValue) jsonDict["speechConfig"] = _speechConfig?.ToJson();
if (_responseModalities != null && _responseModalities.Any()) {
jsonDict["responseModalities"] =
Expand Down
59 changes: 57 additions & 2 deletions firebaseai/src/LiveSessionResponse.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@ public string Text {
}
}

/// <summary>
/// The response's content that was a transcription, if it exists.
/// </summary>
public string Transcription {
get {
if (Message is Transcription transcription) {
return transcription.Text;
}
return null;
}
}

/// <summary>
/// The response's content that was audio, if it exists.
/// </summary>
Expand Down Expand Up @@ -132,6 +144,40 @@ private LiveSessionResponse(ILiveSessionMessage liveSessionMessage) {
/// </summary>
public interface ILiveSessionMessage { }

/// <summary>
/// A transcription of the audio sent in a live session.
/// </summary>
public readonly struct Transcription {
/// <summary>
/// The transcribed text.
/// </summary>
public readonly string Text { get; }
/// <summary>
/// Whether this is the end of the transcription.
/// </summary>
public readonly bool Finished { get; }

private Transcription(string text, bool finished) {
Text = text;
Finished = finished;
}

private Transcription(string text)
{
Text = text;
Finished = true;
}

/// <summary>
/// Intended for internal use only.
/// This method is used for deserializing JSON responses and should not be called directly.
/// </summary>
internal static Transcription FromJson(Dictionary<string, object> jsonDict) {
return new Transcription(
jsonDict.ParseValue<string>("text"), jsonDict.ParseValue<bool>("finished")
);
}
}
/// <summary>
/// Content generated by the model in a live session.
/// </summary>
Expand All @@ -153,10 +199,17 @@ public interface ILiveSessionMessage { }
/// </summary>
public readonly bool Interrupted { get; }

private LiveSessionContent(ModelContent? content, bool turnComplete, bool interrupted) {
public readonly Transcription InputTranscription { get; }
public readonly Transcription OutputTranscription { get; }


private LiveSessionContent(ModelContent? content, bool turnComplete, bool interrupted,
Transcription? inputTranscription, Transcription? outputTranscription) {
Content = content;
TurnComplete = turnComplete;
Interrupted = interrupted;
InputTranscription = inputTranscription;
OutputTranscription = outputTranscription;
}

/// <summary>
Expand All @@ -167,7 +220,9 @@ internal static LiveSessionContent FromJson(Dictionary<string, object> jsonDict)
return new LiveSessionContent(
jsonDict.ParseNullableObject("modelTurn", ModelContent.FromJson),
jsonDict.ParseValue<bool>("turnComplete"),
jsonDict.ParseValue<bool>("interrupted")
jsonDict.ParseValue<bool>("interrupted"),
jsonDict.ParseValue<Transcription>("inputTranscription", Transcription.FromJson),
jsonDict.ParseValue<Transcription>("outputTranscription", Transcription.FromJson)
);
}
}
Expand Down