HTTP replication (#537)

psarna · web-flow · commit f50803f71f81 · 2023-07-24T11:47:14.000Z
* replication: add HTTP endpoint skeleton

* replication: add HTTP implementation for /frames endpoint

It still does not perform any kind of handshake, just asks
for frames starting N.

Tested with:
 $ cargo run -- --http-replication-listen-addr 127.0.0.1:8081
 $ curl -d '{"next_offset": 0}' -v localhost:8081/frames

* replication: add /hello endpoint

Serves the same purpose as gRPC hello and provides:
 - generation id
 - generation start index
 - database id

* replication: limit HTTP response in frames to 256

An arbitrary limit to make sure we do not overload memory.

* replication/primary: unconditionally update max frame number

With gRPC replication, it was reasonable to assume that there
are listeners to the max frame number notifier, but with HTTP
it's not necessarily the case. Since watch::send() fails if there are
no receivers, we hereby switch to send_replace(), which successfully
updates the value even if there are no active receivers.

* replication: add crude snapshot support for HTTP

The HTTP replication will now react to SnapshotRequired error
by just sending all frames from a snapshot to the user.
That's prone to overcommitting memory, but better than giving up.
This change should be followed up by streaming the snapshot frames
in multiple smaller bits.

* replication: migrate HTTP to Axum

Following the example in admin_api.rs
diff --git a/sqld/src/lib.rs b/sqld/src/lib.rs
@@ -104,6 +104,7 @@ pub struct Config {
     pub allow_replica_overwrite: bool,
     pub max_response_size: u64,
     pub snapshot_exec: Option<String>,
+    pub http_replication_addr: Option<SocketAddr>,
 }
 
 impl Default for Config {
@@ -143,6 +144,7 @@ impl Default for Config {
             allow_replica_overwrite: false,
             max_response_size: 10 * 1024 * 1024, // 10MiB
             snapshot_exec: None,
+            http_replication_addr: None,
         }
     }
 }
@@ -495,11 +497,17 @@ async fn start_primary(
             config.rpc_server_key.clone(),
             config.rpc_server_ca_cert.clone(),
             db_factory.clone(),
-            logger,
+            logger.clone(),
             idle_shutdown_layer.clone(),
         ));
     }
 
+    if let Some(ref addr) = config.http_replication_addr {
+        // FIXME: let's bring it back once I figure out how Axum works
+        // let auth = get_auth(config)?;
+        join_set.spawn(replication::http::run(*addr, logger));
+    }
+
     run_service(
         db_factory,
         config,
diff --git a/sqld/src/main.rs b/sqld/src/main.rs
@@ -183,6 +183,10 @@ struct Cli {
     /// Set a command to execute when a snapshot file is generated.
     #[clap(long, env = "SQLD_SNAPSHOT_EXEC")]
     snapshot_exec: Option<String>,
+
+    /// The address and port for the replication HTTP API.
+    #[clap(long, env = "SQLD_HTTP_REPLICATION_LISTEN_ADDR")]
+    http_replication_listen_addr: Option<SocketAddr>,
 }
 
 #[derive(clap::Subcommand, Debug)]
@@ -292,6 +296,7 @@ fn config_from_args(args: Cli) -> Result<Config> {
         allow_replica_overwrite: args.allow_replica_overwrite,
         max_response_size: args.max_response_size.0,
         snapshot_exec: args.snapshot_exec,
+        http_replication_addr: args.http_replication_listen_addr,
     })
 }
 
diff --git a/sqld/src/replication/frame.rs b/sqld/src/replication/frame.rs
@@ -27,7 +27,7 @@ pub struct FrameHeader {
     pub size_after: u32,
 }
 
-#[derive(Clone)]
+#[derive(Clone, serde::Serialize, serde::Deserialize)]
 /// The owned version of a replication frame.
 /// Cloning this is cheap.
 pub struct Frame {
diff --git a/sqld/src/replication/http.rs b/sqld/src/replication/http.rs
@@ -0,0 +1,195 @@
+use crate::replication::LogReadError;
+use crate::replication::{frame::Frame, primary::frame_stream::FrameStream, ReplicationLogger};
+use anyhow::{Context, Result};
+use axum::extract::State;
+use hyper::{Body, Response};
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+#[derive(Debug, serde::Deserialize, serde::Serialize)]
+pub struct FramesRequest {
+    pub next_offset: u64,
+}
+
+#[derive(Debug, serde::Deserialize, serde::Serialize)]
+pub struct Frames {
+    pub frames: Vec<Frame>,
+}
+
+#[derive(Debug, serde::Deserialize, serde::Serialize)]
+pub struct Hello {
+    pub generation_id: uuid::Uuid,
+    pub generation_start_index: u64,
+    pub database_id: uuid::Uuid,
+}
+
+// Thin wrapper to allow returning anyhow errors from axum
+struct AppError(anyhow::Error);
+
+impl axum::response::IntoResponse for AppError {
+    fn into_response(self) -> axum::response::Response {
+        (
+            hyper::StatusCode::INTERNAL_SERVER_ERROR,
+            format!("Replication failed: {}", self.0),
+        )
+            .into_response()
+    }
+}
+
+impl<E: Into<anyhow::Error>> From<E> for AppError {
+    fn from(err: E) -> Self {
+        Self(err.into())
+    }
+}
+
+pub async fn run(addr: SocketAddr, logger: Arc<ReplicationLogger>) -> Result<()> {
+    use axum::routing::{get, post};
+    let router = axum::Router::new()
+        .route("/hello", get(handle_hello))
+        .route("/frames", post(handle_frames))
+        .with_state(logger);
+
+    let server = hyper::Server::try_bind(&addr)
+        .context("Could not bind admin HTTP API server")?
+        .serve(router.into_make_service());
+
+    tracing::info!(
+        "Listening for replication HTTP API requests on {}",
+        server.local_addr()
+    );
+    server.await?;
+    Ok(())
+}
+
+impl Frames {
+    pub fn new() -> Self {
+        Self { frames: Vec::new() }
+    }
+
+    pub fn push(&mut self, frame: Frame) {
+        self.frames.push(frame);
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.frames.is_empty()
+    }
+}
+
+async fn handle_hello(
+    State(logger): State<Arc<ReplicationLogger>>,
+) -> std::result::Result<Response<Body>, AppError> {
+    let hello = Hello {
+        generation_id: logger.generation.id,
+        generation_start_index: logger.generation.start_index,
+        database_id: logger.database_id()?,
+    };
+
+    let resp = Response::builder()
+        .status(hyper::StatusCode::OK)
+        .body(Body::from(serde_json::to_vec(&hello)?))
+        .unwrap();
+    Ok(resp)
+}
+
+fn error(msg: &str, code: hyper::StatusCode) -> Response<Body> {
+    let err = serde_json::json!({ "error": msg });
+    Response::builder()
+        .status(code)
+        .body(Body::from(serde_json::to_vec(&err).unwrap()))
+        .unwrap()
+}
+
+async fn handle_frames(
+    State(logger): State<Arc<ReplicationLogger>>,
+    req: String, // it's a JSON, but Axum errors-out if Content-Type isn't set to json, which is too strict
+) -> std::result::Result<Response<Body>, AppError> {
+    const MAX_FRAMES_IN_SINGLE_RESPONSE: usize = 256;
+
+    let FramesRequest { next_offset } = match serde_json::from_str(&req) {
+        Ok(req) => req,
+        Err(resp) => return Ok(error(&resp.to_string(), hyper::StatusCode::BAD_REQUEST)),
+    };
+    tracing::trace!("Requested next offset: {next_offset}");
+
+    let next_offset = std::cmp::max(next_offset, 1); // Frames start from 1
+    let current_frameno = next_offset - 1;
+    let mut frame_stream = FrameStream::new(logger.clone(), current_frameno);
+    tracing::trace!(
+        "Max available frame_no: {}",
+        frame_stream.max_available_frame_no
+    );
+    if frame_stream.max_available_frame_no < next_offset {
+        tracing::trace!("No frames available starting {next_offset}, returning 204 No Content");
+        return Ok(Response::builder()
+            .status(hyper::StatusCode::NO_CONTENT)
+            .body(Body::empty())?);
+    }
+
+    let mut frames = Frames::new();
+    for _ in 0..MAX_FRAMES_IN_SINGLE_RESPONSE {
+        use futures::StreamExt;
+
+        match frame_stream.next().await {
+            Some(Ok(frame)) => {
+                tracing::trace!("Read frame {}", frame_stream.current_frame_no);
+                frames.push(frame);
+            }
+            Some(Err(LogReadError::SnapshotRequired)) => {
+                drop(frame_stream);
+                if frames.is_empty() {
+                    tracing::debug!("Snapshot required, switching to snapshot mode");
+                    frames = load_snapshot(logger, next_offset)?;
+                } else {
+                    tracing::debug!("Snapshot required, but some frames were read - returning.");
+                }
+                break;
+            }
+            Some(Err(e)) => {
+                tracing::error!("Error reading frame: {}", e);
+                return Ok(Response::builder()
+                    .status(hyper::StatusCode::INTERNAL_SERVER_ERROR)
+                    .body(Body::empty())
+                    .unwrap());
+            }
+            None => break,
+        }
+
+        if frame_stream.max_available_frame_no <= frame_stream.current_frame_no {
+            break;
+        }
+    }
+
+    if frames.is_empty() {
+        return Ok(Response::builder()
+            .status(hyper::StatusCode::NO_CONTENT)
+            .body(Body::empty())
+            .unwrap());
+    }
+
+    Ok(Response::builder()
+        .status(hyper::StatusCode::OK)
+        .body(Body::from(serde_json::to_string(&frames)?))
+        .unwrap())
+}
+
+// FIXME: In the HTTP stateless spirit, we just unconditionally send the whole snapshot
+// here, which is an obvious overcommit. We should instead stream in smaller parts
+// if the snapshot is large.
+fn load_snapshot(logger: Arc<ReplicationLogger>, from: u64) -> Result<Frames> {
+    let snapshot = match logger.get_snapshot_file(from) {
+        Ok(Some(snapshot)) => snapshot,
+        _ => {
+            tracing::trace!("No snapshot available, returning no frames");
+            return Ok(Frames { frames: Vec::new() });
+        }
+    };
+    let mut frames = Frames::new();
+    for bytes in snapshot.frames_iter_from(from) {
+        frames.push(Frame::try_from_bytes(bytes?)?);
+    }
+    tracing::trace!(
+        "Loaded {} frames from the snapshot file",
+        frames.frames.len()
+    );
+    Ok(frames)
+}
diff --git a/sqld/src/replication/mod.rs b/sqld/src/replication/mod.rs
@@ -1,4 +1,5 @@
 pub mod frame;
+pub mod http;
 pub mod primary;
 pub mod replica;
 mod snapshot;
diff --git a/sqld/src/replication/primary/frame_stream.rs b/sqld/src/replication/primary/frame_stream.rs
@@ -11,8 +11,8 @@ use crate::replication::{FrameNo, LogReadError, ReplicationLogger};
 /// Streams frames from the replication log starting at `current_frame_no`.
 /// Only stops if the current frame is not in the log anymore.
 pub struct FrameStream {
-    current_frame_no: FrameNo,
-    max_available_frame_no: FrameNo,
+    pub(crate) current_frame_no: FrameNo,
+    pub(crate) max_available_frame_no: FrameNo,
     logger: Arc<ReplicationLogger>,
     state: FrameStreamState,
 }
diff --git a/sqld/src/replication/primary/logger.rs b/sqld/src/replication/primary/logger.rs
@@ -295,7 +295,7 @@ impl ReplicationLoggerHookCtx {
 
     fn commit(&self) -> anyhow::Result<()> {
         let new_frame_no = self.logger.commit()?;
-        let _ = self.logger.new_frame_notifier.send(new_frame_no);
+        self.logger.new_frame_notifier.send_replace(new_frame_no);
         Ok(())
     }
 

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ pub struct FrameHeader {`
`27`	`27`	`pub size_after: u32,`
`28`	`28`	`}`
`29`	`29`
`30`		`-#[derive(Clone)]`
	`30`	`+#[derive(Clone, serde::Serialize, serde::Deserialize)]`
`31`	`31`	`/// The owned version of a replication frame.`
`32`	`32`	`/// Cloning this is cheap.`
`33`	`33`	`pub struct Frame {`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`pub mod frame;`
	`2`	`+pub mod http;`
`2`	`3`	`pub mod primary;`
`3`	`4`	`pub mod replica;`
`4`	`5`	`mod snapshot;`
Original file line number	Diff line number	Diff line change
`@@ -295,7 +295,7 @@ impl ReplicationLoggerHookCtx {`
`295`	`295`
`296`	`296`	`fn commit(&self) -> anyhow::Result<()> {`
`297`	`297`	`let new_frame_no = self.logger.commit()?;`
`298`		`- let _ = self.logger.new_frame_notifier.send(new_frame_no);`
	`298`	`+ self.logger.new_frame_notifier.send_replace(new_frame_no);`
`299`	`299`	`Ok(())`
`300`	`300`	`}`
`301`	`301`