From 134e7ef97a290bcae3822936f06400d86c50a065 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Tue, 13 May 2025 22:28:46 +0200 Subject: [PATCH] Better S3 authentication errors and run formatter --- extension/httpfs/create_secret_functions.cpp | 21 +- extension/httpfs/crypto.cpp | 189 ++++++------ extension/httpfs/hffs.cpp | 36 +-- extension/httpfs/httpfs.cpp | 274 +++++++++-------- extension/httpfs/httpfs_client.cpp | 302 +++++++++++++------ extension/httpfs/httpfs_extension.cpp | 4 +- extension/httpfs/include/crypto.hpp | 4 +- extension/httpfs/include/hffs.hpp | 4 +- extension/httpfs/include/httpfs.hpp | 16 +- extension/httpfs/include/httpfs_client.hpp | 3 +- extension/httpfs/include/s3fs.hpp | 16 +- extension/httpfs/s3fs.cpp | 65 ++-- 12 files changed, 556 insertions(+), 378 deletions(-) diff --git a/extension/httpfs/create_secret_functions.cpp b/extension/httpfs/create_secret_functions.cpp index c93e8baa..d59c8af1 100644 --- a/extension/httpfs/create_secret_functions.cpp +++ b/extension/httpfs/create_secret_functions.cpp @@ -12,7 +12,7 @@ void CreateS3SecretFunctions::Register(DatabaseInstance &instance) { RegisterCreateSecretFunction(instance, "gcs"); } -static Value MapToStruct(const Value &map){ +static Value MapToStruct(const Value &map) { auto children = MapValue::GetChildren(map); child_list_t struct_fields; @@ -109,15 +109,17 @@ unique_ptr CreateS3SecretFunctions::CreateSecretFunctionInternal(Cli refresh = true; secret->secret_map["refresh_info"] = MapToStruct(named_param.second); } else { - throw InvalidInputException("Unknown named parameter passed to CreateSecretFunctionInternal: " + lower_name); + throw InvalidInputException("Unknown named parameter passed to CreateSecretFunctionInternal: " + + lower_name); } } return std::move(secret); } -CreateSecretInput CreateS3SecretFunctions::GenerateRefreshSecretInfo(const SecretEntry &secret_entry, Value &refresh_info) { - const auto &kv_secret = dynamic_cast(*secret_entry.secret); +CreateSecretInput CreateS3SecretFunctions::GenerateRefreshSecretInfo(const SecretEntry &secret_entry, + Value &refresh_info) { + const auto &kv_secret = dynamic_cast(*secret_entry.secret); CreateSecretInput result; result.on_conflict = OnCreateConflict::REPLACE_ON_CONFLICT; @@ -143,7 +145,7 @@ CreateSecretInput CreateS3SecretFunctions::GenerateRefreshSecretInfo(const Secre //! Function that will automatically try to refresh a secret bool CreateS3SecretFunctions::TryRefreshS3Secret(ClientContext &context, const SecretEntry &secret_to_refresh) { - const auto &kv_secret = dynamic_cast(*secret_to_refresh.secret); + const auto &kv_secret = dynamic_cast(*secret_to_refresh.secret); Value refresh_info; if (!kv_secret.TryGetValue("refresh_info", refresh_info)) { @@ -155,12 +157,15 @@ bool CreateS3SecretFunctions::TryRefreshS3Secret(ClientContext &context, const S // TODO: change SecretManager API to avoid requiring catching this exception try { auto res = secret_manager.CreateSecret(context, refresh_input); - auto &new_secret = dynamic_cast(*res->secret); - DUCKDB_LOG_INFO(context, "httpfs.SecretRefresh", "Successfully refreshed secret: %s, new key_id: %s", secret_to_refresh.secret->GetName(), new_secret.TryGetValue("key_id").ToString()); + auto &new_secret = dynamic_cast(*res->secret); + DUCKDB_LOG_INFO(context, "httpfs.SecretRefresh", "Successfully refreshed secret: %s, new key_id: %s", + secret_to_refresh.secret->GetName(), new_secret.TryGetValue("key_id").ToString()); return true; } catch (std::exception &ex) { ErrorData error(ex); - string new_message = StringUtil::Format("Exception thrown while trying to refresh secret %s. To fix this, please recreate or remove the secret and try again. Error: '%s'", secret_to_refresh.secret->GetName(), error.Message()); + string new_message = StringUtil::Format("Exception thrown while trying to refresh secret %s. To fix this, " + "please recreate or remove the secret and try again. Error: '%s'", + secret_to_refresh.secret->GetName(), error.Message()); throw Exception(error.Type(), new_message); } } diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index ac748c66..04bd795e 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -44,34 +44,33 @@ AESStateSSL::~AESStateSSL() { const EVP_CIPHER *AESStateSSL::GetCipher(const string &key) { - switch (cipher) { - case GCM: - switch (key.size()) { - case 16: - return EVP_aes_128_gcm(); - case 24: - return EVP_aes_192_gcm(); - case 32: - return EVP_aes_256_gcm(); - default: - throw InternalException("Invalid AES key length"); - } - case CTR: - switch (key.size()) { - case 16: - return EVP_aes_128_ctr(); - case 24: - return EVP_aes_192_ctr(); - case 32: - return EVP_aes_256_ctr(); - default: - throw InternalException("Invalid AES key length"); - } - - default: - throw duckdb::InternalException("Invalid Encryption/Decryption Cipher: %d", - static_cast(cipher)); - } + switch (cipher) { + case GCM: + switch (key.size()) { + case 16: + return EVP_aes_128_gcm(); + case 24: + return EVP_aes_192_gcm(); + case 32: + return EVP_aes_256_gcm(); + default: + throw InternalException("Invalid AES key length"); + } + case CTR: + switch (key.size()) { + case 16: + return EVP_aes_128_ctr(); + case 24: + return EVP_aes_192_ctr(); + case 32: + return EVP_aes_256_ctr(); + default: + throw InternalException("Invalid AES key length"); + } + + default: + throw duckdb::InternalException("Invalid Encryption/Decryption Cipher: %d", static_cast(cipher)); + } } void AESStateSSL::GenerateRandomData(data_ptr_t data, idx_t len) { @@ -121,79 +120,75 @@ size_t AESStateSSL::Process(const_data_ptr_t in, idx_t in_len, data_ptr_t out, i return out_len; } -size_t AESStateSSL::FinalizeGCM(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len){ - auto text_len = out_len; - - switch (mode) { - case ENCRYPT: - { - if (1 != EVP_EncryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len))) { - throw InternalException("EncryptFinal failed"); - } - text_len += out_len; - - // The computed tag is written at the end of a chunk - if (1 != EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_GET_TAG, tag_len, tag)) { - throw InternalException("Calculating the tag failed"); - } - return text_len; - } - case DECRYPT: - { - // Set expected tag value - if (!EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) { - throw InternalException("Finalizing tag failed"); - } - - // EVP_DecryptFinal() will return an error code if final block is not correctly formatted. - int ret = EVP_DecryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len)); - text_len += out_len; - - if (ret > 0) { - // success - return text_len; - } - throw InvalidInputException("Computed AES tag differs from read AES tag, are you using the right key?"); - } - default: - throw InternalException("Unhandled encryption mode %d", static_cast(mode)); - } +size_t AESStateSSL::FinalizeGCM(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len) { + auto text_len = out_len; + + switch (mode) { + case ENCRYPT: { + if (1 != EVP_EncryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len))) { + throw InternalException("EncryptFinal failed"); + } + text_len += out_len; + + // The computed tag is written at the end of a chunk + if (1 != EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_GET_TAG, tag_len, tag)) { + throw InternalException("Calculating the tag failed"); + } + return text_len; + } + case DECRYPT: { + // Set expected tag value + if (!EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) { + throw InternalException("Finalizing tag failed"); + } + + // EVP_DecryptFinal() will return an error code if final block is not correctly formatted. + int ret = EVP_DecryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len)); + text_len += out_len; + + if (ret > 0) { + // success + return text_len; + } + throw InvalidInputException("Computed AES tag differs from read AES tag, are you using the right key?"); + } + default: + throw InternalException("Unhandled encryption mode %d", static_cast(mode)); + } } size_t AESStateSSL::Finalize(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len) { - if (cipher == GCM){ - return FinalizeGCM(out, out_len, tag, tag_len); - } - - auto text_len = out_len; - switch (mode) { - - case ENCRYPT: - { - if (1 != EVP_EncryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len))) { - throw InternalException("EncryptFinal failed"); - } - - return text_len += out_len; - } - - case DECRYPT: - { - // EVP_DecryptFinal() will return an error code if final block is not correctly formatted. - int ret = EVP_DecryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len)); - text_len += out_len; - - if (ret > 0) { - // success - return text_len; - } - - throw InvalidInputException("Computed AES tag differs from read AES tag, are you using the right key?"); - } - default: - throw InternalException("Unhandled encryption mode %d", static_cast(mode)); - } + if (cipher == GCM) { + return FinalizeGCM(out, out_len, tag, tag_len); + } + + auto text_len = out_len; + switch (mode) { + + case ENCRYPT: { + if (1 != EVP_EncryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len))) { + throw InternalException("EncryptFinal failed"); + } + + return text_len += out_len; + } + + case DECRYPT: { + // EVP_DecryptFinal() will return an error code if final block is not correctly formatted. + int ret = EVP_DecryptFinal_ex(context, data_ptr_cast(out) + out_len, reinterpret_cast(&out_len)); + text_len += out_len; + + if (ret > 0) { + // success + return text_len; + } + + throw InvalidInputException("Computed AES tag differs from read AES tag, are you using the right key?"); + } + default: + throw InternalException("Unhandled encryption mode %d", static_cast(mode)); + } } } // namespace duckdb diff --git a/extension/httpfs/hffs.cpp b/extension/httpfs/hffs.cpp index 45666173..a7c70d87 100644 --- a/extension/httpfs/hffs.cpp +++ b/extension/httpfs/hffs.cpp @@ -54,20 +54,21 @@ string HuggingFaceFileSystem::ListHFRequest(ParsedHFUrl &url, HTTPFSParams &http string link_header_result; std::stringstream response; - GetRequestInfo get_request(url.endpoint, next_page_url, header_map, http_params, - [&](const HTTPResponse &response) { - if (static_cast(response.status) >= 400) { - throw HTTPException(response, "HTTP GET error on '%s' (HTTP %d)", next_page_url, response.status); - } - if (response.HasHeader("Link")) { - link_header_result = response.GetHeaderValue("Link"); - } - return true; - }, - [&](const_data_ptr_t data, idx_t data_length) { - response << string(const_char_ptr_cast(data), data_length); - return true; - }); + GetRequestInfo get_request( + url.endpoint, next_page_url, header_map, http_params, + [&](const HTTPResponse &response) { + if (static_cast(response.status) >= 400) { + throw HTTPException(response, "HTTP GET error on '%s' (HTTP %d)", next_page_url, response.status); + } + if (response.HasHeader("Link")) { + link_header_result = response.GetHeaderValue("Link"); + } + return true; + }, + [&](const_data_ptr_t data, idx_t data_length) { + response << string(const_char_ptr_cast(data), data_length); + return true; + }); auto res = http_params.http_util->Request(get_request); if (res->status != HTTPStatusCode::OK_200) { throw IOException(res->GetError() + " error for HTTP GET to '" + next_page_url + "'"); @@ -248,8 +249,7 @@ vector HuggingFaceFileSystem::Glob(const string &path, FileOpener return result; } -unique_ptr HuggingFaceFileSystem::HeadRequest(FileHandle &handle, string hf_url, - HTTPHeaders header_map) { +unique_ptr HuggingFaceFileSystem::HeadRequest(FileHandle &handle, string hf_url, HTTPHeaders header_map) { auto &hf_handle = handle.Cast(); auto http_url = HuggingFaceFileSystem::GetFileUrl(hf_handle.parsed_url); return HTTPFileSystem::HeadRequest(handle, http_url, header_map); @@ -262,8 +262,8 @@ unique_ptr HuggingFaceFileSystem::GetRequest(FileHandle &handle, s } unique_ptr HuggingFaceFileSystem::GetRangeRequest(FileHandle &handle, string s3_url, - HTTPHeaders header_map, idx_t file_offset, - char *buffer_out, idx_t buffer_out_len) { + HTTPHeaders header_map, idx_t file_offset, + char *buffer_out, idx_t buffer_out_len) { auto &hf_handle = handle.Cast(); auto http_url = HuggingFaceFileSystem::GetFileUrl(hf_handle.parsed_url); return HTTPFileSystem::GetRangeRequest(handle, http_url, header_map, file_offset, buffer_out, buffer_out_len); diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index 9c6763e8..24123b26 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -68,20 +68,16 @@ HTTPFSParams HTTPFSParams::ReadFrom(optional_ptr opener, optional_pt } } - string proxy_setting; - if (settings_reader.TryGetSecretKey("http_proxy", proxy_setting) && - !proxy_setting.empty()) { + if (settings_reader.TryGetSecretKey("http_proxy", proxy_setting) && !proxy_setting.empty()) { idx_t port; string host; HTTPUtil::ParseHTTPProxyHost(proxy_setting, host, port); result.http_proxy = host; result.http_proxy_port = port; } - settings_reader.TryGetSecretKey("http_proxy_username", - result.http_proxy_username); - settings_reader.TryGetSecretKey("http_proxy_password", - result.http_proxy_password); + settings_reader.TryGetSecretKey("http_proxy_username", result.http_proxy_username); + settings_reader.TryGetSecretKey("http_proxy_password", result.http_proxy_password); settings_reader.TryGetSecretKey("bearer_token", result.bearer_token); Value extra_headers; @@ -114,8 +110,8 @@ void HTTPClientCache::StoreClient(unique_ptr client) { } unique_ptr HTTPFileSystem::PostRequest(FileHandle &handle, string url, HTTPHeaders header_map, - string &buffer_out, - char *buffer_in, idx_t buffer_in_len, string params) { + string &buffer_out, char *buffer_in, idx_t buffer_in_len, + string params) { auto &hfh = handle.Cast(); auto &http_util = *hfh.http_params.http_util; PostRequestInfo post_request(url, header_map, hfh.http_params, const_data_ptr_cast(buffer_in), buffer_in_len); @@ -125,11 +121,12 @@ unique_ptr HTTPFileSystem::PostRequest(FileHandle &handle, string } unique_ptr HTTPFileSystem::PutRequest(FileHandle &handle, string url, HTTPHeaders header_map, - char *buffer_in, idx_t buffer_in_len, string params) { + char *buffer_in, idx_t buffer_in_len, string params) { auto &hfh = handle.Cast(); auto &http_util = *hfh.http_params.http_util; string content_type = "application/octet-stream"; - PutRequestInfo put_request(url, header_map, hfh.http_params, (const_data_ptr_t) buffer_in, buffer_in_len, content_type); + PutRequestInfo put_request(url, header_map, hfh.http_params, (const_data_ptr_t)buffer_in, buffer_in_len, + content_type); return http_util.Request(put_request); } @@ -156,6 +153,17 @@ unique_ptr HTTPFileSystem::DeleteRequest(FileHandle &handle, strin return response; } +HTTPException HTTPFileSystem::GetHTTPError(FileHandle &, const HTTPResponse &response, const string &url) { + auto status_message = HTTPFSUtil::GetStatusMessage(response.status); + string error = "HTTP GET error on '" + url + "' (HTTP " + to_string(static_cast(response.status)) + " " + + status_message + ")"; + if (response.status == HTTPStatusCode::RangeNotSatisfiable_416) { + error += " This could mean the file was changed. Try disabling the duckdb http metadata cache " + "if enabled, and confirm the server supports range requests."; + } + return HTTPException(response, error); +} + unique_ptr HTTPFileSystem::GetRequest(FileHandle &handle, string url, HTTPHeaders header_map) { auto &hfh = handle.Cast(); auto &http_util = *hfh.http_params.http_util; @@ -163,38 +171,40 @@ unique_ptr HTTPFileSystem::GetRequest(FileHandle &handle, string u D_ASSERT(hfh.cached_file_handle); auto http_client = hfh.GetClient(); - GetRequestInfo get_request(url, header_map, hfh.http_params, - [&](const HTTPResponse &response) { - if (static_cast(response.status) >= 400) { - string error = "HTTP GET error on '" + url + "' (HTTP " + to_string(static_cast(response.status)) + ")"; - if (response.status == HTTPStatusCode::RangeNotSatisfiable_416) { - error += " This could mean the file was changed. Try disabling the duckdb http metadata cache " - "if enabled, and confirm the server supports range requests."; - } - throw HTTPException(error); - } - return true; - }, - [&](const_data_ptr_t data, idx_t data_length) { - if (!hfh.cached_file_handle->GetCapacity()) { - hfh.cached_file_handle->AllocateBuffer(data_length); - hfh.length = data_length; - hfh.cached_file_handle->Write(const_char_ptr_cast(data), data_length); - } else { - auto new_capacity = hfh.cached_file_handle->GetCapacity(); - while (new_capacity < hfh.length + data_length) { - new_capacity *= 2; - } - // Grow buffer when running out of space - if (new_capacity != hfh.cached_file_handle->GetCapacity()) { - hfh.cached_file_handle->GrowBuffer(new_capacity, hfh.length); - } - // We can just copy stuff - hfh.cached_file_handle->Write(const_char_ptr_cast(data), data_length, hfh.length); - hfh.length += data_length; - } - return true; - }); + GetRequestInfo get_request( + url, header_map, hfh.http_params, + [&](const HTTPResponse &response) { + if (static_cast(response.status) >= 400) { + string error = + "HTTP GET error on '" + url + "' (HTTP " + to_string(static_cast(response.status)) + ")"; + if (response.status == HTTPStatusCode::RangeNotSatisfiable_416) { + error += " This could mean the file was changed. Try disabling the duckdb http metadata cache " + "if enabled, and confirm the server supports range requests."; + } + throw HTTPException(error); + } + return true; + }, + [&](const_data_ptr_t data, idx_t data_length) { + if (!hfh.cached_file_handle->GetCapacity()) { + hfh.cached_file_handle->AllocateBuffer(data_length); + hfh.length = data_length; + hfh.cached_file_handle->Write(const_char_ptr_cast(data), data_length); + } else { + auto new_capacity = hfh.cached_file_handle->GetCapacity(); + while (new_capacity < hfh.length + data_length) { + new_capacity *= 2; + } + // Grow buffer when running out of space + if (new_capacity != hfh.cached_file_handle->GetCapacity()) { + hfh.cached_file_handle->GrowBuffer(new_capacity, hfh.length); + } + // We can just copy stuff + hfh.cached_file_handle->Write(const_char_ptr_cast(data), data_length, hfh.length); + hfh.length += data_length; + } + return true; + }); auto response = http_util.Request(get_request, http_client); @@ -203,7 +213,7 @@ unique_ptr HTTPFileSystem::GetRequest(FileHandle &handle, string u } unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, - idx_t file_offset, char *buffer_out, idx_t buffer_out_len) { + idx_t file_offset, char *buffer_out, idx_t buffer_out_len) { auto &hfh = handle.Cast(); auto &http_util = *hfh.http_params.http_util; @@ -215,44 +225,46 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str idx_t out_offset = 0; - GetRequestInfo get_request(url, header_map, hfh.http_params, - [&](const HTTPResponse &response) { - if (static_cast(response.status) >= 400) { - string error = "HTTP GET error on '" + url + "' (HTTP " + to_string(static_cast(response.status)) + ")"; - if (response.status == HTTPStatusCode::RangeNotSatisfiable_416) { - error += " This could mean the file was changed. Try disabling the duckdb http metadata cache " - "if enabled, and confirm the server supports range requests."; - } - throw HTTPException(response, error); - } - if (static_cast(response.status) < 300) { // done redirecting - out_offset = 0; - if (response.HasHeader("Content-Length")) { - auto content_length = stoll(response.GetHeaderValue("Content-Length")); - if ((idx_t)content_length != buffer_out_len) { - throw HTTPException("HTTP GET error: Content-Length from server mismatches requested " - "range, server may not support range requests."); - } - } - } - return true; - }, - [&](const_data_ptr_t data, idx_t data_length) { - if (buffer_out != nullptr) { - if (data_length + out_offset > buffer_out_len) { - // As of v0.8.2-dev4424 we might end up here when very big files are served from servers - // that returns more data than requested via range header. This is an uncommon but legal - // behaviour, so we have to improve logic elsewhere to properly handle this case. - - // To avoid corruption of memory, we bail out. - throw HTTPException("Server sent back more data than expected, `SET force_download=true` might " - "help in this case"); - } - memcpy(buffer_out + out_offset, data, data_length); - out_offset += data_length; - } - return true; - }); + GetRequestInfo get_request( + url, header_map, hfh.http_params, + [&](const HTTPResponse &response) { + if (static_cast(response.status) >= 400) { + string error = + "HTTP GET error on '" + url + "' (HTTP " + to_string(static_cast(response.status)) + ")"; + if (response.status == HTTPStatusCode::RangeNotSatisfiable_416) { + error += " This could mean the file was changed. Try disabling the duckdb http metadata cache " + "if enabled, and confirm the server supports range requests."; + } + throw HTTPException(response, error); + } + if (static_cast(response.status) < 300) { // done redirecting + out_offset = 0; + if (response.HasHeader("Content-Length")) { + auto content_length = stoll(response.GetHeaderValue("Content-Length")); + if ((idx_t)content_length != buffer_out_len) { + throw HTTPException("HTTP GET error: Content-Length from server mismatches requested " + "range, server may not support range requests."); + } + } + } + return true; + }, + [&](const_data_ptr_t data, idx_t data_length) { + if (buffer_out != nullptr) { + if (data_length + out_offset > buffer_out_len) { + // As of v0.8.2-dev4424 we might end up here when very big files are served from servers + // that returns more data than requested via range header. This is an uncommon but legal + // behaviour, so we have to improve logic elsewhere to properly handle this case. + + // To avoid corruption of memory, we bail out. + throw HTTPException("Server sent back more data than expected, `SET force_download=true` might " + "help in this case"); + } + memcpy(buffer_out + out_offset, data, data_length); + out_offset += data_length; + } + return true; + }); auto response = http_util.Request(get_request, http_client); @@ -273,9 +285,10 @@ void TimestampToTimeT(timestamp_t timestamp, time_t &result) { result = mktime(&tm); } -HTTPFileHandle::HTTPFileHandle(FileSystem &fs, const OpenFileInfo &file, FileOpenFlags flags, HTTPFSParams http_params_p) - : FileHandle(fs, file.path, flags), http_params(std::move(http_params_p)), flags(flags), length(0), buffer_available(0), - buffer_idx(0), file_offset(0), buffer_start(0), buffer_end(0) { +HTTPFileHandle::HTTPFileHandle(FileSystem &fs, const OpenFileInfo &file, FileOpenFlags flags, + HTTPFSParams http_params_p) + : FileHandle(fs, file.path, flags), http_params(std::move(http_params_p)), flags(flags), length(0), + buffer_available(0), buffer_idx(0), file_offset(0), buffer_start(0), buffer_end(0) { // check if the handle has extended properties that can be set directly in the handle // if we have these properties we don't need to do a head request to obtain them later if (file.extended_info) { @@ -321,7 +334,7 @@ unique_ptr HTTPFileSystem::CreateHandle(const OpenFileInfo &file } unique_ptr HTTPFileSystem::OpenFileExtended(const OpenFileInfo &file, FileOpenFlags flags, - optional_ptr opener) { + optional_ptr opener) { D_ASSERT(flags.Compression() == FileCompressionType::UNCOMPRESSED); if (flags.ReturnNullIfNotExists()) { @@ -383,7 +396,7 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id hfh.buffer_idx = 0; } - idx_t start_offset = location; // Start file offset to read from. + idx_t start_offset = location; // Start file offset to read from. while (to_read > 0) { auto buffer_read_len = MinValue(hfh.buffer_available, to_read); if (buffer_read_len > 0) { @@ -409,8 +422,7 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id start_offset += to_read; break; } else { - GetRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), - new_buffer_available); + GetRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), new_buffer_available); hfh.buffer_available = new_buffer_available; hfh.buffer_idx = 0; hfh.buffer_start = start_offset; @@ -563,7 +575,7 @@ optional_idx TryParseContentRange(const HTTPHeaders &headers) { } try { return std::stoull(range_length); - } catch(...) { + } catch (...) { return optional_idx(); } } @@ -575,7 +587,7 @@ optional_idx TryParseContentLength(const HTTPHeaders &headers) { string content_length = headers.GetHeaderValue("Content-Length"); try { return std::stoull(content_length); - } catch(...) { + } catch (...) { return optional_idx(); } } @@ -599,9 +611,11 @@ void HTTPFileHandle::LoadFileInfo() { // HEAD request fail, use Range request for another try (read only one byte) if (flags.OpenForReading() && res->status != HTTPStatusCode::NotFound_404) { auto range_res = hfs.GetRangeRequest(*this, path, {}, 0, nullptr, 2); - if (range_res->status != HTTPStatusCode::PartialContent_206 && range_res->status != HTTPStatusCode::Accepted_202 && range_res->status != HTTPStatusCode::OK_200) { + if (range_res->status != HTTPStatusCode::PartialContent_206 && + range_res->status != HTTPStatusCode::Accepted_202 && range_res->status != HTTPStatusCode::OK_200) { // It failed again - throw HTTPException(*range_res, "Unable to connect to URL \"%s\": %d (%s).", path, static_cast(res->status), res->GetError()); + throw HTTPException(*range_res, "Unable to connect to URL \"%s\": %d (%s).", path, + static_cast(res->status), res->GetError()); } res = std::move(range_res); } else { @@ -637,49 +651,49 @@ void HTTPFileHandle::Initialize(optional_ptr opener) { auto current_cache = TryGetMetadataCache(opener, hfs); - bool should_write_cache = false; + bool should_write_cache = false; if (flags.OpenForReading()) { - if (http_params.force_download) { - FullDownload(hfs, should_write_cache); - return; - } - - if (current_cache) { - HTTPMetadataCacheEntry value; - bool found = current_cache->Find(path, value); - - if (found) { - last_modified = value.last_modified; - length = value.length; - etag = value.etag; - - if (flags.OpenForReading()) { - read_buffer = duckdb::unique_ptr(new data_t[READ_BUFFER_LEN]); - } - return; - } - - should_write_cache = true; - } - } - LoadFileInfo(); + if (http_params.force_download) { + FullDownload(hfs, should_write_cache); + return; + } + + if (current_cache) { + HTTPMetadataCacheEntry value; + bool found = current_cache->Find(path, value); + + if (found) { + last_modified = value.last_modified; + length = value.length; + etag = value.etag; + + if (flags.OpenForReading()) { + read_buffer = duckdb::unique_ptr(new data_t[READ_BUFFER_LEN]); + } + return; + } + + should_write_cache = true; + } + } + LoadFileInfo(); if (flags.OpenForReading()) { - if (http_params.state && length == 0) { - FullDownload(hfs, should_write_cache); - } - if (should_write_cache) { - current_cache->Insert(path, {length, last_modified, etag}); - } + if (http_params.state && length == 0) { + FullDownload(hfs, should_write_cache); + } + if (should_write_cache) { + current_cache->Insert(path, {length, last_modified, etag}); + } - // Initialize the read buffer now that we know the file exists - read_buffer = duckdb::unique_ptr(new data_t[READ_BUFFER_LEN]); + // Initialize the read buffer now that we know the file exists + read_buffer = duckdb::unique_ptr(new data_t[READ_BUFFER_LEN]); } - // If we're writing to a file, we might as well remove it from the cache - if (current_cache && flags.OpenForWriting()) { - current_cache->Erase(path); - } + // If we're writing to a file, we might as well remove it from the cache + if (current_cache && flags.OpenForWriting()) { + current_cache->Erase(path); + } } unique_ptr HTTPFileHandle::GetClient() { diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 26e485be..6fb33d92 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -39,8 +39,7 @@ class HTTPFSClient : public HTTPClient { } void SetLogger(HTTPLogger &logger) { - client->set_logger( - logger.GetLogger()); + client->set_logger(logger.GetLogger()); } unique_ptr Get(GetRequestInfo &info) override { if (state) { @@ -50,109 +49,109 @@ class HTTPFSClient : public HTTPClient { if (!info.response_handler && !info.content_handler) { return TransformResult(client->Get(info.path, headers)); } else { - return TransformResult(client->Get(info.path.c_str(), headers, - [&](const duckdb_httplib_openssl::Response &response) { - auto http_response = TransformResponse(response); - return info.response_handler(*http_response); - }, - [&](const char *data, size_t data_length) { - if (state) { - state->total_bytes_received += data_length; - } - return info.content_handler(const_data_ptr_cast(data), data_length); - })); + return TransformResult(client->Get( + info.path.c_str(), headers, + [&](const duckdb_httplib_openssl::Response &response) { + auto http_response = TransformResponse(response); + return info.response_handler(*http_response); + }, + [&](const char *data, size_t data_length) { + if (state) { + state->total_bytes_received += data_length; + } + return info.content_handler(const_data_ptr_cast(data), data_length); + })); } } unique_ptr Put(PutRequestInfo &info) override { - if (state) { - state->put_count++; - state->total_bytes_sent += info.buffer_in_len; - } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, info.content_type)); + if (state) { + state->put_count++; + state->total_bytes_sent += info.buffer_in_len; + } + auto headers = TransformHeaders(info.headers, info.params); + return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, + info.content_type)); } unique_ptr Head(HeadRequestInfo &info) override { - if (state) { - state->head_count++; - } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Head(info.path, headers)); + if (state) { + state->head_count++; + } + auto headers = TransformHeaders(info.headers, info.params); + return TransformResult(client->Head(info.path, headers)); } unique_ptr Delete(DeleteRequestInfo &info) override { - if (state) { - state->delete_count++; - } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Delete(info.path, headers)); + if (state) { + state->delete_count++; + } + auto headers = TransformHeaders(info.headers, info.params); + return TransformResult(client->Delete(info.path, headers)); } unique_ptr Post(PostRequestInfo &info) override { - if (state) { - state->post_count++; - state->total_bytes_sent += info.buffer_in_len; - } - // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib - duckdb_httplib_openssl::Request req; - req.method = "POST"; - req.path = info.path; - req.headers = TransformHeaders(info.headers, info.params); - req.headers.emplace("Content-Type", "application/octet-stream"); - req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, - uint64_t /*total_length*/) { - if (state) { - state->total_bytes_received += data_length; - } - info.buffer_out += string(data, data_length); - return true; - }; - req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); - return TransformResult(client->send(req)); + if (state) { + state->post_count++; + state->total_bytes_sent += info.buffer_in_len; + } + // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib + duckdb_httplib_openssl::Request req; + req.method = "POST"; + req.path = info.path; + req.headers = TransformHeaders(info.headers, info.params); + req.headers.emplace("Content-Type", "application/octet-stream"); + req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, + uint64_t /*total_length*/) { + if (state) { + state->total_bytes_received += data_length; + } + info.buffer_out += string(data, data_length); + return true; + }; + req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); + return TransformResult(client->send(req)); } private: - duckdb_httplib_openssl::Headers TransformHeaders(const HTTPHeaders &header_map, const HTTPParams ¶ms) { - duckdb_httplib_openssl::Headers headers; - for(auto &entry : header_map) { - headers.insert(entry); - } - for (auto &entry : params.extra_headers) { - headers.insert(entry); - } - return headers; - } - - unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { - auto status_code = HTTPUtil::ToStatusCode(response.status); - auto result = make_uniq(status_code); - result->body = response.body; - result->reason = response.reason; - for (auto &entry : response.headers) { - result->headers.Insert(entry.first, entry.second); - } - return result; - } - - unique_ptr TransformResult(duckdb_httplib_openssl::Result &&res) { - if (res.error() == duckdb_httplib_openssl::Error::Success) { - auto &response = res.value(); - return TransformResponse(response); - } else { - auto result = make_uniq(HTTPStatusCode::INVALID); - result->request_error = to_string(res.error()); - return result; - } - } + duckdb_httplib_openssl::Headers TransformHeaders(const HTTPHeaders &header_map, const HTTPParams ¶ms) { + duckdb_httplib_openssl::Headers headers; + for (auto &entry : header_map) { + headers.insert(entry); + } + for (auto &entry : params.extra_headers) { + headers.insert(entry); + } + return headers; + } + + unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { + auto status_code = HTTPUtil::ToStatusCode(response.status); + auto result = make_uniq(status_code); + result->body = response.body; + result->reason = response.reason; + for (auto &entry : response.headers) { + result->headers.Insert(entry.first, entry.second); + } + return result; + } + + unique_ptr TransformResult(duckdb_httplib_openssl::Result &&res) { + if (res.error() == duckdb_httplib_openssl::Error::Success) { + auto &response = res.value(); + return TransformResponse(response); + } else { + auto result = make_uniq(HTTPStatusCode::INVALID); + result->request_error = to_string(res.error()); + return result; + } + } private: unique_ptr client; optional_ptr state; }; - -unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, - const string &proto_host_port) { +unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { auto client = make_uniq(http_params.Cast(), proto_host_port); return std::move(client); } @@ -162,10 +161,143 @@ unordered_map HTTPFSUtil::ParseGetParameters(const string &text) duckdb_httplib_openssl::detail::parse_query_text(text, query_params); unordered_map result; - for(auto &entry : query_params) { + for (auto &entry : query_params) { result.emplace(std::move(entry.first), std::move(entry.second)); } return result; } +string HTTPFSUtil::GetStatusMessage(HTTPStatusCode status) { + switch (status) { + case HTTPStatusCode::Continue_100: + return "Continue"; + case HTTPStatusCode::SwitchingProtocol_101: + return "Switching Protocol"; + case HTTPStatusCode::Processing_102: + return "Processing"; + case HTTPStatusCode::EarlyHints_103: + return "Early Hints"; + case HTTPStatusCode::OK_200: + return "OK"; + case HTTPStatusCode::Created_201: + return "Created"; + case HTTPStatusCode::Accepted_202: + return "Accepted"; + case HTTPStatusCode::NonAuthoritativeInformation_203: + return "Non-Authoritative Information"; + case HTTPStatusCode::NoContent_204: + return "No Content"; + case HTTPStatusCode::ResetContent_205: + return "Reset Content"; + case HTTPStatusCode::PartialContent_206: + return "Partial Content"; + case HTTPStatusCode::MultiStatus_207: + return "Multi-Status"; + case HTTPStatusCode::AlreadyReported_208: + return "Already Reported"; + case HTTPStatusCode::IMUsed_226: + return "IM Used"; + case HTTPStatusCode::MultipleChoices_300: + return "Multiple Choices"; + case HTTPStatusCode::MovedPermanently_301: + return "Moved Permanently"; + case HTTPStatusCode::Found_302: + return "Found"; + case HTTPStatusCode::SeeOther_303: + return "See Other"; + case HTTPStatusCode::NotModified_304: + return "Not Modified"; + case HTTPStatusCode::UseProxy_305: + return "Use Proxy"; + case HTTPStatusCode::unused_306: + return "unused"; + case HTTPStatusCode::TemporaryRedirect_307: + return "Temporary Redirect"; + case HTTPStatusCode::PermanentRedirect_308: + return "Permanent Redirect"; + case HTTPStatusCode::BadRequest_400: + return "Bad Request"; + case HTTPStatusCode::Unauthorized_401: + return "Unauthorized"; + case HTTPStatusCode::PaymentRequired_402: + return "Payment Required"; + case HTTPStatusCode::Forbidden_403: + return "Forbidden"; + case HTTPStatusCode::NotFound_404: + return "Not Found"; + case HTTPStatusCode::MethodNotAllowed_405: + return "Method Not Allowed"; + case HTTPStatusCode::NotAcceptable_406: + return "Not Acceptable"; + case HTTPStatusCode::ProxyAuthenticationRequired_407: + return "Proxy Authentication Required"; + case HTTPStatusCode::RequestTimeout_408: + return "Request Timeout"; + case HTTPStatusCode::Conflict_409: + return "Conflict"; + case HTTPStatusCode::Gone_410: + return "Gone"; + case HTTPStatusCode::LengthRequired_411: + return "Length Required"; + case HTTPStatusCode::PreconditionFailed_412: + return "Precondition Failed"; + case HTTPStatusCode::PayloadTooLarge_413: + return "Payload Too Large"; + case HTTPStatusCode::UriTooLong_414: + return "URI Too Long"; + case HTTPStatusCode::UnsupportedMediaType_415: + return "Unsupported Media Type"; + case HTTPStatusCode::RangeNotSatisfiable_416: + return "Range Not Satisfiable"; + case HTTPStatusCode::ExpectationFailed_417: + return "Expectation Failed"; + case HTTPStatusCode::ImATeapot_418: + return "I'm a teapot"; + case HTTPStatusCode::MisdirectedRequest_421: + return "Misdirected Request"; + case HTTPStatusCode::UnprocessableContent_422: + return "Unprocessable Content"; + case HTTPStatusCode::Locked_423: + return "Locked"; + case HTTPStatusCode::FailedDependency_424: + return "Failed Dependency"; + case HTTPStatusCode::TooEarly_425: + return "Too Early"; + case HTTPStatusCode::UpgradeRequired_426: + return "Upgrade Required"; + case HTTPStatusCode::PreconditionRequired_428: + return "Precondition Required"; + case HTTPStatusCode::TooManyRequests_429: + return "Too Many Requests"; + case HTTPStatusCode::RequestHeaderFieldsTooLarge_431: + return "Request Header Fields Too Large"; + case HTTPStatusCode::UnavailableForLegalReasons_451: + return "Unavailable For Legal Reasons"; + case HTTPStatusCode::NotImplemented_501: + return "Not Implemented"; + case HTTPStatusCode::BadGateway_502: + return "Bad Gateway"; + case HTTPStatusCode::ServiceUnavailable_503: + return "Service Unavailable"; + case HTTPStatusCode::GatewayTimeout_504: + return "Gateway Timeout"; + case HTTPStatusCode::HttpVersionNotSupported_505: + return "HTTP Version Not Supported"; + case HTTPStatusCode::VariantAlsoNegotiates_506: + return "Variant Also Negotiates"; + case HTTPStatusCode::InsufficientStorage_507: + return "Insufficient Storage"; + case HTTPStatusCode::LoopDetected_508: + return "Loop Detected"; + case HTTPStatusCode::NotExtended_510: + return "Not Extended"; + case HTTPStatusCode::NetworkAuthenticationRequired_511: + return "Network Authentication Required"; + + default: + case HTTPStatusCode::InternalServerError_500: + return "Internal Server Error"; + } } + +} // namespace duckdb diff --git a/extension/httpfs/httpfs_extension.cpp b/extension/httpfs/httpfs_extension.cpp index a3097689..c9bc9853 100644 --- a/extension/httpfs/httpfs_extension.cpp +++ b/extension/httpfs/httpfs_extension.cpp @@ -21,8 +21,8 @@ static void LoadInternal(DatabaseInstance &instance) { // Global HTTP config // Single timeout value is used for all 4 types of timeouts, we could split it into 4 if users need that - config.AddExtensionOption("http_timeout", "HTTP timeout read/write/connection/retry (in seconds)", LogicalType::UBIGINT, - Value::UBIGINT(HTTPParams::DEFAULT_TIMEOUT_SECONDS)); + config.AddExtensionOption("http_timeout", "HTTP timeout read/write/connection/retry (in seconds)", + LogicalType::UBIGINT, Value::UBIGINT(HTTPParams::DEFAULT_TIMEOUT_SECONDS)); config.AddExtensionOption("http_retries", "HTTP retries on I/O error", LogicalType::UBIGINT, Value(3)); config.AddExtensionOption("http_retry_wait_ms", "Time between retries", LogicalType::UBIGINT, Value(100)); config.AddExtensionOption("force_download", "Forces upfront download of file", LogicalType::BOOLEAN, Value(false)); diff --git a/extension/httpfs/include/crypto.hpp b/extension/httpfs/include/crypto.hpp index d81dd2d4..f819356f 100644 --- a/extension/httpfs/include/crypto.hpp +++ b/extension/httpfs/include/crypto.hpp @@ -35,8 +35,8 @@ class DUCKDB_EXTENSION_API AESStateSSL : public duckdb::EncryptionState { size_t Finalize(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len) override; void GenerateRandomData(data_ptr_t data, idx_t len) override; - const EVP_CIPHER *GetCipher(const string &key); - size_t FinalizeGCM(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len); + const EVP_CIPHER *GetCipher(const string &key); + size_t FinalizeGCM(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len); private: EVP_CIPHER_CTX *context; diff --git a/extension/httpfs/include/hffs.hpp b/extension/httpfs/include/hffs.hpp index 5baa6c5f..a5901e52 100644 --- a/extension/httpfs/include/hffs.hpp +++ b/extension/httpfs/include/hffs.hpp @@ -27,8 +27,8 @@ class HuggingFaceFileSystem : public HTTPFileSystem { duckdb::unique_ptr HeadRequest(FileHandle &handle, string hf_url, HTTPHeaders header_map) override; duckdb::unique_ptr GetRequest(FileHandle &handle, string hf_url, HTTPHeaders header_map) override; duckdb::unique_ptr GetRangeRequest(FileHandle &handle, string hf_url, HTTPHeaders header_map, - idx_t file_offset, char *buffer_out, - idx_t buffer_out_len) override; + idx_t file_offset, char *buffer_out, + idx_t buffer_out_len) override; bool CanHandleFile(const string &fpath) override { return fpath.rfind("hf://", 0) == 0; diff --git a/extension/httpfs/include/httpfs.hpp b/extension/httpfs/include/httpfs.hpp index ca1508e2..37773fe0 100644 --- a/extension/httpfs/include/httpfs.hpp +++ b/extension/httpfs/include/httpfs.hpp @@ -5,6 +5,7 @@ #include "http_state.hpp" #include "duckdb/common/pair.hpp" #include "duckdb/common/unordered_map.hpp" +#include "duckdb/common/exception/http_exception.hpp" #include "duckdb/main/client_data.hpp" #include "http_metadata_cache.hpp" #include "httpfs_client.hpp" @@ -99,16 +100,15 @@ class HTTPFileSystem : public FileSystem { virtual duckdb::unique_ptr HeadRequest(FileHandle &handle, string url, HTTPHeaders header_map); // Get Request with range parameter that GETs exactly buffer_out_len bytes from the url virtual duckdb::unique_ptr GetRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, - idx_t file_offset, char *buffer_out, - idx_t buffer_out_len); + idx_t file_offset, char *buffer_out, idx_t buffer_out_len); // Get Request without a range (i.e., downloads full file) virtual duckdb::unique_ptr GetRequest(FileHandle &handle, string url, HTTPHeaders header_map); // Post Request that can handle variable sized responses without a content-length header (needed for s3 multipart) virtual duckdb::unique_ptr PostRequest(FileHandle &handle, string url, HTTPHeaders header_map, - string &result, char *buffer_in, idx_t buffer_in_len, - string params = ""); + string &result, char *buffer_in, idx_t buffer_in_len, + string params = ""); virtual duckdb::unique_ptr PutRequest(FileHandle &handle, string url, HTTPHeaders header_map, - char *buffer_in, idx_t buffer_in_len, string params = ""); + char *buffer_in, idx_t buffer_in_len, string params = ""); virtual duckdb::unique_ptr DeleteRequest(FileHandle &handle, string url, HTTPHeaders header_map); @@ -146,13 +146,17 @@ class HTTPFileSystem : public FileSystem { protected: unique_ptr OpenFileExtended(const OpenFileInfo &file, FileOpenFlags flags, - optional_ptr opener) override; + optional_ptr opener) override; bool SupportsOpenFileExtended() const override { return true; } + + virtual HTTPException GetHTTPError(FileHandle &, const HTTPResponse &response, const string &url); + protected: virtual duckdb::unique_ptr CreateHandle(const OpenFileInfo &file, FileOpenFlags flags, optional_ptr opener); + private: // Global cache mutex global_cache_lock; diff --git a/extension/httpfs/include/httpfs_client.hpp b/extension/httpfs/include/httpfs_client.hpp index 4b0c33e7..dff9d185 100644 --- a/extension/httpfs/include/httpfs_client.hpp +++ b/extension/httpfs/include/httpfs_client.hpp @@ -27,6 +27,7 @@ class HTTPFSUtil : public HTTPUtil { unique_ptr InitializeClient(HTTPParams &http_params, const string &proto_host_port) override; static unordered_map ParseGetParameters(const string &text); + static string GetStatusMessage(HTTPStatusCode status); }; -} +} // namespace duckdb diff --git a/extension/httpfs/include/s3fs.hpp b/extension/httpfs/include/s3fs.hpp index e389e56c..b15e0d23 100644 --- a/extension/httpfs/include/s3fs.hpp +++ b/extension/httpfs/include/s3fs.hpp @@ -178,15 +178,13 @@ class S3FileSystem : public HTTPFileSystem { duckdb::unique_ptr HeadRequest(FileHandle &handle, string s3_url, HTTPHeaders header_map) override; duckdb::unique_ptr GetRequest(FileHandle &handle, string url, HTTPHeaders header_map) override; duckdb::unique_ptr GetRangeRequest(FileHandle &handle, string s3_url, HTTPHeaders header_map, - idx_t file_offset, char *buffer_out, - idx_t buffer_out_len) override; + idx_t file_offset, char *buffer_out, + idx_t buffer_out_len) override; duckdb::unique_ptr PostRequest(FileHandle &handle, string s3_url, HTTPHeaders header_map, - string &buffer_out, - char *buffer_in, idx_t buffer_in_len, - string http_params = "") override; + string &buffer_out, char *buffer_in, idx_t buffer_in_len, + string http_params = "") override; duckdb::unique_ptr PutRequest(FileHandle &handle, string s3_url, HTTPHeaders header_map, - char *buffer_in, idx_t buffer_in_len, - string http_params = "") override; + char *buffer_in, idx_t buffer_in_len, string http_params = "") override; duckdb::unique_ptr DeleteRequest(FileHandle &handle, string s3_url, HTTPHeaders header_map) override; bool CanHandleFile(const string &fpath) override; @@ -225,6 +223,8 @@ class S3FileSystem : public HTTPFileSystem { return true; } + static HTTPException GetS3Error(S3AuthParams &s3_auth_params, const HTTPResponse &response, const string &url); + protected: static void NotifyUploadsInProgress(S3FileHandle &file_handle); duckdb::unique_ptr CreateHandle(const OpenFileInfo &file, FileOpenFlags flags, @@ -232,6 +232,8 @@ class S3FileSystem : public HTTPFileSystem { void FlushBuffer(S3FileHandle &handle, shared_ptr write_buffer); string GetPayloadHash(char *buffer, idx_t buffer_len); + + HTTPException GetHTTPError(FileHandle &, const HTTPResponse &response, const string &url) override; }; // Helper class to do s3 ListObjectV2 api call https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html diff --git a/extension/httpfs/s3fs.cpp b/extension/httpfs/s3fs.cpp index 8245318e..700193d7 100644 --- a/extension/httpfs/s3fs.cpp +++ b/extension/httpfs/s3fs.cpp @@ -24,8 +24,8 @@ namespace duckdb { static HTTPHeaders create_s3_header(string url, string query, string host, string service, string method, - const S3AuthParams &auth_params, string date_now = "", string datetime_now = "", - string payload_hash = "", string content_type = "") { + const S3AuthParams &auth_params, string date_now = "", string datetime_now = "", + string payload_hash = "", string content_type = "") { HTTPHeaders res; res["Host"] = host; @@ -285,12 +285,11 @@ string S3FileSystem::InitializeMultipartUpload(S3FileHandle &file_handle) { // AWS response is around 300~ chars in docs so this should be enough to not need a resize string result; string query_param = "uploads="; - auto res = s3fs.PostRequest(file_handle, file_handle.path, {}, result, nullptr, 0, - query_param); + auto res = s3fs.PostRequest(file_handle, file_handle.path, {}, result, nullptr, 0, query_param); if (res->status != HTTPStatusCode::OK_200) { throw HTTPException(*res, "Unable to connect to URL %s: %s (HTTP code %d)", res->url, res->GetError(), - static_cast(res->status)); + static_cast(res->status)); } auto open_tag_pos = result.find("", 0); @@ -450,12 +449,12 @@ void S3FileSystem::FinalizeMultipartUpload(S3FileHandle &file_handle) { string result; string query_param = "uploadId=" + S3FileSystem::UrlEncode(file_handle.multipart_upload_id, true); - auto res = s3fs.PostRequest(file_handle, file_handle.path, {}, result, - (char *)body.c_str(), body.length(), query_param); + auto res = + s3fs.PostRequest(file_handle, file_handle.path, {}, result, (char *)body.c_str(), body.length(), query_param); auto open_tag_pos = result.find("(res->status), - result); + throw HTTPException(*res, "Unexpected response during S3 multipart upload finalization: %d\n\n%s", + static_cast(res->status), result); } } @@ -636,8 +635,8 @@ string ParsedS3Url::GetHTTPUrl(S3AuthParams &auth_params, const string &http_que } unique_ptr S3FileSystem::PostRequest(FileHandle &handle, string url, HTTPHeaders header_map, - string &result, - char *buffer_in, idx_t buffer_in_len, string http_params) { + string &result, char *buffer_in, idx_t buffer_in_len, + string http_params) { auto auth_params = handle.Cast().auth_params; auto parsed_s3_url = S3UrlParse(url, auth_params); string http_url = parsed_s3_url.GetHTTPUrl(auth_params, http_params); @@ -649,7 +648,7 @@ unique_ptr S3FileSystem::PostRequest(FileHandle &handle, string ur } unique_ptr S3FileSystem::PutRequest(FileHandle &handle, string url, HTTPHeaders header_map, - char *buffer_in, idx_t buffer_in_len, string http_params) { + char *buffer_in, idx_t buffer_in_len, string http_params) { auto auth_params = handle.Cast().auth_params; auto parsed_s3_url = S3UrlParse(url, auth_params); string http_url = parsed_s3_url.GetHTTPUrl(auth_params, http_params); @@ -680,7 +679,7 @@ unique_ptr S3FileSystem::GetRequest(FileHandle &handle, string s3_ } unique_ptr S3FileSystem::GetRangeRequest(FileHandle &handle, string s3_url, HTTPHeaders header_map, - idx_t file_offset, char *buffer_out, idx_t buffer_out_len) { + idx_t file_offset, char *buffer_out, idx_t buffer_out_len) { auto auth_params = handle.Cast().auth_params; auto parsed_s3_url = S3UrlParse(s3_url, auth_params); string http_url = parsed_s3_url.GetHTTPUrl(auth_params); @@ -773,7 +772,8 @@ void S3FileSystem::RemoveFile(const string &path, optional_ptr opene auto &s3fh = handle->Cast(); auto res = DeleteRequest(*handle, s3fh.path, {}); if (res->status != HTTPStatusCode::OK_200 && res->status != HTTPStatusCode::NoContent_204) { - throw IOException("Could not remove file \"%s\": %s", {{"errno", to_string(static_cast(res->status))}}, path, res->GetError()); + throw IOException("Could not remove file \"%s\": %s", {{"errno", to_string(static_cast(res->status))}}, + path, res->GetError()); } } @@ -784,7 +784,7 @@ void S3FileSystem::RemoveDirectory(const string &path, optional_ptr try { this->RemoveFile(file, opener); } catch (IOException &e) { - string errmsg(e.what()); + string errmsg(e.what()); if (errmsg.find("No such file or directory") != std::string::npos) { return; } @@ -965,6 +965,27 @@ bool S3FileSystem::ListFiles(const string &directory, const std::function(); + return GetS3Error(s3_handle.auth_params, response, url); + } + return HTTPFileSystem::GetHTTPError(handle, response, url); +} string AWSListObjectV2::Request(string &path, HTTPFSParams &http_params, S3AuthParams &s3_auth_params, string &continuation_token, optional_ptr state, bool use_delimiter) { auto parsed_url = S3FileSystem::S3UrlParse(path, s3_auth_params); @@ -992,10 +1013,14 @@ string AWSListObjectV2::Request(string &path, HTTPFSParams &http_params, S3AuthP // Get requests use fresh connection auto client = http_params.http_util->InitializeClient(http_params, parsed_url.http_proto + parsed_url.host); std::stringstream response; - GetRequestInfo get_request(parsed_url.host, listobjectv2_url, header_map, http_params, + GetRequestInfo get_request( + parsed_url.host, listobjectv2_url, header_map, http_params, [&](const HTTPResponse &response) { if (static_cast(response.status) >= 400) { - throw HTTPException(response, "HTTP GET error on '%s' (HTTP %d)", listobjectv2_url, response.status); + string trimmed_path = path; + StringUtil::RTrim(trimmed_path, "/"); + trimmed_path += listobjectv2_url; + throw S3FileSystem::GetS3Error(s3_auth_params, response, trimmed_path); } return true; }, @@ -1021,13 +1046,13 @@ optional_idx FindTagContents(const string &response, const string &tag, idx_t cu } auto close_tag_pos = response.find(close_tag, open_tag_pos + open_tag.size()); if (close_tag_pos == string::npos) { - throw InternalException("Failed to parse S3 result: found open tag for %s but did not find matching close tag", tag); + throw InternalException("Failed to parse S3 result: found open tag for %s but did not find matching close tag", + tag); } result = response.substr(open_tag_pos + open_tag.size(), close_tag_pos - open_tag_pos - open_tag.size()); return close_tag_pos + close_tag.size(); } - void AWSListObjectV2::ParseFileList(string &aws_response, vector &result) { // Example S3 response: // @@ -1038,7 +1063,7 @@ void AWSListObjectV2::ParseFileList(string &aws_response, vector & // STANDARD // idx_t cur_pos = 0; - while(true) { + while (true) { string contents; auto next_pos = FindTagContents(aws_response, "Contents", cur_pos, contents); if (!next_pos.IsValid()) {