From 0783060b91e23940bb70ef1602ef6b775737909c Mon Sep 17 00:00:00 2001 From: sabine <6594573+sabine@users.noreply.github.com> Date: Thu, 6 Nov 2025 16:44:31 +0100 Subject: [PATCH 1/2] increase yaml library buffer to fix video scraper --- tool/ood-gen/lib/youtube.ml | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/tool/ood-gen/lib/youtube.ml b/tool/ood-gen/lib/youtube.ml index 19bf3a5899..48664a88df 100644 --- a/tool/ood-gen/lib/youtube.ml +++ b/tool/ood-gen/lib/youtube.ml @@ -187,17 +187,28 @@ let scrape yaml_file = in match fetched with | Ok fetched -> - let yaml = + let all_videos = VideoSet.union fetched scraped |> VideoSet.to_seq |> List.of_seq |> List.sort (fun a b -> compare b.Vid.published a.Vid.published) - |> Vid.video_list_to_yaml in - let output = - Yaml.pp Format.str_formatter yaml; - Format.flush_str_formatter () + let yaml = Vid.video_list_to_yaml all_videos in + (* The yaml library uses a fixed-size output buffer. The default is 262140 bytes, + which was exceeded when we had 203 videos (~262KB output). This caused the + document_end operation to fail with "doc_end failed" error. + + Current stats: 203 videos ≈ 260KB, average ~1.3KB per video. + We use a 2MB buffer to accommodate growth to ~1500 videos before hitting limits. + If the list grows beyond that, this will fail with a clear error message. *) + let buffer_size = 2 * 1024 * 1024 in (* 2MB *) + let output = match Yaml.to_string ~len:buffer_size yaml with + | Ok s -> s + | Error (`Msg err) -> + failwith (Printf.sprintf + "YAML serialization failed (tried %d videos, buffer size %d bytes): %s" + (List.length all_videos) buffer_size err) in let oc = open_out yaml_file in - Printf.fprintf oc "%s" output; + output_string oc output; close_out oc | Error (`Msg msg) -> failwith msg From 662a13aa53fc519327da56a205d1a474c6c3cb38 Mon Sep 17 00:00:00 2001 From: sabine <6594573+sabine@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:07:38 +0100 Subject: [PATCH 2/2] fmt --- tool/ood-gen/lib/youtube.ml | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/tool/ood-gen/lib/youtube.ml b/tool/ood-gen/lib/youtube.ml index 48664a88df..0506e509b0 100644 --- a/tool/ood-gen/lib/youtube.ml +++ b/tool/ood-gen/lib/youtube.ml @@ -193,20 +193,25 @@ let scrape yaml_file = |> List.sort (fun a b -> compare b.Vid.published a.Vid.published) in let yaml = Vid.video_list_to_yaml all_videos in - (* The yaml library uses a fixed-size output buffer. The default is 262140 bytes, - which was exceeded when we had 203 videos (~262KB output). This caused the - document_end operation to fail with "doc_end failed" error. - - Current stats: 203 videos ≈ 260KB, average ~1.3KB per video. - We use a 2MB buffer to accommodate growth to ~1500 videos before hitting limits. - If the list grows beyond that, this will fail with a clear error message. *) - let buffer_size = 2 * 1024 * 1024 in (* 2MB *) - let output = match Yaml.to_string ~len:buffer_size yaml with + (* The yaml library uses a fixed-size output buffer. The default is 262140 + bytes, which was exceeded when we had 203 videos (~262KB output). This + caused the document_end operation to fail with "doc_end failed" error. + + Current stats: 203 videos ≈ 260KB, average ~1.3KB per video. We use a + 2MB buffer to accommodate growth to ~1500 videos before hitting limits. + If the list grows beyond that, this will fail with a clear error + message. *) + let buffer_size = 2 * 1024 * 1024 in + (* 2MB *) + let output = + match Yaml.to_string ~len:buffer_size yaml with | Ok s -> s - | Error (`Msg err) -> - failwith (Printf.sprintf - "YAML serialization failed (tried %d videos, buffer size %d bytes): %s" - (List.length all_videos) buffer_size err) + | Error (`Msg err) -> + failwith + (Printf.sprintf + "YAML serialization failed (tried %d videos, buffer size %d \ + bytes): %s" + (List.length all_videos) buffer_size err) in let oc = open_out yaml_file in output_string oc output;