From 7a1b0b717658aff574720b47055e9a34f7f23847 Mon Sep 17 00:00:00 2001
From: Shaltiel Shmidman <shaltieltzion@gmail.com>
Date: Sun, 11 Feb 2024 05:12:26 +0200
Subject: [PATCH 1/5] Added code for using stream reading & writing for bytes,
 and using copy_

---
 src/TorchSharp/NN/Module.cs                   |   2 +-
 .../Tensor/Factories/Tensor.Factories.cs      |   7 +-
 src/TorchSharp/Tensor/Tensor.cs               |  71 ++++++++++++
 .../Tensor/TensorExtensionMethods.cs          |  67 +++++------
 src/TorchVision/File.cs                       |   4 +-
 test/TorchSharpTest/TestTorchTensor.cs        | 106 ++++++++++++++++++
 6 files changed, 208 insertions(+), 49 deletions(-)
diff --git a/src/TorchSharp/NN/Module.cs b/src/TorchSharp/NN/Module.cs
index 4ca8a3258..69faf6108 100644
--- a/src/TorchSharp/NN/Module.cs
+++ b/src/TorchSharp/NN/Module.cs
@@ -509,7 +509,7 @@ public virtual (IList<string> missing_keys, IList<string> unexpected_keyes) load
                     foreach (var key in source.Keys) {
                         if (skip.Contains(key)) continue;
                         if (destination.ContainsKey(key)) {
-                            destination[key].bytes = source[key].bytes;
+                            destination[key].copy_(source[key]);
                         }
                     }
 
diff --git a/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs b/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs
index 9bc1c562f..87e26b58c 100644
--- a/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs
+++ b/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs
@@ -521,11 +521,8 @@ public static Tensor Load(System.IO.BinaryReader reader)
                     throw new NotImplementedException("Loading tensors larger than 2GB");
 
                 var tensor = torch.empty(loadedShape, dtype: type);
-
-                var bytes = reader.ReadBytes((int)(totalSize * type.ElementSize()));
-
-                tensor.bytes = bytes;
-
+                tensor.ReadBytesFromStream(reader.BaseStream);
+                
                 return tensor;
             }
 
diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index b8b457063..e1a930e22 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -4,6 +4,7 @@
 using System.ComponentModel;
 using System.Diagnostics.Contracts;
 using System.Globalization;
+using System.IO;
 using System.Linq;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
@@ -404,6 +405,76 @@ internal void ValidateType(Type dotnetType)
                 }
             }
 
+
+            /// <summary>
+            /// Writes the bytes of the tensor to a stream. Useful for when tensors are >2GB.
+            /// </summary>
+            /// <param name="stream">Stream to write the bytes to</param>
+            /// <param name="bufferSize">The buffer size to use when writing to the stream</param>
+            public void WriteBytesToStream(Stream stream, int bufferSize = 65_536)
+            {
+                // Validate, but passing 0 as the total size, since we don't need to validate the size
+                _validate(0);
+
+                long totalSize = NumberOfElements * ElementSize;
+                
+                unsafe {
+                    var ptr = NativeMethods.THSTensor_data(handle);
+                    if (ptr == IntPtr.Zero) { CheckForErrors(); }
+
+                    // NOTE: there is no safety here in this loop.
+                    // Read in the buffer N bytes at a time, and write them out
+                    byte[] buffer = new byte[bufferSize];
+                    while (totalSize > 0) {
+                    // Read in the current buffer size
+                    int curBufferSize = (int)Math.Min(totalSize, bufferSize);
+                        var span = new Span<byte>((void*)ptr, curBufferSize);
+                        span.CopyTo(buffer);
+
+                        // Write it out
+                        stream.Write(buffer, 0, curBufferSize);
+
+                        // Increment our pointer and decrease the total size of elements we have to write
+                        ptr += curBufferSize;
+                        totalSize -= curBufferSize;
+                    }
+                }
+            }
+
+            /// <summary>
+            /// Reads the bytes of the tensor from a stream.
+            /// </summary>
+            /// <param name="stream">Stream to read the bytes from</param>
+            /// <param name="bufferSize">The buffer size to use when reading from the stream</param>
+            public void ReadBytesFromStream(Stream stream, int bufferSize = 65_536)
+            {
+                long totalSize = NumberOfElements * ElementSize;
+
+                if (!is_contiguous()) throw new InvalidOperationException("SetBytes() called on non-contiguous tensor.");
+
+                unsafe {
+                    var ptr = NativeMethods.THSTensor_data(handle);
+                    if (ptr == IntPtr.Zero) { CheckForErrors(); }
+
+                    // NOTE: there is no safety here in this loop.
+                    // Read in the buffer N bytes at a time, and write them out
+                    byte[] buffer = new byte[bufferSize];
+                    while (totalSize > 0) {
+                        // Read in the current buffer size
+                        int curBufferSize = (int)Math.Min(totalSize, bufferSize);
+                        stream.Read(buffer, 0, curBufferSize);
+
+                        // Copy the contents over to the span
+                        var span = new Span<byte>((void*)ptr, curBufferSize);
+                        buffer.AsSpan(0, curBufferSize).CopyTo(span);
+                        
+                        // Increment our pointer and decrease the total size of elements we have to write
+                        ptr += curBufferSize;
+                        totalSize -= curBufferSize;
+                    }
+                }
+            }
+
             /// <summary>
             /// Get or set the contents of a tensor as raw bytes.
             /// </summary>
diff --git a/src/TorchSharp/Tensor/TensorExtensionMethods.cs b/src/TorchSharp/Tensor/TensorExtensionMethods.cs
index 09d79028f..846be615b 100644
--- a/src/TorchSharp/Tensor/TensorExtensionMethods.cs
+++ b/src/TorchSharp/Tensor/TensorExtensionMethods.cs
@@ -387,13 +387,9 @@ public static void Save(this Tensor tensor, System.IO.BinaryWriter writer)
                                               // Then, the shape.
             writer.Encode(tensor.shape.Length); // 4 bytes
             foreach (var s in tensor.shape) writer.Encode(s); // n * 8 bytes
-                                                              // Then, the data
-#if NETSTANDARD2_0_OR_GREATER
-            // TODO: NETSTANDARD2_0_OR_GREATER Try to optimize to avoid the allocation
-            writer.Write(tensor.bytes.ToArray()); // ElementSize * NumberOfElements
-#else
-            writer.Write(tensor.bytes); // ElementSize * NumberOfElements
-#endif // NETSTANDARD2_0_OR_GREATER
+
+            // Then, the data
+            tensor.WriteBytesToStream(writer.BaseStream);
 
             if (copied) tensor.Dispose();
         }
@@ -441,37 +437,28 @@ public static void Load(this Tensor tensor, System.IO.BinaryReader reader, bool
             // Then, the shape
             var shLen = reader.Decode();
             long[] loadedShape = new long[shLen];
-
-            long totalSize = 1;
             for (int i = 0; i < shLen; ++i) {
                 loadedShape[i] = reader.Decode();
-                totalSize *= loadedShape[i];
             }
 
             if (!skip && !loadedShape.SequenceEqual(tensor.shape))
                 // We only care about this if the bytes will be written to the tensor.
                 throw new ArgumentException("Mismatched tensor shape while loading. Make sure that the model you are loading into is exactly the same as the origin.");
 
-            //
-            // TODO: Fix this so that you can read large tensors. Right now, they are limited to 2GB
-            //
-            if (totalSize > int.MaxValue)
-                throw new NotImplementedException("Loading tensors larger than 2GB");
-
-            // This needs to be done even if the tensor is skipped, since we have to advance the input stream.
-            var bytes = reader.ReadBytes((int)(totalSize * tensor.ElementSize));
-
             if (!skip) {
+                using var d = torch.no_grad(); // so that we can perform operations on leaf tensors
+
                 var device = tensor.device;
                 if (device.type != DeviceType.CPU) {
-                    using var copy = tensor.to(CPU);
-                    copy.bytes = bytes;
-                    using var moved = copy.to(device);
-                    tensor.set_(moved);
-                }
-                else {
-                    tensor.bytes = bytes;
-                }
+                    using var temp = torch.zeros_like(tensor, device: torch.CPU);
+                    temp.ReadBytesFromStream(reader.BaseStream);
+
+                    tensor.copy_(temp);
+                } else {
+                    tensor.ReadBytesFromStream(reader.BaseStream);
+                }                
+            } else {
+                reader.BaseStream.Seek(tensor.NumberOfElements * tensor.ElementSize, System.IO.SeekOrigin.Current);
             }
         }
 
@@ -510,18 +497,10 @@ public static void Load(ref Tensor? tensor, System.IO.BinaryReader reader, bool
             var shLen = reader.Decode();
             long[] loadedShape = new long[shLen];
 
-            long totalSize = 1;
             for (int i = 0; i < shLen; ++i) {
                 loadedShape[i] = reader.Decode();
-                totalSize *= loadedShape[i];
             }
 
-            //
-            // TODO: Fix this so that you can read large tensors. Right now, they are limited to 2GB
-            //
-            if (totalSize > int.MaxValue)
-                throw new NotImplementedException("Loading tensors larger than 2GB");
-
             if (tensor is null) {
                 // If the tensor doesn't exist, initialize by zeros unless
                 // it's going to be loaded from the stream.
@@ -533,15 +512,19 @@ public static void Load(ref Tensor? tensor, System.IO.BinaryReader reader, bool
                 throw new ArgumentException("Mismatched tensor shape while loading. Make sure that the model you are loading into is exactly the same as the origin.");
             }
 
-            // This needs to be done even if the tensor is skipped, since we have to advance the input stream.
-            var bytes = reader.ReadBytes((int)(totalSize * tensor.ElementSize));
-
             if (!skip) {
+                using var d = torch.no_grad(); // so that we can perform operations on leaf tensors
+
                 var device = tensor.device;
-                if (device.type != DeviceType.CPU)
-                    tensor = tensor.to(CPU, disposeAfter: true); 
-                tensor.bytes = bytes;
-                tensor = tensor.to(device, disposeAfter: true);
+                if (device.type != DeviceType.CPU) {
+                    using var temp = torch.zeros_like(tensor, device: torch.CPU);
+                    temp.ReadBytesFromStream(reader.BaseStream);
+                    tensor.copy_(temp);
+                } else {
+                    tensor.ReadBytesFromStream(reader.BaseStream);
+                }
+            } else {
+                reader.BaseStream.Seek(tensor.NumberOfElements * tensor.ElementSize, System.IO.SeekOrigin.Current);
             }
         }
 
diff --git a/src/TorchVision/File.cs b/src/TorchVision/File.cs
index 73b6ab1d7..c8ad7b73b 100644
--- a/src/TorchVision/File.cs
+++ b/src/TorchVision/File.cs
@@ -48,7 +48,8 @@ public static async Task<Tensor> read_file_async(string filename)
             /// <param name="data">One dimensional <c>uint8</c> <cref>Tensor</cref>.</param>
             public static void write_file(string filename, Tensor data)
             {
-                File.WriteAllBytes(filename, data.bytes.ToArray());
+                using var stream = File.OpenWrite(filename);
+                data.WriteBytesToStream(stream);
             }
 
             /// <summary>
@@ -58,6 +59,7 @@ public static void write_file(string filename, Tensor data)
             /// <param name="data">One dimensional <c>uint8</c> <cref>Tensor</cref>.</param>
             public static async void write_file_async(string filename, Tensor data)
             {
+                // Currently limited to 2GB - should we duplicate the code for WriteBytesToStream using async ops?
                 using (FileStream stream = File.Open(filename, FileMode.OpenOrCreate)) {
                     await stream.WriteAsync(data.bytes.ToArray(), 0, (int)data.shape[0]);
                 }
diff --git a/test/TorchSharpTest/TestTorchTensor.cs b/test/TorchSharpTest/TestTorchTensor.cs
index 34ede2b58..92d3009e4 100644
--- a/test/TorchSharpTest/TestTorchTensor.cs
+++ b/test/TorchSharpTest/TestTorchTensor.cs
@@ -8269,5 +8269,111 @@ Tensor nbins_ratio(int seed, int size)
                 }
             }
         }
+
+
+        [Fact(Skip = "Very heavy on the compute")]
+        public void TestSaveAndLoadLarger2GBTensor()
+        {
+            var tensor = torch.rand((long)int.MaxValue + 128, device: torch.CPU);
+
+            var tempFile = Path.GetTempFileName();
+            try {
+                // Save to memory
+                using (var fs = File.OpenWrite(tempFile))
+                    tensor.Save(fs);
+
+                // Create a new copy of zeros
+                var copyTensor = torch.zeros_like(tensor, device: torch.CPU);
+
+                // Read it in
+                using (var fs = File.OpenRead(tempFile))
+                    copyTensor.Load(new BinaryReader(fs));
+
+                Assert.Equal(tensor.npstr(), copyTensor.npstr());
+            } finally {
+                File.Delete(tempFile);
+            }
+        }
+
+        [Fact(Skip = "Very heavy on the compute")]
+        public void TestSaveAndLoadLarger2GBTensorCUDA()
+        {
+            if (torch.cuda.is_available()) {
+                var tensor = torch.rand((long)int.MaxValue + 128, device: torch.CUDA);
+
+                var tempFile = Path.GetTempFileName();
+                try {
+                    // Save to memory
+                    using (var fs = File.OpenWrite(tempFile))
+                        tensor.Save(fs);
+
+                    // Create a new copy of zeros
+                    var copyTensor = torch.zeros_like(tensor, device: torch.CUDA);
+
+                    // Read it in
+                    using (var fs = File.OpenRead(tempFile))
+                        copyTensor.Load(new BinaryReader(fs));
+
+                    Assert.Equal(tensor.npstr(), copyTensor.npstr());
+                } finally {
+                    File.Delete(tempFile);
+                }
+            }
+        }
+
+
+        [Fact(Skip = "Very heavy on the compute")]
+        public void TestSaveAndLoadModuleWithLarger2GBTensor()
+        {
+            // Create a sequential with a parameter slightly larger than 2GB
+            var seq = nn.Sequential(("lin1", torch.nn.Linear(int.MaxValue / 2048, 2049, false)));
+
+            var tempFile = Path.GetTempFileName();
+            try {
+                // Save to memory
+                using (var fs = File.OpenWrite(tempFile))
+                    seq.save(fs);
+
+                // Create a new sequence, and make sure it is equal
+                var copySeq = nn.Sequential(("lin1", torch.nn.Linear(int.MaxValue / 2048, 2049, false)));
+
+                // Read it in
+                using (var fs = File.OpenRead(tempFile))
+                    copySeq.load(fs);
+
+                // Compare results
+                Assert.Equal(copySeq.parameters().First().npstr(), seq.parameters().First().npstr());
+            } finally {
+                File.Delete(tempFile);
+            }
+        }
+
+        [Fact(Skip = "Very heavy on the compute")]
+        public void TestSaveAndLoadModuleWithLarger2GBTensorCUDA()
+        {
+            if (torch.cuda.is_available()) {
+                // Create a sequential with a parameter slightly larger than 2GB
+                var seq = nn.Sequential(("lin1", torch.nn.Linear(int.MaxValue / 2048, 2049, false))).cuda();
+
+                var tempFile = Path.GetTempFileName();
+                try {
+                    // Save to memory
+                    using (var fs = File.OpenWrite(tempFile))
+                        seq.save(fs);
+
+                    // Create a new sequence, and make sure it is equal
+                    var copySeq = nn.Sequential(("lin1", torch.nn.Linear(int.MaxValue / 2048, 2049, false))).cuda();
+
+                    // Read it in
+                    using (var fs = File.OpenRead(tempFile))
+                        copySeq.load(fs);
+
+                    // Compare results
+                    Assert.Equal(copySeq.parameters().First().npstr(), seq.parameters().First().npstr());
+                } finally {
+                    File.Delete(tempFile);
+                }
+            }
+        }
     }
 }
\ No newline at end of file

From 045fe5f059c8ed1ac71aa026ae27bf7d94b8f620 Mon Sep 17 00:00:00 2001
From: Shaltiel Shmidman <shaltieltzion@gmail.com>
Date: Sun, 11 Feb 2024 05:32:15 +0200
Subject: [PATCH 2/5] Updated release notes

---
 RELEASENOTES.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/RELEASENOTES.md b/RELEASENOTES.md
index 75fb28142..5b3cdccd0 100644
--- a/RELEASENOTES.md
+++ b/RELEASENOTES.md
@@ -2,6 +2,12 @@
 
 Releases, starting with 9/2/2021, are listed with the most recent release at the top.
 
+## NuGet Version 0.101.7
+
+__API Changes__:
+
+#1219: Added support for loading and saving tensors that are >2GB. 
+
 ## NuGet Version 0.101.6
 
 __API Changes__:

From 191fbcbe6e35f97b67488b71706f55894867dd9c Mon Sep 17 00:00:00 2001
From: Shaltiel Shmidman <shaltieltzion@gmail.com>
Date: Mon, 12 Feb 2024 14:45:04 +0200
Subject: [PATCH 3/5] Updated buffer size

---
 src/TorchSharp/Tensor/Tensor.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index e1a930e22..99ead4914 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -411,7 +411,7 @@ internal void ValidateType(Type dotnetType)
             /// </summary>
             /// <param name="stream">Stream to write the bytes to</param>
             /// <param name="bufferSize">The buffer size to use when writing to the stream</param>
-            public void WriteBytesToStream(Stream stream, int bufferSize = 65_536)
+            public void WriteBytesToStream(Stream stream, int bufferSize = 1024)
             {
                 // Validate, but passing 0 as the total size, since we don't need to validate the size
                 _validate(0);
@@ -446,7 +446,7 @@ public void WriteBytesToStream(Stream stream, int bufferSize = 65_536)
             /// </summary>
             /// <param name="stream">Stream to read the bytes from</param>
             /// <param name="bufferSize">The buffer size to use when reading from the stream</param>
-            public void ReadBytesFromStream(Stream stream, int bufferSize = 65_536)
+            public void ReadBytesFromStream(Stream stream, int bufferSize = 1024)
             {
                 long totalSize = NumberOfElements * ElementSize;
 

From 2bf3015eded987dc06fa335beaeaac25e1615f94 Mon Sep 17 00:00:00 2001
From: Shaltiel Shmidman <shaltieltzion@gmail.com>
Date: Mon, 12 Feb 2024 20:56:06 +0200
Subject: [PATCH 4/5] Added validation in read

---
 src/TorchSharp/Tensor/Tensor.cs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index 99ead4914..a2c4f7684 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -450,7 +450,9 @@ public void ReadBytesFromStream(Stream stream, int bufferSize = 1024)
             {
                 long totalSize = NumberOfElements * ElementSize;
 
-                if (!is_contiguous()) throw new InvalidOperationException("SetBytes() called on non-contiguous tensor.");
+                // Validate that this tensor matches the conditions for reading the bytes - pass 0 as total size
+                // since we don't need to check that condition. 
+                _validate(0);
 
                 unsafe {
                     var ptr = NativeMethods.THSTensor_data(handle);

From e88ecfb7c61cb57637033194f668d61251ec2800 Mon Sep 17 00:00:00 2001
From: Shaltiel Shmidman <shaltieltzion@gmail.com>
Date: Wed, 14 Feb 2024 20:33:48 +0200
Subject: [PATCH 5/5] Removed comment

---
 src/TorchVision/File.cs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/TorchVision/File.cs b/src/TorchVision/File.cs
index c8ad7b73b..ea0c48cff 100644
--- a/src/TorchVision/File.cs
+++ b/src/TorchVision/File.cs
@@ -59,7 +59,6 @@ public static void write_file(string filename, Tensor data)
             /// <param name="data">One dimensional <c>uint8</c> <cref>Tensor</cref>.</param>
             public static async void write_file_async(string filename, Tensor data)
             {
-                // Currently limited to 2GB - should we duplicate the code for WriteBytesToStream using async ops?
                 using (FileStream stream = File.Open(filename, FileMode.OpenOrCreate)) {
                     await stream.WriteAsync(data.bytes.ToArray(), 0, (int)data.shape[0]);
                 }