Skip to content

Commit 5c66ab7

Browse files
committed
Add YUV to RGB conversion in GPU Video decoder
This is still WIP
1 parent 64d21d1 commit 5c66ab7

File tree

5 files changed

+120
-84
lines changed

5 files changed

+120
-84
lines changed

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,7 @@ def get_extensions():
469469
"z",
470470
"pthread",
471471
"dl",
472+
"nppicc",
472473
],
473474
extra_compile_args=extra_compile_args,
474475
)

test/test_video_gpu_decoder.py

Lines changed: 104 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,120 @@
2222
]
2323

2424

25+
def _yuv420_to_444(mat):
26+
# logic taken from
27+
# https://en.wikipedia.org/wiki/YUV#Y%E2%80%B2UV420p_(and_Y%E2%80%B2V12_or_YV12)_to_RGB888_conversion
28+
width = mat.shape[-1]
29+
height = mat.shape[0] * 2 // 3
30+
luma = mat[:height]
31+
uv = mat[height:].reshape(2, height // 2, width // 2)
32+
uv2 = torch.nn.functional.interpolate(uv[None], scale_factor=2, mode='nearest')[0]
33+
yuv2 = torch.cat([luma[None], uv2]).permute(1, 2, 0)
34+
return yuv2
35+
36+
37+
def _yuv420_to_rgb(mat, limited_color_range=True, standard='bt709'):
38+
# taken from https://en.wikipedia.org/wiki/YCbCr
39+
if standard == 'bt601':
40+
# ITU-R BT.601, as used by decord
41+
# taken from https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
42+
m = torch.tensor([[ 1.0000, 0.0000, 1.402],
43+
[ 1.0000, -(1.772 * 0.114 / 0.587), -(1.402 * 0.299 / 0.587)],
44+
[ 1.0000, 1.772, 0.0000]], device=mat.device)
45+
elif standard == 'bt709':
46+
# ITU-R BT.709
47+
# taken from https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
48+
m = torch.tensor([[ 1.0000, 0.0000, 1.5748],
49+
[ 1.0000, -0.1873, -0.4681],
50+
[ 1.0000, 1.8556, 0.0000]], device=mat.device)
51+
else:
52+
raise ValueError(f"{standard} not supported")
53+
54+
if limited_color_range:
55+
# also present in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
56+
# being mentioned as compensation for the footroom and headroom
57+
m = m * torch.tensor([255 / 219, 255 / 224, 255 / 224], device=mat.device)
58+
59+
m = m.T
60+
61+
# TODO: maybe this needs to come together with limited_color_range
62+
offset = torch.tensor([16., 128., 128.], device=mat.device)
63+
64+
yuv2 = _yuv420_to_444(mat)
65+
66+
res = (yuv2 - offset) @ m
67+
return res
68+
69+
2570
@pytest.mark.skipif(_HAS_VIDEO_DECODER is False, reason="Didn't compile with support for gpu decoder")
2671
class TestVideoGPUDecoder:
2772
@pytest.mark.skipif(av is None, reason="PyAV unavailable")
2873
def test_frame_reading(self):
2974
for test_video in test_videos:
3075
full_path = os.path.join(VIDEO_DIR, test_video)
3176
decoder = VideoReader(full_path, device="cuda:0")
77+
print(test_video)
3278
with av.open(full_path) as container:
3379
for av_frame in container.decode(container.streams.video[0]):
34-
av_frames = torch.tensor(av_frame.to_ndarray().flatten())
80+
#print(av_frame.format)
81+
av2 = av_frame.to_rgb().to_ndarray()
82+
#print(av2.shape)
83+
av_frames_yuv = torch.tensor(av_frame.to_ndarray())
84+
#av_frames = torch.tensor(av_frame.to_rgb().to_ndarray())
85+
#av2 = torch.tensor(av_frame.to_rgb(dst_colorspace='ITU709').to_ndarray())
86+
#av_frames = torch.tensor(av_frame.to_rgb(dst_colorspace='ITU624').to_ndarray())
3587
vision_frames = next(decoder)["data"]
36-
mean_delta = torch.mean(torch.abs(av_frames.float() - decoder._reformat(vision_frames).float()))
37-
assert mean_delta < 0.1
88+
if False:
89+
if False:
90+
rr = decoder._reformat(vision_frames)
91+
rr = rr.reshape(av_frames.shape)
92+
rr2 = _transform(rr)
93+
else:
94+
rr2 = vision_frames
95+
print(rr2[:2, :2])
96+
print(av2[:2, :2])
97+
print(_transform(av_frames)[:2, :2])
98+
print((_transform(av_frames) - rr2.cpu()).abs().max())
99+
print((_transform(av_frames) - rr2.cpu()).abs().mean())
100+
print((_transform(av_frames) - rr2.cpu()).abs().median())
101+
print('----------')
102+
print(torch.max(torch.abs(torch.tensor(av2).float() - rr2.cpu().float())))
103+
print(torch.mean(torch.abs(torch.tensor(av2).float() - rr2.cpu().float())))
104+
print(torch.median(torch.abs(torch.tensor(av2).float() - rr2.cpu().float())))
105+
aa = _yuv444(av_frames).flatten(0, -2) - torch.tensor([16., 128., 128.])
106+
bb = torch.tensor(av2).flatten(0, -2).float()
107+
print('----------')
108+
rrr = torch.linalg.lstsq(aa, bb)
109+
print((bb - aa @ rrr.solution).abs().max())
110+
print((bb - aa @ rrr.solution).abs().mean())
111+
print((bb - aa @ rrr.solution).abs().median())
112+
113+
#print(rr[:3, :3], av_frames.shape)
114+
mean_delta = torch.mean(torch.abs(av_frames.float() - rr.float()))
115+
print(torch.max(torch.abs(av_frames.float() - rr.float())))
116+
#mean_delta = torch.mean(torch.abs(av_frames.float() - decoder._reformat(vision_frames).float()))
117+
118+
#print((av_frames.float() - vision_frames.cpu().float()).abs().max())
119+
#print((av_frames.float() - vision_frames.cpu().float()).abs().flatten().topk(10,largest=False).values)
120+
#v = (av_frames.float() - vision_frames.cpu().float()).abs().flatten()
121+
#v = torch.histogram(v, bins=v.unique())
122+
#print(test_video, (v.hist / v.hist.sum() * 100).int())
123+
124+
av_frames_rgb = _yuv420_to_rgb(av_frames_yuv)
125+
#diff = torch.abs(av_frames_rgb.floor().float() - vision_frames.cpu().float())
126+
diff = torch.abs(av_frames_rgb.float() - vision_frames.cpu().float())
127+
mean_delta = torch.median(diff)
128+
mean_delta = torch.kthvalue(diff.flatten(), int(diff.numel() * 0.7)).values
129+
if mean_delta > 16:
130+
print((torch.abs(diff)).max())
131+
print((torch.abs(diff)).median())
132+
#v = torch.histogram(diff.flatten(), bins=diff.flatten().unique())
133+
v = torch.histogram(diff.flatten(), bins=100)
134+
print((v.hist / v.hist.sum() * 100).int())
135+
print((v.hist / v.hist.sum() * 100).cumsum(0).int())
136+
print((v.hist / v.hist.sum() * 100))
137+
assert mean_delta < 16
138+
#assert mean_delta < 5
38139

39140

40141
if __name__ == "__main__":

torchvision/csrc/io/decoder/gpu/decoder.cpp

Lines changed: 14 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include <cmath>
44
#include <cstring>
55
#include <unordered_map>
6+
#include <nppi_color_conversion.h>
7+
68

79
static float chroma_height_factor(cudaVideoSurfaceFormat surface_format) {
810
return (surface_format == cudaVideoSurfaceFormat_YUV444 ||
@@ -138,38 +140,20 @@ int Decoder::handle_picture_display(CUVIDPARSERDISPINFO* disp_info) {
138140
}
139141

140142
auto options = torch::TensorOptions().dtype(torch::kU8).device(torch::kCUDA);
141-
torch::Tensor decoded_frame = torch::empty({get_frame_size()}, options);
143+
torch::Tensor decoded_frame = torch::empty({get_height(), width, 3}, options);
142144
uint8_t* frame_ptr = decoded_frame.data_ptr<uint8_t>();
143145

144-
// Copy luma plane
145-
CUDA_MEMCPY2D m = {0};
146-
m.srcMemoryType = CU_MEMORYTYPE_DEVICE;
147-
m.srcDevice = source_frame;
148-
m.srcPitch = source_pitch;
149-
m.dstMemoryType = CU_MEMORYTYPE_DEVICE;
150-
m.dstDevice = (CUdeviceptr)(m.dstHost = frame_ptr);
151-
m.dstPitch = get_width() * bytes_per_pixel;
152-
m.WidthInBytes = get_width() * bytes_per_pixel;
153-
m.Height = luma_height;
154-
check_for_cuda_errors(cuMemcpy2DAsync(&m, cuvidStream), __LINE__, __FILE__);
155-
156-
// Copy chroma plane
157-
// NVDEC output has luma height aligned by 2. Adjust chroma offset by aligning
158-
// height
159-
m.srcDevice =
160-
(CUdeviceptr)((uint8_t*)source_frame + m.srcPitch * ((surface_height + 1) & ~1));
161-
m.dstDevice = (CUdeviceptr)(m.dstHost = frame_ptr + m.dstPitch * luma_height);
162-
m.Height = chroma_height;
163-
check_for_cuda_errors(cuMemcpy2DAsync(&m, cuvidStream), __LINE__, __FILE__);
164-
165-
if (num_chroma_planes == 2) {
166-
m.srcDevice =
167-
(CUdeviceptr)((uint8_t*)source_frame + m.srcPitch * ((surface_height + 1) & ~1) * 2);
168-
m.dstDevice =
169-
(CUdeviceptr)(m.dstHost = frame_ptr + m.dstPitch * luma_height * 2);
170-
m.Height = chroma_height;
171-
check_for_cuda_errors(cuMemcpy2DAsync(&m, cuvidStream), __LINE__, __FILE__);
172-
}
146+
// TODO: check the surface_height condition in here
147+
const uint8_t *const pSrc[] = {(const uint8_t *const)source_frame,
148+
(const uint8_t *const)(source_frame + source_pitch * ((surface_height + 1) & ~1))};
149+
150+
151+
// TODO: create and reuse NppStreamContext, and thus need to use nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx instead
152+
auto err = nppiNV12ToRGB_709CSC_8u_P2C3R(pSrc, source_pitch, frame_ptr,
153+
width * 3, {(int)decoded_frame.size(1), (int)decoded_frame.size(0)});
154+
155+
TORCH_CHECK(err == NPP_NO_ERROR, "Failed to convert from NV12 to RGB. Error code:", err);
156+
173157
check_for_cuda_errors(cuStreamSynchronize(cuvidStream), __LINE__, __FILE__);
174158
decoded_frames.push(decoded_frame);
175159
check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);

torchvision/csrc/io/decoder/gpu/gpu_decoder.cpp

Lines changed: 1 addition & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -38,48 +38,8 @@ torch::Tensor GPUDecoder::decode() {
3838
return frame;
3939
}
4040

41-
/* Convert a tensor with data in NV12 format to a tensor with data in YUV420
42-
* format in-place.
43-
*/
44-
torch::Tensor GPUDecoder::nv12_to_yuv420(torch::Tensor frameTensor) {
45-
int width = decoder.get_width(), height = decoder.get_height();
46-
int pitch = width;
47-
uint8_t* frame = frameTensor.data_ptr<uint8_t>();
48-
uint8_t* ptr = new uint8_t[((width + 1) / 2) * ((height + 1) / 2)];
49-
50-
// sizes of source surface plane
51-
int sizePlaneY = pitch * height;
52-
int sizePlaneU = ((pitch + 1) / 2) * ((height + 1) / 2);
53-
int sizePlaneV = sizePlaneU;
54-
55-
uint8_t* uv = frame + sizePlaneY;
56-
uint8_t* u = uv;
57-
uint8_t* v = uv + sizePlaneU;
58-
59-
// split chroma from interleave to planar
60-
for (int y = 0; y < (height + 1) / 2; y++) {
61-
for (int x = 0; x < (width + 1) / 2; x++) {
62-
u[y * ((pitch + 1) / 2) + x] = uv[y * pitch + x * 2];
63-
ptr[y * ((width + 1) / 2) + x] = uv[y * pitch + x * 2 + 1];
64-
}
65-
}
66-
if (pitch == width) {
67-
memcpy(v, ptr, sizePlaneV * sizeof(uint8_t));
68-
} else {
69-
for (int i = 0; i < (height + 1) / 2; i++) {
70-
memcpy(
71-
v + ((pitch + 1) / 2) * i,
72-
ptr + ((width + 1) / 2) * i,
73-
((width + 1) / 2) * sizeof(uint8_t));
74-
}
75-
}
76-
delete[] ptr;
77-
return frameTensor;
78-
}
79-
8041
TORCH_LIBRARY(torchvision, m) {
8142
m.class_<GPUDecoder>("GPUDecoder")
8243
.def(torch::init<std::string, int64_t>())
83-
.def("next", &GPUDecoder::decode)
84-
.def("reformat", &GPUDecoder::nv12_to_yuv420);
44+
.def("next", &GPUDecoder::decode);
8545
}

torchvision/io/__init__.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -206,16 +206,6 @@ def set_current_stream(self, stream: str) -> bool:
206206
print("GPU decoding only works with video stream.")
207207
return self._c.set_current_stream(stream)
208208

209-
def _reformat(self, tensor, output_format: str = "yuv420"):
210-
supported_formats = [
211-
"yuv420",
212-
]
213-
if output_format not in supported_formats:
214-
raise RuntimeError(f"{output_format} not supported, please use one of {', '.join(supported_formats)}")
215-
if not isinstance(tensor, torch.Tensor):
216-
raise RuntimeError("Expected tensor as input parameter!")
217-
return self._c.reformat(tensor.cpu())
218-
219209

220210
__all__ = [
221211
"write_video",

0 commit comments

Comments
 (0)