|
1 |
| ---- third_party/fft1d/device/fft1d.cl 2017-05-09 22:47:43.000000000 +0000 |
2 |
| -+++ sdaccel/fft1d/device/fft1d.cl 2017-09-12 19:21:02.120000000 +0000 |
3 |
| -@@ -49 +49 @@ |
| 1 | +--- device/fft1d.cl 2018-02-12 17:54:56.000000000 +0000 |
| 2 | ++++ device/fft1d.cl 2018-07-23 20:57:12.414000000 +0000 |
| 3 | +@@ -46,7 +46,7 @@ |
| 4 | + // Include source code for an engine that produces 8 points each step |
| 5 | + #include "fft_8.cl" |
| 6 | + |
4 | 7 | -#pragma OPENCL EXTENSION cl_intel_channels : enable
|
5 | 8 | +//#pragma OPENCL EXTENSION cl_intel_channels : enable
|
6 |
| -@@ -64 +64,9 @@ |
| 9 | + |
| 10 | + #include "../host/inc/fft_config.h" |
| 11 | + |
| 12 | +@@ -66,11 +66,19 @@ |
| 13 | + #define CONT_FACTOR (1 << LOG_CONT_FACTOR) |
| 14 | + |
| 15 | + // Need some depth to our channels to accomodate their bursty filling. |
7 | 16 | -channel float2 chanin[8] __attribute__((depth(CONT_FACTOR*8)));
|
8 | 17 | +
|
9 | 18 | +pipe float2 chanin0 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
|
|
14 | 23 | +pipe float2 chanin5 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
|
15 | 24 | +pipe float2 chanin6 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
|
16 | 25 | +pipe float2 chanin7 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
|
17 |
| -@@ -68 +76 @@ |
18 |
| -- #pragma unroll |
19 |
| -+ __attribute__((opencl_unroll_hint())) |
20 |
| -@@ -137,2 +145,2 @@ |
| 26 | + |
| 27 | + uint bit_reversed(uint x, uint bits) { |
| 28 | + uint y = 0; |
| 29 | +- #pragma unroll |
| 30 | ++ __attribute__((opencl_unroll_hint())) |
| 31 | + for (uint i = 0; i < bits; i++) { |
| 32 | + y <<= 1; |
| 33 | + y |= x & 1; |
| 34 | +@@ -139,15 +147,15 @@ |
| 35 | + } |
| 36 | + |
| 37 | + // group dimension (N/(8*CONT_FACTOR), num_iterations) |
21 | 38 | -__attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1)))
|
22 | 39 | -kernel void fetch (global float2 * restrict src) {
|
23 | 40 | +kernel __attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1)))
|
24 | 41 | +void fetch (global float2 * restrict src) {
|
25 |
| -@@ -145 +153 @@ |
| 42 | + |
| 43 | + const int N = (1 << LOGN); |
| 44 | + // Each thread will fetch POINTS points. Need POINTS times to pass to FFT. |
| 45 | + const int BUF_SIZE = 1 << (LOG_CONT_FACTOR + LOGPOINTS + LOGPOINTS); |
| 46 | + |
| 47 | + // Local memory for CONT_FACTOR * POINTS points |
26 | 48 | - local float2 buf[BUF_SIZE];
|
27 |
| -+ local float2 buf[BUF_SIZE] __attribute__((xcl_array_partition(block,8,1))); |
28 |
| -@@ -156,2 +164,2 @@ |
| 49 | ++ __local float2 buf[BUF_SIZE] __attribute__((xcl_array_partition(block,8,1))); |
| 50 | + |
| 51 | + uint iteration = get_global_id(1); |
| 52 | + uint group_per_iter = get_global_id(0); |
| 53 | +@@ -158,17 +166,17 @@ |
| 54 | + uint lid = get_local_id(0); |
| 55 | + uint local_addr = lid << LOGPOINTS; |
| 56 | + |
29 | 57 | - #pragma unroll
|
30 | 58 | - for (uint k = 0; k < POINTS; k++) {
|
31 | 59 | + __attribute__((opencl_unroll_hint()))
|
32 | 60 | + for (uint k = 0; k < POINTS; k+=2) {
|
33 |
| -@@ -160 +167,0 @@ |
34 |
| -- |
35 |
| -@@ -163,4 +170,4 @@ |
| 61 | + buf[local_addr + k] = src[global_addr + k]; |
| 62 | + } |
| 63 | + |
| 64 | + barrier (CLK_LOCAL_MEM_FENCE); |
| 65 | + |
36 | 66 | - #pragma unroll
|
37 | 67 | - for (uint k = 0; k < POINTS; k++) {
|
38 | 68 | - uint buf_addr = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid;
|
39 | 69 | - write_channel_intel (chanin[k], buf[buf_addr]);
|
40 | 70 | + uint buf_addr[8];
|
41 | 71 | + __attribute__((opencl_unroll_hint()))
|
42 |
| -+ for(uint k=0;k<8;k++) { |
| 72 | ++ for (uint k = 0; k < 8; k++) { |
43 | 73 | + buf_addr[k] = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid;
|
44 |
| -@@ -167,0 +175,12 @@ |
| 74 | + } |
| 75 | + } |
| 76 | + |
| 77 | +@@ -181,9 +189,24 @@ |
| 78 | + * 'count' represents the number of 4k sets to process |
| 79 | + * 'inverse' toggles between the direct and the inverse transform |
| 80 | + */ |
45 | 81 | + // bit_reversed reverses the bit locations of the value given.
|
46 | 82 | + // The second parameter is the width of the number (in bits) to reverse.
|
47 | 83 | + // Only the non-symmetric numbers are changed. E.g. 001,011,100,110 -> 100,110,100,110
|
48 |
| -+ write_pipe (chanin0, &buf[buf_addr[0]]); |
49 |
| -+ write_pipe (chanin1, &buf[buf_addr[1]]); |
50 |
| -+ write_pipe (chanin2, &buf[buf_addr[2]]); |
51 |
| -+ write_pipe (chanin3, &buf[buf_addr[3]]); |
52 |
| -+ write_pipe (chanin4, &buf[buf_addr[4]]); |
53 |
| -+ write_pipe (chanin5, &buf[buf_addr[5]]); |
54 |
| -+ write_pipe (chanin6, &buf[buf_addr[6]]); |
55 |
| -+ write_pipe (chanin7, &buf[buf_addr[7]]); |
56 | 84 | +
|
57 |
| -@@ -180,2 +199,2 @@ |
| 85 | ++ |
| 86 | ++ |
| 87 | ++ write_pipe(chanin0, &buf[buf_addr[0]]); |
| 88 | ++ write_pipe(chanin1, &buf[buf_addr[1]]); |
| 89 | ++ write_pipe(chanin2, &buf[buf_addr[2]]); |
| 90 | ++ write_pipe(chanin3, &buf[buf_addr[3]]); |
| 91 | ++ write_pipe(chanin4, &buf[buf_addr[4]]); |
| 92 | ++ write_pipe(chanin5, &buf[buf_addr[5]]); |
| 93 | ++ write_pipe(chanin6, &buf[buf_addr[6]]); |
| 94 | ++ write_pipe(chanin7, &buf[buf_addr[7]]); |
| 95 | ++ |
| 96 | + |
58 | 97 | -__attribute((task))
|
59 | 98 | -kernel void fft1d(global float2 * restrict dest,
|
60 | 99 | +kernel __attribute((reqd_work_group_size(1, 1, 1))) //task))
|
61 | 100 | +void fft1d(global float2 * restrict dest,
|
62 |
| -@@ -218,8 +237,9 @@ |
| 101 | + int count, int inverse) { |
| 102 | + |
| 103 | + const int N = (1 << LOGN); |
| 104 | +@@ -220,14 +243,14 @@ |
| 105 | + float2x8 data; |
| 106 | + // Perform memory transfers only when reading data in range |
| 107 | + if (i < count * (N / 8)) { |
63 | 108 | - data.i0 = read_channel_intel(chanin[0]);
|
64 | 109 | - data.i1 = read_channel_intel(chanin[1]);
|
65 | 110 | - data.i2 = read_channel_intel(chanin[2]);
|
|
68 | 113 | - data.i5 = read_channel_intel(chanin[5]);
|
69 | 114 | - data.i6 = read_channel_intel(chanin[6]);
|
70 | 115 | - data.i7 = read_channel_intel(chanin[7]);
|
71 |
| -+ |
72 | 116 | + read_pipe(chanin0,&data.i0);
|
73 | 117 | + read_pipe(chanin1,&data.i1);
|
74 | 118 | + read_pipe(chanin2,&data.i2);
|
|
77 | 121 | + read_pipe(chanin5,&data.i5);
|
78 | 122 | + read_pipe(chanin6,&data.i6);
|
79 | 123 | + read_pipe(chanin7,&data.i7);
|
| 124 | + } else { |
| 125 | + data.i0 = data.i1 = data.i2 = data.i3 = |
| 126 | + data.i4 = data.i5 = data.i6 = data.i7 = 0; |
0 commit comments