Random123
threefry.h
Go to the documentation of this file.
1/*
2Copyright 2010-2011, D. E. Shaw Research.
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are
7met:
8
9* Redistributions of source code must retain the above copyright
10 notice, this list of conditions, and the following disclaimer.
11
12* Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions, and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16* Neither the name of D. E. Shaw Research nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32#ifndef _threefry_dot_h_
33#define _threefry_dot_h_
35#include "array.h"
36
38/* Significant parts of this file were copied from
39 from:
40 Skein_FinalRnd/ReferenceImplementation/skein.h
41 Skein_FinalRnd/ReferenceImplementation/skein_block.c
42
43 in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
44
45 This file has been modified so that it may no longer perform its originally
46 intended function. If you're looking for a Skein or Threefish source code,
47 please consult the original file.
48
49 The original file had the following header:
50**************************************************************************
51**
52** Interface declarations and internal definitions for Skein hashing.
53**
54** Source code author: Doug Whiting, 2008.
55**
56** This algorithm and source code is released to the public domain.
57**
58***************************************************************************
59
60*/
61
62/* See comment at the top of philox.h for the macro pre-process
63 strategy. */
64
65/* Rotation constants: */
66enum r123_enum_threefry64x4 {
67 /* These are the R_256 constants from the Threefish reference sources
68 with names changed to R_64x4... */
69 R_64x4_0_0=14, R_64x4_0_1=16,
70 R_64x4_1_0=52, R_64x4_1_1=57,
71 R_64x4_2_0=23, R_64x4_2_1=40,
72 R_64x4_3_0= 5, R_64x4_3_1=37,
73 R_64x4_4_0=25, R_64x4_4_1=33,
74 R_64x4_5_0=46, R_64x4_5_1=12,
75 R_64x4_6_0=58, R_64x4_6_1=22,
76 R_64x4_7_0=32, R_64x4_7_1=32
77};
78
79enum r123_enum_threefry64x2 {
80 /*
81 // Output from skein_rot_search: (srs64_B64-X1000)
82 // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
83 // Start: Tue Mar 1 10:07:48 2011
84 // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
85 */
86 R_64x2_0_0=16,
87 R_64x2_1_0=42,
88 R_64x2_2_0=12,
89 R_64x2_3_0=31,
90 R_64x2_4_0=16,
91 R_64x2_5_0=32,
92 R_64x2_6_0=24,
93 R_64x2_7_0=21
94 /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
95 // 5 rounds: minHW = 8 [ 8 8 8 8 ]
96 // 6 rounds: minHW = 16 [ 16 16 16 16 ]
97 // 7 rounds: minHW = 32 [ 32 32 32 32 ]
98 // 8 rounds: minHW = 64 [ 64 64 64 64 ]
99 // 9 rounds: minHW = 64 [ 64 64 64 64 ]
100 //10 rounds: minHW = 64 [ 64 64 64 64 ]
101 //11 rounds: minHW = 64 [ 64 64 64 64 ] */
102};
103
104enum r123_enum_threefry32x4 {
105 /* Output from skein_rot_search: (srs-B128-X5000.out)
106 // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
107 // Start: Mon Aug 24 22:41:36 2009
108 // ...
109 // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
110 R_32x4_0_0=10, R_32x4_0_1=26,
111 R_32x4_1_0=11, R_32x4_1_1=21,
112 R_32x4_2_0=13, R_32x4_2_1=27,
113 R_32x4_3_0=23, R_32x4_3_1= 5,
114 R_32x4_4_0= 6, R_32x4_4_1=20,
115 R_32x4_5_0=17, R_32x4_5_1=11,
116 R_32x4_6_0=25, R_32x4_6_1=10,
117 R_32x4_7_0=18, R_32x4_7_1=20
118
119 /* 4 rounds: minHW = 3 [ 3 3 3 3 ]
120 // 5 rounds: minHW = 7 [ 7 7 7 7 ]
121 // 6 rounds: minHW = 12 [ 13 12 13 12 ]
122 // 7 rounds: minHW = 22 [ 22 23 22 23 ]
123 // 8 rounds: minHW = 31 [ 31 31 31 31 ]
124 // 9 rounds: minHW = 32 [ 32 32 32 32 ]
125 //10 rounds: minHW = 32 [ 32 32 32 32 ]
126 //11 rounds: minHW = 32 [ 32 32 32 32 ] */
127
128};
129
130enum r123_enum_threefry32x2 {
131 /* Output from skein_rot_search (srs32x2-X5000.out)
132 // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
133 // Start: Tue Jul 12 11:11:33 2011
134 // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
135 R_32x2_0_0=13,
136 R_32x2_1_0=15,
137 R_32x2_2_0=26,
138 R_32x2_3_0= 6,
139 R_32x2_4_0=17,
140 R_32x2_5_0=29,
141 R_32x2_6_0=16,
142 R_32x2_7_0=24
143
144 /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
145 // 5 rounds: minHW = 6 [ 6 8 6 8 ]
146 // 6 rounds: minHW = 9 [ 9 12 9 12 ]
147 // 7 rounds: minHW = 16 [ 16 24 16 24 ]
148 // 8 rounds: minHW = 32 [ 32 32 32 32 ]
149 // 9 rounds: minHW = 32 [ 32 32 32 32 ]
150 //10 rounds: minHW = 32 [ 32 32 32 32 ]
151 //11 rounds: minHW = 32 [ 32 32 32 32 ] */
152 };
153
154enum r123_enum_threefry_wcnt {
155 WCNT2=2,
156 WCNT4=4
157};
158
159#if R123_USE_64BIT
160R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
161R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
162{
163 return (x << (N & 63)) | (x >> ((64-N) & 63));
164}
165#endif
166
167R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
168R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
169{
170 return (x << (N & 31)) | (x >> ((32-N) & 31));
171}
172
173#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
174#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
175#define SKEIN_KS_PARITY32 0x1BD11BDA
176
179#ifndef THREEFRY2x32_DEFAULT_ROUNDS
180#define THREEFRY2x32_DEFAULT_ROUNDS 20
181#endif
182
183#ifndef THREEFRY2x64_DEFAULT_ROUNDS
184#define THREEFRY2x64_DEFAULT_ROUNDS 20
185#endif
186
187#ifndef THREEFRY4x32_DEFAULT_ROUNDS
188#define THREEFRY4x32_DEFAULT_ROUNDS 20
189#endif
190
191#ifndef THREEFRY4x64_DEFAULT_ROUNDS
192#define THREEFRY4x64_DEFAULT_ROUNDS 20
193#endif
194
195#define _threefry2x_tpl(W) \
196typedef struct r123array2x##W threefry2x##W##_ctr_t; \
197typedef struct r123array2x##W threefry2x##W##_key_t; \
198typedef struct r123array2x##W threefry2x##W##_ukey_t; \
199R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
200R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
201R123_CUDA_DEVICE R123_STATIC_INLINE \
202threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
203 uint##W##_t X0,X1; \
204 uint##W##_t ks0, ks1, ks2; \
205 R123_ASSERT(Nrounds<=32); \
206 ks2 = SKEIN_KS_PARITY##W; \
207 ks0 = k.v[0]; \
208 X0 = in.v[0] + ks0; \
209 ks2 ^= ks0; \
210\
211 ks1 = k.v[1]; \
212 X1 = in.v[1] + ks1; \
213 ks2 ^= ks1; \
214 \
215 if(Nrounds>0){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
216 if(Nrounds>1){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
217 if(Nrounds>2){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
218 if(Nrounds>3){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
219 if(Nrounds>3){ \
220 /* InjectKey(r=1) */ \
221 X0 += ks1; X1 += ks2; \
222 X1 += 1; /* X.v[2-1] += r */ \
223 } \
224 if(Nrounds>4){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
225 if(Nrounds>5){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
226 if(Nrounds>6){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
227 if(Nrounds>7){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
228 if(Nrounds>7){ \
229 /* InjectKey(r=2) */ \
230 X0 += ks2; X1 += ks0; \
231 X1 += 2; \
232 } \
233 if(Nrounds>8){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
234 if(Nrounds>9){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
235 if(Nrounds>10){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
236 if(Nrounds>11){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
237 if(Nrounds>11){ \
238 /* InjectKey(r=3) */ \
239 X0 += ks0; X1 += ks1; \
240 X1 += 3; \
241 } \
242 if(Nrounds>12){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
243 if(Nrounds>13){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
244 if(Nrounds>14){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
245 if(Nrounds>15){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
246 if(Nrounds>15){ \
247 /* InjectKey(r=4) */ \
248 X0 += ks1; X1 += ks2; \
249 X1 += 4; \
250 } \
251 if(Nrounds>16){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
252 if(Nrounds>17){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
253 if(Nrounds>18){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
254 if(Nrounds>19){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
255 if(Nrounds>19){ \
256 /* InjectKey(r=5) */ \
257 X0 += ks2; X1 += ks0; \
258 X1 += 5; \
259 } \
260 if(Nrounds>20){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
261 if(Nrounds>21){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
262 if(Nrounds>22){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
263 if(Nrounds>23){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
264 if(Nrounds>23){ \
265 /* InjectKey(r=6) */ \
266 X0 += ks0; X1 += ks1; \
267 X1 += 6; \
268 } \
269 if(Nrounds>24){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
270 if(Nrounds>25){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
271 if(Nrounds>26){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
272 if(Nrounds>27){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
273 if(Nrounds>27){ \
274 /* InjectKey(r=7) */ \
275 X0 += ks1; X1 += ks2; \
276 X1 += 7; \
277 } \
278 if(Nrounds>28){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
279 if(Nrounds>29){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
280 if(Nrounds>30){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
281 if(Nrounds>31){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
282 if(Nrounds>31){ \
283 /* InjectKey(r=8) */ \
284 X0 += ks2; X1 += ks0; \
285 X1 += 8; \
286 } \
287 threefry2x##W##_ctr_t ret={{X0, X1}}; \
288 return ret; \
289} \
290 \
291enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
292R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
293R123_CUDA_DEVICE R123_STATIC_INLINE \
294threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
295 return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
296}
297
298
299#define _threefry4x_tpl(W) \
300typedef struct r123array4x##W threefry4x##W##_ctr_t; \
301typedef struct r123array4x##W threefry4x##W##_key_t; \
302typedef struct r123array4x##W threefry4x##W##_ukey_t; \
303R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
304R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
305R123_CUDA_DEVICE R123_STATIC_INLINE \
306threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
307 uint##W##_t X0, X1, X2, X3; \
308 uint##W##_t ks0, ks1, ks2, ks3, ks4; \
309 R123_ASSERT(Nrounds<=72); \
310 ks4 = SKEIN_KS_PARITY##W; \
311 ks0 = k.v[0]; \
312 X0 = in.v[0] + ks0; \
313 ks4 ^= ks0; \
314 \
315 ks1 = k.v[1]; \
316 X1 = in.v[1] + ks1; \
317 ks4 ^= ks1; \
318 \
319 ks2 = k.v[2]; \
320 X2 = in.v[2] + ks2; \
321 ks4 ^= ks2; \
322 \
323 ks3 = k.v[3]; \
324 X3 = in.v[3] + ks3; \
325 ks4 ^= ks3; \
326 \
327 if(Nrounds>0){ \
328 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
329 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
330 } \
331 if(Nrounds>1){ \
332 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
333 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
334 } \
335 if(Nrounds>2){ \
336 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
337 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
338 } \
339 if(Nrounds>3){ \
340 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
341 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
342 } \
343 if(Nrounds>3){ \
344 /* InjectKey(r=1) */ \
345 X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
346 X3 += 1; /* XWCNT4-1 += r */ \
347 } \
348 \
349 if(Nrounds>4){ \
350 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
351 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
352 } \
353 if(Nrounds>5){ \
354 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
355 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
356 } \
357 if(Nrounds>6){ \
358 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
359 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
360 } \
361 if(Nrounds>7){ \
362 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
363 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
364 } \
365 if(Nrounds>7){ \
366 /* InjectKey(r=2) */ \
367 X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
368 X3 += 2; /* XWCNT4-1 += r */ \
369 } \
370 \
371 if(Nrounds>8){ \
372 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
373 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
374 } \
375 if(Nrounds>9){ \
376 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
377 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
378 } \
379 if(Nrounds>10){ \
380 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
381 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
382 } \
383 if(Nrounds>11){ \
384 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
385 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
386 } \
387 if(Nrounds>11){ \
388 /* InjectKey(r=3) */ \
389 X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
390 X3 += 3; /* XWCNT4-1 += r */ \
391 } \
392 \
393 if(Nrounds>12){ \
394 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
395 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
396 } \
397 if(Nrounds>13){ \
398 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
399 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
400 } \
401 if(Nrounds>14){ \
402 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
403 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
404 } \
405 if(Nrounds>15){ \
406 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
407 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
408 } \
409 if(Nrounds>15){ \
410 /* InjectKey(r=1) */ \
411 X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
412 X3 += 4; /* XWCNT4-1 += r */ \
413 } \
414 \
415 if(Nrounds>16){ \
416 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
417 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
418 } \
419 if(Nrounds>17){ \
420 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
421 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
422 } \
423 if(Nrounds>18){ \
424 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
425 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
426 } \
427 if(Nrounds>19){ \
428 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
429 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
430 } \
431 if(Nrounds>19){ \
432 /* InjectKey(r=1) */ \
433 X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
434 X3 += 5; /* XWCNT4-1 += r */ \
435 } \
436 \
437 if(Nrounds>20){ \
438 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
439 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
440 } \
441 if(Nrounds>21){ \
442 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
443 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
444 } \
445 if(Nrounds>22){ \
446 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
447 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
448 } \
449 if(Nrounds>23){ \
450 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
451 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
452 } \
453 if(Nrounds>23){ \
454 /* InjectKey(r=1) */ \
455 X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
456 X3 += 6; /* XWCNT4-1 += r */ \
457 } \
458 \
459 if(Nrounds>24){ \
460 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
461 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
462 } \
463 if(Nrounds>25){ \
464 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
465 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
466 } \
467 if(Nrounds>26){ \
468 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
469 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
470 } \
471 if(Nrounds>27){ \
472 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
473 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
474 } \
475 if(Nrounds>27){ \
476 /* InjectKey(r=1) */ \
477 X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
478 X3 += 7; /* XWCNT4-1 += r */ \
479 } \
480 \
481 if(Nrounds>28){ \
482 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
483 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
484 } \
485 if(Nrounds>29){ \
486 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
487 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
488 } \
489 if(Nrounds>30){ \
490 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
491 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
492 } \
493 if(Nrounds>31){ \
494 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
495 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
496 } \
497 if(Nrounds>31){ \
498 /* InjectKey(r=1) */ \
499 X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
500 X3 += 8; /* XWCNT4-1 += r */ \
501 } \
502 \
503 if(Nrounds>32){ \
504 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
505 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
506 } \
507 if(Nrounds>33){ \
508 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
509 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
510 } \
511 if(Nrounds>34){ \
512 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
513 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
514 } \
515 if(Nrounds>35){ \
516 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
517 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
518 } \
519 if(Nrounds>35){ \
520 /* InjectKey(r=1) */ \
521 X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
522 X3 += 9; /* XWCNT4-1 += r */ \
523 } \
524 \
525 if(Nrounds>36){ \
526 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
527 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
528 } \
529 if(Nrounds>37){ \
530 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
531 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
532 } \
533 if(Nrounds>38){ \
534 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
535 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
536 } \
537 if(Nrounds>39){ \
538 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
539 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
540 } \
541 if(Nrounds>39){ \
542 /* InjectKey(r=1) */ \
543 X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
544 X3 += 10; /* XWCNT4-1 += r */ \
545 } \
546 \
547 if(Nrounds>40){ \
548 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
549 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
550 } \
551 if(Nrounds>41){ \
552 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
553 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
554 } \
555 if(Nrounds>42){ \
556 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
557 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
558 } \
559 if(Nrounds>43){ \
560 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
561 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
562 } \
563 if(Nrounds>43){ \
564 /* InjectKey(r=1) */ \
565 X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
566 X3 += 11; /* XWCNT4-1 += r */ \
567 } \
568 \
569 if(Nrounds>44){ \
570 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
571 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
572 } \
573 if(Nrounds>45){ \
574 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
575 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
576 } \
577 if(Nrounds>46){ \
578 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
579 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
580 } \
581 if(Nrounds>47){ \
582 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
583 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
584 } \
585 if(Nrounds>47){ \
586 /* InjectKey(r=1) */ \
587 X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
588 X3 += 12; /* XWCNT4-1 += r */ \
589 } \
590 \
591 if(Nrounds>48){ \
592 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
593 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
594 } \
595 if(Nrounds>49){ \
596 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
597 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
598 } \
599 if(Nrounds>50){ \
600 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
601 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
602 } \
603 if(Nrounds>51){ \
604 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
605 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
606 } \
607 if(Nrounds>51){ \
608 /* InjectKey(r=1) */ \
609 X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
610 X3 += 13; /* XWCNT4-1 += r */ \
611 } \
612 \
613 if(Nrounds>52){ \
614 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
615 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
616 } \
617 if(Nrounds>53){ \
618 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
619 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
620 } \
621 if(Nrounds>54){ \
622 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
623 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
624 } \
625 if(Nrounds>55){ \
626 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
627 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
628 } \
629 if(Nrounds>55){ \
630 /* InjectKey(r=1) */ \
631 X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
632 X3 += 14; /* XWCNT4-1 += r */ \
633 } \
634 \
635 if(Nrounds>56){ \
636 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
637 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
638 } \
639 if(Nrounds>57){ \
640 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
641 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
642 } \
643 if(Nrounds>58){ \
644 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
645 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
646 } \
647 if(Nrounds>59){ \
648 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
649 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
650 } \
651 if(Nrounds>59){ \
652 /* InjectKey(r=1) */ \
653 X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
654 X3 += 15; /* XWCNT4-1 += r */ \
655 } \
656 \
657 if(Nrounds>60){ \
658 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
659 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
660 } \
661 if(Nrounds>61){ \
662 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
663 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
664 } \
665 if(Nrounds>62){ \
666 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
667 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
668 } \
669 if(Nrounds>63){ \
670 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
671 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
672 } \
673 if(Nrounds>63){ \
674 /* InjectKey(r=1) */ \
675 X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
676 X3 += 16; /* XWCNT4-1 += r */ \
677 } \
678 \
679 if(Nrounds>64){ \
680 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
681 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
682 } \
683 if(Nrounds>65){ \
684 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
685 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
686 } \
687 if(Nrounds>66){ \
688 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
689 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
690 } \
691 if(Nrounds>67){ \
692 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
693 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
694 } \
695 if(Nrounds>67){ \
696 /* InjectKey(r=1) */ \
697 X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
698 X3 += 17; /* XWCNT4-1 += r */ \
699 } \
700 \
701 if(Nrounds>68){ \
702 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
703 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
704 } \
705 if(Nrounds>69){ \
706 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
707 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
708 } \
709 if(Nrounds>70){ \
710 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
711 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
712 } \
713 if(Nrounds>71){ \
714 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
715 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
716 } \
717 if(Nrounds>71){ \
718 /* InjectKey(r=1) */ \
719 X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
720 X3 += 18; /* XWCNT4-1 += r */ \
721 } \
722 \
723 threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \
724 return ret; \
725} \
726 \
727 \
728enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
729R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
730R123_CUDA_DEVICE R123_STATIC_INLINE \
731threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
732 return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
733}
734
735#if R123_USE_64BIT
738#endif
741
742/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
743 than a static inline function. Why? */
744#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
745#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
746#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
747#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
748
749#if defined(__cplusplus)
750#define _threefryNxWclass_tpl(NxW) \
751namespace r123{ \
752template<unsigned int ROUNDS> \
753 struct Threefry##NxW##_R{ \
754 typedef threefry##NxW##_ctr_t ctr_type; \
755 typedef threefry##NxW##_key_t key_type; \
756 typedef threefry##NxW##_key_t ukey_type; \
757 static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
758 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
759 R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \
760 return threefry##NxW##_R(ROUNDS, ctr, key); \
761 } \
762}; \
763 typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
764} // namespace r123
765
768#if R123_USE_64BIT
771#endif
772
773/* The _tpl macros don't quite work to do string-pasting inside comments.
774 so we just write out the boilerplate documentation four times... */
775
872#endif
873
874#endif
#define _threefry4x_tpl(W)
Definition: threefry.h:299
#define _threefry2x_tpl(W)
Definition: threefry.h:195
#define _threefryNxWclass_tpl(NxW)
Definition: threefry.h:750