Random123
philox.h
Go to the documentation of this file.
1/*
2Copyright 2010-2011, D. E. Shaw Research.
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are
7met:
8
9* Redistributions of source code must retain the above copyright
10 notice, this list of conditions, and the following disclaimer.
11
12* Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions, and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16* Neither the name of D. E. Shaw Research nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32#ifndef _philox_dot_h_
33#define _philox_dot_h_
34
38#include "array.h"
39
40
41/*
42// Macros _Foo_tpl are code generation 'templates' They define
43// inline functions with names obtained by mangling Foo and the
44// macro arguments. E.g.,
45// _mulhilo_tpl(32, uint32_t, uint64_t)
46// expands to a definition of:
47// mulhilo32(uint32_t, uint32_t, uint32_t *, uint32_t *)
48// We then 'instantiate the template' to define
49// several different functions, e.g.,
50// mulhilo32
51// mulhilo64
52// These functions will be visible to user code, and may
53// also be used later in subsequent templates and definitions.
54
55// A template for mulhilo using a temporary of twice the word-width.
56// Gcc figures out that this can be reduced to a single 'mul' instruction,
57// despite the apparent use of double-wide variables, shifts, etc. It's
58// obviously not guaranteed that all compilers will be that smart, so
59// other implementations might be preferable, e.g., using an intrinsic
60// or an asm block. On the other hand, for 32-bit multiplies,
61// this *is* perfectly standard C99 - any C99 compiler should
62// understand it and produce correct code. For 64-bit multiplies,
63// it's only usable if the compiler recognizes that it can do
64// arithmetic on a 128-bit type. That happens to be true for gcc on
65// x86-64, and powerpc64 but not much else.
66*/
67#define _mulhilo_dword_tpl(W, Word, Dword) \
68R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
69 Dword product = ((Dword)a)*((Dword)b); \
70 *hip = product>>W; \
71 return (Word)product; \
72}
73
74/*
75// A template for mulhilo using gnu-style asm syntax.
76// INSN can be "mulw", "mull" or "mulq".
77// FIXME - porting to other architectures, we'll need still-more conditional
78// branching here. Note that intrinsics are usually preferable.
79*/
80#ifdef __powerpc__
81#define _mulhilo_asm_tpl(W, Word, INSN) \
82R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
83 Word dx = 0; \
84 __asm__("\n\t" \
85 INSN " %0,%1,%2\n\t" \
86 : "=r"(dx) \
87 : "r"(b), "r"(ax) \
88 ); \
89 *hip = dx; \
90 return ax*b; \
91}
92#else
93#define _mulhilo_asm_tpl(W, Word, INSN) \
94R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
95 Word dx; \
96 __asm__("\n\t" \
97 INSN " %2\n\t" \
98 : "=a"(ax), "=d"(dx) \
99 : "r"(b), "0"(ax) \
100 ); \
101 *hip = dx; \
102 return ax; \
103}
104#endif /* __powerpc__ */
105
106/*
107// A template for mulhilo using MSVC-style intrinsics
108// For example,_umul128 is an msvc intrinsic, c.f.
109// http://msdn.microsoft.com/en-us/library/3dayytw9.aspx
110*/
111#define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
112R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
113 return INTRIN(a, b, hip); \
114}
115
116/* N.B. This really should be called _mulhilo_mulhi_intrin. It just
117 happens that CUDA was the first time we used the idiom. */
118#define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
119R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word* hip){ \
120 *hip = INTRIN(a, b); \
121 return a*b; \
122}
123
124/*
125// A template for mulhilo using only word-size operations and
126// C99 operators (no adc, no mulhi). It
127// requires four multiplies and a dozen or so shifts, adds
128// and tests. It's *SLOW*. It can be used to
129// implement philoxNx32 on platforms that completely lack
130// 64-bit types, e.g., Metal.
131// On 32-bit platforms, it could be used to
132// implement philoxNx64, but on such platforms both the philoxNx32
133// and the threefryNx64 cbrngs are going to have much better
134// performance. It is enabled below by R123_USE_MULHILO64_C99,
135// but that is currently (Feb 2019) only set by
136// features/metalfeatures.h headers. It can, of course, be
137// set with a compile-time -D option.
138*/
139#define _mulhilo_c99_tpl(W, Word) \
140R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word *hip){ \
141 const unsigned WHALF = W/2; \
142 const Word LOMASK = ((((Word)1)<<WHALF)-1); \
143 Word lo = a*b; /* full low multiply */ \
144 Word ahi = a>>WHALF; \
145 Word alo = a& LOMASK; \
146 Word bhi = b>>WHALF; \
147 Word blo = b& LOMASK; \
148 \
149 Word ahbl = ahi*blo; \
150 Word albh = alo*bhi; \
151 \
152 Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \
153 Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \
154 hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \
155 /* carry from the sum with alo*blo */ \
156 hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \
157 *hip = hi; \
158 return lo; \
159}
160
161/*
162// A template for mulhilo on a platform that can't do it
163// We could put a C version here, but is it better to run *VERY*
164// slowly or to just stop and force the user to find another CBRNG?
165*/
166#define _mulhilo_fail_tpl(W, Word) \
167R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
168 R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
169}
170
171/*
172// N.B. There's an MSVC intrinsic called _emul,
173// which *might* compile into better code than
174// _mulhilo_dword_tpl
175*/
176#if R123_USE_MULHILO32_ASM
177#ifdef __powerpc__
178_mulhilo_asm_tpl(32, uint32_t, "mulhwu")
179#else
180_mulhilo_asm_tpl(32, uint32_t, "mull")
181#endif /* __powerpc__ */
182#else
183#if R123_USE_64BIT
184_mulhilo_dword_tpl(32, uint32_t, uint64_t)
185#elif R123_USE_MULHILO32_MULHI_INTRIN
186_mulhilo_cuda_intrin_tpl(32, uint32_t, R123_MULHILO32_MULHI_INTRIN)
187#else
188_mulhilo_c99_tpl(32, uint32_t)
189#endif
190#endif
191
192#if R123_USE_PHILOX_64BIT
193#if R123_USE_MULHILO64_ASM
194#ifdef __powerpc64__
195_mulhilo_asm_tpl(64, uint64_t, "mulhdu")
196#else
197_mulhilo_asm_tpl(64, uint64_t, "mulq")
198#endif /* __powerpc64__ */
199#elif R123_USE_MULHILO64_MSVC_INTRIN
200_mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
201#elif R123_USE_MULHILO64_CUDA_INTRIN
202_mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
203#elif R123_USE_MULHILO64_OPENCL_INTRIN
204_mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
205#elif R123_USE_MULHILO64_MULHI_INTRIN
206_mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN)
207#elif R123_USE_GNU_UINT128
208_mulhilo_dword_tpl(64, uint64_t, __uint128_t)
209#elif R123_USE_MULHILO64_C99
210_mulhilo_c99_tpl(64, uint64_t)
211#else
212_mulhilo_fail_tpl(64, uint64_t)
213#endif
214#endif
215
216/*
217// The multipliers and Weyl constants are "hard coded".
218// To change them, you can #define them with different
219// values before #include-ing this file.
220// This isn't terribly elegant, but it works for C as
221// well as C++. A nice C++-only solution would be to
222// use template parameters in the style of <random>
223*/
224#ifndef PHILOX_M2x64_0
225#define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
226#endif
227
228#ifndef PHILOX_M4x64_0
229#define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
230#endif
231
232#ifndef PHILOX_M4x64_1
233#define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
234#endif
235
236#ifndef PHILOX_M2x32_0
237#define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
238#endif
239
240#ifndef PHILOX_M4x32_0
241#define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
242#endif
243#ifndef PHILOX_M4x32_1
244#define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
245#endif
246
247#ifndef PHILOX_W64_0
248#define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15) /* golden ratio */
249#endif
250#ifndef PHILOX_W64_1
251#define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B) /* sqrt(3)-1 */
252#endif
253
254#ifndef PHILOX_W32_0
255#define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
256#endif
257#ifndef PHILOX_W32_1
258#define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
259#endif
260
262#ifndef PHILOX2x32_DEFAULT_ROUNDS
263#define PHILOX2x32_DEFAULT_ROUNDS 10
264#endif
265
266#ifndef PHILOX2x64_DEFAULT_ROUNDS
267#define PHILOX2x64_DEFAULT_ROUNDS 10
268#endif
269
270#ifndef PHILOX4x32_DEFAULT_ROUNDS
271#define PHILOX4x32_DEFAULT_ROUNDS 10
272#endif
273
274#ifndef PHILOX4x64_DEFAULT_ROUNDS
275#define PHILOX4x64_DEFAULT_ROUNDS 10
276#endif
279/* The ignored fourth argument allows us to instantiate the
280 same macro regardless of N. */
281#define _philox2xWround_tpl(W, T) \
282R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
283R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
284 T hi; \
285 T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
286 struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \
287 return out; \
288}
289#define _philox2xWbumpkey_tpl(W) \
290R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
291 key.v[0] += PHILOX_W##W##_0; \
292 return key; \
293}
294
295#define _philox4xWround_tpl(W, T) \
296R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
297R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
298 T hi0; \
299 T hi1; \
300 T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
301 T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
302 struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \
303 hi0^ctr.v[3]^key.v[1], lo0}}; \
304 return out; \
305}
306
307#define _philox4xWbumpkey_tpl(W) \
308R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
309 key.v[0] += PHILOX_W##W##_0; \
310 key.v[1] += PHILOX_W##W##_1; \
311 return key; \
312}
313
315#define _philoxNxW_tpl(N, Nhalf, W, T) \
316 \
317enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
318typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
319typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
320typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
321R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
322R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
323R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
324 R123_ASSERT(R<=16); \
325 if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \
326 if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
327 if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
328 if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
329 if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
330 if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
331 if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
332 if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
333 if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
334 if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
335 if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
336 if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
337 if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
338 if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
339 if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
340 if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
341 return ctr; \
342}
343
345_philox4xWbumpkey_tpl(32)
346_philox2xWround_tpl(32, uint32_t) /* philox2x32round */
347_philox4xWround_tpl(32, uint32_t) /* philo4x32round */
348
349_philoxNxW_tpl(2, 1, 32, uint32_t) /* philox2x32bijection */
350_philoxNxW_tpl(4, 2, 32, uint32_t) /* philox4x32bijection */
351#if R123_USE_PHILOX_64BIT
354_philox4xWbumpkey_tpl(64)
355_philox2xWround_tpl(64, uint64_t) /* philo2x64round */
356_philox4xWround_tpl(64, uint64_t) /* philo4x64round */
358_philoxNxW_tpl(2, 1, 64, uint64_t) /* philox2x64bijection */
359_philoxNxW_tpl(4, 2, 64, uint64_t) /* philox4x64bijection */
360#endif /* R123_USE_PHILOX_64BIT */
361
362#define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
363#define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
364#if R123_USE_PHILOX_64BIT
365#define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
366#define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
367#endif /* R123_USE_PHILOX_64BIT */
368
369#if defined(__cplusplus)
370
371#define _PhiloxNxW_base_tpl(CType, KType, N, W) \
372namespace r123{ \
373template<unsigned int ROUNDS> \
374struct Philox##N##x##W##_R{ \
375 typedef CType ctr_type; \
376 typedef KType key_type; \
377 typedef KType ukey_type; \
378 static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
379 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
380 R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
381 return philox##N##x##W##_R(ROUNDS, ctr, key); \
382 } \
383}; \
384typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
385 } // namespace r123
386
389#if R123_USE_PHILOX_64BIT
392#endif
393
394/* The _tpl macros don't quite work to do string-pasting inside comments.
395 so we just write out the boilerplate documentation four times... */
396
491#endif /* __cplusplus */
492
493#endif /* _philox_dot_h_ */
uint32_t _philox4xWround_tpl(32, uint32_t) enum r123_enum_philox2x32
Definition: philox.h:347
#define _PhiloxNxW_base_tpl(CType, KType, N, W)
Definition: philox.h:371
_philox2xWbumpkey_tpl(32) _philox4xWbumpkey_tpl(32) _philox2xWround_tpl(32
#define _philoxNxW_tpl(N, Nhalf, W, T)
Definition: philox.h:315
Definition: array.h:314
Definition: array.h:320
Definition: array.h:315
Definition: array.h:321
Definition: array.h:316
Definition: array.h:322