Crypto++ 8.2
Free C&
rdrand.s
1;; rdrand.asm - written and placed in public domain by Jeffrey Walton and Uri Blumenthal.
2;; Copyright assigned to the Crypto++ project.
3
4;; This ASM file provides RDRAND and RDSEED to downlevel Unix and Linux tool
5;; chains. You will need a modern Nasm, however. You can also use it in place
6;; of intrinsics. The routines below run a little faster than the intrinsic
7;; based routines.
8
9;; nasm -f elf32 rdrand.s -DX86 -g -o rdrand-x86.o
10;; nasm -f elfx32 rdrand.s -DX32 -g -o rdrand-x32.o
11;; nasm -f elf64 rdrand.s -DX64 -g -o rdrand-x64.o
12
13;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
14;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
15
16;; C/C++ Function prototypes
17;; X86, X32 and X64:
18;; extern "C" void NASM_RDRAND_GenerateBlock(byte* ptr, size_t size);
19;; extern "C" void NASM_RDSEED_GenerateBlock(byte* ptr, size_t size);
20
21;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
22;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
23
24%ifdef X86 ;; Set via the command line
25%define arg1 [esp+04h]
26%define arg2 [esp+08h]
27%define buffer ecx
28%define bsize edx
29%define lsize dl ;; Used for tail bytes, 1-byte constants
30%define MWSIZE 04h ;; machine word size
31
32%elifdef X32 ;; Set via the command line
33%define buffer edi ;; Linux ABI
34%define bsize esi ;; Linux ABI
35%define lsize si
36%define MWSIZE 04h ;; machine word size
37
38%elifdef X64 ;; Set via the command line
39%ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI
40%define buffer rcx ;; Windows ABI
41%define bsize rdx ;; Windows ABI
42%define lsize dx ;; Used for tail bytes, 2-byte constants
43%else
44%define buffer rdi ;; Linux ABI
45%define bsize rsi ;; Linux ABI
46%define lsize si ;; Used for tail bytes, 2-byte constants
47%endif
48%define MWSIZE 08h ;; machine word size
49
50%else
51%error Missing or unknown architecture
52%endif
53
54;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
55;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56
57;; Fixups
58
59%ifdef DARWIN
60%define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
61%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
62%endif
63
64%ifdef CYGWIN
65%ifdef X86
66%define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
67%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
68%endif
69%endif
70
71;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
72;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
73
74%ifdef X86 ;; Set via the command line
75
76global NASM_RDRAND_GenerateBlock
77section .text
78align 8
79
80NASM_RDRAND_GenerateBlock:
81
82.Load_Arguments:
83
84 mov buffer, arg1
85 mov bsize, arg2
86
87 ;; A block of 16-bytes appears to be optimal. Adding
88 ;; more rdrand calls degrades performance.
89 cmp bsize, 16
90 jb .GenerateBlock_4
91
92.GenerateBlock_16:
93
94.Call_RDRAND_EAX_4:
95 rdrand eax
96 jnc .Call_RDRAND_EAX_4
97 mov [buffer+0], eax
98
99.Call_RDRAND_EAX_3:
100 rdrand eax
101 jnc .Call_RDRAND_EAX_3
102 mov [buffer+4], eax
103
104.Call_RDRAND_EAX_2:
105 rdrand eax
106 jnc .Call_RDRAND_EAX_2
107 mov [buffer+8], eax
108
109.Call_RDRAND_EAX_1:
110 rdrand eax
111 jnc .Call_RDRAND_EAX_1
112 mov [buffer+12], eax
113
114 sub bsize, 16
115 add buffer, 16
116
117 cmp bsize, 16
118 jae .GenerateBlock_16
119
120 ;; Fewer than 16 bytes remain
121.GenerateBlock_4:
122
123 cmp lsize, 0
124 je .GenerateBlock_Return
125
126.Call_RDRAND_EAX_0:
127
128 rdrand eax
129 jnc .Call_RDRAND_EAX_0
130
131 cmp lsize, MWSIZE
132 jb .Partial_Machine_Word
133
134.Full_Machine_Word:
135
136 mov [buffer], eax
137 add buffer, MWSIZE
138 sub lsize, MWSIZE
139
140 ;; Continue
141 jmp .GenerateBlock_4
142
143 ;; 1,2,3 bytes remain
144.Partial_Machine_Word:
145
146 ;; Test bit 1 to see if size is at least 2
147 test lsize, 2
148 jz .Bit_1_Not_Set
149
150 mov [buffer], ax
151 shr eax, 16
152 add buffer, 2
153
154.Bit_1_Not_Set:
155
156 ;; Test bit 0 to see if size is at least 1
157 test lsize, 1
158 jz .Bit_0_Not_Set
159
160 mov [buffer], al
161
162.Bit_0_Not_Set:
163
164 ;; We've hit all the bits
165
166.GenerateBlock_Return:
167
168 xor eax, eax
169 ret
170
171%endif ;; X86
172
173;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
174;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
175
176%ifdef X64 or X32 ;; Set via the command line
177
178global NASM_RDRAND_GenerateBlock
179section .text
180align 16
181
182NASM_RDRAND_GenerateBlock:
183
184 ;; No need for Load_Arguments due to fastcall
185
186 ;; A block of 32-bytes appears to be optimal. Adding
187 ;; more rdrand calls degrades performance.
188 cmp bsize, 32
189 jb .GenerateBlock_8
190
191.GenerateBlock_32:
192
193.Call_RDRAND_RAX_4:
194 rdrand rax
195 jnc .Call_RDRAND_RAX_4
196 mov [buffer+0], rax
197
198.Call_RDRAND_RAX_3:
199 rdrand rax
200 jnc .Call_RDRAND_RAX_3
201 mov [buffer+8], rax
202
203.Call_RDRAND_RAX_2:
204 rdrand rax
205 jnc .Call_RDRAND_RAX_2
206 mov [buffer+16], rax
207
208.Call_RDRAND_RAX_1:
209 rdrand rax
210 jnc .Call_RDRAND_RAX_1
211 mov [buffer+24], rax
212
213 sub bsize, 32
214 add buffer, 32
215
216 cmp bsize, 32
217 jae .GenerateBlock_32
218
219 ;; Fewer than 32 bytes remain
220.GenerateBlock_8:
221
222 cmp lsize, 0
223 je .GenerateBlock_Return
224
225.Call_RDRAND_RAX_0:
226 rdrand rax
227 jnc .Call_RDRAND_RAX_0
228
229 cmp lsize, MWSIZE
230 jb .Partial_Machine_Word
231
232.Full_Machine_Word:
233
234 mov [buffer], rax
235 add buffer, MWSIZE
236 sub lsize, MWSIZE
237
238 ;; Continue
239 jmp .GenerateBlock_8
240
241 ;; 1,2,3,4,5,6,7 bytes remain
242.Partial_Machine_Word:
243
244 ;; Test bit 2 to see if size is at least 4
245 test lsize, 4
246 jz .Bit_2_Not_Set
247
248 mov [buffer], eax
249 shr rax, 32
250 add buffer, 4
251
252.Bit_2_Not_Set:
253
254 ;; Test bit 1 to see if size is at least 2
255 test lsize, 2
256 jz .Bit_1_Not_Set
257
258 mov [buffer], ax
259 shr eax, 16
260 add buffer, 2
261
262.Bit_1_Not_Set:
263
264 ;; Test bit 0 to see if size is at least 1
265 test lsize, 1
266 jz .Bit_0_Not_Set
267
268 mov [buffer], al
269
270.Bit_0_Not_Set:
271
272 ;; We've hit all the bits
273
274.GenerateBlock_Return:
275
276 xor rax, rax
277 ret
278
279%endif ;; X64
280
281;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
282;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
283
284%ifdef X86 ;; Set via the command line
285
286global NASM_RDSEED_GenerateBlock
287section .text
288align 8
289
290NASM_RDSEED_GenerateBlock:
291
292.Load_Arguments:
293
294 mov buffer, arg1
295 mov bsize, arg2
296
297 ;; A block of 16-bytes appears to be optimal. Adding
298 ;; more rdrand calls degrades performance.
299 cmp bsize, 16
300 jb .GenerateBlock_4
301
302.GenerateBlock_16:
303
304.Call_RDSEED_EAX_4:
305 rdseed eax
306 jnc .Call_RDSEED_EAX_4
307 mov [buffer+0], eax
308
309.Call_RDSEED_EAX_3:
310 rdseed eax
311 jnc .Call_RDSEED_EAX_3
312 mov [buffer+4], eax
313
314.Call_RDSEED_EAX_2:
315 rdseed eax
316 jnc .Call_RDSEED_EAX_2
317 mov [buffer+8], eax
318
319.Call_RDSEED_EAX_1:
320 rdseed eax
321 jnc .Call_RDSEED_EAX_1
322 mov [buffer+12], eax
323
324 sub bsize, 16
325 add buffer, 16
326
327 cmp bsize, 16
328 jae .GenerateBlock_16
329
330 ;; Fewer than 16 bytes remain
331.GenerateBlock_4:
332
333 cmp lsize, 0
334 je .GenerateBlock_Return
335
336.Call_RDSEED_EAX_0:
337
338 rdseed eax
339 jnc .Call_RDSEED_EAX_0
340
341 cmp lsize, MWSIZE
342 jb .Partial_Machine_Word
343
344.Full_Machine_Word:
345
346 mov [buffer], eax
347 add buffer, MWSIZE
348 sub lsize, MWSIZE
349
350 ;; Continue
351 jmp .GenerateBlock_4
352
353 ;; 1,2,3 bytes remain
354.Partial_Machine_Word:
355
356 ;; Test bit 1 to see if size is at least 2
357 test lsize, 2
358 jz .Bit_1_Not_Set
359
360 mov [buffer], ax
361 shr eax, 16
362 add buffer, 2
363
364.Bit_1_Not_Set:
365
366 ;; Test bit 0 to see if size is at least 1
367 test lsize, 1
368 jz .Bit_0_Not_Set
369
370 mov [buffer], al
371
372.Bit_0_Not_Set:
373
374 ;; We've hit all the bits
375
376.GenerateBlock_Return:
377
378 xor eax, eax
379 ret
380
381%endif ;; X86
382
383;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
384;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
385
386%ifdef X64 or X32 ;; Set via the command line
387
388global NASM_RDSEED_GenerateBlock
389section .text
390align 16
391
392NASM_RDSEED_GenerateBlock:
393
394 ;; No need for Load_Arguments due to fastcall
395
396 ;; A block of 32-bytes appears to be optimal. Adding
397 ;; more rdrand calls degrades performance.
398 cmp bsize, 32
399 jb .GenerateBlock_8
400
401.GenerateBlock_32:
402
403.Call_RDSEED_RAX_4:
404 rdseed rax
405 jnc .Call_RDSEED_RAX_4
406 mov [buffer+0], rax
407
408.Call_RDSEED_RAX_3:
409 rdseed rax
410 jnc .Call_RDSEED_RAX_3
411 mov [buffer+8], rax
412
413.Call_RDSEED_RAX_2:
414 rdseed rax
415 jnc .Call_RDSEED_RAX_2
416 mov [buffer+16], rax
417
418.Call_RDSEED_RAX_1:
419 rdseed rax
420 jnc .Call_RDSEED_RAX_1
421 mov [buffer+24], rax
422
423 sub bsize, 32
424 add buffer, 32
425
426 cmp bsize, 32
427 jae .GenerateBlock_32
428
429 ;; Fewer than 32 bytes remain
430.GenerateBlock_8:
431
432 cmp lsize, 0
433 je .GenerateBlock_Return
434
435.Call_RDSEED_RAX_0:
436 rdseed rax
437 jnc .Call_RDSEED_RAX_0
438
439 cmp lsize, MWSIZE
440 jb .Partial_Machine_Word
441
442.Full_Machine_Word:
443
444 mov [buffer], rax
445 add buffer, MWSIZE
446 sub lsize, MWSIZE
447
448 ;; Continue
449 jmp .GenerateBlock_8
450
451 ;; 1,2,3,4,5,6,7 bytes remain
452.Partial_Machine_Word:
453
454 ;; Test bit 2 to see if size is at least 4
455 test lsize, 4
456 jz .Bit_2_Not_Set
457
458 mov [buffer], eax
459 shr rax, 32
460 add buffer, 4
461
462.Bit_2_Not_Set:
463
464 ;; Test bit 1 to see if size is at least 2
465 test lsize, 2
466 jz .Bit_1_Not_Set
467
468 mov [buffer], ax
469 shr eax, 16
470 add buffer, 2
471
472.Bit_1_Not_Set:
473
474 ;; Test bit 0 to see if size is at least 1
475 test lsize, 1
476 jz .Bit_0_Not_Set
477
478 mov [buffer], al
479
480.Bit_0_Not_Set:
481
482 ;; We've hit all the bits
483
484.GenerateBlock_Return:
485
486 xor rax, rax
487 ret
488
489%endif ;; X64
490
491;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
492;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;