1
# See http://www.ens-lyon.fr/LIP/Pub/Rapports/PhD/PhD2006/PhD2006-02.pdf
2
# for implementation details of all except division which is detailed below
7
nan: .long 0x7FFFFFFF # also abs mask
9
sign_mask: .long 0x80000000
10
m_mask: .long 0x007FFFFF
12
edge_case: .long 0x00FFFFFF
13
smallest_norm: .long 0x00800000 # implicit bit
14
high_FF: .long 0xFF000000
15
high_uint: .long 0xFFFFFFFF
17
# Supply a few 'missing' instructions
30
# set $cc from the result of "ashl reg,dist"
32
.long 0x5de04008 | (\reg << 15) | (\dist << 4)
36
# converts an unsigned number x to a signed rep based on the bits in sign
37
# sign should be 0x00000000 or 0xffffffff.
38
.macro to_signed x, sign
39
add \x,\x,\sign # conditionally decrement x
40
xor \x,\x,\sign # conditionally complement x
49
# calculate trailing zero count in x, also uses scr.
50
# Using Seal's algorithm
68
.byte 32,0,1,12,2,6,0,13,3,0,7,0,0,0,0,14
69
.byte 10,4,0,0,8,0,0,25,0,0,0,0,0,21,27,15
70
.byte 31,11,5,0,0,0,0,0,9,0,0,24,0,0,20,26
71
.byte 30,0,0,0,0,23,0,19,29,0,22,18,28,17,16,0
73
# calculate leading zero count
80
# Round 26 bit mantissa to nearest
81
# | 23 bits frac | G | R | S |
91
# If NZ, set the LSB of reg
94
or \reg,\reg,1 # set the sticky bit to 1
98
##########################################################################
99
##########################################################################
100
## addition & subtraction
102
#if defined(L_subsf3) || defined(L_addsub_sf)
105
# this is subtraction, so we just change the sign of r1
111
#if defined(L_addsf3) || defined(L_addsub_sf)
114
# x in $r0, y in $r1, result z in $r0 --||| 100 instructions +/- |||--
116
bextu $r2,$r0,(8<<5)|23 # ex in r2
117
bextu $r3,$r1,(8<<5)|23 # ey in r3
118
sub $r5,$r2,$r3 # d = ex - ey
120
# Special values are 0x00 and 0xff in ex and ey.
121
# If (ex&ey) != 0 or (xy|ey)=255 then there may be
128
jmpc nz,no_special_vals
130
# Check for early exit
132
jmpc z,test_if_not_255
134
jmpc nz,no_early_exit
155
# setup to test for special values
160
# test for special values
162
jmpc gte,ex_spec_is_gte
166
jmpc nz,no_special_vals
185
jmpc nz,no_special_vals
186
ashl $r6,$r0,9 # clear all except x frac
187
ashl $r7,$r1,9 # clear all except y frac
191
lshr $r4,$r0,31 # sx in r4
192
lshr $r5,$r1,31 # sy in r4
200
ldk $r8,(1<<10)|(9<<5)|26 # setup implicit bit and mask for e
201
#----------------------
202
ashr $r4,$r0,31 # sx in r4
203
ashl $r0,$r0,3 # shift mx 3 for GRS bits
204
bins $r0,$r0,$r8 # clear sx, ex and add implicit bit mx
205
# change mx to signed mantissa
207
#----------------------
208
ashr $r4,$r1,31 # sy in r4
209
ashl $r1,$r1,3 # shift my 3 for GRS bits
210
bins $r1,$r1,$r8 # clear sy, ey and add implicit bit my
211
# change my to signed mantissa
213
#----------------------
214
# test if we swap ms based on d sign
221
# d positive means that ex>=ey, so ez = ex
222
# d negative means that ey>ex, so ez = ey
227
# now $r2 = ez = max(ex,ey)
228
cmp $r5,26 # max necessary alignment shift is 26
233
ashl $r7,$r7,$r5 # create inverse of mask for test of S bit value in discarded my
235
tst $r1,$r7 # determine value of sticky bit
243
# $r4 = sign(mx), mx = |mx|
248
# realign mantissa using leading zero count
252
btst $r0,(6<<5)|0 # test low bits for sticky again
268
# mz == 0? if so, we just bail with a +0
270
jmpc nz,msum_not_zero
274
# Combined check that (1 <= ez <= 254)
277
jmpc b,no_special_ret
286
jmpc lt,no_special_ret
292
ldl $r2,$r2,(8<<5)|23
293
bins $r0,$r0,$r2 # width = 8, pos = 23 pack ez
296
ldl $r4,$r4,(1<<5)|31
297
bins $r0,$r0,$r4 # width = 1, pos = 31 set sz to sy
301
##########################################################################
302
##########################################################################
308
# x in $r0, y in $r1, result z in $r0 --||| 61 instructions +/- |||--
311
bextu $r2,$r0,(8<<5)|23 # ex in r2
312
bextu $r3,$r1,(8<<5)|23 # ey in r3
316
and $r4,$r4,$r5 # sz in r4
318
# unpack m add implicit bit
319
ldk $r5,(1<<10)|(9<<5)|23 # setup implicit bit and mask for e
320
#----------------------
321
bins $r0,$r0,$r5 # clear sx, ex and add implicit bit mx
329
jmpc b,no_special_vals_mul
332
# Check for early exit
336
jmpc nz,no_early_exit_mul
339
jmpc z,no_early_exit_mul
341
jmpc z,no_early_exit_mul
345
# setup to test for special values
350
# test for special values
352
jmpc gte,ex_spec_is_gte_ey_mul
354
ex_spec_is_gte_ey_mul:
356
jmpc nz,no_special_vals_mul
358
jmpc nz,ex_not_FF_mul
387
bins $r1,$r1,$r5 # clear sy, ey and add implicit bit my
390
sub $r3,$r3,127 # ez in r3
396
btst $r1,(1<<5)|15 # XXX use jmpx
400
# 48-bit product is in (r1,r2). The low 22 bits of r2
404
or $r0,$r0,$r1 # r0 = (r1,r2) >> 22
407
add $r3,$r3,1 # bump exponent
416
jmpc b,no_special_ret_mul
419
# When the final exponent <= 0, result is flushed to 0 except
420
# for the border case 0x00FFFFFF which is promoted to next higher
421
# FP no., that is, the smallest "normalized" number.
425
ldl $r3,$r3,(8<<5)|23
426
bins $r0,$r0,$r3 # width = 8, pos = 23 pack ez
430
lpm $r0,smallest_norm
438
jmpc lt,no_special_ret_mul
444
ldl $r3,$r3,(8<<5)|23
445
bins $r0,$r0,$r3 # width = 8, pos = 23 pack ez
453
# 48-bit product is in (r1,r2). The low 21 bits of r2
457
or $r0,$r0,$r1 # r0 = (r1,r2) >> 22
467
jmpc b,no_special_ret_mul
471
##########################################################################
472
##########################################################################
475
## See http://perso.ens-lyon.fr/gilles.villard/BIBLIOGRAPHIE/PDF/arith19.pdf
476
## for implementation details
479
dc_1: .long 0xffffe7d7
480
dc_2: .long 0xffffffe8
481
dc_3: .long 0xffbad86f
482
dc_4: .long 0xfffbece7
483
dc_5: .long 0xf3672b51
484
dc_6: .long 0xfd9d3a3e
485
dc_7: .long 0x9a3c4390
486
dc_8: .long 0xd4d2ce9b
487
dc_9: .long 0x1bba92b3
488
dc_10: .long 0x525a1a8b
489
dc_11: .long 0x0452b1bf
490
dc_12: .long 0xFFFFFFC0
491
spec_val_test: .long 0x7F7FFFFF
499
# x in $r0, y in $r1, result z in $r0 --||| 73 instructions +/- |||-
500
bextu $r10,$r0,(8<<5)|23 # ex in r2
501
bextu $r11,$r1,(8<<5)|23 # ey in r3
503
and $r2, $r0, $r6 # mx
504
and $r3, $r1, $r6 # my
506
bextu $r2,$r30,(1<<5)|4 # c = Tx >= T;
507
ashl $r3,$r3,9 # T = X << 9;
509
ashl $r4,$r0,8 # X8 = X << 8;
510
or $r4,$r4,$r13 # Mx = X8 | 0x80000000;
511
lshr $r5,$r4,$r2 # S = Mx >> c;
515
sub $r2, $r12, $r2 # int D = (Ex + 125) - (Ey - c);
518
and $r12,$r12,$r13 # Sr = ( X ˆ Y ) & 0x80000000;
521
jmpc nz, no_early_ret_dev
523
jmpc z, no_early_ret_dev
525
jmpc z, no_early_ret_dev
530
# setup to test for special values
535
# test for special values
537
jmpc gte, absXm1_gte_absYm1
541
jmpc nz, no_spec_ret_div
543
jmpc nz, ex_not_FF_div
545
and $r2, $r0, $r6 # mx
553
jmpc nz, ey_not_FF_div
579
jmpc lt, no_overflow_div
587
jmpc ns, no_underflow_div
588
xnor $r6, $r6, $r6 # -1
592
xor $r6, $r6, $r7 # 0xFF ^ -1 = 0xFFFFFF00
608
muluh $r7, $r3, $r6 # i0 = mul( T , 0xffffe7d7 );
610
sub $r7, $r6, $r7 # i1 = 0xffffffe8 - i0;
611
muluh $r7, $r5, $r7 # i2 = mul( S , i1 );
612
add $r7, $r7, 0x20 # i3 = 0x00000020 + i2;
613
muluh $r8, $r3, $r3 # i4 = mul( T , T );
614
muluh $r9, $r5, $r8 # i5 = mul( S , i4 );
616
muluh $r10, $r3, $r6 # i6 = mul( T , 0xffbad86f );
618
sub $r10, $r6, $r10 # i7 = 0xfffbece7 - i6;
619
muluh $r10, $r9, $r10 # i8 = mul( i5 , i7 );
620
add $r7, $r7, $r10 # i9 = i3 + i8;
621
muluh $r9, $r8, $r9 # i10 = mul( i4 , i5 );
623
muluh $r10, $r3, $r6 # i11 = mul( T , 0xf3672b51 );
625
sub $r10, $r6, $r10 # i12 = 0xfd9d3a3e - i11;
627
muluh $r11, $r3, $r6 # i13 = mul( T , 0x9a3c4390 );
629
sub $r11, $r6, $r11 # i14 = 0xd4d2ce9b - i13
630
muluh $r11, $r8, $r11 # i15 = mul( i4 , i14 );
631
add $r10, $r10, $r11 # i16 = i12 + i15;
632
muluh $r10, $r9, $r10 # i17 = mul( i10 , i16 )
633
add $r7, $r7, $r10 # i18 = i9 + i17;
634
muluh $r10, $r8, $r8 # i19 = mul( i4 , i4 );
636
muluh $r11, $r3, $r6 # i20 = mul( T , 0x1bba92b3 );
638
sub $r11, $r6, $r11 # i21 = 0x525a1a8b - i20;
640
muluh $r8, $r8, $r6 # i22 = mul( i4 , 0x0452b1bf );
641
add $r8, $r11, $r8 # i23 = i21 + i22;
642
muluh $r8, $r10, $r8 # i24 = mul( i19 , i23 );
643
muluh $r8, $r9, $r8 # i25 = mul( i10 , i24 );
644
add $r3, $r7, $r8 # V = i18 + i25;
647
and $r3, $r3, $r6 # W
648
# round and pack final values
649
ashl $r0, $r2, 23 # pack D
650
or $r0, $r0, $r12 # pack Sr
652
or $r12, $r12, $r13 # My
653
muluh $r10, $r3, $r12
665
##########################################################################
666
##########################################################################
669
##########################################################################
670
## int & unsigned int to float
672
.macro i2f x, s1, s2, s3, lbl
676
jmpc s, float_round\lbl
679
jmp float_no_round\lbl
682
jmpc s, float_shift_right\lbl
685
jmp float_round_and_pack\lbl
686
float_shift_right\lbl:
689
xnor \s3, \s3 ,\s3 # 0xFFFFFFFF
690
ashl \s3, \s3 ,\s2 # create inverse of mask for test of S bit value in discarded my
691
xnor \s3, \s3 ,0 # NOT
692
tst \x, \s3 # determine value of sticky bit
694
jmpc z,float_round_and_pack\lbl
695
or \x, \x, 1 # set the sticky bit to 1
696
float_round_and_pack\lbl:
697
bextu \s2, \x, (1<<5)|2 # extract low bit of m
698
or \x, \x, \s2 # or p into r
701
btst \x, (1<<5)|24 # test for carry from round
702
jmpc z, float_no_round\lbl
703
sub \s1, \s1, 1 # inc e for carry (actually dec nlz)
709
ldl \s1, \s1, (8<<5)|23
716
__floatsisf: # 32 instructions
718
jmpc nz, float_not_zero
721
ashr $r1, $r0, 31 # s in r1
722
xor $r0, $r0, $r1 # cond neg
724
i2f $r0, $r2, $r3, $r4, 1
725
ldl $r1, $r1, (1<<5)|31
732
__floatunsisf: # 26 instructions
734
jmpc nz, float_not_zero2
737
i2f $r0, $r1, $r2, $r3, 2
741
##########################################################################
742
##########################################################################
748
lpm $r3, nan # also abs mask
751
# test if either abs is nan
764
# -- if either is pos