| 1 | mdef(0)_out(4)_in(6) |
|---|
| 2 | mad out0, in0.x, in1, in2 |
|---|
| 3 | mad out1, in0.y, in1, in3 |
|---|
| 4 | mad out2, in0.z, in1, in4 |
|---|
| 5 | mad out3, in0.w, in1, in5 |
|---|
| 6 | mend |
|---|
| 7 | il_ps_2_0 |
|---|
| 8 | ; |
|---|
| 9 | ; 8x8 outer product version |
|---|
| 10 | ; 20100707 N.Nakasato (University of Aizu, Japan) |
|---|
| 11 | ; Provided "as is" |
|---|
| 12 | ; |
|---|
| 13 | dcl_input_interp(linear) v0.xy |
|---|
| 14 | dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float) ;A |
|---|
| 15 | dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float) ;B |
|---|
| 16 | dcl_literal l0, 0.0, 1.0, 2.0, 3.0 |
|---|
| 17 | dcl_literal l1, 4.0, 1.0, 2.0, 3.0 |
|---|
| 18 | dcl_literal l2, -1.0, 1.0, -2.0, 3.0 |
|---|
| 19 | dcl_literal l4, 1, 1, 1, 2 |
|---|
| 20 | dcl_cb cb0[1] ; {(float)n/4, (float)n, 0.0, 0.0} where n is matrix size |
|---|
| 21 | |
|---|
| 22 | flr r100.xy, v0.xy |
|---|
| 23 | mul r100.x, r100.x, l0.z ; x2 |
|---|
| 24 | mul r100.y, r100.y, l0.z ; x2 |
|---|
| 25 | mov r100.z, l2.z ; -2 |
|---|
| 26 | |
|---|
| 27 | mov r50, l0.xxxx |
|---|
| 28 | mov r51, l0.xxxx |
|---|
| 29 | mov r52, l0.xxxx |
|---|
| 30 | mov r53, l0.xxxx |
|---|
| 31 | mov r54, l0.xxxx |
|---|
| 32 | mov r55, l0.xxxx |
|---|
| 33 | mov r56, l0.xxxx |
|---|
| 34 | mov r57, l0.xxxx |
|---|
| 35 | |
|---|
| 36 | mov r60, l0.xxxx |
|---|
| 37 | mov r61, l0.xxxx |
|---|
| 38 | mov r62, l0.xxxx |
|---|
| 39 | mov r63, l0.xxxx |
|---|
| 40 | mov r64, l0.xxxx |
|---|
| 41 | mov r65, l0.xxxx |
|---|
| 42 | mov r66, l0.xxxx |
|---|
| 43 | mov r67, l0.xxxx |
|---|
| 44 | |
|---|
| 45 | whileloop |
|---|
| 46 | add r100.z, r100.z, l0.z ; +2 |
|---|
| 47 | eq r999, r100.z, cb0[0].y |
|---|
| 48 | break_logicalnz r999.x |
|---|
| 49 | |
|---|
| 50 | ; read A (transposed) |
|---|
| 51 | sample_resource(0)_sampler(0)_aoffimmi(0.0,0.0,0.0) r30, r100.yz |
|---|
| 52 | sample_resource(0)_sampler(1)_aoffimmi(1.0,0.0,0.0) r31, r100.yz |
|---|
| 53 | |
|---|
| 54 | ; read B |
|---|
| 55 | sample_resource(1)_sampler(2)_aoffimmi(0.0,0.0,0.0) r32, r100.xz |
|---|
| 56 | sample_resource(1)_sampler(3)_aoffimmi(1.0,0.0,0.0) r33, r100.xz |
|---|
| 57 | |
|---|
| 58 | mcall(0)(r50,r51,r52,r53),(r30,r32,r50,r51,r52,r53) |
|---|
| 59 | mcall(0)(r60,r61,r62,r63),(r30,r33,r60,r61,r62,r63) |
|---|
| 60 | |
|---|
| 61 | mcall(0)(r54,r55,r56,r57),(r31,r32,r54,r55,r56,r57) |
|---|
| 62 | mcall(0)(r64,r65,r66,r67),(r31,r33,r64,r65,r66,r67) |
|---|
| 63 | |
|---|
| 64 | ; read A (transposed) |
|---|
| 65 | sample_resource(0)_sampler(4)_aoffimmi(0.0,1.0,0.0) r30, r100.yz |
|---|
| 66 | sample_resource(0)_sampler(5)_aoffimmi(1.0,1.0,0.0) r31, r100.yz |
|---|
| 67 | |
|---|
| 68 | ; read B |
|---|
| 69 | sample_resource(1)_sampler(6)_aoffimmi(0.0,1.0,0.0) r32, r100.xz |
|---|
| 70 | sample_resource(1)_sampler(7)_aoffimmi(1.0,1.0,0.0) r33, r100.xz |
|---|
| 71 | |
|---|
| 72 | breakc_relop(eq) cb[0].z, l0.y |
|---|
| 73 | mcall(0)(r50,r51,r52,r53),(r30,r32,r50,r51,r52,r53) |
|---|
| 74 | mcall(0)(r60,r61,r62,r63),(r30,r33,r60,r61,r62,r63) |
|---|
| 75 | |
|---|
| 76 | mcall(0)(r54,r55,r56,r57),(r31,r32,r54,r55,r56,r57) |
|---|
| 77 | mcall(0)(r64,r65,r66,r67),(r31,r33,r64,r65,r66,r67) |
|---|
| 78 | endloop |
|---|
| 79 | |
|---|
| 80 | mul r100.y, r100.y, l1.x ; x2 x4 = x8 |
|---|
| 81 | mov r1, r100.xy |
|---|
| 82 | mul r2.x, r1.y, cb0[0].x |
|---|
| 83 | add r2.x, r2.x, r1.x |
|---|
| 84 | ftoi r3.x, r2.x |
|---|
| 85 | ftoi r4.x, cb0[0].x |
|---|
| 86 | |
|---|
| 87 | iadd r3.y, r3.x, l4.x |
|---|
| 88 | iadd r3.__zw, r3.xyxy, r4.xxxx |
|---|
| 89 | |
|---|
| 90 | mov g[r3.x], r50 |
|---|
| 91 | mov g[r3.y], r60 |
|---|
| 92 | mov g[r3.z], r51 |
|---|
| 93 | mov g[r3.w], r61 |
|---|
| 94 | |
|---|
| 95 | imul r4.x, r4.x, l4.w |
|---|
| 96 | iadd r3.xyzw, r3.xyzw, r4.xxxx |
|---|
| 97 | mov g[r3.x], r52 |
|---|
| 98 | mov g[r3.y], r62 |
|---|
| 99 | mov g[r3.z], r53 |
|---|
| 100 | mov g[r3.w], r63 |
|---|
| 101 | |
|---|
| 102 | iadd r3.xyzw, r3.xyzw, r4.xxxx |
|---|
| 103 | mov g[r3.x], r54 |
|---|
| 104 | mov g[r3.y], r64 |
|---|
| 105 | mov g[r3.z], r55 |
|---|
| 106 | mov g[r3.w], r65 |
|---|
| 107 | |
|---|
| 108 | iadd r3.xyzw, r3.xyzw, r4.xxxx |
|---|
| 109 | mov g[r3.x], r56 |
|---|
| 110 | mov g[r3.y], r66 |
|---|
| 111 | mov g[r3.z], r57 |
|---|
| 112 | mov g[r3.w], r67 |
|---|
| 113 | |
|---|
| 114 | ret_dyn |
|---|
| 115 | endmain |
|---|
| 116 | |
|---|
| 117 | end |
|---|