MatrixMultiply: kernel_single.il

File kernel_single.il, 2.8 KB (added by nakasato, 14 years ago)
Line 
1mdef(0)_out(4)_in(6)
2mad out0, in0.x, in1, in2
3mad out1, in0.y, in1, in3
4mad out2, in0.z, in1, in4
5mad out3, in0.w, in1, in5
6mend
7il_ps_2_0
8;
9; 8x8 outer product version
10;   20100707 N.Nakasato (University of Aizu, Japan)
11;   Provided "as is"
12;
13dcl_input_interp(linear) v0.xy
14dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float) ;A
15dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float) ;B
16dcl_literal l0, 0.0, 1.0, 2.0, 3.0
17dcl_literal l1, 4.0, 1.0, 2.0, 3.0
18dcl_literal l2, -1.0, 1.0, -2.0, 3.0
19dcl_literal l4, 1, 1, 1, 2
20dcl_cb cb0[1]            ; {(float)n/4, (float)n, 0.0, 0.0} where n is matrix size
21
22flr r100.xy, v0.xy
23mul r100.x, r100.x, l0.z ; x2
24mul r100.y, r100.y, l0.z ; x2
25mov r100.z, l2.z         ; -2
26
27mov r50, l0.xxxx
28mov r51, l0.xxxx
29mov r52, l0.xxxx
30mov r53, l0.xxxx
31mov r54, l0.xxxx
32mov r55, l0.xxxx
33mov r56, l0.xxxx
34mov r57, l0.xxxx
35
36mov r60, l0.xxxx
37mov r61, l0.xxxx
38mov r62, l0.xxxx
39mov r63, l0.xxxx
40mov r64, l0.xxxx
41mov r65, l0.xxxx
42mov r66, l0.xxxx
43mov r67, l0.xxxx
44
45whileloop
46  add r100.z, r100.z, l0.z  ; +2
47  eq r999, r100.z, cb0[0].y
48  break_logicalnz r999.x
49
50  ; read A (transposed)
51  sample_resource(0)_sampler(0)_aoffimmi(0.0,0.0,0.0) r30, r100.yz
52  sample_resource(0)_sampler(1)_aoffimmi(1.0,0.0,0.0) r31, r100.yz
53
54  ; read B
55  sample_resource(1)_sampler(2)_aoffimmi(0.0,0.0,0.0) r32, r100.xz
56  sample_resource(1)_sampler(3)_aoffimmi(1.0,0.0,0.0) r33, r100.xz
57
58  mcall(0)(r50,r51,r52,r53),(r30,r32,r50,r51,r52,r53)
59  mcall(0)(r60,r61,r62,r63),(r30,r33,r60,r61,r62,r63)
60
61  mcall(0)(r54,r55,r56,r57),(r31,r32,r54,r55,r56,r57)
62  mcall(0)(r64,r65,r66,r67),(r31,r33,r64,r65,r66,r67)
63
64  ; read A (transposed)
65  sample_resource(0)_sampler(4)_aoffimmi(0.0,1.0,0.0) r30, r100.yz
66  sample_resource(0)_sampler(5)_aoffimmi(1.0,1.0,0.0) r31, r100.yz
67
68  ; read B
69  sample_resource(1)_sampler(6)_aoffimmi(0.0,1.0,0.0) r32, r100.xz
70  sample_resource(1)_sampler(7)_aoffimmi(1.0,1.0,0.0) r33, r100.xz
71
72  breakc_relop(eq) cb[0].z, l0.y
73  mcall(0)(r50,r51,r52,r53),(r30,r32,r50,r51,r52,r53)
74  mcall(0)(r60,r61,r62,r63),(r30,r33,r60,r61,r62,r63)
75
76  mcall(0)(r54,r55,r56,r57),(r31,r32,r54,r55,r56,r57)
77  mcall(0)(r64,r65,r66,r67),(r31,r33,r64,r65,r66,r67)
78endloop
79
80mul r100.y, r100.y, l1.x ; x2 x4 = x8
81mov r1, r100.xy
82mul r2.x, r1.y, cb0[0].x
83add r2.x, r2.x, r1.x
84ftoi r3.x, r2.x
85ftoi r4.x, cb0[0].x
86
87iadd r3.y, r3.x, l4.x
88iadd r3.__zw, r3.xyxy, r4.xxxx
89
90mov g[r3.x], r50
91mov g[r3.y], r60
92mov g[r3.z], r51
93mov g[r3.w], r61
94
95imul r4.x, r4.x, l4.w
96iadd r3.xyzw, r3.xyzw, r4.xxxx
97mov g[r3.x], r52
98mov g[r3.y], r62
99mov g[r3.z], r53
100mov g[r3.w], r63
101
102iadd r3.xyzw, r3.xyzw, r4.xxxx
103mov g[r3.x], r54
104mov g[r3.y], r64
105mov g[r3.z], r55
106mov g[r3.w], r65
107
108iadd r3.xyzw, r3.xyzw, r4.xxxx
109mov g[r3.x], r56
110mov g[r3.y], r66
111mov g[r3.z], r57
112mov g[r3.w], r67
113
114ret_dyn
115endmain
116
117end