summaryrefslogtreecommitdiffstats
path: root/mpg123_artsplugin/mpg123/decode_3dnow.s
blob: fd39429a0161256eb344fbf15ba3ecb264fe9249 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#
# decode_3dnow.s - 3DNow! optimized synth_1to1()
#
# This code based 'decode_3dnow.s' by Syuuhei Kashiyama
# <[email protected]>,only two types of changes have been made:
#
# - remove PREFETCH instruction for speedup
# - change function name for support 3DNow! automatic detect
# - femms moved to before 'call dct64_3dnow'
#
# You can find Kashiyama's original 3dnow! support patch
# (for mpg123-0.59o) at
# http:#/user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese).
#
# by KIMURA Takuhiro <[email protected]> - until 31.Mar.1999
#                    <[email protected]>               - after  1.Apr.1999
#

##/
##/ Replacement of synth_1to1() with AMD's 3DNow! SIMD operations support
##/ 
##/ Syuuhei Kashiyama <[email protected]>
##/ 
##/ The author of this program disclaim whole expressed or implied
##/ warranties with regard to this program, and in no event shall the
##/ author of this program liable to whatever resulted from the use of
##/ this program. Use it at your own risk.
##/ 

	.local	buffs.40
	.comm	buffs.40,4352,32
.data
	.align 4
	.type	 bo.42,@object
	.size	 bo.42,4
bo.42:
	.long 1
.text
.globl synth_1to1_3dnow
	.type	 synth_1to1_3dnow,@function
synth_1to1_3dnow:
	subl $24,%esp
	pushl %ebp
	pushl %edi
	xorl %ebp,%ebp
	pushl %esi
	pushl %ebx
	movl 56(%esp),%esi
	movl 52(%esp),%edi
	movl 0(%esi),%esi
	movl 48(%esp),%ebx
	addl %edi,%esi
	movl %esi,16(%esp)

	femms
		
	# fixed by Takuhiro
	cmpl $0,param+348
	je .L25
	pushl %ebx
	pushl 48(%esp)
	call do_equalizer_3dnow
	addl $8,%esp
.L25:
	testl %ebx,%ebx
	jne .L26
	decl bo.42
	movl $buffs.40,%ecx
	andl $15,bo.42
	jmp .L27
.L26:
	addl $2,16(%esp)
	movl $buffs.40+2176,%ecx
.L27:
	movl bo.42,%edx
	testb $1,%dl
	je .L28
	movl %edx,36(%esp)
	movl %ecx,%ebx
	movl 44(%esp),%esi
	movl %edx,%edi
	pushl %esi
	sall $2,%edi
	movl %ebx,%eax
	movl %edi,24(%esp)
	addl %edi,%eax
	pushl %eax
	movl %edx,%eax
	incl %eax
	andl $15,%eax
	leal 1088(,%eax,4),%eax
	addl %ebx,%eax
	pushl %eax
	call dct64_3dnow
	addl $12,%esp
	jmp .L29
.L28:
	leal 1(%edx),%esi
	movl 44(%esp),%edi
	movl %esi,36(%esp)
	leal 1092(%ecx,%edx,4),%eax
	pushl %edi
	leal 1088(%ecx),%ebx
	pushl %eax
	sall $2,%esi
	leal (%ecx,%edx,4),%eax
	pushl %eax
	call dct64_3dnow
	addl $12,%esp
	movl %esi,20(%esp)
.L29:
	movl $decwin+64,%edx
	movl $16,%ecx
	subl 20(%esp),%edx
	movl 16(%esp),%edi

	movq (%edx),%mm0
	movq (%ebx),%mm1
	.align 32
.L33:
	movq 8(%edx),%mm3
        pfmul %mm1,%mm0
	movq 8(%ebx),%mm4
	movq 16(%edx),%mm5
  	pfmul %mm4,%mm3
	movq 16(%ebx),%mm6
  	pfadd %mm3,%mm0
	movq 24(%edx),%mm1
	pfmul %mm6,%mm5
	movq 24(%ebx),%mm2
	pfadd %mm5,%mm0
	movq 32(%edx),%mm3
	pfmul %mm2,%mm1
	movq 32(%ebx),%mm4
	pfadd %mm1,%mm0
	movq 40(%edx),%mm5
	pfmul %mm4,%mm3
	movq 40(%ebx),%mm6
	pfadd %mm3,%mm0
	movq 48(%edx),%mm1
	pfmul %mm6,%mm5
	movq 48(%ebx),%mm2
	pfadd %mm0,%mm5
	movq 56(%edx),%mm3
	pfmul %mm1,%mm2
	movq 56(%ebx),%mm4
	pfadd %mm5,%mm2
	addl $64,%ebx
	subl $-128,%edx
	movq (%edx),%mm0
	pfmul %mm4,%mm3
	movq (%ebx),%mm1
	pfadd %mm3,%mm2
	movq %mm2,%mm3
	psrlq $32,%mm3
	pfsub %mm3,%mm2
	incl %ebp
	pf2id %mm2,%mm2
	packssdw %mm2,%mm2
	movd %mm2,%eax
	movw %ax,0(%edi)
	addl $4,%edi
	decl %ecx
	jnz .L33
	
	movd (%ebx),%mm0
	movd (%edx),%mm1
	punpckldq 8(%ebx),%mm0
	punpckldq 8(%edx),%mm1
	movd 16(%ebx),%mm3
	movd 16(%edx),%mm4
	pfmul %mm1,%mm0
	punpckldq 24(%ebx),%mm3
	punpckldq 24(%edx),%mm4
	movd 32(%ebx),%mm5
	movd 32(%edx),%mm6
	pfmul %mm4,%mm3
	punpckldq 40(%ebx),%mm5
	punpckldq 40(%edx),%mm6
	pfadd %mm3,%mm0
	movd 48(%ebx),%mm1
	movd 48(%edx),%mm2
	pfmul %mm6,%mm5
	punpckldq 56(%ebx),%mm1
	punpckldq 56(%edx),%mm2
	pfadd %mm5,%mm0
	pfmul %mm2,%mm1
	pfadd %mm1,%mm0
	pfacc %mm1,%mm0
	pf2id %mm0,%mm0
	packssdw %mm0,%mm0
	movd %mm0,%eax
	movw %ax,0(%edi)
	incl %ebp
	movl 36(%esp),%esi
	addl $-64,%ebx
	movl $15,%ebp
	addl $4,%edi
	leal -128(%edx,%esi,8),%edx

	movl $15,%ecx
	movd (%ebx),%mm0
	movd -4(%edx),%mm1
	punpckldq 4(%ebx),%mm0
	punpckldq -8(%edx),%mm1
	.align 32
.L46:						
	movd 8(%ebx),%mm3
	movd -12(%edx),%mm4
	pfmul %mm1,%mm0
	punpckldq 12(%ebx),%mm3
	punpckldq -16(%edx),%mm4
	movd 16(%ebx),%mm5
	movd -20(%edx),%mm6
	pfmul %mm4,%mm3
	punpckldq 20(%ebx),%mm5
	punpckldq -24(%edx),%mm6
	pfadd %mm3,%mm0
	movd 24(%ebx),%mm1		
	movd -28(%edx),%mm2		
	pfmul %mm6,%mm5
	punpckldq 28(%ebx),%mm1	
	punpckldq -32(%edx),%mm2
	pfadd %mm5,%mm0
	movd 32(%ebx),%mm3		
	movd -36(%edx),%mm4		
	pfmul %mm2,%mm1
	punpckldq 36(%ebx),%mm3	
	punpckldq -40(%edx),%mm4
	pfadd %mm1,%mm0			
	movd 40(%ebx),%mm5		
	movd -44(%edx),%mm6		
	pfmul %mm4,%mm3			
	punpckldq 44(%ebx),%mm5	
	punpckldq -48(%edx),%mm6
	pfadd %mm3,%mm0			
	movd 48(%ebx),%mm1		
	movd -52(%edx),%mm2		
	pfmul %mm6,%mm5			
	punpckldq 52(%ebx),%mm1	
	punpckldq -56(%edx),%mm2
	pfadd %mm0,%mm5
	movd 56(%ebx),%mm3		
	movd -60(%edx),%mm4		
	pfmul %mm2,%mm1
	punpckldq 60(%ebx),%mm3	
	punpckldq (%edx),%mm4
	pfadd %mm1,%mm5			
	addl $-128,%edx			
	addl $-64,%ebx
	movd (%ebx),%mm0
	movd -4(%edx),%mm1
	pfmul %mm4,%mm3
	punpckldq 4(%ebx),%mm0
	punpckldq -8(%edx),%mm1
	pfadd %mm5,%mm3
	pfacc %mm3,%mm3
	incl %ebp
	pf2id %mm3,%mm3
  	movd %mm3,%eax
  	negl %eax
  	movd %eax,%mm3
	packssdw %mm3,%mm3
	movd %mm3,%eax
  	movw %ax,(%edi)
	addl $4,%edi
	decl %ecx
	jnz .L46

	femms
	movl 56(%esp),%esi
	movl %ebp,%eax
	subl $-128,0(%esi)
	popl %ebx
	popl %esi
	popl %edi
	popl %ebp
	addl $24,%esp
	ret