efl

Форк
0
/
draw_main_sse2.c 
335 строк · 8.8 Кб
1
#ifdef HAVE_CONFIG_H
2
#include "config.h"
3
#endif
4

5
#include "draw_private.h"
6

7
#ifdef BUILD_SSE3
8
#include <immintrin.h>
9

10
// Each 32bits components of alphaChannel must be in the form 0x00AA00AA
11
inline static __m128i
12
v4_byte_mul_sse2(__m128i c, __m128i a)
13
{
14
   const __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
15
   const __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
16

17
   /* for AG */
18
   __m128i v_ag = _mm_and_si128(ag_mask, c);
19
   v_ag = _mm_srli_epi32(v_ag, 8);
20
   v_ag = _mm_mullo_epi16(a, v_ag);
21
   v_ag = _mm_and_si128(ag_mask, v_ag);
22

23
   /* for RB */
24
   __m128i v_rb = _mm_and_si128(rb_mask, c);
25
   v_rb = _mm_mullo_epi16(a, v_rb);
26
   v_rb = _mm_srli_epi32(v_rb, 8);
27
   v_rb = _mm_and_si128(rb_mask, v_rb);
28

29
   /* combine */
30
   return _mm_add_epi32(v_ag, v_rb);
31
}
32

33
static inline __m128i
34
v4_interpolate_color_sse2(__m128i a, __m128i c0, __m128i c1)
35
{
36
   const __m128i rb_mask = _mm_set1_epi32(0xFF00FF00);
37
   const __m128i zero = _mm_setzero_si128();
38

39
   __m128i a_l = a;
40
   __m128i a_h = a;
41
   a_l = _mm_unpacklo_epi16(a_l, a_l);
42
   a_h = _mm_unpackhi_epi16(a_h, a_h);
43

44
   __m128i a_t = _mm_slli_epi64(a_l, 32);
45
   __m128i a_t0 = _mm_slli_epi64(a_h, 32);
46

47
   a_l = _mm_add_epi32(a_l, a_t);
48
   a_h = _mm_add_epi32(a_h, a_t0);
49

50
   __m128i c0_l = c0;
51
   __m128i c0_h = c0;
52

53
   c0_l = _mm_unpacklo_epi8(c0_l, zero);
54
   c0_h = _mm_unpackhi_epi8(c0_h, zero);
55

56
   __m128i c1_l = c1;
57
   __m128i c1_h = c1;
58

59
   c1_l = _mm_unpacklo_epi8(c1_l, zero);
60
   c1_h = _mm_unpackhi_epi8(c1_h, zero);
61

62
   __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l);
63
   __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h);
64

65
   cl_sub = _mm_mullo_epi16(cl_sub, a_l);
66
   ch_sub = _mm_mullo_epi16(ch_sub, a_h);
67

68
   __m128i c1ls = _mm_slli_epi16(c1_l, 8);
69
   __m128i c1hs = _mm_slli_epi16(c1_h, 8);
70

71
   cl_sub = _mm_add_epi16(cl_sub, c1ls);
72
   ch_sub = _mm_add_epi16(ch_sub, c1hs);
73

74
   cl_sub = _mm_and_si128(cl_sub, rb_mask);
75
   ch_sub = _mm_and_si128(ch_sub, rb_mask);
76

77
   cl_sub = _mm_srli_epi64(cl_sub, 8);
78
   ch_sub = _mm_srli_epi64(ch_sub, 8);
79

80
   cl_sub = _mm_packus_epi16(cl_sub, cl_sub);
81
   ch_sub = _mm_packus_epi16(ch_sub, ch_sub);
82

83
   return  (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44);
84
}
85

86
static inline __m128i
87
v4_mul_color_sse2(__m128i x, __m128i y)
88
{
89
   const __m128i zero = _mm_setzero_si128();
90
   const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF);
91

92
   __m128i x_l = _mm_unpacklo_epi8(x, zero);
93
   __m128i x_h = _mm_unpackhi_epi8(x, zero);
94

95
   __m128i y_l = _mm_unpacklo_epi8(y, zero);
96
   __m128i y_h = _mm_unpackhi_epi8(y, zero);
97

98
   __m128i r_l = _mm_mullo_epi16(x_l, y_l);
99
   __m128i r_h = _mm_mullo_epi16(x_h, y_h);
100

101
   r_l = _mm_add_epi16(r_l, sym4_mask);
102
   r_h = _mm_add_epi16(r_h, sym4_mask);
103

104
   r_l = _mm_srli_epi16(r_l, 8);
105
   r_h = _mm_srli_epi16(r_h, 8);
106

107
   return  _mm_packus_epi16(r_l, r_h);
108
}
109

110
static inline __m128i
111
v4_ialpha_sse2(__m128i c)
112
{
113
   __m128i a = _mm_srli_epi32(c, 24);
114

115
   return _mm_sub_epi32(_mm_set1_epi32(0xff), a);
116
}
117

118
// dest = color + (dest * alpha)
119
inline static void
120
comp_func_helper_sse2(uint32_t *dest, int length, uint32_t color, uint32_t alpha)
121
{
122
   const __m128i v_color = _mm_set1_epi32(color);
123
   const __m128i v_a = _mm_set1_epi16(alpha);
124

125
   LOOP_ALIGNED_U1_A4(dest, length,
126
      { /* UOP */
127
         *dest = color + DRAW_BYTE_MUL(*dest, alpha);
128
         dest++; length--;
129
      },
130
      { /* A4OP */
131
         __m128i v_dest = _mm_load_si128((__m128i *)dest);
132

133
         v_dest = v4_byte_mul_sse2(v_dest, v_a);
134
         v_dest = _mm_add_epi32(v_dest, v_color);
135

136
         _mm_store_si128((__m128i *)dest, v_dest);
137

138
         dest += 4; length -= 4;
139
      })
140
}
141

142
void
143
comp_func_solid_source_sse2(uint32_t *dest, int length, uint32_t color, uint32_t const_alpha)
144
{
145
   if (const_alpha == 255)
146
     {
147
        draw_memset32(dest, color, length);
148
     }
149
   else
150
     {
151
        int ialpha;
152

153
        ialpha = 255 - const_alpha;
154
        color = DRAW_BYTE_MUL(color, const_alpha);
155
        comp_func_helper_sse2(dest, length, color, ialpha);
156
     }
157
}
158

159
void
160
comp_func_solid_source_over_sse2(uint32_t *dest, int length, uint32_t color, uint32_t const_alpha)
161
{
162
   int ialpha;
163

164
   if (const_alpha != 255)
165
     color = DRAW_BYTE_MUL(color, const_alpha);
166
   ialpha = alpha_inverse(color);
167
   comp_func_helper_sse2(dest, length, color, ialpha);
168
}
169

170
// Load src and dest vector
171
#define V4_FETCH_SRC_DEST \
172
  __m128i v_src = _mm_loadu_si128((__m128i *)src); \
173
  __m128i v_dest = _mm_load_si128((__m128i *)dest);
174

175
#define V4_FETCH_SRC \
176
  __m128i v_src = _mm_loadu_si128((__m128i *)src);
177

178
#define V4_STORE_DEST \
179
  _mm_store_si128((__m128i *)dest, v_src);
180

181
#define V4_SRC_DEST_LEN_INC \
182
  dest += 4; src +=4; length -= 4;
183

184
// Multiply src color with color multiplier
185
#define V4_COLOR_MULTIPLY \
186
  v_src = v4_mul_color_sse2(v_src, v_color);
187

188
// Multiply src color with const_alpha
189
#define V4_ALPHA_MULTIPLY \
190
  v_src = v4_byte_mul_sse2(v_src, v_alpha);
191

192
// dest = src + dest * sia
193
#define V4_COMP_OP_SRC_OVER \
194
  __m128i v_sia = v4_ialpha_sse2(v_src); \
195
  v_sia = _mm_add_epi32(v_sia, _mm_slli_epi32(v_sia, 16)); \
196
  v_dest = v4_byte_mul_sse2(v_dest, v_sia); \
197
  v_src = _mm_add_epi32(v_src, v_dest);
198

199
// dest = src + dest * sia
200
#define V4_COMP_OP_SRC \
201
  v_src = v4_interpolate_color_sse2(v_alpha, v_src, v_dest);
202

203
static void
204
comp_func_source_sse2(uint32_t *dest, const uint32_t *src, int length, uint32_t color, uint32_t const_alpha)
205
{
206
   int ialpha;
207
   uint32_t src_color;
208

209
   if (color == 0xffffffff) // No color multiplier
210
     {
211
        if (const_alpha == 255)
212
          {
213
             memcpy(dest, src, length * sizeof(uint32_t));
214
          }
215
        else
216
          {
217
             ialpha = 255 - const_alpha;
218
             __m128i v_alpha = _mm_set1_epi32(const_alpha);
219

220
             LOOP_ALIGNED_U1_A4(dest, length,
221
               { /* UOP */
222
                  *dest = draw_interpolate_256(*src, const_alpha, *dest, ialpha);
223
                  dest++; src++; length--;
224
               },
225
               { /* A4OP */
226
                  V4_FETCH_SRC_DEST
227
                  V4_COMP_OP_SRC
228
                  V4_STORE_DEST
229
                  V4_SRC_DEST_LEN_INC
230
               })
231
          }
232
     }
233
   else
234
     {
235
        __m128i v_color = _mm_set1_epi32(color);
236

237
        if (const_alpha == 255)
238
          {
239
             LOOP_ALIGNED_U1_A4(dest, length,
240
               { /* UOP */
241
                  *dest = DRAW_MUL4_SYM(*src, color);
242
                  dest++; src++; length--;
243
               },
244
               { /* A4OP */
245
                  V4_FETCH_SRC
246
                  V4_COLOR_MULTIPLY
247
                  V4_STORE_DEST
248
                  V4_SRC_DEST_LEN_INC
249
               })
250
          }
251
        else
252
          {
253
             ialpha = 255 - const_alpha;
254
             __m128i v_alpha = _mm_set1_epi32(const_alpha);
255

256
             LOOP_ALIGNED_U1_A4(dest, length,
257
               { /* UOP */
258
                  src_color = DRAW_MUL4_SYM(*src, color);
259
                  *dest = draw_interpolate_256(src_color, const_alpha, *dest, ialpha);
260
                  dest++; src++; length--;
261
               },
262
               { /* A4OP */
263
                  V4_FETCH_SRC_DEST
264
                  V4_COLOR_MULTIPLY
265
                  V4_COMP_OP_SRC
266
                  V4_STORE_DEST
267
                  V4_SRC_DEST_LEN_INC
268
               })
269
          }
270
     }
271
}
272

273
static void
274
comp_func_source_over_sse2(uint32_t *dest, const uint32_t *src, int length, uint32_t color, uint32_t const_alpha)
275
{
276
   uint32_t s, sia;
277

278
   if (const_alpha != 255)
279
     color = DRAW_BYTE_MUL(color, const_alpha);
280

281
   if (color == 0xffffffff) // No color multiplier
282
     {
283
        LOOP_ALIGNED_U1_A4(dest, length,
284
         { /* UOP */
285
            s = *src;
286
            sia = alpha_inverse(s);
287
            *dest = s + DRAW_BYTE_MUL(*dest, sia);
288
            dest++; src++; length--;
289
         },
290
         { /* A4OP */
291
            V4_FETCH_SRC_DEST
292
            V4_COMP_OP_SRC_OVER
293
            V4_STORE_DEST
294
            V4_SRC_DEST_LEN_INC
295
         })
296
     }
297
   else
298
     {
299
        __m128i v_color = _mm_set1_epi32(color);
300

301
        LOOP_ALIGNED_U1_A4(dest, length,
302
         { /* UOP */
303
            s = DRAW_MUL4_SYM(*src, color);
304
            sia = alpha_inverse(s);
305
            *dest = s + DRAW_BYTE_MUL(*dest, sia);
306
            dest++; src++; length--;
307
         },
308
         { /* A4OP */
309
            V4_FETCH_SRC_DEST
310
            V4_COLOR_MULTIPLY
311
            V4_COMP_OP_SRC_OVER
312
            V4_STORE_DEST
313
            V4_SRC_DEST_LEN_INC
314
         })
315
     }
316
}
317

318
#endif
319

320
void
321
efl_draw_sse2_init()
322
{
323
#ifdef BUILD_SSE3
324
   if (eina_cpu_features_get() & EINA_CPU_SSE2)
325
     {
326
        // update the comp_function table for solid color
327
        func_for_mode_solid[EFL_GFX_RENDER_OP_COPY] = comp_func_solid_source_sse2;
328
        func_for_mode_solid[EFL_GFX_RENDER_OP_BLEND] = comp_func_solid_source_over_sse2;
329

330
        // update the comp_function table for source data
331
        func_for_mode[EFL_GFX_RENDER_OP_COPY] = comp_func_source_sse2;
332
        func_for_mode[EFL_GFX_RENDER_OP_BLEND] = comp_func_source_over_sse2;
333
      }
334
#endif
335
}
336

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.