efl
335 строк · 8.8 Кб
1#ifdef HAVE_CONFIG_H2#include "config.h"3#endif4
5#include "draw_private.h"6
7#ifdef BUILD_SSE38#include <immintrin.h>9
10// Each 32bits components of alphaChannel must be in the form 0x00AA00AA
11inline static __m128i12v4_byte_mul_sse2(__m128i c, __m128i a)13{
14const __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);15const __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);16
17/* for AG */18__m128i v_ag = _mm_and_si128(ag_mask, c);19v_ag = _mm_srli_epi32(v_ag, 8);20v_ag = _mm_mullo_epi16(a, v_ag);21v_ag = _mm_and_si128(ag_mask, v_ag);22
23/* for RB */24__m128i v_rb = _mm_and_si128(rb_mask, c);25v_rb = _mm_mullo_epi16(a, v_rb);26v_rb = _mm_srli_epi32(v_rb, 8);27v_rb = _mm_and_si128(rb_mask, v_rb);28
29/* combine */30return _mm_add_epi32(v_ag, v_rb);31}
32
33static inline __m128i34v4_interpolate_color_sse2(__m128i a, __m128i c0, __m128i c1)35{
36const __m128i rb_mask = _mm_set1_epi32(0xFF00FF00);37const __m128i zero = _mm_setzero_si128();38
39__m128i a_l = a;40__m128i a_h = a;41a_l = _mm_unpacklo_epi16(a_l, a_l);42a_h = _mm_unpackhi_epi16(a_h, a_h);43
44__m128i a_t = _mm_slli_epi64(a_l, 32);45__m128i a_t0 = _mm_slli_epi64(a_h, 32);46
47a_l = _mm_add_epi32(a_l, a_t);48a_h = _mm_add_epi32(a_h, a_t0);49
50__m128i c0_l = c0;51__m128i c0_h = c0;52
53c0_l = _mm_unpacklo_epi8(c0_l, zero);54c0_h = _mm_unpackhi_epi8(c0_h, zero);55
56__m128i c1_l = c1;57__m128i c1_h = c1;58
59c1_l = _mm_unpacklo_epi8(c1_l, zero);60c1_h = _mm_unpackhi_epi8(c1_h, zero);61
62__m128i cl_sub = _mm_sub_epi16(c0_l, c1_l);63__m128i ch_sub = _mm_sub_epi16(c0_h, c1_h);64
65cl_sub = _mm_mullo_epi16(cl_sub, a_l);66ch_sub = _mm_mullo_epi16(ch_sub, a_h);67
68__m128i c1ls = _mm_slli_epi16(c1_l, 8);69__m128i c1hs = _mm_slli_epi16(c1_h, 8);70
71cl_sub = _mm_add_epi16(cl_sub, c1ls);72ch_sub = _mm_add_epi16(ch_sub, c1hs);73
74cl_sub = _mm_and_si128(cl_sub, rb_mask);75ch_sub = _mm_and_si128(ch_sub, rb_mask);76
77cl_sub = _mm_srli_epi64(cl_sub, 8);78ch_sub = _mm_srli_epi64(ch_sub, 8);79
80cl_sub = _mm_packus_epi16(cl_sub, cl_sub);81ch_sub = _mm_packus_epi16(ch_sub, ch_sub);82
83return (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44);84}
85
86static inline __m128i87v4_mul_color_sse2(__m128i x, __m128i y)88{
89const __m128i zero = _mm_setzero_si128();90const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF);91
92__m128i x_l = _mm_unpacklo_epi8(x, zero);93__m128i x_h = _mm_unpackhi_epi8(x, zero);94
95__m128i y_l = _mm_unpacklo_epi8(y, zero);96__m128i y_h = _mm_unpackhi_epi8(y, zero);97
98__m128i r_l = _mm_mullo_epi16(x_l, y_l);99__m128i r_h = _mm_mullo_epi16(x_h, y_h);100
101r_l = _mm_add_epi16(r_l, sym4_mask);102r_h = _mm_add_epi16(r_h, sym4_mask);103
104r_l = _mm_srli_epi16(r_l, 8);105r_h = _mm_srli_epi16(r_h, 8);106
107return _mm_packus_epi16(r_l, r_h);108}
109
110static inline __m128i111v4_ialpha_sse2(__m128i c)112{
113__m128i a = _mm_srli_epi32(c, 24);114
115return _mm_sub_epi32(_mm_set1_epi32(0xff), a);116}
117
118// dest = color + (dest * alpha)
119inline static void120comp_func_helper_sse2(uint32_t *dest, int length, uint32_t color, uint32_t alpha)121{
122const __m128i v_color = _mm_set1_epi32(color);123const __m128i v_a = _mm_set1_epi16(alpha);124
125LOOP_ALIGNED_U1_A4(dest, length,126{ /* UOP */127*dest = color + DRAW_BYTE_MUL(*dest, alpha);128dest++; length--;129},130{ /* A4OP */131__m128i v_dest = _mm_load_si128((__m128i *)dest);132
133v_dest = v4_byte_mul_sse2(v_dest, v_a);134v_dest = _mm_add_epi32(v_dest, v_color);135
136_mm_store_si128((__m128i *)dest, v_dest);137
138dest += 4; length -= 4;139})140}
141
142void
143comp_func_solid_source_sse2(uint32_t *dest, int length, uint32_t color, uint32_t const_alpha)144{
145if (const_alpha == 255)146{147draw_memset32(dest, color, length);148}149else150{151int ialpha;152
153ialpha = 255 - const_alpha;154color = DRAW_BYTE_MUL(color, const_alpha);155comp_func_helper_sse2(dest, length, color, ialpha);156}157}
158
159void
160comp_func_solid_source_over_sse2(uint32_t *dest, int length, uint32_t color, uint32_t const_alpha)161{
162int ialpha;163
164if (const_alpha != 255)165color = DRAW_BYTE_MUL(color, const_alpha);166ialpha = alpha_inverse(color);167comp_func_helper_sse2(dest, length, color, ialpha);168}
169
170// Load src and dest vector
171#define V4_FETCH_SRC_DEST \172__m128i v_src = _mm_loadu_si128((__m128i *)src); \173__m128i v_dest = _mm_load_si128((__m128i *)dest);174
175#define V4_FETCH_SRC \176__m128i v_src = _mm_loadu_si128((__m128i *)src);177
178#define V4_STORE_DEST \179_mm_store_si128((__m128i *)dest, v_src);180
181#define V4_SRC_DEST_LEN_INC \182dest += 4; src +=4; length -= 4;183
184// Multiply src color with color multiplier
185#define V4_COLOR_MULTIPLY \186v_src = v4_mul_color_sse2(v_src, v_color);187
188// Multiply src color with const_alpha
189#define V4_ALPHA_MULTIPLY \190v_src = v4_byte_mul_sse2(v_src, v_alpha);191
192// dest = src + dest * sia
193#define V4_COMP_OP_SRC_OVER \194__m128i v_sia = v4_ialpha_sse2(v_src); \195v_sia = _mm_add_epi32(v_sia, _mm_slli_epi32(v_sia, 16)); \196v_dest = v4_byte_mul_sse2(v_dest, v_sia); \197v_src = _mm_add_epi32(v_src, v_dest);198
199// dest = src + dest * sia
200#define V4_COMP_OP_SRC \201v_src = v4_interpolate_color_sse2(v_alpha, v_src, v_dest);202
203static void204comp_func_source_sse2(uint32_t *dest, const uint32_t *src, int length, uint32_t color, uint32_t const_alpha)205{
206int ialpha;207uint32_t src_color;208
209if (color == 0xffffffff) // No color multiplier210{211if (const_alpha == 255)212{213memcpy(dest, src, length * sizeof(uint32_t));214}215else216{217ialpha = 255 - const_alpha;218__m128i v_alpha = _mm_set1_epi32(const_alpha);219
220LOOP_ALIGNED_U1_A4(dest, length,221{ /* UOP */222*dest = draw_interpolate_256(*src, const_alpha, *dest, ialpha);223dest++; src++; length--;224},225{ /* A4OP */226V4_FETCH_SRC_DEST
227V4_COMP_OP_SRC
228V4_STORE_DEST
229V4_SRC_DEST_LEN_INC
230})231}232}233else234{235__m128i v_color = _mm_set1_epi32(color);236
237if (const_alpha == 255)238{239LOOP_ALIGNED_U1_A4(dest, length,240{ /* UOP */241*dest = DRAW_MUL4_SYM(*src, color);242dest++; src++; length--;243},244{ /* A4OP */245V4_FETCH_SRC
246V4_COLOR_MULTIPLY
247V4_STORE_DEST
248V4_SRC_DEST_LEN_INC
249})250}251else252{253ialpha = 255 - const_alpha;254__m128i v_alpha = _mm_set1_epi32(const_alpha);255
256LOOP_ALIGNED_U1_A4(dest, length,257{ /* UOP */258src_color = DRAW_MUL4_SYM(*src, color);259*dest = draw_interpolate_256(src_color, const_alpha, *dest, ialpha);260dest++; src++; length--;261},262{ /* A4OP */263V4_FETCH_SRC_DEST
264V4_COLOR_MULTIPLY
265V4_COMP_OP_SRC
266V4_STORE_DEST
267V4_SRC_DEST_LEN_INC
268})269}270}271}
272
273static void274comp_func_source_over_sse2(uint32_t *dest, const uint32_t *src, int length, uint32_t color, uint32_t const_alpha)275{
276uint32_t s, sia;277
278if (const_alpha != 255)279color = DRAW_BYTE_MUL(color, const_alpha);280
281if (color == 0xffffffff) // No color multiplier282{283LOOP_ALIGNED_U1_A4(dest, length,284{ /* UOP */285s = *src;286sia = alpha_inverse(s);287*dest = s + DRAW_BYTE_MUL(*dest, sia);288dest++; src++; length--;289},290{ /* A4OP */291V4_FETCH_SRC_DEST
292V4_COMP_OP_SRC_OVER
293V4_STORE_DEST
294V4_SRC_DEST_LEN_INC
295})296}297else298{299__m128i v_color = _mm_set1_epi32(color);300
301LOOP_ALIGNED_U1_A4(dest, length,302{ /* UOP */303s = DRAW_MUL4_SYM(*src, color);304sia = alpha_inverse(s);305*dest = s + DRAW_BYTE_MUL(*dest, sia);306dest++; src++; length--;307},308{ /* A4OP */309V4_FETCH_SRC_DEST
310V4_COLOR_MULTIPLY
311V4_COMP_OP_SRC_OVER
312V4_STORE_DEST
313V4_SRC_DEST_LEN_INC
314})315}316}
317
318#endif319
320void
321efl_draw_sse2_init()322{
323#ifdef BUILD_SSE3324if (eina_cpu_features_get() & EINA_CPU_SSE2)325{326// update the comp_function table for solid color327func_for_mode_solid[EFL_GFX_RENDER_OP_COPY] = comp_func_solid_source_sse2;328func_for_mode_solid[EFL_GFX_RENDER_OP_BLEND] = comp_func_solid_source_over_sse2;329
330// update the comp_function table for source data331func_for_mode[EFL_GFX_RENDER_OP_COPY] = comp_func_source_sse2;332func_for_mode[EFL_GFX_RENDER_OP_BLEND] = comp_func_source_over_sse2;333}334#endif335}
336