#ifndef _SSE_H #define _SSE_H typedef REAL vReal __attribute__((mode(V4SF),aligned(16))); typedef struct { REAL re, im; } complex; typedef struct { vReal re, im; } vcomplex; typedef struct SU3 { complex v[3][3]; } SU3; typedef struct { vcomplex v[3][3]; } vSU3; typedef struct { vcomplex f[4][3]; } vFermion; typedef struct { vcomplex f[2][3]; } vHalfFermion; typedef struct { vFermion f; } vEvenFermion; typedef struct { vFermion f; } vOddFermion; static inline vReal vmk1(REAL a) { vReal v = __builtin_ia32_loadss((float *)&a); asm("shufps\t$0,%0,%0" : "+x" (v)); return v; } static inline vReal vmk4(REAL a0, REAL a1, REAL a2, REAL a3) { vReal v; REAL *r = (REAL *)&v; r[0] = a0; r[1] = a1; r[2] = a2; r[3] = a3; return v; } static inline REAL vsum(vReal v) { REAL *vv = (REAL *)&v; return vv[0] + vv[1] + vv[2] + vv[3]; } static inline void vput_3(vReal *v, REAL a3) { ((REAL *)v)[3] = a3; } static inline void vput_0(vReal *v, REAL a0) { ((REAL *)v)[0] = a0; } static inline vReal shift_up1(vReal a, vReal b) { vReal x = a; vReal y = b; asm("shufps\t$0x30,%0,%1\n\t" "shufps\t$0x29,%1,%0" : "+x" (x), "+x" (y)); return x; } static inline vReal shift_up2(vReal a, vReal b) { vReal x = a; asm("shufps\t$0x4e,%1,%0" : "+x" (x): "x" (b)); return x; } static inline vReal shift_up3(vReal a, vReal b) { vReal x = a; asm("shufps\t$0x03,%1,%0\n\t" "shufps\t$0x9c,%1,%0" : "+x" (x): "x" (b)); return x; } static inline vReal shift_down1(vReal a, vReal b) { return shift_up3(a, b); } static inline vReal shift_down2(vReal a, vReal b) { return shift_up2(a, b); } static inline vReal shift_down3(vReal a, vReal b) { return shift_up1(a, b); } static inline void vhfzero(vHalfFermion *v) { vReal z = vmk1(0.0); v->f[0][0].re = v->f[0][0].im = v->f[0][1].re = v->f[0][1].im = v->f[0][2].re = v->f[0][2].im = v->f[1][0].re = v->f[1][0].im = v->f[1][1].re = v->f[1][1].im = v->f[1][2].re = v->f[1][2].im = z; } #endif