#include #include "sse-dwf-cg.h" #include #include #define Vs 4 /* Length of SSE vector */ #define REAL float /* floating point type compatible with vReal */ #include #define PAD16(size) (15+(size)) #define ALIGN16(addr) ((void *)(~15 & (15 + (size_t)(addr)))) #define Nc 3 /* Number of colors */ #define DIM 4 /* number of dimensions */ #define Fd 4 /* Fermion representation dimension */ struct SSE_DWF_Fermion { vEvenFermion *even; vOddFermion *odd; }; struct SSE_DWF_Gauge { complex v[Nc][Nc]; }; struct memblock { struct memblock *next; struct memblock *prev; void *data; size_t size; }; struct bounds { int lo[DIM]; int hi[DIM]; }; struct neighbor { int size; /* size of site table */ int inside_size; /* number of inside sites */ int boundary_size; /* number of boundary sites */ int snd_size[2*DIM]; /* size of send buffers in 8 dirs */ int rcv_size[2*DIM]; /* size of receive buffers */ int *snd[2*DIM]; /* i->x translation for send buffers */ int *inside; /* i->x translation for inside sites */ struct boundary *boundary; /* i->x,mask translation for boundary */ struct site *site; /* x->site translation for sites */ vHalfFermion *snd_buf[2*DIM]; /* Send buffers */ vHalfFermion *rcv_buf[2*DIM]; /* Receive buffers */ int qmp_size[4*DIM]; /* sizes of QMP buffers */ void *qmp_xbuf[4*DIM]; /* QMP snd/rcv buffer addresses */ vHalfFermion *qmp_buf[4*DIM]; /* send and receive buffers for QMP */ QMP_msgmem_t qmp_mm[4*DIM]; /* msgmem's for send and receive */ int Nx; /* number of msegs */ QMP_msghandle_t qmp_sh[2*DIM]; /* handles for sends */ QMP_msghandle_t qmp_sv[2*DIM]; /* copies of handles for finilization */ int qmp_smask; /* send flags for qmp_sh[] */ int Ns; /* number of send handles */ QMP_msghandle_t qmp_rh[2*DIM]; /* handles for receives */ int Nr; /* number of receive handles */ QMP_msghandle_t qmp_cr; /* common receive handle */ }; struct boundary { int index; /* x-index of this boundary site */ int mask; /* bitmask of the borders */ }; struct site { int Uup; /* up-links are Uup, Uup+1, Uup+2, Uup+3 */ int Udown[DIM]; /* four down-links */ int F[2*DIM]; /* eight neighboring fermions on the other sublattice */ }; static int inited_p = 0; static void *(*tmalloc)(size_t size); static void (*tfree)(void *ptr); static int tlattice[DIM+1]; static SU3 *U; static struct memblock memblock = { &memblock, &memblock, NULL, 0 }; static int network[DIM]; static int coord[DIM]; static struct bounds bounds; static int gauge_XYZT; static int S_4, S_4_1; static struct neighbor neighbor; static struct neighbor odd_even; static struct neighbor even_odd; static vOddFermion *auxA_o, *auxB_o, *Phi_o; static vEvenFermion *auxA_e; static vOddFermion *r_o, *p_o, *q_o; static vEvenFermion *auxB_e; static REAL inv_a; static REAL b_over_a; static vReal vfx_A; static vReal vfx_B; static vReal vab; static REAL c0; static struct neighbor *sending = 0; static vEvenFermion *allocate_even_fermion(void); static vOddFermion *allocate_odd_fermion(void); static SSE_DWF_Gauge *allocate_gauge_field(void); /* vFermion *allocate_subfermion(int size); */ static inline vReal import_vector(const void *z, void *env, SSE_DWF_fermion_reader reader, int x[DIM+1], int c, int d, int re_im) { vReal f; REAL *v = (REAL *)&f; int i, xs; for (xs = x[DIM], i = 0; i < Vs; i++, x[DIM]++) { *v++ = reader(z, env, x, c, d, re_im); } x[DIM] = xs; return f; } static inline void save_vector(void *z, void *env, SSE_DWF_fermion_writer writer, int x[DIM+1], int c, int d, int re_im, vReal *f) { REAL *v = (REAL *)f; int i, xs; for (xs = x[DIM], i = 0; i < Vs; i++, x[DIM]++) { writer(z, env, x, c, d, re_im, *v++); } x[DIM] = xs; } static inline int lattice_start(int lat, int net, int coord) { int q = lat / net; int r = lat % net; return coord * q + ((coord < r)? coord: r); } static inline void mk_sublattice(struct bounds *bounds, int coord[]) { int i; for (i = 0; i < DIM; i++) { bounds->lo[i] = lattice_start(tlattice[i], network[i], coord[i]); bounds->hi[i] = lattice_start(tlattice[i], network[i], coord[i] + 1); } } static void init_neighbor(struct bounds *bounds, struct neighbor *neighbor); static void build_neighbor(struct neighbor *out, struct bounds *bounds, int parity, struct neighbor *in); static void construct_rec(struct neighbor *out, int par, struct bounds *bounds, int dir, int step); static int to_HFlinear(int p[], struct bounds *b, int q, int z) { int x, d; for (x = 0, d = 4; d--;) { int v = p[d] + ((d == q)?z:0); int s = b->hi[d] - b->lo[d]; if (v < 0) v += tlattice[d]; if (v >= tlattice[d]) v -= tlattice[d]; x = x * s + v - b->lo[d]; } return x / 2; } static int to_Ulinear(int p[], struct bounds *b, int q) { int x, d; if ((q < 0) || (p[q] > b->lo[q]) || (network[q] < 2)) { for (x = 0, d = 4; d--;) { int s = b->hi[d] - b->lo[d]; int v = p[d] - ((q == d)?1:0); if (v < 0) v += tlattice[d]; x = x * s + v - b->lo[d]; } return 4 * x + ((q < 0)?0:q); } else { int s0, v0; for (d = 0, v0 = 1; d < 4; d++) v0 *= b->hi[d] - b->lo[d]; for (d = 0, s0 = 4 * v0; d < q; d++) s0 += v0 / (b->hi[d] - b->lo[d]); for (d = 4, x = 0; d--;) { int s = b->hi[d] - b->lo[d]; int v = p[d]; if (d == q) continue; x = x * s + v - b->lo[d]; } return s0 + x; } } static int build_buffers(struct neighbor *nb); static int make_buffer(struct neighbor *nb, int size); static void make_send(struct neighbor *nb, int k, int i, int d); static int make_receive(struct neighbor *nb, int k, int i, int d, QMP_msghandle_t Rh[2*DIM], int Nr); static void sse_aligned_buffer(struct neighbor *nb, int k, int size); static void free_buffers(struct neighbor *nb); static int cg(vOddFermion *psi, const vOddFermion *b, const vOddFermion *x0, double epsilon, int max_iter, double *out_eps, int *out_iter); static void copy_o(vOddFermion *dst, const vOddFermion *src); static void compute_sum2_o(vOddFermion *dst, double alpha, const vOddFermion *src); static void compute_sum2x_o(vOddFermion *dst, const vOddFermion *src, double alpha); static void compute_sum_e(vEvenFermion *d, const vEvenFermion *x, double alpha, const vEvenFermion *y); static void compute_sum_o(vOddFermion *d, const vOddFermion *x, double alpha, const vOddFermion *y); static void compute_sum_oN(vOddFermion *d, double *norm, const vOddFermion *x, double alpha, const vOddFermion *y); static void compute_sum2_oN(vOddFermion *d, double *norm, double alpha, const vOddFermion *y); static void compute_MxM(vOddFermion *eta, double *norm, const vOddFermion *psi); static void compute_M(vOddFermion *eta, double *norm, const vOddFermion *psi); static void compute_Mx(vOddFermion *eta, const vOddFermion *psi); static void compute_Qxx1(vFermion *eta, const vFermion *psi, int xyzt); static void inline compute_Qee1(vEvenFermion *eta, const vEvenFermion *psi) { compute_Qxx1(&eta->f, &psi->f, even_odd.size); } static void inline compute_Qoo1(vOddFermion *eta, const vOddFermion *psi) { compute_Qxx1(&eta->f, &psi->f, odd_even.size); } static void compute_Soo1(vOddFermion *eta, const vOddFermion *psi); static void compute_Qxy(vFermion *d, const vFermion *s, struct neighbor *nb); static void inline compute_Qoe(vOddFermion *d, const vEvenFermion *s) { compute_Qxy(&d->f, &s->f, &odd_even); } static void inline compute_Qeo(vEvenFermion *d, const vOddFermion *s) { compute_Qxy(&d->f, &s->f, &even_odd); } static void compute_1Sxy(vFermion *d, const vFermion *q, const vFermion *s, struct neighbor *nb); static void inline compute_1Soe(vOddFermion *d, const vOddFermion *q, const vEvenFermion *s) { compute_1Sxy(&d->f, &q->f, &s->f, &odd_even); } static void compute_Qxx1Qxy(vFermion *d, const vFermion *s, struct neighbor *nb); static void inline compute_Qee1Qeo(vEvenFermion *d, const vOddFermion *s) { compute_Qxx1Qxy(&d->f, &s->f, &even_odd); } static void compute_Sxx1Sxy(vFermion *d, const vFermion *s, struct neighbor *nb); static void inline compute_See1Seo(vEvenFermion *d, const vOddFermion *s) { compute_Sxx1Sxy(&d->f, &s->f, &even_odd); } static void compute_1Qxx1Qxy(vFermion *d, double *norm, const vFermion *q, const vFermion *s, struct neighbor *nb); static void inline compute_1Qoo1Qoe(vOddFermion *d, double *norm, const vOddFermion *q, const vEvenFermion *s) { compute_1Qxx1Qxy(&d->f, norm, &q->f, &s->f, &odd_even); } static inline int parity(const int x[DIM]) { int i, v; for (i = v = 0; i < DIM; i++) v += x[i]; return v & 1; } static void * alloc16(int size) { int xsize = PAD16(size + sizeof (struct memblock)); struct memblock *p = tmalloc(xsize); if (p == 0) return p; p->data = ALIGN16(&p[1]); p->size = size; p->next = memblock.next; p->prev = &memblock; p->next->prev = p; p->prev->next = p; return p->data; } static void free16(void *ptr) { struct memblock *p; if (ptr == 0) return; for (p = memblock.next; p != &memblock; p = p->next) { if (p->data != ptr) continue; p->next->prev = p->prev; p->prev->next = p->next; tfree(p); return; } /* this is BAD: control should reach here! */ } vEvenFermion * allocate_even_fermion(void) { return alloc16(even_odd.size * S_4 * sizeof (vFermion)); } vOddFermion * allocate_odd_fermion(void) { return alloc16(odd_even.size * S_4 * sizeof (vFermion)); } SSE_DWF_Gauge * allocate_gauge_field(void) { return alloc16(gauge_XYZT * sizeof (SSE_DWF_Gauge)); } static int init_tables(void) { struct neighbor tmp; int i, v; init_neighbor(&bounds, &neighbor); S_4 = tlattice[DIM] / 4; S_4_1 = S_4 - 1; for (v = 1, i = 0; i < DIM; i++) { v *= bounds.hi[i] - bounds.lo[i]; } gauge_XYZT = DIM * v; for (i = 0; i < DIM; i++) { if (network[i] < 2) continue; gauge_XYZT += v / (bounds.hi[i] - bounds.lo[i]); } tmp = neighbor; build_neighbor(&even_odd, &bounds, 0, &tmp); build_neighbor(&odd_even, &bounds, 1, &tmp); return 0; } static void init_neighbor(struct bounds *bounds, struct neighbor *neighbor) { int i; mk_sublattice(bounds, coord); neighbor->qmp_smask = 0; for (neighbor->size = 1, neighbor->inside_size = 1, i = 0; i < DIM; i++) { int ext = bounds->hi[i] - bounds->lo[i]; neighbor->size *= ext; if (network[i] > 1) neighbor->inside_size *= ext - 2; else neighbor->inside_size *= ext; } neighbor->boundary_size = neighbor->size - neighbor->inside_size; neighbor->site = tmalloc(neighbor->size * sizeof (struct site)); if (neighbor->inside_size) neighbor->inside = tmalloc(neighbor->inside_size * sizeof (int)); else neighbor->inside = 0; if (neighbor->boundary_size) neighbor->boundary = tmalloc(neighbor->boundary_size * sizeof (struct boundary)); else neighbor->boundary = 0; for (i = 0; i < 2 * DIM; i++) { int d = i / 2; if (network[d] > 1) { neighbor->snd_size[i] = neighbor->size / (bounds->hi[d] - bounds->lo[d]); neighbor->snd[i] = tmalloc(neighbor->snd_size[i] * sizeof (int)); } else { neighbor->snd_size[i] = 0; neighbor->snd[i] = 0; } } } static void build_neighbor(struct neighbor *out, struct bounds *bounds, int par, struct neighbor *in) { int i,d, s, p, m; int x[DIM]; *out = *in; out->size = 0; out->inside_size = 0; out->boundary_size = 0; for (d = 0; d < DIM; d++) { out->rcv_size[2*d] = out->snd_size[2*d] = 0; out->rcv_size[2*d+1] = out->snd_size[2*d+1] = 0; } for (i = 0; i < DIM; i++) x[i] = bounds->lo[i]; for (i = 0; i < DIM;) { s = parity(x); if (s != par) goto next; p = to_HFlinear(x, bounds, -1, 0); for (m = 0, d = 0; d < DIM; d++) { if (network[d] > 1) { if (x[d] == bounds->lo[d]) m |= 1 << (2 * d); if (x[d] + 1 == bounds->hi[d]) m |= 1 << (2 * d + 1); } } if (m) { in->boundary->index = p; in->boundary->mask = m; in->boundary++; out->boundary_size++; for (d = 0; d < 2*DIM; d++) { if ((m & (1 << d)) == 0) continue; *in->snd[d]++ = p; out->snd_size[d]++; } } else { *in->inside++ = p; out->inside_size++; } in->site->Uup = to_Ulinear(x, bounds, -1); for (d = 0; d < DIM; d++) { in->site->Udown[d] = to_Ulinear(x, bounds, d); if ((m & (1 << (2 * d))) == 0) in->site->F[2*d] = S_4 * to_HFlinear(x, bounds, d, -1); if ((m & (1 << (2 * d + 1))) == 0) in->site->F[2*d + 1] = S_4 * to_HFlinear(x, bounds, d, +1); } out->size++; in->site++; next: for (i = 0; i < DIM; i++) { if (++x[i] == bounds->hi[i]) x[i] = bounds->lo[i]; else break; } } for (d = 0; d < DIM; d++) { if (network[d] < 2) continue; construct_rec(out, par, bounds, d, +1); construct_rec(out, par, bounds, d, -1); } } static void construct_rec(struct neighbor *out, int par, struct bounds *bounds, int dir, int step) { struct bounds xb; int xc[DIM], x[DIM]; int s, d, p, k; int dz = dir * 2 + ((step>0)?1:0); for (d = 0; d < DIM; d++) { int v = coord[d] + ((d==dir)?step:0); if (v < 0) v += network[d]; if (v >= network[d]) v -= network[d]; xc[d] = v; } mk_sublattice(&xb, xc); for (d = 0; d < DIM; d++) x[d] = ((d == dir) && (step < 0))? (xb.hi[d] - 1): xb.lo[d]; /* ZZZ: This needs some cleaning */ k = 0; do { for (d = 0, s = par; d < DIM; d++) s += x[d]; if (!(s & 1)) goto next; p = to_HFlinear(x, bounds, dir, -step); out->site[p].F[dz] = S_4 * k++; next: for (d = 0; d < DIM; d++) { if (d == dir) continue; if (++x[d] == xb.hi[d]) x[d] = xb.lo[d]; else break; } } while (d != DIM); out->rcv_size[dz^1] = k; /* XXX is it true? */ } static int build_buffers(struct neighbor *nb) { int i, k, Nr; QMP_msghandle_t Rh[2*DIM]; Nr = nb->Ns = nb->Nx = 0; for (i = 0; i < DIM; i++) { switch (network[i]) { case 1: break; case 2: k = make_buffer(nb, nb->snd_size[2*i] + nb->snd_size[2*i+1]); nb->snd_buf[2*i] = nb->qmp_buf[k]; nb->snd_buf[2*i+1] = nb->snd_buf[2*i] + S_4 * nb->snd_size[2*i]; make_send(nb, k, i, +1); k = make_buffer(nb, nb->rcv_size[2*i] + nb->rcv_size[2*i+1]); nb->rcv_buf[2*i] = nb->qmp_buf[k]; nb->rcv_buf[2*i+1] = nb->snd_buf[2*i] + S_4 * nb->snd_size[2*i]; Nr = make_receive(nb, k, i, -1, Rh, Nr); /* -1 here helps with a bug in GigE QMP */ break; default: /* Order here is important */ k = make_buffer(nb, nb->snd_size[2*i]); nb->snd_buf[2*i] = nb->qmp_buf[k]; make_send(nb, k, i, -1); k = make_buffer(nb, nb->rcv_size[2*i]); nb->rcv_buf[2*i] = nb->qmp_buf[k]; Nr = make_receive(nb, k, i, -1, Rh, Nr); k = make_buffer(nb, nb->snd_size[2*i+1]); nb->snd_buf[2*i+1] = nb->qmp_buf[k]; make_send(nb, k, i, +1); k = make_buffer(nb, nb->rcv_size[2*i+1]); nb->rcv_buf[2*i+1] = nb->qmp_buf[k]; Nr = make_receive(nb, k, i, +1, Rh, Nr); break; } } nb->qmp_cr = QMP_declare_multiple(Rh, Nr); return 0; } static int make_buffer(struct neighbor *nb, int size) { int bcount = size * S_4 * sizeof (vHalfFermion); int N = nb->Nx; nb->qmp_size[N] = size; sse_aligned_buffer(nb, N, bcount); nb->qmp_mm[N] = QMP_declare_msgmem(nb->qmp_buf[N], bcount); nb->Nx = N + 1; return N; } static void make_send(struct neighbor *nb, int k, int i, int d) { QMP_msghandle_t h = QMP_declare_send_relative(nb->qmp_mm[k], i, d, 1); int j = 2 * i + ((d < 0)? 0: 1); nb->qmp_sh[j] = h; nb->qmp_sv[nb->Ns++] = h; nb->qmp_smask |= (1 << j); } static int make_receive(struct neighbor *nb, int k, int i, int d, QMP_msghandle_t Rh[2*DIM], int Nr) { Rh[Nr] = QMP_declare_receive_relative(nb->qmp_mm[k], i, d, 1); return Nr+1; } static void sse_aligned_buffer(struct neighbor *nb, int k, int size) { int xcount = size + 15; char *ptr = QMP_allocate_aligned_memory(xcount); nb->qmp_buf[k] = (void *)(~15 & (15 + (unsigned long)(ptr))); nb->qmp_xbuf[k] = ptr; } static void free_buffers(struct neighbor *nb) { int i; QMP_free_msghandle(nb->qmp_cr); for (i = nb->Ns; i--;) QMP_free_msghandle(nb->qmp_sv[i]); for (i = nb->Nx; i--;) { QMP_free_msgmem(nb->qmp_mm[i]); QMP_free_aligned_memory(nb->qmp_xbuf[i]); } } static int cg(vOddFermion *x_o, const vOddFermion *b, const vOddFermion *x0, double epsilon, int N, double *out_eps, int *out_N) { double rho, alpha, beta, gamma, norm_z; int status = 1; int k; copy_o(x_o, x0); compute_MxM(p_o, &norm_z, x_o); compute_sum_oN(r_o, &rho, b, -1, p_o); copy_o(p_o, r_o); /* relax, QMP does not support split reductions yet. */ for (k = 0; (rho > epsilon) && (k < N); k++) { compute_MxM(q_o, &norm_z, p_o); /* relax, QMP does not support split reductions yet. */ alpha = rho / norm_z; compute_sum2_oN(r_o, &gamma, -alpha, q_o); compute_sum2_o(x_o, alpha, p_o); /* relax, QMP does not support split reductions yet. */ if (gamma <= epsilon) { rho = gamma; status = 0; break; } beta = gamma / rho; rho = gamma; compute_sum2x_o(p_o, r_o, beta); } if (sending) { int i; /* This is QMP_wait_vector(nb->qmp_sv, nb->Ns); */ for (i = sending->Ns; i--;) QMP_wait(sending->qmp_sv[i]); sending = 0; } *out_N = k; *out_eps = rho; return status; } static void copy_o(vOddFermion *dst, const vOddFermion *src) { int i = odd_even.size * S_4 * sizeof (vOddFermion) / sizeof (vReal); vReal *d = (vReal *)dst; const vReal *s = (const vReal *)src; for ( ;i--;) *d++ = *s++; } static void compute_sum2_o(vOddFermion *dst, double alpha, const vOddFermion *src) { int i = odd_even.size * S_4 * sizeof (vOddFermion) / sizeof (vReal); vReal a = vmk1(alpha); vReal *d = (vReal *)dst; const vReal *s = (const vReal *)src; for ( ;i--;) *d++ += a * *s++; } static void compute_sum2x_o(vOddFermion *dst, const vOddFermion *src, double alpha) { int i = odd_even.size * S_4 * sizeof (vOddFermion) / sizeof (vReal); vReal a = vmk1(alpha); vReal *d = (vReal *)dst; const vReal *s = (const vReal *)src; for ( ;i--; d++) *d = a * *d + *s++; } static void compute_sum_e(vEvenFermion *d, const vEvenFermion *x, double alpha, const vEvenFermion *y) { const vReal *X = (const vReal *)x; const vReal *Y = (const vReal *)y; vReal *D = (vReal *)d; vReal a = vmk1(alpha); int i = even_odd.size * S_4 * sizeof (vEvenFermion) / sizeof (vReal); for (;i--;) *D++ = *X++ + a * *Y++; } static void compute_sum_o(vOddFermion *d, const vOddFermion *x, double alpha, const vOddFermion *y) { const vReal *X = (const vReal *)x; const vReal *Y = (const vReal *)y; vReal *D = (vReal *)d; vReal a = vmk1(alpha); int i = odd_even.size * S_4 * sizeof (vOddFermion) / sizeof (vReal); for (;i--;) *D++ = *X++ + a * *Y++; } static void compute_sum_oN(vOddFermion *d, double *norm, const vOddFermion *x, double alpha, const vOddFermion *y) { const vReal *X = (const vReal *)x; const vReal *Y = (const vReal *)y; vReal *D = (vReal *)d; vReal a = vmk1(alpha); vReal s = vmk1(0.0); vReal v; int i = odd_even.size * S_4 * sizeof (vOddFermion) / sizeof (vReal); for (;i--;) { v = *X++ + a * *Y++; s += v * v; *D++ = v; } *norm = vsum(s); QMP_sum_double(norm); } static void compute_sum2_oN(vOddFermion *d, double *norm, double alpha, const vOddFermion *y) { const vReal *Y = (const vReal *)y; vReal *D = (vReal *)d; vReal a = vmk1(alpha); vReal s = vmk1(0.0); vReal v; int i = odd_even.size * S_4 * sizeof (vOddFermion) / sizeof (vReal); for (;i--;) { v = *D + a * *Y++; s += v * v; *D++ = v; } *norm = vsum(s); QMP_sum_double(norm); } static void compute_MxM(vOddFermion *eta, double *norm, const vOddFermion *psi) { compute_M(auxB_o, norm, psi); compute_Mx(eta, auxB_o); } static void compute_M(vOddFermion *eta, double *norm, const vOddFermion *psi) { compute_Qee1Qeo(auxA_e, psi); compute_1Qoo1Qoe(eta, norm, psi, auxA_e); } static void compute_Mx(vOddFermion *eta, const vOddFermion *psi) { compute_Soo1(auxA_o, psi); compute_See1Seo(auxA_e, auxA_o); compute_1Soe(eta, psi, auxA_e); } static void compute_Qxx1(vFermion *chi, const vFermion *psi, int size) { const vFermion *qs, *qx5; int i, xyzt5, s, c; vFermion * __restrict__ rx5, * __restrict__ rs; vReal fx; vHalfFermion zV; vcomplex zn; complex zX[2][3]; complex yOut[2][3]; for (i = 0; i < size; i++) { xyzt5 = i * S_4; rx5 = &chi[xyzt5]; qx5 = &psi[xyzt5]; vhfzero(&zV); fx = vfx_A; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = 0; s < S_4_1; s++, fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } } rs = &rx5[S_4_1]; QSETUP(S_4_1) vput_3(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[0][c].re; zn.im = qs->f[0][c].im; vput_3(&zn.re, zX[0][c].re); vput_3(&zn.im, zX[0][c].im); rs->f[0][c].re = zn.re; rs->f[0][c].im = zn.im; zn.re = qs->f[1][c].re; zn.im = qs->f[1][c].im; vput_3(&zn.re, zX[1][c].re); vput_3(&zn.im, zX[1][c].im); rs->f[1][c].re = zn.re; rs->f[1][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = S_4; s--;) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs0re = (REAL *)&rs->f[0][c].re; REAL *rs0im = (REAL *)&rs->f[0][c].im; rs0re[3] = inv_a * rs0re[3] + b_over_a * yOut[0][c].re; rs0re[2] = inv_a * rs0re[2] + b_over_a * rs0re[3]; rs0re[1] = inv_a * rs0re[1] + b_over_a * rs0re[2]; yOut[0][c].re = rs0re[0] = inv_a * rs0re[0] + b_over_a * rs0re[1]; rs0im[3] = inv_a * rs0im[3] + b_over_a * yOut[0][c].im; rs0im[2] = inv_a * rs0im[2] + b_over_a * rs0im[3]; rs0im[1] = inv_a * rs0im[1] + b_over_a * rs0im[2]; yOut[0][c].im = rs0im[0] = inv_a * rs0im[0] + b_over_a * rs0im[1]; } { REAL *rs1re = (REAL *)&rs->f[1][c].re; REAL *rs1im = (REAL *)&rs->f[1][c].im; rs1re[3] = inv_a * rs1re[3] + b_over_a * yOut[1][c].re; rs1re[2] = inv_a * rs1re[2] + b_over_a * rs1re[3]; rs1re[1] = inv_a * rs1re[1] + b_over_a * rs1re[2]; yOut[1][c].re = rs1re[0] = inv_a * rs1re[0] + b_over_a * rs1re[1]; rs1im[3] = inv_a * rs1im[3] + b_over_a * yOut[1][c].im; rs1im[2] = inv_a * rs1im[2] + b_over_a * rs1im[3]; rs1im[1] = inv_a * rs1im[1] + b_over_a * rs1im[2]; yOut[1][c].im = rs1im[0] = inv_a * rs1im[0] + b_over_a * rs1im[1]; } } } vhfzero(&zV); fx = vfx_B; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = S_4; --s; fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } } rs = &rx5[0]; QSETUP(0) vput_0(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[2][c].re; zn.im = qs->f[2][c].im; vput_0(&zn.re, zX[0][c].re); vput_0(&zn.im, zX[0][c].im); rs->f[2][c].re = zn.re; rs->f[2][c].im = zn.im; zn.re = qs->f[3][c].re; zn.im = qs->f[3][c].im; vput_0(&zn.re, zX[1][c].re); vput_0(&zn.im, zX[1][c].im); rs->f[3][c].re = zn.re; rs->f[3][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = 0; s < S_4; s++) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs2re = (REAL *)&rs->f[2][c].re; REAL *rs2im = (REAL *)&rs->f[2][c].im; rs2re[0] = inv_a * rs2re[0] + b_over_a * yOut[0][c].re; rs2re[1] = inv_a * rs2re[1] + b_over_a * rs2re[0]; rs2re[2] = inv_a * rs2re[2] + b_over_a * rs2re[1]; yOut[0][c].re = rs2re[3] = inv_a * rs2re[3] + b_over_a * rs2re[2]; rs2im[0] = inv_a * rs2im[0] + b_over_a * yOut[0][c].im; rs2im[1] = inv_a * rs2im[1] + b_over_a * rs2im[0]; rs2im[2] = inv_a * rs2im[2] + b_over_a * rs2im[1]; yOut[0][c].im = rs2im[3] = inv_a * rs2im[3] + b_over_a * rs2im[2]; } { REAL *rs3re = (REAL *)&rs->f[3][c].re; REAL *rs3im = (REAL *)&rs->f[3][c].im; rs3re[0] = inv_a * rs3re[0] + b_over_a * yOut[1][c].re; rs3re[1] = inv_a * rs3re[1] + b_over_a * rs3re[0]; rs3re[2] = inv_a * rs3re[2] + b_over_a * rs3re[1]; yOut[1][c].re = rs3re[3] = inv_a * rs3re[3] + b_over_a * rs3re[2]; rs3im[0] = inv_a * rs3im[0] + b_over_a * yOut[1][c].im; rs3im[1] = inv_a * rs3im[1] + b_over_a * rs3im[0]; rs3im[2] = inv_a * rs3im[2] + b_over_a * rs3im[1]; yOut[1][c].im = rs3im[3] = inv_a * rs3im[3] + b_over_a * rs3im[2]; } } } } } static void compute_Soo1(vOddFermion *Chi, const vOddFermion *Psi) { vFermion *chi = &Chi->f; const vFermion *psi = &Psi->f; int size = odd_even.size; const vFermion *qs, *qx5; int i, xyzt5, s, c; vFermion * __restrict__ rx5, * __restrict__ rs; vReal fx; vHalfFermion zV; vcomplex zn; complex zX[2][3]; complex yOut[2][3]; for (i = 0; i < size; i++) { xyzt5 = i * S_4; rx5 = &chi[xyzt5]; qx5 = &psi[xyzt5]; vhfzero(&zV); fx = vfx_A; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = 0; s < S_4_1; s++, fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } } rs = &rx5[S_4_1]; QSETUP(S_4_1) vput_3(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[2][c].re; zn.im = qs->f[2][c].im; vput_3(&zn.re, zX[0][c].re); vput_3(&zn.im, zX[0][c].im); rs->f[2][c].re = zn.re; rs->f[2][c].im = zn.im; zn.re = qs->f[3][c].re; zn.im = qs->f[3][c].im; vput_3(&zn.re, zX[1][c].re); vput_3(&zn.im, zX[1][c].im); rs->f[3][c].re = zn.re; rs->f[3][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = S_4; s--;) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs2re = (REAL *)&rs->f[2][c].re; REAL *rs2im = (REAL *)&rs->f[2][c].im; rs2re[3] = inv_a * rs2re[3] + b_over_a * yOut[0][c].re; rs2re[2] = inv_a * rs2re[2] + b_over_a * rs2re[3]; rs2re[1] = inv_a * rs2re[1] + b_over_a * rs2re[2]; yOut[0][c].re = rs2re[0] = inv_a * rs2re[0] + b_over_a * rs2re[1]; rs2im[3] = inv_a * rs2im[3] + b_over_a * yOut[0][c].im; rs2im[2] = inv_a * rs2im[2] + b_over_a * rs2im[3]; rs2im[1] = inv_a * rs2im[1] + b_over_a * rs2im[2]; yOut[0][c].im = rs2im[0] = inv_a * rs2im[0] + b_over_a * rs2im[1]; } { REAL *rs3re = (REAL *)&rs->f[3][c].re; REAL *rs3im = (REAL *)&rs->f[3][c].im; rs3re[3] = inv_a * rs3re[3] + b_over_a * yOut[1][c].re; rs3re[2] = inv_a * rs3re[2] + b_over_a * rs3re[3]; rs3re[1] = inv_a * rs3re[1] + b_over_a * rs3re[2]; yOut[1][c].re = rs3re[0] = inv_a * rs3re[0] + b_over_a * rs3re[1]; rs3im[3] = inv_a * rs3im[3] + b_over_a * yOut[1][c].im; rs3im[2] = inv_a * rs3im[2] + b_over_a * rs3im[3]; rs3im[1] = inv_a * rs3im[1] + b_over_a * rs3im[2]; yOut[1][c].im = rs3im[0] = inv_a * rs3im[0] + b_over_a * rs3im[1]; } } } vhfzero(&zV); fx = vfx_B; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = S_4; --s; fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } } rs = &rx5[0]; QSETUP(0) vput_0(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[0][c].re; zn.im = qs->f[0][c].im; vput_0(&zn.re, zX[0][c].re); vput_0(&zn.im, zX[0][c].im); rs->f[0][c].re = zn.re; rs->f[0][c].im = zn.im; zn.re = qs->f[1][c].re; zn.im = qs->f[1][c].im; vput_0(&zn.re, zX[1][c].re); vput_0(&zn.im, zX[1][c].im); rs->f[1][c].re = zn.re; rs->f[1][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = 0; s < S_4; s++) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs0re = (REAL *)&rs->f[0][c].re; REAL *rs0im = (REAL *)&rs->f[0][c].im; rs0re[0] = inv_a * rs0re[0] + b_over_a * yOut[0][c].re; rs0re[1] = inv_a * rs0re[1] + b_over_a * rs0re[0]; rs0re[2] = inv_a * rs0re[2] + b_over_a * rs0re[1]; yOut[0][c].re = rs0re[3] = inv_a * rs0re[3] + b_over_a * rs0re[2]; rs0im[0] = inv_a * rs0im[0] + b_over_a * yOut[0][c].im; rs0im[1] = inv_a * rs0im[1] + b_over_a * rs0im[0]; rs0im[2] = inv_a * rs0im[2] + b_over_a * rs0im[1]; yOut[0][c].im = rs0im[3] = inv_a * rs0im[3] + b_over_a * rs0im[2]; } { REAL *rs1re = (REAL *)&rs->f[1][c].re; REAL *rs1im = (REAL *)&rs->f[1][c].im; rs1re[0] = inv_a * rs1re[0] + b_over_a * yOut[1][c].re; rs1re[1] = inv_a * rs1re[1] + b_over_a * rs1re[0]; rs1re[2] = inv_a * rs1re[2] + b_over_a * rs1re[1]; yOut[1][c].re = rs1re[3] = inv_a * rs1re[3] + b_over_a * rs1re[2]; rs1im[0] = inv_a * rs1im[0] + b_over_a * yOut[1][c].im; rs1im[1] = inv_a * rs1im[1] + b_over_a * rs1im[0]; rs1im[2] = inv_a * rs1im[2] + b_over_a * rs1im[1]; yOut[1][c].im = rs1im[3] = inv_a * rs1im[3] + b_over_a * rs1im[2]; } } } } } static void compute_Qxy(vFermion *chi, const vFermion *psi, struct neighbor *nb) { int i, xyzt5, s, c; vFermion * __restrict__ rx5, * __restrict__ rs; int xyzt, k, d; const vFermion *f; vHalfFermion *g; vHalfFermion gg[8], hh[8]; vSU3 V[8]; int ps[8], p5[8]; const SU3 *Uup, *Udown; int c1, c2; #define qx5 rx5 #define qs rs QMP_start(nb->qmp_cr); if (sending) { int i; /* This is QMP_wait_vector(nb->qmp_sv, nb->Ns); */ for (i = sending->Ns; i--;) QMP_wait(sending->qmp_sv[i]); sending = 0; } { int k, i, s, c, *src; const vFermion *f; vHalfFermion *g; k = 0; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 1; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 2; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 3; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 4; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 5; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 6; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 7; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } } for (i = 0; i < nb->inside_size; i++) { xyzt = nb->inside[i]; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; for (c = 0; c < 3; c++) { k = 6; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 7; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; } } } QMP_wait(nb->qmp_cr); for (i = 0; i < nb->boundary_size; i++) { int m = nb->boundary[i].mask; xyzt = nb->boundary[i].index; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { if ((m & 0x01) == 0) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } if ((m & 0x02) == 0) { k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } if ((m & 0x04) == 0) { k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } if ((m & 0x08) == 0) { k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } if ((m & 0x10) == 0) { k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } if ((m & 0x20) == 0) { k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } if ((m & 0x40) == 0) { k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } if ((m & 0x80) == 0) { k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = (m & (1 << d))? &nb->rcv_buf[d][ps[d]]: &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; for (c = 0; c < 3; c++) { k = 6; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 7; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; } } } #undef qs #undef qx5 } static void compute_1Sxy(vFermion *chi, const vFermion *eta, const vFermion *psi, struct neighbor *nb) { int i, xyzt5, s, c; vFermion * __restrict__ rx5, * __restrict__ rs; int xyzt, k, d; const vFermion *f; vHalfFermion *g; vHalfFermion gg[8], hh[8]; vSU3 V[8]; int ps[8], p5[8]; const SU3 *Uup, *Udown; int c1, c2; #define qx5 rx5 #define qs rs QMP_start(nb->qmp_cr); if (sending) { int i; /* This is QMP_wait_vector(nb->qmp_sv, nb->Ns); */ for (i = sending->Ns; i--;) QMP_wait(sending->qmp_sv[i]); sending = 0; } { int k, i, s, c, *src; const vFermion *f; vHalfFermion *g; k = 0; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 1; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 2; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 3; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 4; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 5; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 6; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 7; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } } for (i = 0; i < nb->inside_size; i++) { const vFermion *ex5, *es; xyzt = nb->inside[i]; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; ex5 = &eta[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; es = &ex5[s]; for (c = 0; c < 3; c++) { k = 7; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 6; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; rs->f[0][c].re = es->f[0][c].re - rs->f[0][c].re; rs->f[0][c].im = es->f[0][c].im - rs->f[0][c].im; rs->f[1][c].re = es->f[1][c].re - rs->f[1][c].re; rs->f[1][c].im = es->f[1][c].im - rs->f[1][c].im; rs->f[2][c].re = es->f[2][c].re - rs->f[2][c].re; rs->f[2][c].im = es->f[2][c].im - rs->f[2][c].im; rs->f[3][c].re = es->f[3][c].re - rs->f[3][c].re; rs->f[3][c].im = es->f[3][c].im - rs->f[3][c].im; } } } QMP_wait(nb->qmp_cr); for (i = 0; i < nb->boundary_size; i++) { const vFermion *ex5, *es; int m = nb->boundary[i].mask; xyzt = nb->boundary[i].index; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; ex5 = &eta[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { if ((m & 0x01) == 0) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } if ((m & 0x02) == 0) { k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } if ((m & 0x04) == 0) { k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } if ((m & 0x08) == 0) { k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } if ((m & 0x10) == 0) { k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } if ((m & 0x20) == 0) { k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } if ((m & 0x40) == 0) { k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } if ((m & 0x80) == 0) { k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = (m & (1 << d))? &nb->rcv_buf[d][ps[d]]: &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; es = &ex5[s]; for (c = 0; c < 3; c++) { k = 7; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 6; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; rs->f[0][c].re = es->f[0][c].re - rs->f[0][c].re; rs->f[0][c].im = es->f[0][c].im - rs->f[0][c].im; rs->f[1][c].re = es->f[1][c].re - rs->f[1][c].re; rs->f[1][c].im = es->f[1][c].im - rs->f[1][c].im; rs->f[2][c].re = es->f[2][c].re - rs->f[2][c].re; rs->f[2][c].im = es->f[2][c].im - rs->f[2][c].im; rs->f[3][c].re = es->f[3][c].re - rs->f[3][c].re; rs->f[3][c].im = es->f[3][c].im - rs->f[3][c].im; } } } #undef qs #undef qx5 } static void compute_Qxx1Qxy(vFermion *chi, const vFermion *psi, struct neighbor *nb) { int i, xyzt5, s, c; vFermion * __restrict__ rx5, * __restrict__ rs; int xyzt, k, d; const vFermion *f; vHalfFermion *g; vHalfFermion gg[8], hh[8]; vSU3 V[8]; int ps[8], p5[8]; const SU3 *Uup, *Udown; int c1, c2; vReal fx; vHalfFermion zV; vcomplex zn; complex zX[2][3]; complex yOut[2][3]; #define qx5 rx5 #define qs rs QMP_start(nb->qmp_cr); if (sending) { int i; /* This is QMP_wait_vector(nb->qmp_sv, nb->Ns); */ for (i = sending->Ns; i--;) QMP_wait(sending->qmp_sv[i]); sending = 0; } { int k, i, s, c, *src; const vFermion *f; vHalfFermion *g; k = 0; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 1; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 2; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 3; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 4; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 5; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 6; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 7; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } } for (i = 0; i < nb->inside_size; i++) { xyzt = nb->inside[i]; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; for (c = 0; c < 3; c++) { k = 6; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 7; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; } } vhfzero(&zV); fx = vfx_A; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = 0; s < S_4_1; s++, fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } } rs = &rx5[S_4_1]; QSETUP(S_4_1) vput_3(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[0][c].re; zn.im = qs->f[0][c].im; vput_3(&zn.re, zX[0][c].re); vput_3(&zn.im, zX[0][c].im); rs->f[0][c].re = zn.re; rs->f[0][c].im = zn.im; zn.re = qs->f[1][c].re; zn.im = qs->f[1][c].im; vput_3(&zn.re, zX[1][c].re); vput_3(&zn.im, zX[1][c].im); rs->f[1][c].re = zn.re; rs->f[1][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = S_4; s--;) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs0re = (REAL *)&rs->f[0][c].re; REAL *rs0im = (REAL *)&rs->f[0][c].im; rs0re[3] = inv_a * rs0re[3] + b_over_a * yOut[0][c].re; rs0re[2] = inv_a * rs0re[2] + b_over_a * rs0re[3]; rs0re[1] = inv_a * rs0re[1] + b_over_a * rs0re[2]; yOut[0][c].re = rs0re[0] = inv_a * rs0re[0] + b_over_a * rs0re[1]; rs0im[3] = inv_a * rs0im[3] + b_over_a * yOut[0][c].im; rs0im[2] = inv_a * rs0im[2] + b_over_a * rs0im[3]; rs0im[1] = inv_a * rs0im[1] + b_over_a * rs0im[2]; yOut[0][c].im = rs0im[0] = inv_a * rs0im[0] + b_over_a * rs0im[1]; } { REAL *rs1re = (REAL *)&rs->f[1][c].re; REAL *rs1im = (REAL *)&rs->f[1][c].im; rs1re[3] = inv_a * rs1re[3] + b_over_a * yOut[1][c].re; rs1re[2] = inv_a * rs1re[2] + b_over_a * rs1re[3]; rs1re[1] = inv_a * rs1re[1] + b_over_a * rs1re[2]; yOut[1][c].re = rs1re[0] = inv_a * rs1re[0] + b_over_a * rs1re[1]; rs1im[3] = inv_a * rs1im[3] + b_over_a * yOut[1][c].im; rs1im[2] = inv_a * rs1im[2] + b_over_a * rs1im[3]; rs1im[1] = inv_a * rs1im[1] + b_over_a * rs1im[2]; yOut[1][c].im = rs1im[0] = inv_a * rs1im[0] + b_over_a * rs1im[1]; } } } vhfzero(&zV); fx = vfx_B; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = S_4; --s; fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } } rs = &rx5[0]; QSETUP(0) vput_0(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[2][c].re; zn.im = qs->f[2][c].im; vput_0(&zn.re, zX[0][c].re); vput_0(&zn.im, zX[0][c].im); rs->f[2][c].re = zn.re; rs->f[2][c].im = zn.im; zn.re = qs->f[3][c].re; zn.im = qs->f[3][c].im; vput_0(&zn.re, zX[1][c].re); vput_0(&zn.im, zX[1][c].im); rs->f[3][c].re = zn.re; rs->f[3][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = 0; s < S_4; s++) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs2re = (REAL *)&rs->f[2][c].re; REAL *rs2im = (REAL *)&rs->f[2][c].im; rs2re[0] = inv_a * rs2re[0] + b_over_a * yOut[0][c].re; rs2re[1] = inv_a * rs2re[1] + b_over_a * rs2re[0]; rs2re[2] = inv_a * rs2re[2] + b_over_a * rs2re[1]; yOut[0][c].re = rs2re[3] = inv_a * rs2re[3] + b_over_a * rs2re[2]; rs2im[0] = inv_a * rs2im[0] + b_over_a * yOut[0][c].im; rs2im[1] = inv_a * rs2im[1] + b_over_a * rs2im[0]; rs2im[2] = inv_a * rs2im[2] + b_over_a * rs2im[1]; yOut[0][c].im = rs2im[3] = inv_a * rs2im[3] + b_over_a * rs2im[2]; } { REAL *rs3re = (REAL *)&rs->f[3][c].re; REAL *rs3im = (REAL *)&rs->f[3][c].im; rs3re[0] = inv_a * rs3re[0] + b_over_a * yOut[1][c].re; rs3re[1] = inv_a * rs3re[1] + b_over_a * rs3re[0]; rs3re[2] = inv_a * rs3re[2] + b_over_a * rs3re[1]; yOut[1][c].re = rs3re[3] = inv_a * rs3re[3] + b_over_a * rs3re[2]; rs3im[0] = inv_a * rs3im[0] + b_over_a * yOut[1][c].im; rs3im[1] = inv_a * rs3im[1] + b_over_a * rs3im[0]; rs3im[2] = inv_a * rs3im[2] + b_over_a * rs3im[1]; yOut[1][c].im = rs3im[3] = inv_a * rs3im[3] + b_over_a * rs3im[2]; } } } } QMP_wait(nb->qmp_cr); for (i = 0; i < nb->boundary_size; i++) { int m = nb->boundary[i].mask; xyzt = nb->boundary[i].index; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { if ((m & 0x01) == 0) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } if ((m & 0x02) == 0) { k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } if ((m & 0x04) == 0) { k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } if ((m & 0x08) == 0) { k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } if ((m & 0x10) == 0) { k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } if ((m & 0x20) == 0) { k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } if ((m & 0x40) == 0) { k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } if ((m & 0x80) == 0) { k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = (m & (1 << d))? &nb->rcv_buf[d][ps[d]]: &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; for (c = 0; c < 3; c++) { k = 6; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 7; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; } } vhfzero(&zV); fx = vfx_A; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = 0; s < S_4_1; s++, fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } } rs = &rx5[S_4_1]; QSETUP(S_4_1) vput_3(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[0][c].re; zn.im = qs->f[0][c].im; vput_3(&zn.re, zX[0][c].re); vput_3(&zn.im, zX[0][c].im); rs->f[0][c].re = zn.re; rs->f[0][c].im = zn.im; zn.re = qs->f[1][c].re; zn.im = qs->f[1][c].im; vput_3(&zn.re, zX[1][c].re); vput_3(&zn.im, zX[1][c].im); rs->f[1][c].re = zn.re; rs->f[1][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = S_4; s--;) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs0re = (REAL *)&rs->f[0][c].re; REAL *rs0im = (REAL *)&rs->f[0][c].im; rs0re[3] = inv_a * rs0re[3] + b_over_a * yOut[0][c].re; rs0re[2] = inv_a * rs0re[2] + b_over_a * rs0re[3]; rs0re[1] = inv_a * rs0re[1] + b_over_a * rs0re[2]; yOut[0][c].re = rs0re[0] = inv_a * rs0re[0] + b_over_a * rs0re[1]; rs0im[3] = inv_a * rs0im[3] + b_over_a * yOut[0][c].im; rs0im[2] = inv_a * rs0im[2] + b_over_a * rs0im[3]; rs0im[1] = inv_a * rs0im[1] + b_over_a * rs0im[2]; yOut[0][c].im = rs0im[0] = inv_a * rs0im[0] + b_over_a * rs0im[1]; } { REAL *rs1re = (REAL *)&rs->f[1][c].re; REAL *rs1im = (REAL *)&rs->f[1][c].im; rs1re[3] = inv_a * rs1re[3] + b_over_a * yOut[1][c].re; rs1re[2] = inv_a * rs1re[2] + b_over_a * rs1re[3]; rs1re[1] = inv_a * rs1re[1] + b_over_a * rs1re[2]; yOut[1][c].re = rs1re[0] = inv_a * rs1re[0] + b_over_a * rs1re[1]; rs1im[3] = inv_a * rs1im[3] + b_over_a * yOut[1][c].im; rs1im[2] = inv_a * rs1im[2] + b_over_a * rs1im[3]; rs1im[1] = inv_a * rs1im[1] + b_over_a * rs1im[2]; yOut[1][c].im = rs1im[0] = inv_a * rs1im[0] + b_over_a * rs1im[1]; } } } vhfzero(&zV); fx = vfx_B; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = S_4; --s; fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } } rs = &rx5[0]; QSETUP(0) vput_0(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[2][c].re; zn.im = qs->f[2][c].im; vput_0(&zn.re, zX[0][c].re); vput_0(&zn.im, zX[0][c].im); rs->f[2][c].re = zn.re; rs->f[2][c].im = zn.im; zn.re = qs->f[3][c].re; zn.im = qs->f[3][c].im; vput_0(&zn.re, zX[1][c].re); vput_0(&zn.im, zX[1][c].im); rs->f[3][c].re = zn.re; rs->f[3][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = 0; s < S_4; s++) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs2re = (REAL *)&rs->f[2][c].re; REAL *rs2im = (REAL *)&rs->f[2][c].im; rs2re[0] = inv_a * rs2re[0] + b_over_a * yOut[0][c].re; rs2re[1] = inv_a * rs2re[1] + b_over_a * rs2re[0]; rs2re[2] = inv_a * rs2re[2] + b_over_a * rs2re[1]; yOut[0][c].re = rs2re[3] = inv_a * rs2re[3] + b_over_a * rs2re[2]; rs2im[0] = inv_a * rs2im[0] + b_over_a * yOut[0][c].im; rs2im[1] = inv_a * rs2im[1] + b_over_a * rs2im[0]; rs2im[2] = inv_a * rs2im[2] + b_over_a * rs2im[1]; yOut[0][c].im = rs2im[3] = inv_a * rs2im[3] + b_over_a * rs2im[2]; } { REAL *rs3re = (REAL *)&rs->f[3][c].re; REAL *rs3im = (REAL *)&rs->f[3][c].im; rs3re[0] = inv_a * rs3re[0] + b_over_a * yOut[1][c].re; rs3re[1] = inv_a * rs3re[1] + b_over_a * rs3re[0]; rs3re[2] = inv_a * rs3re[2] + b_over_a * rs3re[1]; yOut[1][c].re = rs3re[3] = inv_a * rs3re[3] + b_over_a * rs3re[2]; rs3im[0] = inv_a * rs3im[0] + b_over_a * yOut[1][c].im; rs3im[1] = inv_a * rs3im[1] + b_over_a * rs3im[0]; rs3im[2] = inv_a * rs3im[2] + b_over_a * rs3im[1]; yOut[1][c].im = rs3im[3] = inv_a * rs3im[3] + b_over_a * rs3im[2]; } } } } #undef qs #undef qx5 } static void compute_Sxx1Sxy(vFermion *chi, const vFermion *psi, struct neighbor *nb) { int i, xyzt5, s, c; vFermion * __restrict__ rx5, * __restrict__ rs; int xyzt, k, d; const vFermion *f; vHalfFermion *g; vHalfFermion gg[8], hh[8]; vSU3 V[8]; int ps[8], p5[8]; const SU3 *Uup, *Udown; int c1, c2; vReal fx; vHalfFermion zV; vcomplex zn; complex zX[2][3]; complex yOut[2][3]; #define qx5 rx5 #define qs rs QMP_start(nb->qmp_cr); if (sending) { int i; /* This is QMP_wait_vector(nb->qmp_sv, nb->Ns); */ for (i = sending->Ns; i--;) QMP_wait(sending->qmp_sv[i]); sending = 0; } { int k, i, s, c, *src; const vFermion *f; vHalfFermion *g; k = 0; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 1; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 2; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 3; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 4; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 5; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 6; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 7; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } } for (i = 0; i < nb->inside_size; i++) { xyzt = nb->inside[i]; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; for (c = 0; c < 3; c++) { k = 7; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 6; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; } } vhfzero(&zV); fx = vfx_A; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = 0; s < S_4_1; s++, fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } } rs = &rx5[S_4_1]; QSETUP(S_4_1) vput_3(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[2][c].re; zn.im = qs->f[2][c].im; vput_3(&zn.re, zX[0][c].re); vput_3(&zn.im, zX[0][c].im); rs->f[2][c].re = zn.re; rs->f[2][c].im = zn.im; zn.re = qs->f[3][c].re; zn.im = qs->f[3][c].im; vput_3(&zn.re, zX[1][c].re); vput_3(&zn.im, zX[1][c].im); rs->f[3][c].re = zn.re; rs->f[3][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = S_4; s--;) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs2re = (REAL *)&rs->f[2][c].re; REAL *rs2im = (REAL *)&rs->f[2][c].im; rs2re[3] = inv_a * rs2re[3] + b_over_a * yOut[0][c].re; rs2re[2] = inv_a * rs2re[2] + b_over_a * rs2re[3]; rs2re[1] = inv_a * rs2re[1] + b_over_a * rs2re[2]; yOut[0][c].re = rs2re[0] = inv_a * rs2re[0] + b_over_a * rs2re[1]; rs2im[3] = inv_a * rs2im[3] + b_over_a * yOut[0][c].im; rs2im[2] = inv_a * rs2im[2] + b_over_a * rs2im[3]; rs2im[1] = inv_a * rs2im[1] + b_over_a * rs2im[2]; yOut[0][c].im = rs2im[0] = inv_a * rs2im[0] + b_over_a * rs2im[1]; } { REAL *rs3re = (REAL *)&rs->f[3][c].re; REAL *rs3im = (REAL *)&rs->f[3][c].im; rs3re[3] = inv_a * rs3re[3] + b_over_a * yOut[1][c].re; rs3re[2] = inv_a * rs3re[2] + b_over_a * rs3re[3]; rs3re[1] = inv_a * rs3re[1] + b_over_a * rs3re[2]; yOut[1][c].re = rs3re[0] = inv_a * rs3re[0] + b_over_a * rs3re[1]; rs3im[3] = inv_a * rs3im[3] + b_over_a * yOut[1][c].im; rs3im[2] = inv_a * rs3im[2] + b_over_a * rs3im[3]; rs3im[1] = inv_a * rs3im[1] + b_over_a * rs3im[2]; yOut[1][c].im = rs3im[0] = inv_a * rs3im[0] + b_over_a * rs3im[1]; } } } vhfzero(&zV); fx = vfx_B; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = S_4; --s; fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } } rs = &rx5[0]; QSETUP(0) vput_0(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[0][c].re; zn.im = qs->f[0][c].im; vput_0(&zn.re, zX[0][c].re); vput_0(&zn.im, zX[0][c].im); rs->f[0][c].re = zn.re; rs->f[0][c].im = zn.im; zn.re = qs->f[1][c].re; zn.im = qs->f[1][c].im; vput_0(&zn.re, zX[1][c].re); vput_0(&zn.im, zX[1][c].im); rs->f[1][c].re = zn.re; rs->f[1][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = 0; s < S_4; s++) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs0re = (REAL *)&rs->f[0][c].re; REAL *rs0im = (REAL *)&rs->f[0][c].im; rs0re[0] = inv_a * rs0re[0] + b_over_a * yOut[0][c].re; rs0re[1] = inv_a * rs0re[1] + b_over_a * rs0re[0]; rs0re[2] = inv_a * rs0re[2] + b_over_a * rs0re[1]; yOut[0][c].re = rs0re[3] = inv_a * rs0re[3] + b_over_a * rs0re[2]; rs0im[0] = inv_a * rs0im[0] + b_over_a * yOut[0][c].im; rs0im[1] = inv_a * rs0im[1] + b_over_a * rs0im[0]; rs0im[2] = inv_a * rs0im[2] + b_over_a * rs0im[1]; yOut[0][c].im = rs0im[3] = inv_a * rs0im[3] + b_over_a * rs0im[2]; } { REAL *rs1re = (REAL *)&rs->f[1][c].re; REAL *rs1im = (REAL *)&rs->f[1][c].im; rs1re[0] = inv_a * rs1re[0] + b_over_a * yOut[1][c].re; rs1re[1] = inv_a * rs1re[1] + b_over_a * rs1re[0]; rs1re[2] = inv_a * rs1re[2] + b_over_a * rs1re[1]; yOut[1][c].re = rs1re[3] = inv_a * rs1re[3] + b_over_a * rs1re[2]; rs1im[0] = inv_a * rs1im[0] + b_over_a * yOut[1][c].im; rs1im[1] = inv_a * rs1im[1] + b_over_a * rs1im[0]; rs1im[2] = inv_a * rs1im[2] + b_over_a * rs1im[1]; yOut[1][c].im = rs1im[3] = inv_a * rs1im[3] + b_over_a * rs1im[2]; } } } } QMP_wait(nb->qmp_cr); for (i = 0; i < nb->boundary_size; i++) { int m = nb->boundary[i].mask; xyzt = nb->boundary[i].index; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { if ((m & 0x01) == 0) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } if ((m & 0x02) == 0) { k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } if ((m & 0x04) == 0) { k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } if ((m & 0x08) == 0) { k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } if ((m & 0x10) == 0) { k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } if ((m & 0x20) == 0) { k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } if ((m & 0x40) == 0) { k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } if ((m & 0x80) == 0) { k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = (m & (1 << d))? &nb->rcv_buf[d][ps[d]]: &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; for (c = 0; c < 3; c++) { k = 7; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 6; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; } } vhfzero(&zV); fx = vfx_A; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = 0; s < S_4_1; s++, fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } } rs = &rx5[S_4_1]; QSETUP(S_4_1) vput_3(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[2][c].re; zn.im = qs->f[2][c].im; vput_3(&zn.re, zX[0][c].re); vput_3(&zn.im, zX[0][c].im); rs->f[2][c].re = zn.re; rs->f[2][c].im = zn.im; zn.re = qs->f[3][c].re; zn.im = qs->f[3][c].im; vput_3(&zn.re, zX[1][c].re); vput_3(&zn.im, zX[1][c].im); rs->f[3][c].re = zn.re; rs->f[3][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = S_4; s--;) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs2re = (REAL *)&rs->f[2][c].re; REAL *rs2im = (REAL *)&rs->f[2][c].im; rs2re[3] = inv_a * rs2re[3] + b_over_a * yOut[0][c].re; rs2re[2] = inv_a * rs2re[2] + b_over_a * rs2re[3]; rs2re[1] = inv_a * rs2re[1] + b_over_a * rs2re[2]; yOut[0][c].re = rs2re[0] = inv_a * rs2re[0] + b_over_a * rs2re[1]; rs2im[3] = inv_a * rs2im[3] + b_over_a * yOut[0][c].im; rs2im[2] = inv_a * rs2im[2] + b_over_a * rs2im[3]; rs2im[1] = inv_a * rs2im[1] + b_over_a * rs2im[2]; yOut[0][c].im = rs2im[0] = inv_a * rs2im[0] + b_over_a * rs2im[1]; } { REAL *rs3re = (REAL *)&rs->f[3][c].re; REAL *rs3im = (REAL *)&rs->f[3][c].im; rs3re[3] = inv_a * rs3re[3] + b_over_a * yOut[1][c].re; rs3re[2] = inv_a * rs3re[2] + b_over_a * rs3re[3]; rs3re[1] = inv_a * rs3re[1] + b_over_a * rs3re[2]; yOut[1][c].re = rs3re[0] = inv_a * rs3re[0] + b_over_a * rs3re[1]; rs3im[3] = inv_a * rs3im[3] + b_over_a * yOut[1][c].im; rs3im[2] = inv_a * rs3im[2] + b_over_a * rs3im[3]; rs3im[1] = inv_a * rs3im[1] + b_over_a * rs3im[2]; yOut[1][c].im = rs3im[0] = inv_a * rs3im[0] + b_over_a * rs3im[1]; } } } vhfzero(&zV); fx = vfx_B; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = S_4; --s; fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } } rs = &rx5[0]; QSETUP(0) vput_0(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[0][c].re; zn.im = qs->f[0][c].im; vput_0(&zn.re, zX[0][c].re); vput_0(&zn.im, zX[0][c].im); rs->f[0][c].re = zn.re; rs->f[0][c].im = zn.im; zn.re = qs->f[1][c].re; zn.im = qs->f[1][c].im; vput_0(&zn.re, zX[1][c].re); vput_0(&zn.im, zX[1][c].im); rs->f[1][c].re = zn.re; rs->f[1][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = 0; s < S_4; s++) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs0re = (REAL *)&rs->f[0][c].re; REAL *rs0im = (REAL *)&rs->f[0][c].im; rs0re[0] = inv_a * rs0re[0] + b_over_a * yOut[0][c].re; rs0re[1] = inv_a * rs0re[1] + b_over_a * rs0re[0]; rs0re[2] = inv_a * rs0re[2] + b_over_a * rs0re[1]; yOut[0][c].re = rs0re[3] = inv_a * rs0re[3] + b_over_a * rs0re[2]; rs0im[0] = inv_a * rs0im[0] + b_over_a * yOut[0][c].im; rs0im[1] = inv_a * rs0im[1] + b_over_a * rs0im[0]; rs0im[2] = inv_a * rs0im[2] + b_over_a * rs0im[1]; yOut[0][c].im = rs0im[3] = inv_a * rs0im[3] + b_over_a * rs0im[2]; } { REAL *rs1re = (REAL *)&rs->f[1][c].re; REAL *rs1im = (REAL *)&rs->f[1][c].im; rs1re[0] = inv_a * rs1re[0] + b_over_a * yOut[1][c].re; rs1re[1] = inv_a * rs1re[1] + b_over_a * rs1re[0]; rs1re[2] = inv_a * rs1re[2] + b_over_a * rs1re[1]; yOut[1][c].re = rs1re[3] = inv_a * rs1re[3] + b_over_a * rs1re[2]; rs1im[0] = inv_a * rs1im[0] + b_over_a * yOut[1][c].im; rs1im[1] = inv_a * rs1im[1] + b_over_a * rs1im[0]; rs1im[2] = inv_a * rs1im[2] + b_over_a * rs1im[1]; yOut[1][c].im = rs1im[3] = inv_a * rs1im[3] + b_over_a * rs1im[2]; } } } } #undef qs #undef qx5 } static void compute_1Qxx1Qxy(vFermion *chi, double *norm, const vFermion *eta, const vFermion *psi, struct neighbor *nb) { int i, xyzt5, s, c; vFermion * __restrict__ rx5, * __restrict__ rs; int xyzt, k, d; const vFermion *f; vHalfFermion *g; vHalfFermion gg[8], hh[8]; vSU3 V[8]; int ps[8], p5[8]; const SU3 *Uup, *Udown; int c1, c2; vReal fx; vHalfFermion zV; vcomplex zn; complex zX[2][3]; complex yOut[2][3]; vReal vv; vReal nv; *norm = 0; #define qx5 rx5 #define qs rs QMP_start(nb->qmp_cr); if (sending) { int i; /* This is QMP_wait_vector(nb->qmp_sv, nb->Ns); */ for (i = sending->Ns; i--;) QMP_wait(sending->qmp_sv[i]); sending = 0; } { int k, i, s, c, *src; const vFermion *f; vHalfFermion *g; k = 0; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 1; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 2; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 3; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 4; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 5; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 6; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } k = 7; for (i = nb->snd_size[k], g = nb->snd_buf[k], src = nb->snd[k]; i--; src++) { for (s = S_4, f = &psi[*src]; s--; g++, f++) { for (c = 0; c < 3; c++) { g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } } } if (nb->qmp_smask & (1 << k)) { QMP_start(nb->qmp_sh[k]); sending = nb; } } for (i = 0; i < nb->inside_size; i++) { const vFermion *ex5, *es; xyzt = nb->inside[i]; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; ex5 = &eta[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; for (c = 0; c < 3; c++) { k = 6; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 7; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; } } vhfzero(&zV); fx = vfx_A; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = 0; s < S_4_1; s++, fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } } rs = &rx5[S_4_1]; QSETUP(S_4_1) vput_3(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[0][c].re; zn.im = qs->f[0][c].im; vput_3(&zn.re, zX[0][c].re); vput_3(&zn.im, zX[0][c].im); rs->f[0][c].re = zn.re; rs->f[0][c].im = zn.im; zn.re = qs->f[1][c].re; zn.im = qs->f[1][c].im; vput_3(&zn.re, zX[1][c].re); vput_3(&zn.im, zX[1][c].im); rs->f[1][c].re = zn.re; rs->f[1][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = S_4; s--;) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs0re = (REAL *)&rs->f[0][c].re; REAL *rs0im = (REAL *)&rs->f[0][c].im; rs0re[3] = inv_a * rs0re[3] + b_over_a * yOut[0][c].re; rs0re[2] = inv_a * rs0re[2] + b_over_a * rs0re[3]; rs0re[1] = inv_a * rs0re[1] + b_over_a * rs0re[2]; yOut[0][c].re = rs0re[0] = inv_a * rs0re[0] + b_over_a * rs0re[1]; rs0im[3] = inv_a * rs0im[3] + b_over_a * yOut[0][c].im; rs0im[2] = inv_a * rs0im[2] + b_over_a * rs0im[3]; rs0im[1] = inv_a * rs0im[1] + b_over_a * rs0im[2]; yOut[0][c].im = rs0im[0] = inv_a * rs0im[0] + b_over_a * rs0im[1]; } { REAL *rs1re = (REAL *)&rs->f[1][c].re; REAL *rs1im = (REAL *)&rs->f[1][c].im; rs1re[3] = inv_a * rs1re[3] + b_over_a * yOut[1][c].re; rs1re[2] = inv_a * rs1re[2] + b_over_a * rs1re[3]; rs1re[1] = inv_a * rs1re[1] + b_over_a * rs1re[2]; yOut[1][c].re = rs1re[0] = inv_a * rs1re[0] + b_over_a * rs1re[1]; rs1im[3] = inv_a * rs1im[3] + b_over_a * yOut[1][c].im; rs1im[2] = inv_a * rs1im[2] + b_over_a * rs1im[3]; rs1im[1] = inv_a * rs1im[1] + b_over_a * rs1im[2]; yOut[1][c].im = rs1im[0] = inv_a * rs1im[0] + b_over_a * rs1im[1]; } } } vhfzero(&zV); fx = vfx_B; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = S_4; --s; fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } } rs = &rx5[0]; QSETUP(0) vput_0(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[2][c].re; zn.im = qs->f[2][c].im; vput_0(&zn.re, zX[0][c].re); vput_0(&zn.im, zX[0][c].im); rs->f[2][c].re = zn.re; rs->f[2][c].im = zn.im; zn.re = qs->f[3][c].re; zn.im = qs->f[3][c].im; vput_0(&zn.re, zX[1][c].re); vput_0(&zn.im, zX[1][c].im); rs->f[3][c].re = zn.re; rs->f[3][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = 0; s < S_4; s++) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs2re = (REAL *)&rs->f[2][c].re; REAL *rs2im = (REAL *)&rs->f[2][c].im; rs2re[0] = inv_a * rs2re[0] + b_over_a * yOut[0][c].re; rs2re[1] = inv_a * rs2re[1] + b_over_a * rs2re[0]; rs2re[2] = inv_a * rs2re[2] + b_over_a * rs2re[1]; yOut[0][c].re = rs2re[3] = inv_a * rs2re[3] + b_over_a * rs2re[2]; rs2im[0] = inv_a * rs2im[0] + b_over_a * yOut[0][c].im; rs2im[1] = inv_a * rs2im[1] + b_over_a * rs2im[0]; rs2im[2] = inv_a * rs2im[2] + b_over_a * rs2im[1]; yOut[0][c].im = rs2im[3] = inv_a * rs2im[3] + b_over_a * rs2im[2]; } { REAL *rs3re = (REAL *)&rs->f[3][c].re; REAL *rs3im = (REAL *)&rs->f[3][c].im; rs3re[0] = inv_a * rs3re[0] + b_over_a * yOut[1][c].re; rs3re[1] = inv_a * rs3re[1] + b_over_a * rs3re[0]; rs3re[2] = inv_a * rs3re[2] + b_over_a * rs3re[1]; yOut[1][c].re = rs3re[3] = inv_a * rs3re[3] + b_over_a * rs3re[2]; rs3im[0] = inv_a * rs3im[0] + b_over_a * yOut[1][c].im; rs3im[1] = inv_a * rs3im[1] + b_over_a * rs3im[0]; rs3im[2] = inv_a * rs3im[2] + b_over_a * rs3im[1]; yOut[1][c].im = rs3im[3] = inv_a * rs3im[3] + b_over_a * rs3im[2]; } } } for (s = 0; s < S_4; s++) { rs = &rx5[s]; es = &ex5[s]; nv = vmk1(0.0); for (c = 0; c < 3; c++) { vv = es->f[0][c].re - rs->f[0][c].re; rs->f[0][c].re = vv; nv += vv * vv; vv = es->f[0][c].im - rs->f[0][c].im; rs->f[0][c].im = vv; nv += vv * vv; vv = es->f[1][c].re - rs->f[1][c].re; rs->f[1][c].re = vv; nv += vv * vv; vv = es->f[1][c].im - rs->f[1][c].im; rs->f[1][c].im = vv; nv += vv * vv; vv = es->f[2][c].re - rs->f[2][c].re; rs->f[2][c].re = vv; nv += vv * vv; vv = es->f[2][c].im - rs->f[2][c].im; rs->f[2][c].im = vv; nv += vv * vv; vv = es->f[3][c].re - rs->f[3][c].re; rs->f[3][c].re = vv; nv += vv * vv; vv = es->f[3][c].im - rs->f[3][c].im; rs->f[3][c].im = vv; nv += vv * vv; } *norm += vsum(nv); } } QMP_wait(nb->qmp_cr); for (i = 0; i < nb->boundary_size; i++) { const vFermion *ex5, *es; int m = nb->boundary[i].mask; xyzt = nb->boundary[i].index; xyzt5 = xyzt * S_4; for (d = 0; d < 8; d++) p5[d] = nb->site[xyzt].F[d]; rx5 = &chi[xyzt5]; ex5 = &eta[xyzt5]; Uup = &U[nb->site[xyzt].Uup]; for (d = 0; d < 4; d++, Uup++) { Udown = &U[nb->site[xyzt].Udown[d]]; for (c1 = 0; c1 < 3; c1++) { for (c2 = 0; c2 < 3; c2++) { V[d*2+0].v[c1][c2].re = vmk1(Uup->v[c1][c2].re); V[d*2+0].v[c1][c2].im = vmk1(Uup->v[c1][c2].im); /* conjugate down-link */ V[d*2+1].v[c1][c2].re = vmk1( Udown->v[c2][c1].re); V[d*2+1].v[c1][c2].im = vmk1(-Udown->v[c2][c1].im); } } } for (s = 0; s < S_4; s++) { for (d = 0; d < 8; d++) { ps[d] = p5[d] + s; } for (c = 0; c < 3; c++) { if ((m & 0x01) == 0) { k=0; f=&psi[ps[0]]; g=&gg[0]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].im; g->f[0][c].im = f->f[0][c].im + f->f[3][c].re; g->f[1][c].re = f->f[1][c].re - f->f[2][c].im; g->f[1][c].im = f->f[1][c].im + f->f[2][c].re; } if ((m & 0x02) == 0) { k=1; f=&psi[ps[1]]; g=&gg[1]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].im; g->f[0][c].im = f->f[0][c].im - f->f[3][c].re; g->f[1][c].re = f->f[1][c].re + f->f[2][c].im; g->f[1][c].im = f->f[1][c].im - f->f[2][c].re; } if ((m & 0x04) == 0) { k=2; f=&psi[ps[2]]; g=&gg[2]; g->f[0][c].re = f->f[0][c].re - f->f[3][c].re; g->f[0][c].im = f->f[0][c].im - f->f[3][c].im; g->f[1][c].re = f->f[1][c].re + f->f[2][c].re; g->f[1][c].im = f->f[1][c].im + f->f[2][c].im; } if ((m & 0x08) == 0) { k=3; f=&psi[ps[3]]; g=&gg[3]; g->f[0][c].re = f->f[0][c].re + f->f[3][c].re; g->f[0][c].im = f->f[0][c].im + f->f[3][c].im; g->f[1][c].re = f->f[1][c].re - f->f[2][c].re; g->f[1][c].im = f->f[1][c].im - f->f[2][c].im; } if ((m & 0x10) == 0) { k=4; f=&psi[ps[4]]; g=&gg[4]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].im; g->f[0][c].im = f->f[0][c].im + f->f[2][c].re; g->f[1][c].re = f->f[1][c].re + f->f[3][c].im; g->f[1][c].im = f->f[1][c].im - f->f[3][c].re; } if ((m & 0x20) == 0) { k=5; f=&psi[ps[5]]; g=&gg[5]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].im; g->f[0][c].im = f->f[0][c].im - f->f[2][c].re; g->f[1][c].re = f->f[1][c].re - f->f[3][c].im; g->f[1][c].im = f->f[1][c].im + f->f[3][c].re; } if ((m & 0x40) == 0) { k=6; f=&psi[ps[6]]; g=&gg[6]; g->f[0][c].re = f->f[0][c].re + f->f[2][c].re; g->f[0][c].im = f->f[0][c].im + f->f[2][c].im; g->f[1][c].re = f->f[1][c].re + f->f[3][c].re; g->f[1][c].im = f->f[1][c].im + f->f[3][c].im; } if ((m & 0x80) == 0) { k=7; f=&psi[ps[7]]; g=&gg[7]; g->f[0][c].re = f->f[0][c].re - f->f[2][c].re; g->f[0][c].im = f->f[0][c].im - f->f[2][c].im; g->f[1][c].re = f->f[1][c].re - f->f[3][c].re; g->f[1][c].im = f->f[1][c].im - f->f[3][c].im; } } for (d = 0; d < 8; d++) { vHalfFermion * __restrict__ h = &hh[d]; vSU3 *u = &V[d]; g = (m & (1 << d))? &nb->rcv_buf[d][ps[d]]: &gg[d]; for (c = 0; c < 3; c++) { h->f[0][c].re=u->v[c][0].re*g->f[0][0].re-u->v[c][0].im*g->f[0][0].im +u->v[c][1].re*g->f[0][1].re-u->v[c][1].im*g->f[0][1].im +u->v[c][2].re*g->f[0][2].re-u->v[c][2].im*g->f[0][2].im; h->f[0][c].im=u->v[c][0].im*g->f[0][0].re+u->v[c][0].re*g->f[0][0].im +u->v[c][1].im*g->f[0][1].re+u->v[c][1].re*g->f[0][1].im +u->v[c][2].im*g->f[0][2].re+u->v[c][2].re*g->f[0][2].im; h->f[1][c].re=u->v[c][0].re*g->f[1][0].re-u->v[c][0].im*g->f[1][0].im +u->v[c][1].re*g->f[1][1].re-u->v[c][1].im*g->f[1][1].im +u->v[c][2].re*g->f[1][2].re-u->v[c][2].im*g->f[1][2].im; h->f[1][c].im=u->v[c][0].im*g->f[1][0].re+u->v[c][0].re*g->f[1][0].im +u->v[c][1].im*g->f[1][1].re+u->v[c][1].re*g->f[1][1].im +u->v[c][2].im*g->f[1][2].re+u->v[c][2].re*g->f[1][2].im; } } rs = &rx5[s]; for (c = 0; c < 3; c++) { k = 6; qs->f[0][c].re = hh[k].f[0][c].re; qs->f[2][c].re = hh[k].f[0][c].re; qs->f[0][c].im = hh[k].f[0][c].im; qs->f[2][c].im = hh[k].f[0][c].im; qs->f[1][c].re = hh[k].f[1][c].re; qs->f[3][c].re = hh[k].f[1][c].re; qs->f[1][c].im = hh[k].f[1][c].im; qs->f[3][c].im = hh[k].f[1][c].im; k = 7; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].im -= hh[k].f[1][c].im; k = 2; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im += hh[k].f[1][c].im; k = 3; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].re += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].im += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].re -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].im -= hh[k].f[1][c].im; k = 0; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re += hh[k].f[1][c].im; k = 1; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[3][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[3][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[2][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[2][c].re -= hh[k].f[1][c].im; k = 4; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im -= hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re += hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im += hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re -= hh[k].f[1][c].im; k = 5; qs->f[0][c].re += hh[k].f[0][c].re; qs->f[2][c].im += hh[k].f[0][c].re; qs->f[0][c].im += hh[k].f[0][c].im; qs->f[2][c].re -= hh[k].f[0][c].im; qs->f[1][c].re += hh[k].f[1][c].re; qs->f[3][c].im -= hh[k].f[1][c].re; qs->f[1][c].im += hh[k].f[1][c].im; qs->f[3][c].re += hh[k].f[1][c].im; } } vhfzero(&zV); fx = vfx_A; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = 0; s < S_4_1; s++, fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } } rs = &rx5[S_4_1]; QSETUP(S_4_1) vput_3(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[0][c].re; Q2R(0,re) zV.f[0][c].im += fx * qs->f[0][c].im; Q2R(0,im) zV.f[1][c].re += fx * qs->f[1][c].re; Q2R(1,re) zV.f[1][c].im += fx * qs->f[1][c].im; Q2R(1,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[0][c].re; zn.im = qs->f[0][c].im; vput_3(&zn.re, zX[0][c].re); vput_3(&zn.im, zX[0][c].im); rs->f[0][c].re = zn.re; rs->f[0][c].im = zn.im; zn.re = qs->f[1][c].re; zn.im = qs->f[1][c].im; vput_3(&zn.re, zX[1][c].re); vput_3(&zn.im, zX[1][c].im); rs->f[1][c].re = zn.re; rs->f[1][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = S_4; s--;) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs0re = (REAL *)&rs->f[0][c].re; REAL *rs0im = (REAL *)&rs->f[0][c].im; rs0re[3] = inv_a * rs0re[3] + b_over_a * yOut[0][c].re; rs0re[2] = inv_a * rs0re[2] + b_over_a * rs0re[3]; rs0re[1] = inv_a * rs0re[1] + b_over_a * rs0re[2]; yOut[0][c].re = rs0re[0] = inv_a * rs0re[0] + b_over_a * rs0re[1]; rs0im[3] = inv_a * rs0im[3] + b_over_a * yOut[0][c].im; rs0im[2] = inv_a * rs0im[2] + b_over_a * rs0im[3]; rs0im[1] = inv_a * rs0im[1] + b_over_a * rs0im[2]; yOut[0][c].im = rs0im[0] = inv_a * rs0im[0] + b_over_a * rs0im[1]; } { REAL *rs1re = (REAL *)&rs->f[1][c].re; REAL *rs1im = (REAL *)&rs->f[1][c].im; rs1re[3] = inv_a * rs1re[3] + b_over_a * yOut[1][c].re; rs1re[2] = inv_a * rs1re[2] + b_over_a * rs1re[3]; rs1re[1] = inv_a * rs1re[1] + b_over_a * rs1re[2]; yOut[1][c].re = rs1re[0] = inv_a * rs1re[0] + b_over_a * rs1re[1]; rs1im[3] = inv_a * rs1im[3] + b_over_a * yOut[1][c].im; rs1im[2] = inv_a * rs1im[2] + b_over_a * rs1im[3]; rs1im[1] = inv_a * rs1im[1] + b_over_a * rs1im[2]; yOut[1][c].im = rs1im[0] = inv_a * rs1im[0] + b_over_a * rs1im[1]; } } } vhfzero(&zV); fx = vfx_B; #if defined(qs) #define QSETUP(s) #define Q2R(d,pt) #else #define QSETUP(s) qs = &qx5[s]; #define Q2R(d,pt) rs->f[d][c].pt = qs->f[d][c].pt; #endif for (s = S_4; --s; fx = fx * vab) { rs = &rx5[s]; QSETUP(s) for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } } rs = &rx5[0]; QSETUP(0) vput_0(&fx, c0); for (c = 0; c < 3; c++) { zV.f[0][c].re += fx * qs->f[2][c].re; Q2R(2,re) zV.f[0][c].im += fx * qs->f[2][c].im; Q2R(2,im) zV.f[1][c].re += fx * qs->f[3][c].re; Q2R(3,re) zV.f[1][c].im += fx * qs->f[3][c].im; Q2R(3,im) } for (c = 0; c < 3; c++) { zX[0][c].re = vsum(zV.f[0][c].re); zX[0][c].im = vsum(zV.f[0][c].im); zX[1][c].re = vsum(zV.f[1][c].re); zX[1][c].im = vsum(zV.f[1][c].im); zn.re = qs->f[2][c].re; zn.im = qs->f[2][c].im; vput_0(&zn.re, zX[0][c].re); vput_0(&zn.im, zX[0][c].im); rs->f[2][c].re = zn.re; rs->f[2][c].im = zn.im; zn.re = qs->f[3][c].re; zn.im = qs->f[3][c].im; vput_0(&zn.re, zX[1][c].re); vput_0(&zn.im, zX[1][c].im); rs->f[3][c].re = zn.re; rs->f[3][c].im = zn.im; } #undef QSETUP #undef Q2R yOut[0][0].re = yOut[0][0].im = 0; yOut[0][1].re = yOut[0][1].im = 0; yOut[0][2].re = yOut[0][2].im = 0; yOut[1][0].re = yOut[1][0].im = 0; yOut[1][1].re = yOut[1][1].im = 0; yOut[1][2].re = yOut[1][2].im = 0; for (s = 0; s < S_4; s++) { rs = &rx5[s]; for (c = 0; c < 3; c++) { { REAL *rs2re = (REAL *)&rs->f[2][c].re; REAL *rs2im = (REAL *)&rs->f[2][c].im; rs2re[0] = inv_a * rs2re[0] + b_over_a * yOut[0][c].re; rs2re[1] = inv_a * rs2re[1] + b_over_a * rs2re[0]; rs2re[2] = inv_a * rs2re[2] + b_over_a * rs2re[1]; yOut[0][c].re = rs2re[3] = inv_a * rs2re[3] + b_over_a * rs2re[2]; rs2im[0] = inv_a * rs2im[0] + b_over_a * yOut[0][c].im; rs2im[1] = inv_a * rs2im[1] + b_over_a * rs2im[0]; rs2im[2] = inv_a * rs2im[2] + b_over_a * rs2im[1]; yOut[0][c].im = rs2im[3] = inv_a * rs2im[3] + b_over_a * rs2im[2]; } { REAL *rs3re = (REAL *)&rs->f[3][c].re; REAL *rs3im = (REAL *)&rs->f[3][c].im; rs3re[0] = inv_a * rs3re[0] + b_over_a * yOut[1][c].re; rs3re[1] = inv_a * rs3re[1] + b_over_a * rs3re[0]; rs3re[2] = inv_a * rs3re[2] + b_over_a * rs3re[1]; yOut[1][c].re = rs3re[3] = inv_a * rs3re[3] + b_over_a * rs3re[2]; rs3im[0] = inv_a * rs3im[0] + b_over_a * yOut[1][c].im; rs3im[1] = inv_a * rs3im[1] + b_over_a * rs3im[0]; rs3im[2] = inv_a * rs3im[2] + b_over_a * rs3im[1]; yOut[1][c].im = rs3im[3] = inv_a * rs3im[3] + b_over_a * rs3im[2]; } } } for (s = 0; s < S_4; s++) { rs = &rx5[s]; es = &ex5[s]; nv = vmk1(0.0); for (c = 0; c < 3; c++) { vv = es->f[0][c].re - rs->f[0][c].re; rs->f[0][c].re = vv; nv += vv * vv; vv = es->f[0][c].im - rs->f[0][c].im; rs->f[0][c].im = vv; nv += vv * vv; vv = es->f[1][c].re - rs->f[1][c].re; rs->f[1][c].re = vv; nv += vv * vv; vv = es->f[1][c].im - rs->f[1][c].im; rs->f[1][c].im = vv; nv += vv * vv; vv = es->f[2][c].re - rs->f[2][c].re; rs->f[2][c].re = vv; nv += vv * vv; vv = es->f[2][c].im - rs->f[2][c].im; rs->f[2][c].im = vv; nv += vv * vv; vv = es->f[3][c].re - rs->f[3][c].re; rs->f[3][c].re = vv; nv += vv * vv; vv = es->f[3][c].im - rs->f[3][c].im; rs->f[3][c].im = vv; nv += vv * vv; } *norm += vsum(nv); } } QMP_sum_double(norm); #undef qs #undef qx5 } int SSE_DWF_init(const int lattice[DIM+1], SSE_DWF_FP_SIZE fp_size, void *(*allocator)(size_t size), void (*deallocator)(void *)) { if (inited_p) return 1; /* error: second init */ if (fp_size != SSE_DWF_FLOAT) goto error; if (lattice[DIM] % Vs) goto error; tlattice[DIM] = lattice[DIM]; { int i; for (i = 0; i < DIM; i++) { if (lattice[i] & 1) goto error; tlattice[i] = lattice[i]; } } { int i, dn; const QMP_u32_t *xn, *xc; if (!QMP_logical_topology_is_declared()) /* The user must have declared logical topology before */ goto error; dn = QMP_get_logical_number_of_dimensions(); if (dn > DIM) /* Too high dimension of the logical network */ goto error; xn = QMP_get_logical_dimensions(); xc = QMP_get_logical_coordinates(); for (i = 0; i < dn; i++) { network[i] = xn[i]; coord[i] = xc[i]; } for (; i < dn; i++) { network[i] = 1; coord[i] = 0; } } if (allocator) tmalloc = allocator; else tmalloc = malloc; if (deallocator) tfree = deallocator; else tfree = free; if (init_tables()) { /* Something went wrong in the table construction */ goto error; } Phi_o = allocate_odd_fermion(); if (Phi_o == 0) goto error; auxA_o = allocate_odd_fermion(); if (auxA_o == 0) goto error; auxB_o = allocate_odd_fermion(); if (auxB_o == 0) goto error; auxA_e = allocate_even_fermion(); if (auxA_e == 0) goto error; r_o = allocate_odd_fermion(); if (r_o == 0) goto error; p_o = allocate_odd_fermion(); if (p_o == 0) goto error; q_o = allocate_odd_fermion(); if (q_o == 0) goto error; auxB_e = allocate_even_fermion(); if (auxB_e == 0) goto error; if (build_buffers(&even_odd)) goto error; if (build_buffers(&odd_even)) goto error; inited_p = 1; return 0; error: SSE_DWF_fini(); return 1; } void SSE_DWF_fini(void) { free_buffers(&even_odd); free_buffers(&odd_even); if (auxA_e) free16(auxA_e); auxA_e = 0; if (auxB_o) free16(auxB_o); auxB_o = 0; if (auxA_o) free16(auxA_o); auxA_o = 0; if (Phi_o) free16(Phi_o); Phi_o = 0; if (r_o) free16(r_o); r_o = 0; if (p_o) free16(p_o); p_o = 0; if (q_o) free16(q_o); q_o = 0; if (auxB_e) free16(auxB_e); auxB_e = 0; { int i; if (neighbor.site) { tfree(neighbor.site); neighbor.site = 0; } if (neighbor.inside) { tfree(neighbor.inside); neighbor.inside = 0; } if (neighbor.boundary) { tfree(neighbor.boundary); neighbor.boundary = 0; } for (i = 2 * DIM; i--;) { if (neighbor.snd[i] == 0) continue; tfree(neighbor.snd[i]); neighbor.snd[i] = 0; } } inited_p = 0; } SSE_DWF_Fermion * SSE_DWF_allocate_fermion(void) { SSE_DWF_Fermion *ptr; if (!inited_p) return 0; ptr = tmalloc(sizeof (*ptr)); if (ptr == 0) return 0; ptr->even = allocate_even_fermion(); if (ptr->even == 0) goto error1; ptr->odd = allocate_odd_fermion(); if (ptr->odd == 0) goto error2; return ptr; error2: free16(ptr->even); error1: tfree(ptr); return 0; } SSE_DWF_Fermion * SSE_DWF_load_fermion(const void *OuterFermion, void *env, SSE_DWF_fermion_reader reader) { SSE_DWF_Fermion *ptr = SSE_DWF_allocate_fermion(); /* Handle both lack of memory and missing initialization */ if (ptr == 0) return 0; { int x[DIM+1], i; for (i = 0; i < DIM; i++) x[i] = bounds.lo[i]; for (i = 0; i < DIM;) { { int p = parity(x); int p1 = S_4 * to_HFlinear(x, &bounds, -1, 0); /* p is taken care of! */ vFermion *f = p? &ptr->odd[p1].f: &ptr->even[p1].f; for (x[DIM] = 0; x[DIM] < tlattice[DIM]; x[DIM] += Vs, f++) { int d; for (d = 0; d < Fd; d++) { int c; for (c = 0; c < Nc; c++) { f->f[d][c].re = import_vector(OuterFermion, env, reader, x, c, d, 0); f->f[d][c].im = import_vector(OuterFermion, env, reader, x, c, d, 1); } } } } for (i = 0; i < DIM; i++) { if (++x[i] == bounds.hi[i]) x[i] = bounds.lo[i]; else break; } } } return ptr; } void SSE_DWF_save_fermion(void *OuterFermion, void *env, SSE_DWF_fermion_writer writer, SSE_DWF_Fermion *CGfermion) { if (!inited_p) return; { int x[DIM+1], i; for (i = 0; i < DIM; i++) x[i] = bounds.lo[i]; for (i = 0; i < DIM;) { { int p = parity(x); int p1 = S_4 * to_HFlinear(x, &bounds, -1, 0); /* p is taken care of! */ vFermion *f = p? &CGfermion->odd[p1].f: &CGfermion->even[p1].f; for (x[DIM] = 0; x[DIM] < tlattice[DIM]; x[DIM] += Vs, f++) { int d; for (d = 0; d < Fd; d++) { int c; for (c = 0; c < Nc; c++) { save_vector(OuterFermion, env, writer, x, c, d, 0, &f->f[d][c].re); save_vector(OuterFermion, env, writer, x, c, d, 1, &f->f[d][c].im); } } } } for (i = 0; i < DIM; i++) { if (++x[i] == bounds.hi[i]) x[i] = bounds.lo[i]; else break; } } } } void SSE_DWF_delete_fermion(SSE_DWF_Fermion *ptr) { if (!inited_p) return; free16(ptr->even); free16(ptr->odd); tfree(ptr); } SSE_DWF_Gauge * SSE_DWF_load_gauge(const void *OuterGauge_U, const void *OuterGauge_V, void *env, SSE_DWF_gauge_reader reader) { SSE_DWF_Gauge *g; if (!inited_p) return 0; g = allocate_gauge_field(); if (g == 0) return 0; { int x[DIM], i, d, a, b, p1; for (i = 0; i < DIM; i++) x[i] = bounds.lo[i]; for (i = 0; i < DIM;) { p1 = to_Ulinear(x, &bounds, -1); for (d = 0; d < DIM; d++) { for (a = 0; a < Nc; a++) { for (b = 0; b < Nc; b++) { g[p1 + d].v[a][b].re = reader(OuterGauge_U, env, x, d, a, b, 0); g[p1 + d].v[a][b].im = reader(OuterGauge_U, env, x, d, a, b, 1); } } } for (i = 0; i < DIM; i++) { if (++x[i] == bounds.hi[i]) x[i] = bounds.lo[i]; else break; } } for (d = 0; d < DIM; d++) { if (network[d] == 1) continue; for (i = 0; i < DIM; i++) x[i] = bounds.lo[i]; for (i = 0; i < DIM;) { x[d] = bounds.lo[d] - 1; p1 = to_Ulinear(x, &bounds, d); x[d] = bounds.lo[d]; for (a = 0; a < Nc; a++) { for (b = 0; b < Nc; b++) { g[p1].v[a][b].re = reader(OuterGauge_V, env, x, d, a, b, 0); g[p1].v[a][b].im = reader(OuterGauge_V, env, x, d, a, b, 1); } } for (i = 0; i < DIM; i++) { if (i == d) continue; if (++x[i] == bounds.hi[i]) x[i] = bounds.lo[i]; else break; } } } } return g; } void SSE_DWF_delete_gauge(SSE_DWF_Gauge *ptr) { if (!inited_p) return; free16(ptr); } int SSE_DWF_cg_solver(SSE_DWF_Fermion *psi, /* result */ double *out_eps, int *out_iter, const SSE_DWF_Gauge *gauge, double M, double m0, const SSE_DWF_Fermion *x0, /* guess */ const SSE_DWF_Fermion *eta, /* rhs */ double eps, int max_iter) { int status; if (!inited_p) return 1; U = (SU3 *)gauge; { double a = M; double b = 2.; double c = -2*m0; inv_a = 1.0 / a; b_over_a = -b * inv_a; c0 = 1./(1.-c/b*pow(b/a, S_4*4.)); vab = vmk1(pow(b/a, 4.)); vfx_A = vmk4(-c0*c/a, c0*c*b/(a*a), -c0*c*b*b/(a*a*a), c0*c*b*b*b/(a*a*a*a)); vfx_B = vmk4(c0*c*b*b*b/(a*a*a*a), -c0*c*b*b/(a*a*a), c0*c*b/(a*a), -c0*c/a); } compute_Qee1(auxA_e, eta->even); compute_Qoe(auxB_o, auxA_e); compute_sum_o(auxA_o, eta->odd, -1, auxB_o); compute_Qoo1(auxB_o, auxA_o); compute_Mx(Phi_o, auxB_o); status = cg(psi->odd, Phi_o, x0->odd, eps, max_iter, out_eps, out_iter); compute_Qeo(auxA_e, psi->odd); compute_sum_e(auxB_e, eta->even, -1, auxA_e); compute_Qee1(psi->even, auxB_e); return status; }