// Assembly optimization of the following C code for the PPlain or PMMX processors using double floating-point values

// fftw_real *halfcomplex_mul(fftw_real *frblk, fftw_real *input, fftw_real *result, unsigned n) {
//   unsigned i;
//   const unsigned halfn = n / 2;
//
//   result[0] = input[0] * frblk[0]; /* first point is real */
//   for (i = 1; i < halfn; i++) { /* middle points have real and complex parts */
//     result[i] = input[i] * frblk[i] - input[n - i] * frblk[n - i];     /*re*/
//     result[n - i] = input[i] * frblk[n - i] + frblk[i] * input[n - i]; /*im*/
//   }
//   if (n % 2 == 0) /* if even number of points, last is real */
//     result[n / 2] = input[n / 2] * frblk[n / 2];
//
//   return result;
// }

// EAX = n / 2 and count down, ECX = -n / 2 and count up
// EBX = middle of frblk, ESI = middle of input, EDI = middle of result

#ifndef ASM_HALFCMPMUL
#define ASM_HALFCMPMUL

#include <rfftw.h>
#define DSIZE 8								// data size

fftw_real *halfcomplex_mul(fftw_real *frblk, fftw_real *input, fftw_real *result, unsigned n) {
	__asm {

    		; Setup
        MOV     EAX, n						                	; number of elements (u)
	    	MOV	  	EBX, frblk				          	    	; pointer to frblk (v)
    		SAR	  	EAX, 1							                ; EAX = n / 2 (u)
        MOV     ESI, input					               	; pointer to input (v)
        MOV     EDI, result					               	; pointer to result (u)
        XOR     ECX, ECX						                ; ECX = 0 (v)

		    LEA	  	EBX, [EBX+DSIZE*EAX]	          		; EBX = point to middle of frblk (u)
        LEA     ESI, [ESI+DSIZE*EAX]	           		; ESI = point to middle of input (v)
        SUB     ECX, EAX					                 	; ECX = -n / 2 (u)
        LEA     EDI, [EDI+DSIZE*EAX]	           		; EDI = point to middle of result (v)

    		; Calculate the first value (real)
    		FLD     QWORD PTR [ESI+DSIZE*ECX]				  	; => [ input[0] ] (1)
        FMUL    QWORD PTR [EBX+DSIZE*ECX]				  	; ** [ frblk[0] * input[0] ] (3)

		    INC     ECX								                	; i = 1 (1)
        JZ      MIDDONE					      	          	; test for N = 0 (1)

    		; Calculate the middle values (complex)
MIDLOOP:
		    DEC		  EAX							                  	; i - n (1)

		    FLD	  	QWORD PTR [ESI+DSIZE*ECX]		  			; => [ input[i] | result[i - 1] ] (1)
    		FMUL	  QWORD PTR [EBX+DSIZE*EAX]			  		; ** [ frblk[n - i] * input[i] | result[i - 1] ] (3)

        FXCH							                      		; get old result [ result[i - 1] | frblk[n - i] * input[i] ] (1)
		    FSTP  	QWORD PTR [EDI+DSIZE*ECX-DSIZE]			; x> result[i - 1] = result[i - 1] [ frblk[n - i] * input[i] ] (2)

		    FLD	  	QWORD PTR [ESI+DSIZE*EAX]				   	; => [ input[n - i] | frblk[n - i] * input[i] ] (1)
		    FMUL	  QWORD PTR [EBX+DSIZE*ECX]					  ; ** [ frblk[i] * input[n - i] | frblk[n - i] * input[i] ] (3)

		    FLD     QWORD PTR [ESI+DSIZE*ECX]	  				; => [ input[i] | frblk[i] * input[n - i] | frblk[n - i] * input[i] ] (1)
        FMUL    QWORD PTR [EBX+DSIZE*ECX]		  			; ** [ frblk[i] * input[i] | frblk[i] * input[n - i] | frblk[n - i] * input[i] ] (3)

		    FXCH	  ST(2)						        	          ; get prev term [ frblk[n - i] * input[i] | frblk[i] * input[n - i] | frblk[i] * input[i] ] (1)
    		FADDP 	ST(1), ST(0)					              ; x> [ frblk[n - i] * input[i] + frblk[i] * input[n - i] | frblk[i] * input[i] ] (3)

		    FLD		  QWORD PTR [ESI+DSIZE*EAX]		   			; => [ input[n - i] | frblk[n - i] * input[i] + frblk[i] * input[n - i] | frblk[i] * input[i] ] (1)
    		FMUL	  QWORD PTR [EBX+DSIZE*EAX]		  			; ** [ frblk[n - i] * input[n - i] | frblk[n - i] * input[i] + frblk[i] * input[n - i] | frblk[i] * input[i] ] (3)

		    FXCH	  ST(1)						        	          ; get prev term [ frblk[n - i] * input[i] + frblk[i] * input[n - i] | frblk[n - i] * input[n - i] | frblk[i] * input[i] ] (1)
		    FSTP  	QWORD PTR [EDI+DSIZE*EAX]	   				; x> result[n - i] = sum... [ frblk[n - i] * input[n - i] | frblk[i] * input[i] ] (2)

		    FSUBP	  ST(1), ST(0)		    			          ; x> [ frblk[i] * input[i] - frblk[n - i] * input[n - i] ] (3)

        INC     ECX							                   	; increment index (1)
        JNZ     MIDLOOP					      	          	; loop (2)

MIDDONE:
    		; Calculate last value if real
		    MOV		  EAX, n					      	          	; Retrieve value of n (1)
    		FSTP  	QWORD PTR [EDI+DSIZE*ECX-DSIZE]			; Store last value (2)

		    AND	  	EAX, 1				      		          	; Check if odd (1)
    		JNZ	  	DONE					        	          	; (1)

		    FLD     QWORD PTR [ESI]					         		; => [ input[n/2] ] (1) (1)
        FMUL    QWORD PTR [EBX]				        			; ** [ frblk[n/2] * input[n/2] ] (3) (3)

    		FSTP  	QWORD PTR [EDI]					        		; result[n/2] = frblk[n/2] * input[n/2] (2)

DONE:
		    MOV		  EAX, result
	}

	/* Return result in EAX */
}

#endif