
; Very fast, Real-Valued Input FFT Routine for the DSP5600x Family
:-----Copyright Motorola 1995-------
;
; This program originally available on the Motorola DSP bulletin board.
; It is provided under a DISCLAMER OF WARRANTY available from
; Motorola DSP Operation, 6501 Wm. Cannon Drive W., Austin, Tx., 78735.
;
;---------------------------------------------------------------------------------------; 
;  512-Point Real Input FFT Test Program  [12428(CFFT256)+3124(SPLIT56)=15552 cycles]   | 
;  @40Mhz, clock cycle time=25ns, 15540*25=0.3888(ms), Sampling rate ~=130Khz           |
;                                                                                       |
;---------------------------------------------------------------------------------------; 
; Routine called |Pmemory|   X memory   |    Y memory   |             Comments          |
;---------------------------------------------------------------------------------------; 
; 1. gen56r*     |   0   |256(real data)|256(imag data) |Create input for test          | 
;---------------------------------------------------------------------------------------; 
; 2. sincosr*    |   0   |   256 (Wr)   |  256 (Wi)     |64 for CFFT256, 128 for SPLIT56|
;---------------------------------------------------------------------------------------; 
; 3. cfft256     |  225  |256 for ODATA |256 for ODATA  |256-point Complex FFT          |
;---------------------------------------------------------------------------------------; 
; 4. split56     |   39  |       0      |       0       |Create 512/2 points real FFT   |
;---------------------------------------------------------------------------------------; 
; Total Words    |  264  |  512(256)    |   512(256)    | 512 external and 256 internal |
;---------------------------------------------------------------------------------------; 
; Note: 1).Only first 256 complex points are calculated (DC to Nyquist) for real FFT.
;       2).Store EVEN index input data to X memory and ODD index input data to Y.
;       3).Assume scaling down at input, i.e. all input data are divided by 256 before FFT.
;          The outputs of this real input FFT are twice larger than true values. 
;          If the original FFT values are desired, scaling up factor should be 128. 
;       4).IDATA points to input data, after running the program it contains correct output.
;          i.e. the output of the FFT replace the inputs started at IDATA.
;          ODATA is only temp external location pointer for normal order output.
;       5).* indicates the generation and reordering of twiddle factors can be done off-line. 
;       6).X:IDATA contains DC*2 and Y:IDATA contains Niquist*2.
;       7).Wr and Wi are real and imaginary twiddle factors. SPLIT56 requires a twice higher 
;          resolution twiddle factors than CFFT256,i.e. N=128 for SPLIT56, N=64 for CFFT256. 
; 
;
RFFT56T ident   1,0
	page    132,60
	opt     nomd,nomex,loc,nocex,mu

;
; Sine-Cosine Table Generator for rfft56.asm 
; 
; Last Update 11/11/92
;
sincosr  macro   points,coef
sincosr  ident   1,2
;
;       sincosr  -      macro to generate sine and cosine coefficient
;                       lookup tables for Decimation in Time real FFT
;                       twiddle factors. Only points/4 coefficients
;                       are generated. For real FFT another points/4
;                       coefficients with higher freq. are created.
;
;       points  -       number of points (2 - 32768, power of 2)
;       coef    -       base address of sine/cosine table
;                       positive cosine value in X memory
;                       positive sine value in Y memory
;
;    8/12/92
;    3/8/94 cfft256 and split56 share 128 complex +cos and +sin
;           also see sincosr.bak for orginal code

pi      equ     3.141592654
freq    equ     2.0*pi/@cvf(points*2)

	org     x:coef
count   set     0
	dup     points
	dc      @cos(@cvf(count)*freq)
count   set     count+1
	endm

	org     y:coef
count   set     0
	dup     points
	dc      @sin(@cvf(count)*freq)
count   set     count+1
	endm

	endm    ;end of sincosr  macro


;
; Input signal for FFT rfft56.asm 
; 
; Last Update 1/18/94
;
gen56r    macro   POINTS,IDATA
;
;       gen56r    -     macro to generate input test signal for real input FFT,
;                       2000 Hz sinewave with scaling factor POINTS, EVEN index in X 
;                       ODD index in Y memory
;
;       POINTS  -       number of points of real FFT (2 - 32768, power of 2)
;       IDATA    -      base address of signal
;
srate   set     44100   ;Hz
ffreq   set     2000    ;Hz
ppi     equ     3.141592654
freq2   equ     (2.0*ppi*ffreq/@cvf(srate))

	org     x:0
;	org     x:IDATA
count   set     0                       ; Even index in X memory
	dup     POINTS/2
	dc      @sin(@cvf(count)*freq2)/POINTS

count   set     count+2
	endm

	org     y:0
;	org     y:IDATA
count   set     1                       ; Odd index in Y memory
	dup     POINTS/2
	dc      @sin(@cvf(count)*freq2)/POINTS

count   set     count+2
	endm

	endm    ;end of gen56r  macro


;----------------------------------------------------------------------------;
;                           256-Point Complex FFT                            ;
; 12418 clock cycles (does not include bit reversal) In-Place FFT.           ; 
;----------------------------------------------------------------------------;
;   Memory Requirement |     Program     |  Twiddle Factor |      Data       | 
;----------------------------------------------------------------------------;
;        P             |     215         |                 |                 ;
;----------------------------------------------------------------------------;
;        X             |                 |      64         |       256       |
;----------------------------------------------------------------------------;
;        Y             |                 |      64         |       256       |
;----------------------------------------------------------------------------;
;
; See cfft256t.asm how to use this program for complex FFT and rfft56t.asm 
;  for real input FFT
;
;----------------------------------------------------------------------------;
; 
; Sept. 11 92   Version 1.0
; Updated 1/21/94 for more comments
; Updated 3/8/94  1) for using linear twiddle factor table instead of bit-reversed
;                 as cfft256a.asm. By this way, a 256-point, one quadure complex 
;                 on-chip twiddle factor table can be used for upto 1024-point
;                 complex FFT.
;                 2) Using +sine for Wi instead of -sine in cfft256a.asm
; NOTE: This program assume there is a 256-point complex sin/cos table available
;       on chip (56007). If you provide an external sin/cos table, then generate
;       sin/cos table in following rule: The size of the table should be POINTS/4,
;       the resolution of sin/cos is POINTS, see SINCOSR.ASM.
CFFT256  macro   IDATA,COEF,POINTS,ODATA
CFFT256  ident   1,0
;
; 256 Point Complex Fast Fourier Transform Routine
; using the Radix 2, Decimation in Time, Cooley-Tukey FFT algorithm.
;
; This routine performs a 256 point complex FFT by taking advantages of 
; 1). internal memory access by starting first half data at location 0,
;     avoid cycle stretching;
; 2). using N/4 complex twiddle factors based on the fact that two consective
;     twiddle factors in DIT FFT has a difference -j;
; 3). trivial twiddle factors (1,0) and (0,-1) are utilized.
; 
;
;    Complex input and output data
;        Real data in X memory
;        Imaginary data in Y memory
;    Normally ordered input data
;    Bit reversed output data, (good for real input FFT)
;       Coefficient lookup table
;        +Cosine values in X memory
;        +Sine values in Y memory
;
;
; Address pointers are organized as follows:
;
;  r0 = Ar,Ai input pointer     n0 = group offset       m0 = modulo (points)
;  r1 = Br,Bi input pointer     n1 = group offset       m1 = modulo (points)
;  r2 = # of groups in a pass   n2 = bflies per group   m2 = always linear 
;  r3 = ODATA, external temp    n3 = groups in a pass   m3 = number of passes
;  r4 = Ar',Ai' output pointer  n4 = group offset       m4 = modulo (points)
;  r5 = Br',Bi' output pointer  n5 = group offset       m5 = modulo (points)
;  r6 = Wr,Wi input pointer     n6 = offset of TF table m6 = linear 
;  r7 = not used (*)            n7 = not used (*)       m7 = not used (*)
;
;       * - r7, n7 and m7 are typically reserved for a user stack pointer.
;
; Alters Data ALU Registers
;       x1      x0      y1      y0
;       a2      a1      a0      a
;       b2      b1      b0      b
;
; Alters Address Registers
;       r0      n0      m0
;       r1      n1      m1
;       r2      n2      m2
;       r3      n3      m3
;       r4      n4      m4
;       r5      n5      m5
;       r6      n6      m6
;
; Alters Program Control Registers
;       pc      sr
;
; Uses 8 levels on System Stack                                           
;
;----------------------------------------------------------------------------;
; For 256-point complex input FFT, there are N=log2(256)=8 passes (index 0 to;
; 7). Each pass has (2^i) group(s), where i=0,1,..,7. Group 1 in even index  ;
; pass and group 1 and 2 in odd index pass contain trival twiddle factors and;
; can be combined together as radix 4 butterflies. Group 2 in even index pass;
; also has trival twiddle factors, but can only be calculated as radix 2     ;
; butterflies.  _END_TRIVAL loop and _EXTRA loop in following calculate all  ;
; trival butterflies in 256-point FFT except the last pass, pass 7, which is ;
; processed in _LAST loop. 
; 
; Initialize pointers to r0->Ar,r1->Cr,r4->Bi,r5->Di, and r3->temp location  ;
; r0,r1,r4, and r5 are modular addressing with modulo N/2                    ;
;----------------------------------------------------------------------------;
OFFSET  equ      64             ;offset for 256 complex on-chip ROM table
	move    y:IDATA,r0      ;r0 -> Ar
	move    r0,n3
	move    #-1,m2          ;m2 always linear mode
	move    y:ODATA,r3      ;r3 always has ODATA
	move    #OFFSET,n6      ;n6 offset for addressing on-chip sin/cos table
	move    #0,m6           ;bit reversed address on r6
	move    #POINTS/4,n0    ;offset and butterflies per group
	move    #POINTS/2-1,m0  ;modulo addressing

	do      #3,_end_trivial ;do three Radix 4 passes
	move    n0,n1           ;pointer offset
	move    n0,n4           ;pointer offset
	move    n0,n5           ;
	lea     (r0)+n0,r4      ;r4 -> Bi
	move    m0,m5
	lea     (r4)+n4,r1      ;r1 -> Cr
	move    m0,m1           ;
	move    m0,m4
	lea     (r1)+n1,r5      ;r5 -> Di
;----------------------------------------------------------------------------------------;
;            First two passes are combined into a R4 pass without multiplication         ;
;             because Wr=1,Wi=0 in first R2 pass and Wr=0, Wi=-1 in 2nd R2 pass          ;
;                                                                                        ;
; Ar'=Ar+Cr+(Br+Dr)     Br'=Ar+Cr-(Br+Dr)     Cr'=(Ar-Cr)+(Bi-Di)    Dr'=(Ar-Cr)-(Bi-Di) ;     
; Ai'=Ai+Ci+(Bi+Di)     Bi'=Ai+Ci-(Bi+Di)     Ci'=(Ai-Ci)+(Dr-Br)    Di'=(Ai-Ci)-(Dr-Br) ;     
;                                                                                        ;
; For 256 or less point complex FFT, 17 Icyc are required for R4bfly, 4.25 Icyc/R2bfly.  ;
;                                                                                        ;
;----------------------------------------------------------------------------------------;
	move          x:(r0)+n0,a                ; a= Ar r0 -> Br
	move          x:(r1)+n1,b                ; b= Cr,r1 -> Dr

	do      n0,_twopass
	add     a,b   x:(r0)+n0,x1 y:(r5)+n5,y1  ;b=Ar+Cr,x1=Br,y1=Di,r0->Ar,r5->Ci
	subl    b,a   b,x:(r0)     y:(r4),b      ;a=Ar-Cr,save Ar+Cr temp in Ar,b=Bi
	add     y1,b  a,x0         y:(r4)+n4,a   ;b=Bi+Di,x0=Ar-Cr,a=Bi,save Ar-Cr in Dr,r4->Ai
	sub     y1,a  b,x:(r3)     x0,b          ;a=Bi-Di,store Bi+Di temp in x:ODATA,b=Ar-Cr
	sub     a,b   x:(r1),x0                  ;b=Ar-Cr-(Bi-Di)=Dr',x0=Dr,r0 -> Ar
	addl    b,a   b,x:(r1)+n1  x0,b          ;a=Ar-Cr+(Bi-Di)=Cr',save Dr',b=Dr,r1->Cr
	sub     x1,b  a,x:(r1)+    x0,a          ;b=Dr-Br,            save Cr',a=Dr,r1->nCr
	add     x1,a  x:(r0)+n0,b  b,y1          ;a=Dr+Br,b=Ar+Cr, y1=Dr-Br,r0->Br
	sub     a,b                y:(r5),y0     ;a=Ar+Cr-(Dr+Br)=Br',y0=Ci
	addl    b,a   b,x:(r0)+n0  y:(r4),b      ;a=Ar+Cr+(Dr+Br)=Ar',save Br',r0->Ar,b=Ai
	sub     y0,b  a,x:(r0)+    y:(r4),a      ;b=Ai-Ci,a=Ai again, save Ar',r0->nAr
	add     y0,a  x:(r3),b     b,y0          ;a=Ai+Ci,y0=Ai-Ci,b=Bi+Di, r5->Ci
	add     a,b                              ;b=Ai+Ci+(Bi+Di)=Ai'
	subl    b,a   y0,b         b,y:(r4)+n4   ;a=Ai+Ci-(Bi+Di)=Bi',b=Ai-Ci,save Ai'
	add     y1,b  y0,a         a,y:(r4)+     ;b=Ai-Ci+(Dr-Br)=Ci',a=Ai-Ci,save Bi',r4->nBi
	sub     y1,a  x:(r1)+n1,b  b,y:(r5)+n5   ;a=Ai-Ci-(Dr-Br)=Di',b=nCr,  save Ci',
	move          x:(r0)+n0,a  a,y:(r5)+     ;                    a=nAr,  save Di',r5->nDi
_twopass
;----------------------------------------------------------------------------------------;
; Do trivial butterflies in even index passes by 5 Icyc Radix 2 butterfly
;----------------------------------------------------------------------------------------;
	move    n5,a                             ;n5 contains ptr to Ar already
	asr     a      n5,r1                     ;r1->Ar
	move    a,n1                             ;get offset 
	move    r1,r5                            ;r5->Ai
	lea     (r1)+n1,r4                       ;r4->Bi
	move    #2,n4                            ;for pointer
	move    r4,r0                            ;r0->Br
	move         x:(r1),a  y:(r4)+,b         ;a=Ar,b=Bi,r4->nBi 
	do      n1,_no_more                      ;w=(0,-1), R2 butterfly 
	add     a,b  x:(r0),x0 y:(r4)-,y0        ;b=Ar+Bi=Ar',x0=Br,y0=nBi
	subl    b,a  b,x:(r1)+ y:(r5),b          ;a=Ar-Bi=Br',save Ar',b=Ai
	add     x0,b a,x:(r0)+ y:(r5),a          ;b=Ai+Br=Bi',save Br',a=Ai again
	subl    b,a  y0,b      b,y:(r4)+n4       ;a=Ai-Br=Ai',save Bi',b=nBi
	move         x:(r1),a  a,y:(r5)+         ;a=nAr,save Ai'  
_no_more
	move    n0,a
	asr     a     n3,r0                      ;r0->IDATA  
	asr     a     a,r2                       ;r2 has number of groups in a new pass
	move    a,n0                             ;(points in a group)/4 after a radix 4 pass
	move    x:(r2)-,b                        ;dec r2
	move    r2,m0
_end_trivial
	move    r0,r4                            ;output pointer
	move    n1,r1                            ;r1->Br
	move    r1,r5                            ;r5->Bi
	move         x:(r0),a                    ;a=Ar
	move         x:(r1),b                    ;b=Br
	do      n1,_extra                        ;w=(1,0) in pass 6 first group
	add     a,b            y:(r5),y0         ;b=Ar+Br=Ar', y0=Bi 
	subl    b,a  b,x:(r0)+ y:(r4),b          ;a=Ar-Br=Br',save Ar',b=Ai
	add     y0,b a,x:(r1)+ y:(r4),a          ;b=Ai+Bi=Ai',save Br',a=Ai
	sub     y0,a x:(r1),b  b,y:(r4)+         ;a=Ai-Bi=Bi',save Ai',b=nBr
	move         x:(r0),a  a,y:(r5)+         ;a=nAr,      save Bi' 
_extra
	
;----------------------------------------------------------------------------------------;
;                                                                                        ;
; In each pass, first two groups are trivial twiddle factors and have been calculated    ;
; in above _END_TRIVAL section. Remaining groups use complex twiddle factors.            ;
; Since two consecutive DIT butterfly groups use twiddle factors different by -j, to save;
; twiddle factor memory, two groups are calculated in one loop. _END_BFLY1 uses (Wr,Wi)  ;
; and _END_BFLY2 use -j*(Wr,Wi).                                                         ;
; Only four inner passes (index 2,3,4, and 5) are processed in this section. Pass 6 and 7;
; are calculated in _NEXT_LAST_PASS and _LAST, respectively.                             ;
;                                                                                        ;
; Radix 2, Decimation In Time Cooley-Tukey FFT algorithm                                 ;
;                  ___________                                                           ;
;                 |           |      Ar' = Ar + Wr*Br - Wi*Bi                            ;
;      Ar,Ai ---->|  Radix 2  |----> Ai' = Ai + Wi*Br + Wr*Bi                            ;
;      Br,Bi ---->| Butterfly |----> Br' = Ar - Wr*Br + Wi*Bi = 2*Ar - Ar'               ;
;                 |___________|      Bi' = Ai - Wi*Br - Wr*Bi = 2*Ai - Ai'               ;
;                       ^                                                                ;
;                       |                                                                ;
;                   W= Wr-jWi                                                            ;
;                                                                                        ;
; r0->A,r1->B,r4->A',r5->B',r6->TF,n0=offset for B pointer,n2=numberof bflies in a group ;
; n3=number of groups in a pass, m3=number of pass. r2=n3 or n3+1                        ;
;----------------------------------------------------------------------------------------;
	move    m2,m0                     ;linear address 
	move    m2,m1
	move    m2,m4
	move    m2,m5
	move    #$80,r0                   ;start location of a pass, chged from POINTS/4
	move    #4,m3                     ;4 passes in first 256-point
	move    #POINTS/8,n0              ;offset to point to Br and Bi,chged fr.POINTS/16
	move    n0,n1
	move    n0,n4
	move    n0,n5           
	move    #COEF+OFFSET,r6           ;r6->COEF+1, OFFSET pt to bit reversed table
	move    n0,n2                     ;number of bflies in the first pass=R2 bfies/4
	lea     (r0)+n0,r1                ;r1->Br
	move    r0,r4                     ;r4->Ai
	lea     (r1)-,r5                  ;r5->Bi-1 for pointer reason

	move    #1,n3                     ;number of groups in a pass
	move    n3,r2                     ;copy of n3
	move    #2,r2
_set_grp
	do      m3,_inner_loop
	jsr     <_inner_pass
	move    n0,a
	asr     a      y:IDATA,r0          ;r0=IDATA   
	move    a,n0                      ;n0=offset of B
	move    r2,a
	asl     a      n0,n1 
	move    a,r2                      ;r2=r2*2
	asr     b      r2,n3              ;n3=number of groups in a new pass 
	lea     (r2)-,n3                  ;n3=number of groups -1 in a new pass 
	move    n4,a
	asl     a      #COEF+OFFSET,r6    ;for 1st 256, TF always starts at first location
	move    a,r0                      ;r0=start location of first 256-point
_inner_set
	move    n0,n4  
	move    n0,n5
	lea     (r0)+n0,r1                ;r1->B
	move    r0,r4                     ;r4->A'
	lea     (r1)-,r5                  ;r5->B'
_inner_loop

;----------------------------------------------------------------------------------------;
;  End inner loop
;----------------------------------------------------------------------------------------;
	move    #8,r0                     ;r0 point to Ar in _NEXT_LAST_PASS
	move    #31,n2                    ;n2=number of groups in _NEXT_LAST_PASS 
	move    #10,r1                    ;r1 -> Br
	move    r0,r4                     ;r4->Ai
_no_set 
	move    #7,r5                    ;r5->Bi, now pre-set to Bi-n5 for pointer reason
	move    #3,n0
	move    n0,n1
	move    n0,n4
	move    n0,n5
	jsr     <_next_last               ;do the pass next to last
;----------------------------------------------------------------------------------------;
;  End _next_last pass
;----------------------------------------------------------------------------------------;
	move    y:IDATA,r0                 ;r0->IDATA 
	move    r3,r4                     ;r4->A',output ptr -> external memory ODATA
_add_offset
	lea     (r0)+,r1                  ;r1->B
	lea     (r4)-,r5                  ;r5->B'
	move    #64,n2                    ;number of bflies in the last pass
	move    #2,n0
	move    n0,n1
	move    n0,n4
	move    n0,n5
	move    #COEF,r6                  ;r6=COEF,not COEF+OFFSET, chged from n6
	jsr     <_last
;----------------------------------------------------------------------------------------;
;  End _last pass
;----------------------------------------------------------------------------------------;
	jmp     <_end_FFT


;----------------------------------------------------------------------------------------;
; All subroutines 
;----------------------------------------------------------------------------------------;
_inner_pass
	do      n3,_end_grp                     ;do groups in a pass
	move x:(r5),a                           ;for pointer reason,a=something  
	move x:(r6),x0 y:(r0),b                 ;x0=Wr,b=Ai
	move x:(r1),x1 y:(r6)+n6,y0             ;x1=Br,y0=Wi 
	do   n0,_end_bfy1        ;Radix 2 DIT butterfly kernel with y0=Wi,x0=Wr
	mac  -x1,y0,b                y:(r1)+,y1 ;b=Ai-BrWi,y1=Bi, r1->nBi
	macr x0,y1,b   a,x:(r5)+     y:(r0),a   ;b=Ai-BrWi+BiWr=Ai',save prev. Br',a=Ai
	subl b,a       x:(r0),b      b,y:(r4)   ;a=2Ai-Ai'=Bi',b=Ar,save Ai'
	mac  x1,x0,b   x:(r0)+,a     a,y:(r5)   ;b=Ar+BrWr,a=Ar,save Bi',r0->nAi
	macr y1,y0,b   x:(r1),x1                ;b=Ar+BrWr+BiWi=Ar',x1=nBr
	subl b,a       b,x:(r4)+     y:(r0),b   ;a=2Ar-Ar'=Br',save Ar',b=nAi,r4->nAr
_end_bfy1
	move         a,x:(r5)+n5   y:(r1)+n1,b  ;save preve. Br' inc r5 and r1
	move         x:(r4)+n4,a   y:(r0)+n0,b  ;inc r0,r4  
	move         x:(r1),x1                  ;x1=nGBr
	move         x:(r5),a      y:(r0),b     ;for pointer reason,a=something,b=nGAr 

	do   n0,_end_bfy2       ;W=-j*W
	mac  -x1,x0,b                y:(r1)+,y1 ;b=Ai-BrWr,y1=Bi,r1->nBi
	macr -y0,y1,b  a,x:(r5)+     y:(r0),a   ;b=Ai-BrWr-BiWi=Ai',save prev. Br',a=Ai
	subl b,a       x:(r0),b      b,y:(r4)   ;a=2Ai-Ai'=Bi',b=Ar,save Ai'
	mac  -x1,y0,b  x:(r0)+,a     a,y:(r5)   ;b=Ar-BrWi,a=Ar,save Bi',r0->nAi
	macr y1,x0,b   x:(r1),x1                ;b=Ar-BrWi+BiWr=Ar',x1=nBr
	subl b,a       b,x:(r4)+     y:(r0),b   ;a=2Ar-Ar'=Br',save Ar',b=nAi,r4->nAr
_end_bfy2
	move           a,x:(r5)+n5   y:(r1)+n1,b    ;save preve. Br' inc r5 and r1
	move           x:(r4)+n4,a   y:(r0)+n0,b    ;inc r0,r4  
_end_grp
	rts

_next_last
	move    x:(r5),a  y:(r0),b               ;a=something,b=Ai
	move    x:(r1),x1 y:(r6),y0              ;x1=Br,y0=Wi 
	do   n2,_n_last        ;do the pass next to last, internal to internal
	mac  -x1,y0,b  x:(r6)+n6,x0 y:(r1)+,y1   ;b=Ai-BrWi,x0=Wr,y1=Bi, r1->nBi
	macr x0,y1,b   a,x:(r5)+n5  y:(r0),a     ;b=Ai-BrWi+BiWr=Ai',save prev. Br',a=Ai
	subl b,a       x:(r0),b     b,y:(r4)     ;a=2Ai-Ai'=Bi',b=Ar,save Ai'
	mac  x1,x0,b   x:(r0)+,a    a,y:(r5)     ;b=Ar+BrWr,a=Ar,save Bi',r0->nAi
	macr y1,y0,b   x:(r1),x1                 ;b=Ar+BrWr+BiWi=Ar',x1=nBr
	subl b,a       b,x:(r4)+    y:(r0),b     ;a=2Ar-Ar'=Br',save Ar',b=nAi,r4->nAr

	mac  -x1,y0,b               y:(r1)+n1,y1 ;b=Ai-BrWi,y1=Bi, r1->nGBi
	macr x0,y1,b   a,x:(r5)+    y:(r0),a     ;b=Ai-BrWi+BiWr=Ai',save prev. Br',a=Ai
	subl b,a       x:(r0),b     b,y:(r4)     ;a=2Ai-Ai'=Bi',b=Ar,save Ai'
	mac  x1,x0,b   x:(r0)+n0,a  a,y:(r5)     ;b=Ar+BrWr,a=Ar,save Bi',r0->nGAi
	macr y1,y0,b   x:(r1),x1                    ;b=Ar+BrWr+BiWi=Ar',x1=nGBr
	subl b,a       b,x:(r4)+n4  y:(r0),b     ;a=2Ar-Ar'=Br',save Ar',b=nGAi,r4->nGAr

	mac  -x1,x0,b               y:(r1)+,y1   ;b=Ai-BrWr,y1=Bi,r1->nGBi
	macr -y0,y1,b  a,x:(r5)+n5  y:(r0),a     ;b=Ai-BrWr-BiWi=Ai',save prev.Br',a=Ai,r5->nGBi 
	subl b,a       x:(r0),b     b,y:(r4)     ;a=2Ai-Ai'=Bi',b=Ar,save Ai'
	mac  -x1,y0,b  x:(r0)+,a    a,y:(r5)     ;b=Ar-BrWi,a=Ar,save Bi',r0->nGAi
	macr y1,x0,b   x:(r1),x1                 ;b=Ar-BrWi+BiWr=Ar',x1=nBr
	subl b,a       b,x:(r4)+    y:(r0),b     ;a=2Ar-Ar'=Br',save Ar',b=nAi,r4->nGAr

	mac  -x1,x0,b               y:(r1)+n1,y1 ;b=Ai-BrWr,y1=Bi,r1->nGBi
	macr -y0,y1,b  a,x:(r5)+    y:(r0),a     ;b=Ai-BrWi-BiWr=Ai',save prev. Br',a=Ai,r5->Bi 
	subl b,a       x:(r0),b     b,y:(r4)     ;a=2Ai-Ai'=Bi',b=Ar,save Ai'
	mac  -x1,y0,b  x:(r0)+n0,a  a,y:(r5)     ;b=Ar-BrWi,a=Ar,save Bi',r0->nGAi
	macr y1,x0,b   x:(r1),x1    y:(r6),y0    ;b=Ar-BrWi+BiWr=Ar',x1=nBr,y0=nWi
	subl b,a       b,x:(r4)+n4  y:(r0),b     ;a=2Ar-Ar'=Br',save Ar',b=nAi,r4->nGAr

_n_last
	move           a,x:(r5)
	rts

_last
	move           x:(r5),a     y:(r0),b     ;a=something,b=Ai
	move           x:(r1),x1    y:(r6),y0    ;x1=Br,y0=Wi 
	do   n2,_end_last                        ; do last pass, internal to external
	mac  -x1,y0,b  x:(r6)+n6,x0   y:(r1)+n1,y1 ;b=Ai-BrWi,x0=Wr,y1=Bi, r1->nGBi
	macr x0,y1,b   a,x:(r5)+n5  y:(r0),a     ;b=Ai-BrWi+BiWr=Ai',save prev. Br',a=Ai
	subl b,a       x:(r0),b     b,y:(r4)     ;a=2Ai-Ai'=Bi',b=Ar,save Ai'
	mac  x1,x0,b   x:(r0)+n0,a  a,y:(r5)     ;b=Ar+BrWr,a=Ar,save Bi',r0->nGAi
	macr y1,y0,b   x:(r1),x1                 ;b=Ar+BrWr+BiWi=Ar',x1=nGBr
	subl b,a       b,x:(r4)+n4  y:(r0),b     ;a=2Ar-Ar'=Br',save Ar',b=nGAi,r4->nGAr

	mac  -x1,x0,b               y:(r1)+n1,y1 ;b=Ai-BrWr,y1=Bi,r1->nGBi
	macr -y0,y1,b  a,x:(r5)+n5  y:(r0),a     ;b=Ai-BrWr-BiWi=Ai',save prev.Br',a=Ai,r5->Bi 
	subl b,a       x:(r0),b     b,y:(r4)     ;a=2Ai-Ai'=Bi',b=Ar,save Ai'
	mac  -x1,y0,b  x:(r0)+n0,a  a,y:(r5)     ;b=Ar-BrWi,a=Ar,save Bi',r0->nGAi
	macr y1,x0,b   x:(r1),x1    y:(r6),y0    ;b=Ar-BrWi+BiWr=Ar',x1=nBr,y0=nWi
	subl b,a       b,x:(r4)+n4  y:(r0),b     ;a=2Ar-Ar'=Br',save Ar',b=nAi,r4->nGAr
_end_last
	move           a,x:(r5)
	rts
_end_FFT
	endm

;**************************************************************************
;
; Split N/2 Complex FFT(Hn) for N real FFT(Fn)
;
SPLIT56         macro    IDATA,COEF,POINTS,ODATA
SPLIT56         ident    1,0
;
;
; Fi=0.5(Hi+Hn/2-i*)-0.5j(Hi-Hn/2-i*)W i=0,1,,,N-1
;
; Bit reverse input, Normal order output
; This macro amplifies coefficients of FFT by 2.
; If absolute values of spectrum are desired, then scaling up factor is 2^(N-1),
; assuming inputs are scaled by 2^N before complex FFT.
; POINTS is the number of (real input data)/2
; COEF is twiddle factor location 
; but COEF-POINTS/2+1 is the real location for SPLIT calculation (see sincosr)
;
;
OFFS  equ     1                                   ;OFFSET for 256 complex table, OFFS=1 for 128 table
      move    #POINTS-1,n0                        ;number of complex FFT input data -1
      move    #POINTS/2-1,n3                      ;loop counter
      move    y:ODATA,r0                           ;r0 ptr to Ar=Hi
      move    #COEF+1,r2                          ;Wr, twiddle factor start location, 
      move    r2,r6                               ;r6 -> Wi
      move    #-1,m6                              ;linear address
      move    m6,m2                               ;linear address
      move    #<OFFS,n2                           ;
      move    n2,n6                               ;
      lea     (r0)+n0,r5                          ;r5 ptr to Br & Bi
      move    y:IDATA,r3                          ;r3 pointer for A'
      move    n0,r1                               ;r1 ptr for B', r1=B
      move    #POINTS/2,n0
      move    n0,n5
      move    m5,m3                               ;m3 and m1 linear address
      move    m5,m1
      move    #0,m0                               ;bit reverse address
      move    m0,m5                               ;bit reverse address
      move              x:(r0),b                  ;b=Ar0
      move              x:(r5),x1   y:(r0),a      ;a=Ai0,x1=Br 
      sub     a,b       x:(r0),x0                 ;b=Ar0-Ai0=Niquist,x0=Ar0 
      asl     b         r3,r4                     ;b=Niquist/2, r4 ptr to IDATA location 
      add     x0,a                  b,y:(r0)+n0   ;a=Ar0+Ai0=DC, save Niq/2 in temp location 
      asl     a         x:(r1)+,x0  y:(r5),b      ;a=DC/2, DC,b=Bi, inc r1   
      move              a,x:(r4)    y:(r1),a      ;a=something  ;save DC/2 
      move                          y:(r0),y0     ;y0=Ai
      move              x:(r3)+,x0                ;inc r3            
      do      n3,_end_split
      add     y0,b      y0,a        a,y:(r1)-     ;b=Ai+Bi=H2r,a=Ai, save prev. Bi' 
      subl    b,a       x:(r0),b    b,y1          ;a=Ai-Bi=H1i, b=Ar, y1=H2r
      sub     x1,b      x:(r0)+n0,a a,y:(r4)      ;b=Ar-Br=H2i,a=Ar again,save H1i temp,r0->nA
      subl    b,a       x:(r2)+n2,x1  y:(r6)+n6,y0    ;a=Ar+Br=H1r,x1=Wr,y0=Wi 
      mac     x1,y1,a   b,x0        a,y:(r5)      ;a=H1r+Wr*H2r,x0=H2i,save H1r temp
      macr    -y0,x0,a              y:(r5)-n5,b   ;a=H1r+Wr*H2r-Wi*H2i=Ar', b=H1r
      subl    a,b       a,x:(r3)    y:(r4),a      ;b=H1r-(Wr*H2r-Wi*H2i)=Br',a=H1i,save Ar' 
      mac     -x1,x0,a  b,x:(r1)    y:(r5),b      ;a=H1i-Wr*H2i,save Br',b=nBi
      macr    -y0,y1,a              y:(r4),y0     ;a=Wi*H2r-Wr*H2i+H1i=Ai',y0=H1i again  
      sub     y0,a      x:(r5),x1   a,y:(r3)+     ;a=Wi*H2r-Wr*H2i, x1=nBr,save Ai'
      sub     y0,a                  y:(r0),y0     ;a=Wi*H2r-Wr*H2i-H1i=Bi',y0=nAi
_end_split
      move              y0,a        a,y:(r1)      ;save last Bi',conjugate last Ai
      neg     a                     y:ODATA,r5
      move                          y:IDATA,r0   
      move                          a,y:(r4)
      move                          y:(r5),a
      move                          a,y:(r0)      ;move Niq. back

      endm ;split56


;
;
; Latest revision - Nov. 11 92
; Update Jan. 24, 94 for more comments
; Update March 7, 94 
;  1) changes Wi=-sin to +sin in SPLIT56 
;  2) SPLIT56 and CFFT256 shares one 256 complex on-chip sin/cos table
;     though only 64 and 128 complex sin/cos are used in CFFT256 and
;     SPLIT56, respectively.


reset   equ     0
start   equ     $40
POINTS  equ     512                        ;number of real FFT
;IDATA   equ     $000                       ;input data in internal
;ODATA   equ     $400                       ;temp data in external 
COEF    equ     $900                       ;twiddle factor Wr and Wi

	sincosr POINTS/2,COEF              ;generates POINTS/4 & POINTS/8 for SPLIT and FFT
	gen56r  POINTS,IDATA               ;generate 512-point sine wave

	org     Y:$0400
IDATA	dc      $0000
ODATA   dc      $0200


	opt     mex
	org     p:reset
	jmp     start

	org     p:start
;	movep   #0,x:$fffe                 ;0 wait states
;       bitrevtwd56 POINTS/2,COEF          ;assumes POINTS/8 twiddle factors are used
	CFFT256 IDATA,COEF,POINTS/2,ODATA  ;call 256-point complex FFT
	SPLIT56 IDATA,COEF,POINTS/2,ODATA  ;create 256-point real FFT output

	rts
	end
