/* This source file is part of the ATMEL AVR32-UC3-SoftwareFramework-1.6.0 Release */

/*This file is prepared for Doxygen automatic documentation generation.*/
/*! \file *********************************************************************
 *
 * \brief 16-bit rectangular function optimized for the at32uc
 *
 * This file contains the code of the vector addition.
 *
 * - Compiler:           IAR EWAVR32 and GNU GCC for AVR32
 * - Supported devices:  All AVR32 devices.
 * - AppNote:
 *
 * \author               Atmel Corporation: http://www.atmel.com \n
 *                       Support and FAQ: http://support.atmel.no/
 *
 ******************************************************************************/

/* Copyright (c) 2009 Atmel Corporation. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. The name of Atmel may not be used to endorse or promote products derived
 * from this software without specific prior written permission.
 *
 * 4. This software may only be redistributed and used in connection with an Atmel
 * AVR product.
 *
 * THIS SOFTWARE IS PROVIDED BY ATMEL "AS IS" AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT ARE
 * EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
 *
 */
#include "preprocessor.h"

#if !defined(FORCE_ALL_GENERICS) && \
    !defined(FORCE_GENERIC_WIN16_RECT) && \
    defined(TARGET_SPECIFIC_WIN16_RECT)

#if __GNUC__
#  define DSP16_RECT_END_KERNEL_X_FCT(x_num, data)  __attribute__((__naked__)) DSP16_RECT_END_KERNEL_X_FCT__(x_num, data)
#elif __ICCAVR32__
#  define DSP16_RECT_END_KERNEL_X_FCT(x_num, data)  DSP16_RECT_END_KERNEL_X_FCT__(x_num, data)
#endif


#if __GNUC__
#  define ASM_INSTRUCT_COMPACKED(str) str
#  define ASM_INSTRUCT_EXTENDED(str)  str
#elif __ICCAVR32__
#  define ASM_INSTRUCT_COMPACKED(str) str":C"
#  define ASM_INSTRUCT_EXTENDED(str)  str":E"
#endif

// Constants
#if __GNUC__
# define CST_ONE    "%[CST_ONE__]"
#elif __ICCAVR32__
# if DSP16_QB >= 15
#   define CST_ONE  ASTRINGZ((1 << 15) - 1)
# else
#   define CST_ONE  ASTRINGZ(1 << DSP16_QB)
# endif
#endif


/*********************************************************************************************
 * Macro name: DSP16_RECTANGLE_X
 * Used registers:
 *   r8, r9, r12
 * Pre:
 * Description:
 *   vect1(0) = DSP16_Q(1.)
 *   vect1(1) = DSP16_Q(1.)
 *   ... X times ...
 *   where X is the number of iteration
 *********************************************************************************************/
#define DSP16_RECTANGLE_0(r_vect1)

#define DSP16_RECTANGLE_1(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "st.h   "ASTRINGZ(r_vect1)"[0], r8\n\t"

#define DSP16_RECTANGLE_2(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "st.w   "ASTRINGZ(r_vect1)"[0], r8\n\t"

#define DSP16_RECTANGLE_3(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "st.w   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.h   "ASTRINGZ(r_vect1)"[4], r8\n\t"

#define DSP16_RECTANGLE_4(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t"

#define DSP16_RECTANGLE_5(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.h   "ASTRINGZ(r_vect1)"[8], r8\n\t"

#define DSP16_RECTANGLE_6(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.w   "ASTRINGZ(r_vect1)"[8], r8\n\t"

#define DSP16_RECTANGLE_7(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.w   "ASTRINGZ(r_vect1)"[8], r8\n\t" \
  "st.h   "ASTRINGZ(r_vect1)"[12], r8\n\t"

#define DSP16_RECTANGLE_8(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[8], r8\n\t"

#define DSP16_RECTANGLE_9(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[8], r8\n\t" \
  "st.h   "ASTRINGZ(r_vect1)"[16], r8\n\t"

#define DSP16_RECTANGLE_10(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[8], r8\n\t" \
  "st.w   "ASTRINGZ(r_vect1)"[16], r8\n\t"

#define DSP16_RECTANGLE_11(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[8], r8\n\t" \
  "st.w   "ASTRINGZ(r_vect1)"[16], r8\n\t" \
  "st.h   "ASTRINGZ(r_vect1)"[20], r8\n\t"

#define DSP16_RECTANGLE_12(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[8], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[16], r8\n\t"

#define DSP16_RECTANGLE_13(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[8], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[16], r8\n\t" \
  "st.h   "ASTRINGZ(r_vect1)"[24], r8\n\t"

#define DSP16_RECTANGLE_14(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[8], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[16], r8\n\t" \
  "st.w   "ASTRINGZ(r_vect1)"[24], r8\n\t"

#define DSP16_RECTANGLE_15(r_vect1) \
  "mov    r8, "CST_ONE"\n\t" \
  "or     r8, r8, r8 << 16\n\t" \
  "mov    r9, r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[0], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[8], r8\n\t" \
  "st.d   "ASTRINGZ(r_vect1)"[16], r8\n\t" \
  "st.w   "ASTRINGZ(r_vect1)"[24], r8\n\t" \
  "st.h   "ASTRINGZ(r_vect1)"[28], r8\n\t"

/*********************************************************************************************/


/*********************************************************************************************
 * This function is the ending function of the addition. It is used to add the last items of a vector.
 *********************************************************************************************/
#if __GNUC__
# define DSP16_RECT_END_KERNEL_X_FCT__(x_num, data) \
static void TPASTE2(dsp16_win_rect_end_kernel_x, x_num)(dsp16_t *vect1) \
{ \
  __asm__ __volatile__ ( \
    TPASTE2(DSP16_RECTANGLE_, x_num)(r12) \
    "mov  pc, lr\n\t" \
    : \
    : \
    [CST_ONE__] "i" (DSP16_Q(1.)) \
  ); \
}
#elif __ICCAVR32__
# define DSP16_RECT_END_KERNEL_X_FCT__(x_num, data) \
static void TPASTE2(dsp16_win_rect_end_kernel_x, x_num)(dsp16_t *vect1) \
{ \
  __asm__ __volatile__ ( \
    TPASTE2(DSP16_RECTANGLE_, x_num)(r12) \
    "mov  pc, lr\n\t" \
  ); \
}
#endif
/*********************************************************************************************/

/*********************************************************************************************
 * Algorithm:
 * for(n=0; n<size-15; n+=16)
 * {
 *   vect1[n] = DSP16_Q(1.);
 *   vect1[n+1] = DSP16_Q(1.);
 *   vect1[n+2] = DSP16_Q(1.);
 *   vect1[n+3] = DSP16_Q(1.);
 *   vect1[n+4] = DSP16_Q(1.);
 *   vect1[n+5] = DSP16_Q(1.);
 *   vect1[n+6] = DSP16_Q(1.);
 *   vect1[n+7] = DSP16_Q(1.); 
 *   vect1[n+8] = DSP16_Q(1.);
 *   vect1[n+9] = DSP16_Q(1.);
 *   vect1[n+10] = DSP16_Q(1.);
 *   vect1[n+11] = DSP16_Q(1.);
 *   vect1[n+12] = DSP16_Q(1.);
 *   vect1[n+13] = DSP16_Q(1.);
 *   vect1[n+14] = DSP16_Q(1.);
 *   vect1[n+15] = DSP16_Q(1.);  
 * }
 *********************************************************************************************/
//! avr32-uc3 16 bit version 
#if __GNUC__
__attribute__((__naked__))
__attribute__((__noinline__))
#elif __ICCAVR32__
# pragma shadow_registers=full
# pragma optimize=none no_inline
#endif
static dsp16_t *dsp16_win_rect_kernel_ext(dsp16_t *vect1, int size)
{
  __asm__ __volatile__ (
      "sub    r11, 15\n\t"
      // r11 = &vect1[size]
      "add    r11, r12, r11 << 1\n"
      "cp.w   r12, r11\n\t"
      ASM_INSTRUCT_COMPACKED("brge __dsp16_rect_ext_end_loop")"\n\t"

      // r8 = r9 = (DSP16_Q(1.) << 16) | DSP16_Q(1.)
      "mov    r8, "CST_ONE"\n\t"
      "or     r8, r8, r8 << 16\n\t"
      "mov    r9, r8\n"

    "__dsp16_rect_ext_loop:\n\t"

      "st.d   r12[0], r8\n\t"
      "st.d   r12[8], r8\n\t"
      "st.d   r12[16], r8\n\t"
      "st.d   r12[24], r8\n\t"

      "sub    r12, -32\n\t"

      "cp.w   r12, r11\n\t"
      ASM_INSTRUCT_COMPACKED("brlt __dsp16_rect_ext_loop")"\n"

    "__dsp16_rect_ext_end_loop:\n\t"

      "retal  r12\n\t"
#if __GNUC__
    :
    :
    [CST_ONE__] "i" (DSP16_Q(1.))
#endif
  );

  return (dsp16_t *) 0;
}
/*********************************************************************************************/

DSP16_RECT_END_KERNEL_X_FCT(0, "")
DSP16_RECT_END_KERNEL_X_FCT(1, "")
DSP16_RECT_END_KERNEL_X_FCT(2, "")
DSP16_RECT_END_KERNEL_X_FCT(3, "")
DSP16_RECT_END_KERNEL_X_FCT(4, "")
DSP16_RECT_END_KERNEL_X_FCT(5, "")
DSP16_RECT_END_KERNEL_X_FCT(6, "")
DSP16_RECT_END_KERNEL_X_FCT(7, "")
DSP16_RECT_END_KERNEL_X_FCT(8, "")
DSP16_RECT_END_KERNEL_X_FCT(9, "")
DSP16_RECT_END_KERNEL_X_FCT(10, "")
DSP16_RECT_END_KERNEL_X_FCT(11, "")
DSP16_RECT_END_KERNEL_X_FCT(12, "")
DSP16_RECT_END_KERNEL_X_FCT(13, "")
DSP16_RECT_END_KERNEL_X_FCT(14, "")
DSP16_RECT_END_KERNEL_X_FCT(15, "")

void dsp16_win_rect(dsp16_t *vect1, int size)
{
  typedef void (*rect_end_kernel_opti_t)(dsp16_t *);
  static const rect_end_kernel_opti_t rect_end_kernel_opti[16] = {
    dsp16_win_rect_end_kernel_x0,
    dsp16_win_rect_end_kernel_x1,
    dsp16_win_rect_end_kernel_x2,
    dsp16_win_rect_end_kernel_x3,
    dsp16_win_rect_end_kernel_x4,
    dsp16_win_rect_end_kernel_x5,
    dsp16_win_rect_end_kernel_x6,
    dsp16_win_rect_end_kernel_x7,
    dsp16_win_rect_end_kernel_x8,
    dsp16_win_rect_end_kernel_x9,
    dsp16_win_rect_end_kernel_x10,
    dsp16_win_rect_end_kernel_x11,
    dsp16_win_rect_end_kernel_x12,
    dsp16_win_rect_end_kernel_x13,
    dsp16_win_rect_end_kernel_x14,
    dsp16_win_rect_end_kernel_x15
  };

  vect1 = dsp16_win_rect_kernel_ext(vect1, size);

  // Jump on different functions depending on the length of the vectors to compute
  rect_end_kernel_opti[size&0xF](vect1);
}

#endif
