#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "bench.h"
#include "memalign.h"
#include "altivec2avx.h"

/*
   "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs" available at:
   http://arxiv.org/pdf/cs.MS/0406049
 */
typedef struct phase {
  vector float c;
  vector float s;
} phase_t;

/* define FASTER_SINCOS for the slightly-less-accurate results in slightly less time */
#undef FASTER_SINCOS
#if !defined(FASTER_SINCOS) /* these coefficients generate a badly un-normalized sine-cosine pair, 
but the angle*/
#define ss1 1.5707963268
#define ss2 -0.6466386396
#define ss3 0.0679105987
#define ss4 -0.0011573807
#define cc1 -1.2341299769
#define cc2 0.2465220241
#define cc3 -0.0123926179
#else /* use 20031003 coefficients for fast, normalized series*/
#define ss1 1.5707963235
#define ss2 -0.645963615
#define ss3 0.0796819754
#define ss4 -0.0046075748
#define cc1 -1.2336977925
#define cc2 0.2536086171
#define cc3 -0.0204391631
#endif

#ifdef ALTIVEC
#define VEC_CONST(x) (vector float)(x))
#else
#define VEC_CONST(x) {(x), (x), (x), (x)}
#endif

inline vector float Reciprocal( vector float v )
{
  vector float vone = VEC_CONST(1.0);

  //Get the reciprocal estimate
  vector float estimate = vec_re( v );

  //One round of Newton-Raphson refinement
  return vec_madd( vec_nmsub(estimate, v, vone), estimate, estimate );
}

inline void FastSinCos(vector float v, struct phase *ph)
{
  vector float s1, s2, c1, c2, fixmag1;
  vector float vzero = VEC_CONST(0.0);
  vector float vone = VEC_CONST(1.0);
  vector float vtwo = VEC_CONST(2.0);
  vector float vhalfpi = VEC_CONST(1.0/(2.0*3.1415926536));
  vector float v_ss1 = VEC_CONST(ss1);
  vector float v_ss2 = VEC_CONST(ss2);
  vector float v_ss3 = VEC_CONST(ss3);
  vector float v_ss4 = VEC_CONST(ss4);
  vector float v_cc1 = VEC_CONST(cc1);
  vector float v_cc2 = VEC_CONST(cc2);
  vector float v_cc3 = VEC_CONST(cc3);

  vector float x1 = vec_madd(v, vhalfpi, vzero);
  /* q1=x/2pi reduced onto (-0.5,0.5), q2=q1**2 */
  vector float q1=vec_nmsub(vec_round(x1), vone, x1);
  vector float q2=vec_madd(q1, q1, vzero);
  s1= vec_madd(q1,
               vec_madd(q2,
                        vec_madd(q2, vec_madd(q2, v_ss4, v_ss3), v_ss2),
                        v_ss1),
               vzero);
  c1= vec_madd(q2,
               vec_madd(q2, vec_madd(q2, v_cc3, v_cc2), v_cc1),
               vone);
  /* now, do one out of two angle-doublings to get sin & cos theta/2 */
  c2 = vec_nmsub(s1, s1, vec_madd(c1, c1, vzero));
  s2 = vec_madd(vtwo, vec_madd(s1, c1, vzero), vzero);
  /* now, cheat on the correction for magnitude drift...
     if the pair has drifted to (1+e)*(cos, sin),
     the next iteration will be (1+e)**2*(cos, sin)
     which is, for small e, (1+2e)*(cos,sin).
     However, on the (1+e) error iteration,
     sin**2+cos**2=(1+e)**2=1+2e also,
     so the error in the square of this term
     will be exactly the error in the magnitude of the next term.
     Then, multiply final result by (1-e) to correct */
#if defined(FASTER_SINCOS)
  /* this works with properly normalized sine-cosine functions, but
     un-normalized is more */
  fixmag1=vec_nmsub(s2,s2, vec_nmsub(c2, c2, vtwo));
#else
  /* must use this method with un-normalized series, since magnitude error is large */
  fixmag1 = Reciprocal(vec_madd(s2,s2,vec_madd(c2,c2,vzero)));
#endif
  c1=vec_nmsub(s2, s2, vec_madd(c2, c2, vzero));
  s1=vec_madd(vtwo, vec_madd(s2, c2, vzero),
              vzero);
  ph->c=vec_madd(c1, fixmag1, vzero);
  ph->s=vec_madd(s1, fixmag1, vzero);
}


static void vsin(float *x, float *out, int n)
{
  int i;
  for (i = 0; i < n; i += 4) {
    vector float Avf32, Rvf32;
    struct phase ph;

    Avf32 = vec_ld(0, x+i);
    FastSinCos(Avf32, &ph);
    Rvf32 = ph.s;
    vec_st(Rvf32, 0, out + i);
  }
}


static void cvsin(float *x, float *out, int n)
{
  int i;
  for (i = 0; i < n; i++) {
    out[i] = sin(x[i]);
  }
}

static void make_data(float *x, int n)
{
  int i;
  for (i = 0; i < n; i++) {
    x[i] = 0.1;
  }
}

int main(int nargs, char *argv[])
{
  int i, n = 32;
  long besttime = 0;
  float *a, *r;

  if (nargs > 1)
    n = atoi(argv[1]);

  a = (float*)MALLOC(n * sizeof(float));
  r = (float*)MALLOC(n * sizeof(float));
  make_data(a, n);

  for (i = 0; i < BENCH_ITS; i++) {
    time_val t0, t1;
    long thistime;
    t0 = get_cycles();
    vsin(a, r, n);
    t1 = get_cycles();
    thistime = t1 - t0;
    if (i == 0 || thistime < besttime)
      besttime = thistime;
  }
  printf("Altivec cycles=%ld\n", besttime);
  
  for (i = 0; i < BENCH_ITS; i++) {
    time_val t0, t1;
    long thistime;
    t0 = get_cycles();
    cvsin(a, r, n);
    t1 = get_cycles();
    thistime = t1 - t0;
    if (i == 0 || thistime < besttime)
      besttime = thistime;
  }
  printf("      C cycles=%ld\n", besttime);

  free(r);
  free(a);
  return 0;
}
