#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "bench.h"
// #define IACA_MARKS_OFF // undef for analysis by iaca-lin32/bin/iaca.sh
// #include "iacaMarks.h"
#include "altivec2avx.h"



static void print_matrix(float *matrix)
{
  int i, j;

  for (i = 0; i < 4; i++) {
    for (j = 0; j < 4; j++) {
      printf("%6.1f", matrix[i * 4 + j]);
    }
    printf("\n");
  }
  return;
}


static void mat_trans(float *a, float *aT)
{
  __vector float *va  = (__vector float *) a;
  __vector float *vaT = (__vector float *) aT;

  __vector float vtmp[4];
  __vector unsigned char vpat1 =
    (__vector unsigned char) { 0x00, 0x01, 0x02, 0x03,
			       0x04, 0x05, 0x06, 0x07,
			       0x10, 0x11, 0x12, 0x13,
			       0x14, 0x15, 0x16, 0x17 };
  __vector unsigned char vpat2 =
    (__vector unsigned char) { 0x08, 0x09, 0x0a, 0x0b,
			       0x0c, 0x0d, 0x0e, 0x0f,
			       0x18, 0x19, 0x1a, 0x1b,
			       0x1c, 0x1d, 0x1e, 0x1f };
  __vector unsigned char vpat3 =
    (__vector unsigned char) { 0x00, 0x01, 0x02, 0x03,
			       0x10, 0x11, 0x12, 0x13,
			       0x08, 0x09, 0x0a, 0x0b,
			       0x18, 0x19, 0x1a, 0x1b };
  __vector unsigned char vpat4 =
    (__vector unsigned char) { 0x04, 0x05, 0x06, 0x07,
			       0x14, 0x15, 0x16, 0x17,
			       0x0c, 0x0d, 0x0e, 0x0f,
			       0x1c, 0x1d, 0x1e, 0x1f };
  /* vec_perm() part 1 */
  vtmp[0] = vec_perm(va[0], va[2], vpat1);
  vtmp[1] = vec_perm(va[1], va[3], vpat1);
  vtmp[2] = vec_perm(va[0], va[2], vpat2);
  vtmp[3] = vec_perm(va[1], va[3], vpat2);
  
  /* vec_perm() part 2 */
  vaT[0] = vec_perm(vtmp[0], vtmp[1], vpat3);
  vaT[1] = vec_perm(vtmp[0], vtmp[1], vpat4);
  vaT[2] = vec_perm(vtmp[2], vtmp[3], vpat3);
  vaT[3] = vec_perm(vtmp[2], vtmp[3], vpat4);
}


static void SSEmat_trans(float *a, float *aT)
{
  __vector float *va  = (__vector float *) a;
  __vector float *vaT = (__vector float *) aT;

  vaT[0] = va[0];
  vaT[1] = va[1];
  vaT[2] = va[2];
  vaT[3] = va[3];
  _MM_TRANSPOSE4_PS(vaT[0], vaT[1], vaT[2], vaT[3]);
}

int main(int argc, char **argv)
{
  int i;
  long besttime = 0;
  float a[16]  __attribute__((aligned(16))) 
    = {  1,  2,  3,  4,
	 5,  6,  7,  8,
	 9, 10, 11, 12,
	 13, 14, 15, 16 };

  float aT[16] __attribute__((aligned(16)));
  
  printf("--- original matrix ---\n");
  print_matrix(a);

  for (i = 0; i < BENCH_ITS; i++) {
    time_val t0, t1;
    long thistime;
    t0 = get_cycles();
    mat_trans(a, aT);
    t1 = get_cycles();
    thistime = t1 - t0;
    if (i == 0 || thistime < besttime)
      besttime = thistime;
  }

  printf("--- transformed matrix ---\n");
  print_matrix(aT);

  printf("Altivec cycles=%ld\n", besttime);

  for (i = 0; i < BENCH_ITS; i++) {
    time_val t0, t1;
    long thistime;
    t0 = get_cycles();
    SSEmat_trans(a, aT);
    t1 = get_cycles();
    thistime = t1 - t0;
    if (i == 0 || thistime < besttime)
      besttime = thistime;
  }
  printf("SSE     cycles=%ld\n", besttime);

  return 0;
}
