/* 
   gcc-4.4 -O3 -mtune=core2 -flax-vector-conversions vmatmul.c -msse4 && ./iaca.sh -32 a.out
   gcc-4.4 -O3 -mtune=core2 -flax-vector-conversions vmatmul.c -mavx && ./iaca.sh -32 a.out
 */

#include <stdio.h>
#include "altivec2avx.h"
#include "bench.h"


// http://developer.apple.com/hardwaredrivers/ve/algorithms.html#Matrix_Multiplication
typedef vector float vFloat;

void MultiplyMatrix4x4(const vector float *A, 
		       const vector float *B,
		       vector float *C)
{
    //Load the matrix rows
    vector float A1 = vec_ld( 0, A );
    vector float A2 = vec_ld( 1 * sizeof( vector float), A );
    vector float A3 = vec_ld( 2 * sizeof( vector float), A );
    vector float A4 = vec_ld( 3 * sizeof( vector float), A );

    vector float B1 = vec_ld( 0, B );
    vector float B2 = vec_ld( 1 * sizeof( vector float), B );
    vector float B3 = vec_ld( 2 * sizeof( vector float), B );
    vector float B4 = vec_ld( 3 * sizeof( vector float), B );

    vector float zero = (vector float) vec_splat_u32(0);
    vector float C1, C2, C3, C4;

    //Do the first scalar x vector multiply for each row
    C1 = vec_madd( vec_splat( A1, 0 ), B1, zero );
    C2 = vec_madd( vec_splat( A2, 0 ), B1, zero );
    C3 = vec_madd( vec_splat( A3, 0 ), B1, zero );
    C4 = vec_madd( vec_splat( A4, 0 ), B1, zero );

    //Accumulate in the second scalar x vector multiply for each row
    C1 = vec_madd( vec_splat( A1, 1 ), B2, C1 );
    C2 = vec_madd( vec_splat( A2, 1 ), B2, C2 );
    C3 = vec_madd( vec_splat( A3, 1 ), B2, C3 );
    C4 = vec_madd( vec_splat( A4, 1 ), B2, C4 );

    //Accumulate in the third scalar x vector multiply for each row
    C1 = vec_madd( vec_splat( A1, 2 ), B3, C1 );
    C2 = vec_madd( vec_splat( A2, 2 ), B3, C2 );
    C3 = vec_madd( vec_splat( A3, 2 ), B3, C3 );
    C4 = vec_madd( vec_splat( A4, 2 ), B3, C4 );

    //Accumulate in the fourth scalar x vector multiply for each row
    C1 = vec_madd( vec_splat( A1, 3 ), B4, C1 );
    C2 = vec_madd( vec_splat( A2, 3 ), B4, C2 );
    C3 = vec_madd( vec_splat( A3, 3 ), B4, C3 );
    C4 = vec_madd( vec_splat( A4, 3 ), B4, C4 );

    //Store out the result
    vec_st( C1, 0 * sizeof( vector float ), C );
    vec_st( C2, 1 * sizeof( vector float ), C );
    vec_st( C3, 2 * sizeof( vector float ), C );
    vec_st( C4, 3 * sizeof( vector float ), C );
}


void cMultiplyMatrix4x4(float *a, float *b, float *product)
{
  int i;
#define A(row,col)  a[(col<<2)+row]
#define B(row,col)  b[(col<<2)+row]
#define P(row,col)  product[(col<<2)+row]

   /* i-te Zeile */
   for (i = 0; i < 4; i++) {
      float ai0=A(i,0),  ai1=A(i,1),  ai2=A(i,2),  ai3=A(i,3);
      P(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
      P(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
      P(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
      P(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
   }
}


int main(void)
{
  int i;
  long besttime = 0;
  float A[]  __attribute__ ((__aligned__ (16))) = {1.1, 1.2, 1.3, 1.4,
	       2.1, 2.2, 2.3, 2.4,
	       3.1, 3.2, 3.3, 3.4,
	       4.1, 4.2, 4.3, 4.4};
  float B[]  __attribute__ ((__aligned__ (16))) = {1.1, 1.2, 1.3, 1.4,
	       2.1, 2.2, 2.3, 2.4,
	       3.1, 3.2, 3.3, 3.4,
	       4.1, 4.2, 4.3, 4.4};
  float C[]  __attribute__ ((__aligned__ (16))) = {0.0, 0.0, 0.0, 0.0,
	       0.0, 0.0, 0.0, 0.0,
	       0.0, 0.0, 0.0, 0.0,
	       0.0, 0.0, 0.0, 0.0};

  for (i = 0; i < BENCH_ITS; i++) {
    time_val t0, t1;
    long thistime;
    t0 = get_cycles();
    MultiplyMatrix4x4((vFloat*)A, (vFloat*)B, (vFloat*)C);
    t1 = get_cycles();
    thistime = t1 - t0;
    if (i == 0 || thistime < besttime)
      besttime = thistime;
  }
  printf("Altivec cycles=%ld\n", besttime);
  

  for (i = 0; i < BENCH_ITS; i++) {
    time_val t0, t1;
    long thistime;
    t0 = get_cycles();
    cMultiplyMatrix4x4(A, B, C);
    t1 = get_cycles();
    thistime = t1 - t0;
    if (i == 0 || thistime < besttime)
      besttime = thistime;
  }
  printf("C       cycles=%ld\n", besttime);

  return 0;
}
