#include <limits.h>


static inline unsigned short satadd16(unsigned short a, unsigned short b)
{
  return (a > USHRT_MAX - b) ? USHRT_MAX : a + b; 
}

static inline short ssatadd16(short a, short b)
{
  if (b >= 0)
    return (a > SHRT_MAX - b) ? SHRT_MAX : a + b; 
  else
    return (a < SHRT_MIN - b) ? SHRT_MIN : a + b; 
}

static inline unsigned int satadd32(unsigned int a, unsigned int b)
{
  return (a > UINT_MAX - b) ? UINT_MAX : a + b;
}

static inline int ssatadd32(int a, int b)
{
  if (b >= 0)
    return (a > INT_MAX - b) ? INT_MAX : a + b;
  else
    return (a < INT_MIN - b) ? INT_MIN : a + b;
}

static inline unsigned int satsub32(unsigned int a, unsigned int b)
{
  return (a < b) ? 0 : a - b;
}

static inline int ssatsub32(int a, int b)
{
  if (b >= 0)
    return (a < INT_MIN + b) ? INT_MIN : a - b;
  else
    return (a > INT_MAX + b) ? INT_MAX : a - b;
}

/* -------------------------------------------------------------------------- */

/*
d = vec_addc(a,b)
do i=0 to 3
  di = CarryOut(ai + bi)
end

 */

static inline vector unsigned int 
__IA32_vaddcuw(vector unsigned int va, vector unsigned int vb)
{
  mix_u32 a, b, r;
  int i;

  a.v = va;
  b.v = vb;
  for (i = 0; i < 4; i++) {
    unsigned long long sum;
    sum = (unsigned long long)a.s[i] + (unsigned long long)b.s[i];
    r.s[i] = (sum > UINT_MAX) ? 1 : 0;
  }
  return r.v;
}

/* 
d = vec_subc(a,b)
do i=0 to 3
  di = BorrowOut(ai - bi)
end
 */

static inline vector unsigned int 
__IA32_vsubcuw(vector unsigned int va, vector unsigned int vb)
{
  mix_u32 a, b, r;
  int i;

  a.v = va;
  b.v = vb;
  for (i = 0; i < 4; i++) {
    unsigned long long sum;
    sum = (unsigned long long)a.s[i] - (unsigned long long)b.s[i];
    r.s[i] = (sum > UINT_MAX) ? 0: 1;
  }
  return r.v;
}


/* -------------------------------------------------------------------------- */

/*
d = vec_adds(a,b)
n <- number of elements
do i=0 to n-1
  di <- Saturate(ai + bi)
end
 */

static inline vector unsigned int
__IA32_vadduws(vector unsigned int va, vector unsigned int vb)
{
  int i;
  mix_u32 a, b, r;

  a.v = va;
  b.v = vb;
  for (i = 0; i < 4; i++) {
    r.s[i] = satadd32(a.s[i], b.s[i]);
  }
  return r.v;
}

static inline vector signed int
__IA32_vaddsws(vector signed int va, vector signed int vb)
{
  int i;
  mix_s32 a, b, r;

  a.v = va;
  b.v = vb;
  for (i = 0; i < 4; i++) {
    r.s[i] = ssatadd32(a.s[i], b.s[i]);
  }
  return r.v;
}


static inline vector unsigned int
__IA32_vsubuws(vector unsigned int va, vector unsigned int vb)
{
  int i;
  mix_u32 a, b, r;

  a.v = va;
  b.v = vb;
  for (i = 0; i < 4; i++) {
    r.s[i] = satsub32(a.s[i], b.s[i]);
  }
  return r.v;
}

static inline vector signed int
__IA32_vsubsws(vector signed int va, vector signed int vb)
{
  int i;
  mix_s32 a, b, r;

  a.v = va;
  b.v = vb;
  for (i = 0; i < 4; i++) {
    r.s[i] = ssatsub32(a.s[i], b.s[i]);
  }
  return r.v;
}


/* -------------------------------------------------------------------------- */

/*
> · Vector Multiply Add Saturated
d = vec_madds(a,b,c)
do i=0 to 7
  di <- Saturate((ai * bi)/2^15 + ci)
end
 */

static inline vector signed short 
__IA32_vmhaddshs(vector signed short va, vector signed short vb, 
		 vector signed short vc)
{
  int i;
  mix_s16 a, b, c, r;

  a.v = va;
  b.v = vb;
  c.v = vc;

  for (i = 0; i < 8; i++) {
    int t;
    t = a.s[i] * b.s[i];
    r.s[i] = ssatadd16(t >> 15, c.s[i]);
  }
  return r.v;
}

/*
> · Vector Multiply Round and Add Saturated
d = vec_mradds(a,b,c)
do i=0 to 7
  di <- Saturate((ai * bi + 2^14)/2^15 + ci)
end
*/
static inline vector signed short 
__IA32_vmhraddshs(vector signed short va, vector signed short vb,
		  vector signed short vc)
{
  int i;
  mix_s16 a, b, c, r;

  a.v = va;
  b.v = vb;
  c.v = vc;

  for (i = 0; i < 8; i++) {
    int t;
    t = a.s[i] * b.s[i] + (1<<14);
    r.s[i] = ssatadd16(t >> 15, c.s[i]);
  }
  return r.v;
}


/* -------------------------------------------------------------------------- */

/* 
d = vec_mladd(a,b,c)

do i=0 to 7
  di <- (ai * bi) + ci
end
 */

static inline vector signed short 
__IA32_vmladduhm(vector signed short va, vector signed short vb, 
		 vector signed short vc)
{
  int i;
  mix_s16 a, b, c, r;

  a.v = va;
  b.v = vb;
  c.v = vc;

  for (i = 0; i < 8; i++) {
    r.s[i] = (a.s[i] * b.s[i]) + c.s[i];
  }

  return r.v;
}


/* -------------------------------------------------------------------------- */

/* 
> · Vector Multiply Sum Saturated
d = vec_msums(a,b,c)
do i=0 to 3
  di <- Saturate((a2i * b2i) + (a2i+1 * b2i+1) + ci)
end
 */

static inline vector unsigned int 
__IA32_vmsumuhs(vector unsigned short va, vector unsigned short vb,
		vector unsigned int vc)
{
  int i;
  mix_u16 a, b;
  mix_u32 c, r;

  a.v = va;
  b.v = vb;
  c.v = vc;

  for (i = 0; i < 4; i++) {
    unsigned int p0, p1;
    p0 = a.s[2*i] * b.s[2*i];
    p1 = a.s[2*i+1] * b.s[2*i+1];
    r.s[i] = satadd32(p0 + p1, c.s[i]);
  }
  return r.v;
}

static inline vector signed int 
__IA32_vmsumshs(vector signed short va, vector signed short vb,
		vector signed int vc)
{
  int i;
  mix_s16 a, b;
  mix_s32 c, r;

  a.v = va;
  b.v = vb;
  c.v = vc;

  for (i = 0; i < 4; i++) {
    int p0, p1;
    p0 = a.s[2*i] * b.s[2*i];
    p1 = a.s[2*i+1] * b.s[2*i+1];
    r.s[i] = ssatadd32(p0 + p1, c.s[i]);
  }
  return r.v;
}


/* -------------------------------------------------------------------------- */



/* vec_sums */
/* 
d = vec_sums(a,b)
do i=0 to 2
  di <- 0
end
d3 <- Saturate(a0 + a1 + a2 + a3 + b3)

 */
#define SATURATE(x, MIN, MAX) ((x) < (MIN) ? (MIN) : (x) > (MAX) ? (MAX) : (x))

static inline vector signed int 
__IA32_vsumsws (vector signed int va, vector signed int vb)
{
  mix_s32 a, b, r;
  long long sum;

  a.v = va;
  b.v = vb;
  r.s[0] = 0;
  r.s[1] = 0;
  r.s[2] = 0;
  sum = (long long)a.s[0] +
    (long long)a.s[1] + 
    (long long)a.s[2] + 
    (long long)a.s[3] +
    (long long)b.s[3];
  r.s[3] = SATURATE(sum, INT_MIN, INT_MAX);
  return r.v;
}



/* vec_sum2s */
/* 
d = vec_sum2s(a,b)
do i=0 to 1
  d2i <- 0
  d2i+1 <- Saturate(a2i + a2i+1 + b2i+1)
 */

static inline vector signed int 
__IA32_vsum2sws (vector signed int va, vector signed int vb)
{
  mix_s32 a, b, r;

  a.v = va;
  b.v = vb;
  r.s[0] = 0;
  r.s[1] = ssatadd32(ssatadd32(a.s[0], a.s[1]), b.s[1]);
  r.s[2] = 0;
  r.s[3] = ssatadd32(ssatadd32(a.s[2], a.s[3]), b.s[3]);
  return r.v;
}




/* vec_sum4s */
/*
d = vec_sum4s(a,b)
* For a with 8-bit elements:
do i=0 to 3
  di <- Saturate (a4i+ a4i+1 + a4i+2 + a4i+3 + bi)
end

* For a with 16-bit elements:
do i=0 to 3
  di <- Saturate(a2i+ a2i+1 + bi)
end
*/

static inline vector unsigned int
__IA32_vsum4ubs(vector unsigned char va, vector unsigned int vb)
{
  mix_u8 a;
  mix_u32 b, r;

  a.v = va;
  b.v = vb;
  r.s[0] = satadd32(b.s[0], (int)a.s[0]+(int)a.s[1]+(int)a.s[2]+(int)a.s[3]);
  r.s[1] = satadd32(b.s[1], (int)a.s[4]+(int)a.s[5]+(int)a.s[6]+(int)a.s[7]);
  r.s[2] = satadd32(b.s[2], (int)a.s[8]+(int)a.s[9]+(int)a.s[10]+(int)a.s[11]);
  r.s[3] = satadd32(b.s[3], (int)a.s[12]+(int)a.s[13]+(int)a.s[14]+(int)a.s[15]);
  return r.v;
}


static inline vector signed int
__IA32_vsum4sbs(vector signed char va, vector signed int vb)
{
  mix_s8 a;
  mix_s32 b, r;

  a.v = va;
  b.v = vb;
  r.s[0] = ssatadd32(b.s[0], (int)a.s[0]+(int)a.s[1]+(int)a.s[2]+(int)a.s[3]);
  r.s[1] = ssatadd32(b.s[1], (int)a.s[4]+(int)a.s[5]+(int)a.s[6]+(int)a.s[7]);
  r.s[2] = ssatadd32(b.s[2], (int)a.s[8]+(int)a.s[9]+(int)a.s[10]+(int)a.s[11]);
  r.s[3] = ssatadd32(b.s[3], (int)a.s[12]+(int)a.s[13]+(int)a.s[14]+(int)a.s[15]);
  return r.v;
}


static inline vector signed int
__IA32_vsum4shs(vector signed short va, vector signed int vb)
{
  mix_s16 a;
  mix_s32 b, r;

  a.v = va;
  b.v = vb;
  r.s[0] = ssatadd32(b.s[0], (int)a.s[0] + (int)a.s[1]);
  r.s[1] = ssatadd32(b.s[1], (int)a.s[2] + (int)a.s[3]);
  r.s[2] = ssatadd32(b.s[2], (int)a.s[4] + (int)a.s[5]);
  r.s[3] = ssatadd32(b.s[3], (int)a.s[6] + (int)a.s[7]);
  return r.v;
}


/* -------------------------------------------------------------------------- */


/* 
> · Vector Convert to Unsigned Fixed-Point Word Saturated
d = vec_ctu(a,b)
do i=0 to 3
  di <- Saturate (ai * 2^b)
end
 */

static inline vector unsigned int 
__IA32_vctuxs(vector float va, int b)
{
  int i;
  mix_f32 a;
  mix_u32 r;
  a.v = va;

  for (i = 0; i < 4; i++) {
    r.s[i] = SATURATE(a.s[i] * (1<<b), 0, UINT_MAX);
  }
  return r.v;
}


/* -------------------------------------------------------------------------- */


/* 
> · pack/unpack pixels: vec_packpx, vec_vupkhpx, vec_vupklpx  
d = vec_packpx(a,b)

do i=0 to 3
  d[i] <- ai[7] || ai[8:12] || ai[16:20] || ai[24:28]
  d[i+4] <- bi[7] || bi[8:12] || bi[16:20] || bi[24:28]
end
 */
typedef union {
  vector pixel v;
  unsigned short s[8];
} mix_pix;

static inline vector pixel 
__IA32_vpkpx(vector unsigned int va, vector unsigned int vb)
{
  int i;
  mix_u32 a, b;
  mix_pix r;
  a.v = va;
  b.v = vb;

  for (i = 0; i < 4; i++) {
    unsigned char A, R, G, B;
    A = (a.s[i] >> 24) & 0x1;
    R = (a.s[i]>>19) & 0x1f;
    G = (a.s[i]>>11) & 0x1f;
    B = (a.s[i]>>3)  & 0x1f;
    r.s[i] = (A << 15) | (R << 10) | (G << 5) | B;
  }
  for (i = 0; i < 4; i++) {
    unsigned char A, R, G, B;
    A = (b.s[i] >> 24) & 0x1;
    R = (b.s[i]>>19) & 0x1f;
    G = (b.s[i]>>11) & 0x1f;
    B = (b.s[i]>>3)  & 0x1f;
    r.s[i+4] = (A << 15) | (R << 10) | (G << 5) | B;
  }
  return r.v;
}

/* 
vec_unpackh
Vector Unpack High Element
d = vec_unpackh(a)

Pixel value:
do i=0 to 3
  di <- SignExtend(ai[0]) || 000 || ai[1:5] || 000 || ai[6:10] || 000 || ai[11:15]
end

 */

static inline vector unsigned int 
__IA32_vupkhpx(vector pixel va)
{
  int i;
  mix_pix a;
  mix_u32 r;
  a.v = va;

  for (i = 0; i < 4; i++) {
    unsigned char A, R, G, B;
    A = (a.s[i] >> 15);
    R = (a.s[i] >> 10) & 0x1f;
    G = (a.s[i] >>  5) & 0x1f;
    B = a.s[i] & 0x1f;
    r.s[i] = (A ? 0xf000 : 0x0000) | (R << 16) | (G << 8) | B;
  }
  return r.v;
}


/* 
vec_unpackl Vector Unpack Low Element
d = vec_unpackl(a)
Pixel value:
do i=0 to 3
  di <- SignExtend(ai+n[0]) || 000 || ai+n[1:5] || 000 || ai+n[6:10] || 000 || ai+n[11:15]
end
 */

static inline vector unsigned int 
__IA32_vupklpx(vector pixel va)
{
  int i;
  mix_pix a;
  mix_u32 r;
  a.v = va;

  for (i = 0; i < 4; i++) {
    unsigned char A, R, G, B;
    A = (a.s[i] >> 15);
    R = (a.s[i]>>10) & 0x1f;
    G = (a.s[i]>>5) & 0x1f;
    B = a.s[i] & 0x1f;
    r.s[i] = (A ? 0xf000 : 0x0000) | (R << 16) | (G << 8) | B;
  }
  return r.v;
}



/* -------------------------------------------------------------------------- */


/*
> · vector pack saturated unsigned: vec_vpkshus, vec_vpkswus, vec_vpkuwus  
d = vec_packsu(a,b)
n <- number of elements in a
do i=0 to n-1
  di <- Saturate(ai)
  di+n <- Saturate(bi)
end
 */

static inline vector unsigned char 
__IA32_vpkshus(vector signed short va, vector signed short vb)
{
  int i;
  mix_s16 a, b;
  mix_u8 r;
  a.v = va;
  b.v = vb;

  for (i = 0; i < 8; i++) {
    r.s[i] = SATURATE(a.s[i], 0, UCHAR_MAX);
  }
  for (i = 0; i < 8; i++) {
    r.s[i+8] = SATURATE(b.s[i], 0, UCHAR_MAX);
  }

  return r.v;
}


static inline vector unsigned short 
__IA32_vpkswus(vector signed int va, vector signed int vb)
{
  int i;
  mix_s32 a, b;
  mix_u16 r;
  a.v = va;
  b.v = vb;

  for (i = 0; i < 4; i++) {
    r.s[i] = SATURATE(a.s[i], 0, USHRT_MAX);
  }
  for (i = 0; i < 4; i++) {
    r.s[i+4] = SATURATE(b.s[i], 0, USHRT_MAX);
  }

  return r.v;
}


static inline vector unsigned short 
__IA32_vpkuwus(vector unsigned int va, vector unsigned int vb)
{
  int i;
  mix_u32 a, b;
  mix_u16 r;
  a.v = va;
  b.v = vb;

  for (i = 0; i < 4; i++) {
    r.s[i] = a.s[i] > USHRT_MAX ? USHRT_MAX : a.s[i];
  }
  for (i = 0; i < 4; i++) {
    r.s[i+4] = b.s[i] > USHRT_MAX ? USHRT_MAX : b.s[i];
  }

  return r.v;
}
