/******************************************************************************\
* cat_block_matching.c                                                         *
*                                                                              *
* CURSOR ACTIVITY TRACKING                                                     *
*                                                                              *
* Block matching algorithms for mouse cursor detection                         *
*                                                                              *
* Copyright 2005 Peppercon AG                                                  *
* Thomas Weber tweb@peppercon.de                                               *
\******************************************************************************/

#include <sys/types.h>

#include "iipptr_bitmap.h"
#include "cat_internal.h"
#include "cat_debug.h"

#if !defined(CAT_FB_DIRECT)

/***** MSE ********************************************************************/

int cat_mse32(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
              u_int32_t search_left, u_int32_t search_top,
              u_int32_t search_width, u_int32_t search_height, 
              u_int32_t threshold, t_point *found) {
    t_point match;
    u_int32_t mse_min, mse_count, mse, shape_offset, screen_offset, found1;
    u_int32_t x, y, ox, oy, iter, stopx, stopy, shape_tmp, screen_tmp;
    int32_t dr, dg, db;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width || shape->height > search_height ||
       search_left + search_width > screen->width ||
       search_top + search_height > screen->height) {
        /* wrong dimensions */
        CDMSG(CAT_CDMSG_BLOCK_MATCHING_INFO,
              "wrong dimensions of shape or search area!\n");
//printf("screen->width = %d, screen->height = %d, shape->width = %d, shape->height = %d, search_left = %d, search_top = %d, search_width = %d, search_height = %d\n", screen->width, screen->height, shape->width, shape->height, search_left, search_top, search_width, search_height);
        return PP_ERR;
    }
    
    /* count unmasked shape pixels */
    for(oy = 0, mse_count = 0; oy < shape->height; ++oy) {
        for(ox = 0; ox < shape->width; ++ox) {
            /* all positions of shape */
            if(mask->rgb[oy * mask->width + ox].r) {
                /* all valid position in mask */
                ++mse_count;
            }
        }
    }
    
    /* init */
    stopx = search_left + search_width - shape->width + 1;
    stopy = search_top + search_height - shape->height + 1;
    mse_min = mse_count * threshold * threshold * 3 * 4;
    found1 = 0;
    
    /**
     * init coarse iterator...
     * the larger the unmasked area, the larger the iter... 
     * smaller side of shape is MIN(shape->width, shape->height)
     * iter should at least reach 3 times in min side
     * => iter = (MIN(shape->width, shape->height) / 3)
     * coverage = mse count / shape size
     * => iter = ? - (shape size / mse count)
     * we want iter to be at most CAT_BLOCK_MATCHING_MAX_ITER...
     * => iter = CAT_BLOCK_MATCHING_MAX_ITER
     */
    iter = MIN(MIN(shape->width, shape->height) / 3, 
               (u_int)((CAT_BLOCK_MATCHING_MAX_ITER * mse_count) /
                       (float)(shape->width * shape->height) + 0.5));
//    iter = MIN(MIN(shape->width, shape->height) / 5 + 1, 3);

    /* coarse search */
    for(y = search_top; y < stopy; y += iter) {
        for(x = search_left; x < stopx; x += iter) {
            /* all possible positions (x; y) of shape in search space */
            
            mse = 0;
            for(oy = 0; oy < shape->height; oy += iter) {
                shape_tmp = oy * shape->width;
                screen_tmp = (y + oy) * screen->width + x;
                for(ox = 0; ox < shape->width; ox += iter) {
                    /* all positions of shape */
                    
                    shape_offset = shape_tmp + ox;
                    if(mask->rgb[shape_offset].r) {
                        /* all valid position in mask */
                        
                        assert((u_int16_t)(x + ox) < screen->width);
                        assert((u_int16_t)(y + oy) < screen->height);
                        screen_offset = screen_tmp + ox;
                        dr = screen->rgb[screen_offset].r -
                             shape->rgb[shape_offset].r;
                        dg = screen->rgb[screen_offset].g -
                             shape->rgb[shape_offset].g;
                        db = screen->rgb[screen_offset].b -
                             shape->rgb[shape_offset].b;
                        mse += dr * dr + dg * dg + db * db;
                        if(mse > mse_min) {
                            /* already worse than min, bail out */
                            goto bail_c;
                        }
                    }
                }
            }
            
            mse_min = mse;
            match.pos_x = x;
            match.pos_y = y;
            ++found1;
 bail_c:;
        }
    }

    if(!found1) {
        /* we didn't find pattern */
        CDMSG(CAT_CDMSG_BLOCK_MATCHING_INFO,
              "we didn't find pattern in coarse search!\n");
        return PP_ERR;
    } else {
        CDMSG(CAT_CDMSG_BLOCK_MATCHING_INFO, 
              "found pattern in coarse search @(%4d; %3d) with iter %d!\n",
              match.pos_x, match.pos_y, iter);
    }
    
    mse_min = mse_count * threshold * threshold * 3;
    found1 = 0;
   
    /* fine search */
    if(iter > 1) {
        /* if iter == 1 coarse search was fine enough ;-) */
        for(y = MAX(search_top, match.pos_y - iter); 
            y < MIN(stopy, match.pos_y + iter); ++y) {
            for(x = MAX(search_left, match.pos_x - iter);
                x < MIN(stopx, match.pos_x + iter); ++x) {
                /* all possible positions (x; y) of shape in search space */
                
                mse = 0;
                for(oy = 0; oy < shape->height; ++oy) {
                    shape_tmp = oy * shape->width;
                    screen_tmp = (y + oy) * screen->width + x;
                    for(ox = 0; ox < shape->width; ++ox) {
                        /* all positions of shape */
                        
                        shape_offset = shape_tmp + ox;
                        if(mask->rgb[shape_offset].r) {
                            /* all valid position in mask */
                            
                            assert((u_int16_t)(x + ox) < screen->width);
                            assert((u_int16_t)(y + oy) < screen->height);
                            screen_offset = screen_tmp + ox;
                            dr = screen->rgb[screen_offset].r -
                                 shape->rgb[shape_offset].r;
                            dg = screen->rgb[screen_offset].g -
                                 shape->rgb[shape_offset].g;
                            db = screen->rgb[screen_offset].b -
                                 shape->rgb[shape_offset].b;
                            mse += dr * dr + dg * dg + db * db;
                            if(mse > mse_min) {
                                /* already worse than min, bail out */
                                goto bail_f;
                            }
                        }
                    }
                }
                
                mse_min = mse;
                match.pos_x = x;
                match.pos_y = y;
                ++found1;
     bail_f:;
            }
        }
        
        if(!found1) {
            /* we didn't find pattern */
            CDMSG(CAT_CDMSG_BLOCK_MATCHING_INFO,
                  "we didn't find pattern in fine search!\n");
            return PP_ERR;
        }
    }
        
    mse_min /= 3 * mse_count; /* normalize */
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    CDMSG(CAT_CDMSG_BLOCK_MATCHING_INFO, "found shape @(%4d, %3d), mse = %d!\n",
          match.pos_x, match.pos_y, mse_min);

    return mse_min;
}
#if defined(CAT_BLOCK_MATCHING_LOCAL)
/*
#define MIN(x, y) ((x) > (y) ? (y) : (x))
#define MAX(x, y) ((x) < (y) ? (y) : (x))
#define SET_WITHIN(i, min, max) (MIN(max, (MAX(min, i))))
#define SET_WITHIN__(i, min, max) ((i) < (min) ? (min) : ((i) > (max) ? (max) : (i)))
*/
/***** MSE ********************************************************************/
int cat_mse(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
            u_int16_t search_left, u_int16_t search_top,
            u_int16_t search_width, u_int16_t search_height, 
            u_int32_t threshold , t_point *found) {
    t_point match;
    u_int32_t mse_min = -1, mse_count, mse, shape_offset, screen_offset;
    u_int16_t x, y, ox, oy;
    int16_t dr, dg, db;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width ||shape->height > search_height) {
        return PP_ERR;
    }
    
    for(oy = 0, mse_count = 0; oy < shape->height; ++oy) {
        for(ox = 0; ox < shape->width; ++ox) {
            /* all positions of shape */
            if(mask->rgb[oy * mask->width + ox].r) {
                /* all valid position in mask */
                ++mse_count;
            }
        }
    }
    for(y = search_top; y <= search_top + search_height - shape->height; ++y) {
        for(x = search_left; 
            x <= search_left + search_width - shape->width; ++x) {
            /* all possible positions (x; y) of shape in search space */
            
            mse = 0;
            for(oy = 0; oy < shape->height; ++oy) {
                for(ox = 0; ox < shape->width; ++ox) {
                    /* all positions of shape */
                    
                    shape_offset = oy * shape->width + ox;
                    if(mask->rgb[shape_offset].r) {
                        /* all valid position in mask */
                        
                        assert((u_int16_t)(x + ox) < screen->width);
                        assert((u_int16_t)(y + oy) < screen->height);
                        screen_offset = (y + oy) * screen->width + (x + ox);
                        dr = screen->rgb[screen_offset].r -
                             shape->rgb[shape_offset].r;
                        dg = screen->rgb[screen_offset].g -
                             shape->rgb[shape_offset].g;
                        db = screen->rgb[screen_offset].b -
                             shape->rgb[shape_offset].b;
                        mse += dr * dr + dg * dg + db * db;
                        if(mse > mse_min) {
                            /* already worse than min, bail out */
                            goto bail;
                        }
                    }
                }
            }
            
            mse_min = mse;
            match.pos_x = x;
            match.pos_y = y;
 bail:;
        }
    }
    
    mse_min /= 3 * mse_count; /* normalize */
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return mse_min;
}

/***** MSE ********************************************************************/
int cat_mse32_old(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
                  u_int16_t search_left, u_int16_t search_top,
                  u_int16_t search_width, u_int16_t search_height, 
                  u_int32_t threshold , t_point *found) {
    t_point match;
    u_int32_t mse_min = -1, mse_count, mse, shape_offset, screen_offset;
    u_int32_t x, y, ox, oy;
    int32_t dr, dg, db;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width ||shape->height > search_height) {
        return PP_ERR;
    }
    
    for(oy = 0, mse_count = 0; oy < shape->height; ++oy) {
        for(ox = 0; ox < shape->width; ++ox) {
            /* all positions of shape */
            if(mask->rgb[oy * mask->width + ox].r) {
                /* all valid position in mask */
                ++mse_count;
            }
        }
    }
    for(y = search_top; y <= search_top + search_height - shape->height; ++y) {
        for(x = search_left; 
            x <= search_left + search_width - shape->width; ++x) {
            /* all possible positions (x; y) of shape in search space */
            
            mse = 0;
            for(oy = 0; oy < shape->height; ++oy) {
                for(ox = 0; ox < shape->width; ++ox) {
                    /* all positions of shape */
                    
                    shape_offset = oy * shape->width + ox;
                    if(mask->rgb[shape_offset].r) {
                        /* all valid position in mask */
                        
                        assert((u_int16_t)(x + ox) < screen->width);
                        assert((u_int16_t)(y + oy) < screen->height);
                        screen_offset = (y + oy) * screen->width + (x + ox);
                        dr = screen->rgb[screen_offset].r -
                             shape->rgb[shape_offset].r;
                        dg = screen->rgb[screen_offset].g -
                             shape->rgb[shape_offset].g;
                        db = screen->rgb[screen_offset].b -
                             shape->rgb[shape_offset].b;
                        mse += dr * dr + dg * dg + db * db;
                        if(mse > mse_min) {
                            /* already worse than min, bail out */
                            goto bail;
                        }
                    }
                }
            }
            
            mse_min = mse;
            match.pos_x = x;
            match.pos_y = y;
 bail:;
        }
    }
    
    mse_min /= 3 * mse_count; /* normalize */
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return mse_min;
}

int cat_ssd32_sub(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
                  u_int32_t x_offset, u_int32_t y_offset, 
                  u_int32_t ssd_min, u_int32_t iter) {
    u_int32_t shape_offset, screen_offset, ox, oy, shape_tmp, screen_tmp;
    int32_t dr, dg, db, ssd = 0;

    for(oy = 0; oy < shape->height; oy += iter) {
        shape_tmp = oy * shape->width;
        screen_tmp = (y_offset + oy) * screen->width + x_offset;
        for(ox = 0; ox < shape->width; ox += iter) {
            /* all positions of shape */

            shape_offset = shape_tmp + ox;
            if(mask->rgb[shape_offset].r) {
                /* all valid position in mask */

//printf("screen is %d x %d, get %d, %d\n", screen->width, screen->height, x_offset + ox, y_offset + oy);usleep(10000);
                assert((u_int16_t)(x_offset + ox) < screen->width);
                assert((u_int16_t)(y_offset + oy) < screen->height);

                screen_offset = screen_tmp + ox;
                dr = screen->rgb[screen_offset].r -
                     shape->rgb[shape_offset].r;
                dg = screen->rgb[screen_offset].g -
                     shape->rgb[shape_offset].g;
                db = screen->rgb[screen_offset].b -
                     shape->rgb[shape_offset].b;
                ssd += dr * dr + dg * dg + db * db;

                if((u_int32_t)ssd > ssd_min) {
                    /* already worse than min, bail out */
                    return PP_ERR;
                }
            }
        }
    }

    return ssd;
}

int cat_mse32_s(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
                u_int32_t search_left, u_int32_t search_top,
                u_int32_t search_width, u_int32_t search_height, 
                u_int32_t threshold, t_point *found) {
    t_point match;
    u_int32_t ssd_min, mse_count, mse, found1;
    u_int32_t x, y, ox, oy, iter, stopx, stopy;
    int ssd;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width || shape->height > search_height) {
        return PP_ERR;
    }
    
    /* count unmasked shape pixels */
    for(oy = 0, mse_count = 0; oy < shape->height; ++oy) {
        for(ox = 0; ox < shape->width; ++ox) {
            /* all positions of shape */
            if(mask->rgb[oy * mask->width + ox].r) {
                /* all valid position in mask */
                ++mse_count;
            }
        }
    }
    
    /* init */
    stopx = search_left + search_width - shape->width + 1;
    stopy = search_top + search_height - shape->height + 1;
    /* init min to double treshold squared for each color channel and pixel */
    ssd_min = 4 * threshold * threshold * 3 * mse_count;
    found1 = 0;
    
    /**
     * init coarse iterator...
     * the larger the unmasked area, the larger the iter... 
     * smaller side of shape is MIN(shape->width, shape->height)
     * iter should at least reach 3 times in min side
     * => iter = (MIN(shape->width, shape->height) / 3)
     * coverage = mse count / shape size
     * => iter = ? - (shape size / mse count)
     * we want iter to be at most CAT_BLOCK_MATCHING_MAX_ITER...
     * => iter = CAT_BLOCK_MATCHING_MAX_ITER
     */
    iter = MIN(MIN(shape->width, shape->height) / 3, 
               (u_int)((CAT_BLOCK_MATCHING_MAX_ITER * mse_count) /
                       (float)(shape->width * shape->height) + 0.5));
    
    /* coarse search */
    for(y = search_top; y < stopy; y += iter) {
        for(x = search_left; x < stopx; x += iter) {
            /* all possible positions (x; y) of shape in search space */
            
            if(PP_ERR != (ssd = cat_ssd32_sub(screen, shape, mask, 
                                              x, y, ssd_min, iter))) {
                ssd_min = ssd;
                match.pos_x = x;
                match.pos_y = y;
                ++found1;
            }
        }
    }

    if(!found1) {
        /* we didn't find pattern */
        return PP_ERR;
    }
    
    /* init min to treshold squared for each color channel and pixel */
    ssd_min = threshold * threshold * 3 * mse_count;
    found1 = 0;
   
    /* fine search */
    if(iter > 1) {
        /* if iter == 1 coarse search was fine enough ;-) */
        for(y = MAX(search_top, match.pos_y - (iter * 2)); 
            y < MIN(stopy, match.pos_y + (iter * 2)); ++y) {
            for(x = MAX(search_left, match.pos_x - (iter * 2));
                x < MIN(stopx, match.pos_x + (iter * 2)); ++x) {
                /* all possible positions (x; y) of shape in search space */

                if(PP_ERR != (ssd = cat_ssd32_sub(screen, shape, mask, 
                                                  x, y, ssd_min, 1))) {
                    ssd_min = ssd;
                    match.pos_x = x;
                    match.pos_y = y;
                    ++found1;
                }
            }
        }

        if(!found1) {
            /* we didn't find pattern */
            return PP_ERR;
        }
    }
    
    mse = ssd_min / (3 * mse_count); /* normalize */
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return mse;
}

/***** SAD ********************************************************************/
int cat_sad(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
            u_int16_t search_left, u_int16_t search_top,
            u_int16_t search_width, u_int16_t search_height, 
            u_int32_t threshold , t_point *found) {
    t_point match;
    u_int32_t sad_min = -1, sad, shape_offset, screen_offset;
    u_int16_t x, y, ox, oy;
    int16_t dr, dg, db;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width ||shape->height > search_height) {
        return PP_ERR;
    }
    
    for(y = search_top; y <= search_top + search_height - shape->height; ++y) {
        for(x = search_left; 
            x <= search_left + search_width - shape->width; ++x) {
            /* all possible positions (x; y) of shape in search space */
            
            sad = 0;
            for(oy = 0; oy < shape->height; ++oy) {
                for(ox = 0; ox < shape->width; ++ox) {
                    /* all positions of shape */
                    
                    shape_offset = oy * shape->width + ox;
                    if(mask->rgb[shape_offset].r) {
                        /* all valid position in mask */
                        
                        assert((u_int16_t)(x + ox) < screen->width);
                        assert((u_int16_t)(y + oy) < screen->height);
                        screen_offset = (y + oy) * screen->width + (x + ox);
                        dr = screen->rgb[screen_offset].r -
                             shape->rgb[shape_offset].r;
                        dg = screen->rgb[screen_offset].g -
                             shape->rgb[shape_offset].g;
                        db = screen->rgb[screen_offset].b -
                             shape->rgb[shape_offset].b;
                        sad += abs(dr) + abs(dg) + abs(db);
/*
                        sad += dr > 0 ? dr : -dr;
                        sad += dg > 0 ? dg : -dg;
                        sad += db > 0 ? db : -db;
*/
                        if(sad > sad_min) {
                            /* already worse than min, bail out */
                            goto bail;
                        }
                    }
                }
            }
            
            sad_min = sad;
            match.pos_x = x;
            match.pos_y = y;
 bail:;
        }
    }
    
    sad_min /= 3;
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return sad_min;
}

static inline int32_t abs32(int32_t __i__) {
    u_int32_t __sign__ = __i__ >> 31;
    u_int32_t __bitmask__ = 0 - __i__;
    __i__ ^= __bitmask__;
    __i__ += __sign__;
    return __i__;
}
    
/***** SAD ********************************************************************/
int cat_sad32(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
              u_int16_t search_left, u_int16_t search_top,
              u_int16_t search_width, u_int16_t search_height, 
              u_int32_t threshold , t_point *found) {
    t_point match;
    u_int32_t sad_min = -1, sad, shape_offset, screen_offset;
    u_int32_t x, y, ox, oy;
    int32_t dr, dg, db;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width ||shape->height > search_height) {
        return PP_ERR;
    }
    
    for(y = search_top; y <= search_top + search_height - shape->height; ++y) {
        for(x = search_left; 
            x <= search_left + search_width - shape->width; ++x) {
            /* all possible positions (x; y) of shape in search space */
            
            sad = 0;
            for(oy = 0; oy < shape->height; ++oy) {
                for(ox = 0; ox < shape->width; ++ox) {
                    /* all positions of shape */
                    
                    shape_offset = oy * shape->width + ox;
                    if(mask->rgb[shape_offset].r) {
                        /* all valid position in mask */
                        
                        assert((u_int16_t)(x + ox) < screen->width);
                        assert((u_int16_t)(y + oy) < screen->height);
                        screen_offset = (y + oy) * screen->width + (x + ox);
                        dr = screen->rgb[screen_offset].r -
                             shape->rgb[shape_offset].r;
                        dg = screen->rgb[screen_offset].g -
                             shape->rgb[shape_offset].g;
                        db = screen->rgb[screen_offset].b -
                             shape->rgb[shape_offset].b;

                        sad += abs(dr) + abs(dg) + abs(db);

//                        sad += abs32(dr) + abs32(dg) + abs32(db);

/*
                        sad += dr > 0 ? dr : -dr;
                        sad += dg > 0 ? dg : -dg;
                        sad += db > 0 ? db : -db;
*/
/*
                        {
                            u_int32_t sign;
                            u_int32_t bitmask;
                            
                            sign = dr >> 31;
                            bitmask = 0 - sign;
                            dr ^= bitmask;
                            dr += sign;
                            
                            sign = dg >> 31;
                            bitmask = 0 - sign;
                            dg ^= bitmask;
                            dg += sign;
                            
                            sign = db >> 31;
                            bitmask = 0 - sign;
                            db ^= bitmask;
                            db += sign;
                        }                            
                        sad += dr + dg + db;
*/                        
                        if(sad > sad_min) {
                            /* already worse than min, bail out */
                            goto bail;
                        }
                    }
                }
            }
            
            sad_min = sad;
            match.pos_x = x;
            match.pos_y = y;
 bail:;
        }
    }
    
    sad_min /= 3;
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return sad_min;
}

/***** SAD ********************************************************************/
int cat_sad32x(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
               u_int16_t search_left, u_int16_t search_top,
               u_int16_t search_width, u_int16_t search_height, 
               u_int32_t threshold , t_point *found) {
    t_point match;
    u_int32_t sad_min = -1, sad, shape_offset, screen_offset;
    u_int32_t x, y, ox, oy;
    int32_t dr, dg, db;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width ||shape->height > search_height) {
        return PP_ERR;
    }
    
    for(y = search_top; y <= search_top + search_height - shape->height; ++y) {
        for(x = search_left; 
            x <= search_left + search_width - shape->width; ++x) {
            /* all possible positions (x; y) of shape in search space */
            
            sad = 0;
            for(oy = 0; oy < shape->height; ++oy) {
                for(ox = 0; ox < shape->width; ++ox) {
                    /* all positions of shape */
                    
                    shape_offset = oy * shape->width + ox;
                    if(mask->rgb[shape_offset].r) {
                        /* all valid position in mask */
                        
                        assert((u_int16_t)(x + ox) < screen->width);
                        assert((u_int16_t)(y + oy) < screen->height);
                        screen_offset = (y + oy) * screen->width + (x + ox);
                        dr = screen->rgb[screen_offset].r -
                             shape->rgb[shape_offset].r;
                        dg = screen->rgb[screen_offset].g -
                             shape->rgb[shape_offset].g;
                        db = screen->rgb[screen_offset].b -
                             shape->rgb[shape_offset].b;

//                        sad += abs32(dr) + abs32(dg) + abs32(db);

//                        sad += abs32(dr) + abs32(dg) + abs32(db);

/*
                        sad += dr > 0 ? dr : -dr;
                        sad += dg > 0 ? dg : -dg;
                        sad += db > 0 ? db : -db;
*/

                        {
#define ABS32X1(__i__)       sign = __i__ >> 31; \
                            bitmask = 0 - sign; \
                            __i__ ^= bitmask; \
                            __i__ += sign
#define ABS32X(__i__)       sign = __i__ >> 31; \
                            __i__ = (__i__ ^ (0 - sign)) + sign

                            u_int32_t sign;
//                            u_int32_t bitmask;
                            
                            ABS32X(dr);
                            ABS32X(dg);
                            ABS32X(db);
                        }                            
                        sad += dr + dg + db;
                        
                        if(sad > sad_min) {
                            /* already worse than min, bail out */
                            goto bail;
                        }
                    }
                }
            }
            
            sad_min = sad;
            match.pos_x = x;
            match.pos_y = y;
 bail:;
        }
    }
    
    sad_min /= 3;
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return sad_min;
}

/***** SSD ********************************************************************/
int cat_ssd(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
            u_int16_t search_left, u_int16_t search_top,
            u_int16_t search_width, u_int16_t search_height, 
            u_int32_t threshold , t_point *found) {
    t_point match;
    u_int32_t ssd_min = -1, ssd, shape_offset, screen_offset;
    u_int16_t x, y, ox, oy;
    int16_t dr, dg, db;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width ||shape->height > search_height) {
        return PP_ERR;
    }
    
    for(y = search_top; y <= search_top + search_height - shape->height; ++y) {
        for(x = search_left; 
            x <= search_left + search_width - shape->width; ++x) {
            /* all possible positions (x; y) of shape in search space */
            
            ssd = 0;
            for(oy = 0; oy < shape->height; ++oy) {
                for(ox = 0; ox < shape->width; ++ox) {
                    /* all positions of shape */
                    
                    shape_offset = oy * shape->width + ox;
                    if(mask->rgb[shape_offset].r) {
                        /* all valid position in mask */
                        
                        assert((u_int16_t)(x + ox) < screen->width);
                        assert((u_int16_t)(y + oy) < screen->height);
                        screen_offset = (y + oy) * screen->width + (x + ox);
                        dr = screen->rgb[screen_offset].r -
                             shape->rgb[shape_offset].r;
                        dg = screen->rgb[screen_offset].g -
                             shape->rgb[shape_offset].g;
                        db = screen->rgb[screen_offset].b -
                             shape->rgb[shape_offset].b;
                        ssd += dr * dr + dg * dg + db * db;
                        if(ssd > ssd_min) {
                            /* already worse than min, bail out */
                            goto bail;
                        }
                    }
                }
            }
            
            ssd_min = ssd;
            match.pos_x = x;
            match.pos_y = y;
 bail:;
        }
    }
    
    ssd_min /= 3;
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return ssd_min;
}

/***** SSD ********************************************************************/
int cat_ssd32(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
              u_int16_t search_left, u_int16_t search_top,
              u_int16_t search_width, u_int16_t search_height, 
              u_int32_t threshold , t_point *found) {
    t_point match;
    u_int32_t ssd_min = -1, ssd, shape_offset, screen_offset;
    u_int16_t x, y, ox, oy;
    int32_t dr, dg, db;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width ||shape->height > search_height) {
        return PP_ERR;
    }
    
    for(y = search_top; y <= search_top + search_height - shape->height; ++y) {
        for(x = search_left; 
            x <= search_left + search_width - shape->width; ++x) {
            /* all possible positions (x; y) of shape in search space */
            
            ssd = 0;
            for(oy = 0; oy < shape->height; ++oy) {
                for(ox = 0; ox < shape->width; ++ox) {
                    /* all positions of shape */
                    
                    shape_offset = oy * shape->width + ox;
                    if(mask->rgb[shape_offset].r) {
                        /* all valid position in mask */
                        
                        assert((u_int16_t)(x + ox) < screen->width);
                        assert((u_int16_t)(y + oy) < screen->height);
                        screen_offset = (y + oy) * screen->width + (x + ox);
                        dr = screen->rgb[screen_offset].r -
                             shape->rgb[shape_offset].r;
                        dg = screen->rgb[screen_offset].g -
                             shape->rgb[shape_offset].g;
                        db = screen->rgb[screen_offset].b -
                             shape->rgb[shape_offset].b;
                        ssd += dr * dr + dg * dg + db * db;
                        if(ssd > ssd_min) {
                            /* already worse than min, bail out */
                            goto bail;
                        }
                    }
                }
            }
            
            ssd_min = ssd;
            match.pos_x = x;
            match.pos_y = y;
 bail:;
        }
    }
    
    ssd_min /= 3;
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return ssd_min;
}

/***** PDC ********************************************************************/
int cat_pdc32_sub(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
                  u_int32_t x_offset, u_int32_t y_offset, 
                  u_int32_t pdc_min, u_int32_t threshold, u_int32_t iter) {
    u_int32_t shape_offset, screen_offset, ox, oy, shape_tmp, screen_tmp;
    int dr, dg, db, d, pdc = 0;

    for(oy = 0; oy < shape->height; oy += iter) {
        shape_tmp = oy * shape->width;
        screen_tmp = (y_offset + oy) * screen->width + x_offset;
        for(ox = 0; ox < shape->width; ox += iter) {
            /* all positions of shape */
            
            shape_offset = oy * shape->width + ox;
            if(mask->rgb[shape_offset].r) {
                /* all valid position in mask */
                
                assert((u_int16_t)(x_offset + ox) < screen->width);
                assert((u_int16_t)(y_offset + oy) < screen->height);
                screen_offset = (y_offset + oy) * screen->width + 
                                (x_offset + ox);

/*
                d = 3 * threshold - 
                    abs(screen->rgb[screen_offset].r -
                        shape->rgb[shape_offset].r) -
                    abs(screen->rgb[screen_offset].g -
                        shape->rgb[shape_offset].g) -
                    abs(screen->rgb[screen_offset].b -
                        shape->rgb[shape_offset].b);
*/
                dr = screen->rgb[screen_offset].r -
                     shape->rgb[shape_offset].r;
                dg = screen->rgb[screen_offset].g -
                     shape->rgb[shape_offset].g;
                db = screen->rgb[screen_offset].b -
                     shape->rgb[shape_offset].b;

                d = 3 * threshold * threshold - 
                    dr * dr - dg * dg - db * db;

                pdc += (u_int32_t)d >> 31;
                
                if((u_int32_t)pdc > pdc_min) {
                    /* already worse than min, bail out */
                    return PP_ERR;
                }
            }
        }
    }

    assert(pdc >= 0);
    return pdc;
}

int cat_pdc32(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
              u_int32_t search_left, u_int32_t search_top,
              u_int32_t search_width, u_int32_t search_height, 
              u_int32_t threshold, t_point *found) {
    t_point match;
    u_int32_t pdc_min, shape_count, pdc, found1;
    u_int32_t x, y, ox, oy, iter, stopx, stopy;
    int pdc_sub;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width || shape->height > search_height) {
        return PP_ERR;
    }
    
    /* count unmasked shape pixels */
    for(oy = 0, shape_count = 0; oy < shape->height; ++oy) {
        for(ox = 0; ox < shape->width; ++ox) {
            /* all positions of shape */
            if(mask->rgb[oy * mask->width + ox].r) {
                /* all valid position in mask */
                ++shape_count;
            }
        }
    }
    
    /* init */
    stopx = search_left + search_width - shape->width + 1;
    stopy = search_top + search_height - shape->height + 1;
    /* init min to double treshold squared for each color channel and pixel */
    pdc_min = shape_count / 4;
    found1 = 0;
    
    /**
     * init coarse iterator...
     * the larger the unmasked area, the larger the iter... 
     * smaller side of shape is MIN(shape->width, shape->height)
     * iter should at least reach 3 times in min side
     * => iter = (MIN(shape->width, shape->height) / 3)
     * coverage = mse count / shape size
     * => iter = ? - (shape size / mse count)
     * we want iter to be at most CAT_BLOCK_MATCHING_MAX_ITER...
     * => iter = CAT_BLOCK_MATCHING_MAX_ITER
     */
    iter = MIN(MIN(shape->width, shape->height) / 3, 
               (u_int)((CAT_BLOCK_MATCHING_MAX_ITER * shape_count) /
                       (float)(shape->width * shape->height) + 0.5));
    
    /* coarse search */
    for(y = search_top; y < stopy; y += iter) {
        for(x = search_left; x < stopx; x += iter) {
            /* all possible positions (x; y) of shape in search space */
            
            if(PP_ERR != (pdc_sub = cat_pdc32_sub(screen, shape, mask, 
                                                  x, y, pdc_min, threshold,
                                                  iter))) {
                pdc_min = pdc_sub;
                match.pos_x = x;
                match.pos_y = y;
                ++found1;
            }
        }
    }

    if(!found1) {
        /* we didn't find pattern */
        return PP_ERR;
    }
    
    /* init min to treshold squared for each color channel and pixel */
    pdc_min = shape_count / 2;
    found1 = 0;
   
    /* fine search */
    if(iter > 1) {
        /* if iter == 1 coarse search was fine enough ;-) */
        for(y = MAX(search_top, match.pos_y - (iter * 2)); 
            y < MIN(stopy, match.pos_y + (iter * 2)); ++y) {
            for(x = MAX(search_left, match.pos_x - (iter * 2));
                x < MIN(stopx, match.pos_x + (iter * 2)); ++x) {
                /* all possible positions (x; y) of shape in search space */

                if(PP_ERR != (pdc_sub = cat_pdc32_sub(screen, shape, mask, 
                                                      x, y, pdc_min, 
                                                      threshold, 1))) {
                    pdc_min = pdc_sub;
                    match.pos_x = x;
                    match.pos_y = y;
                    ++found1;
                }
            }
        }

        if(!found1) {
            /* we didn't find pattern */
            return PP_ERR;
        }
    }
    
    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return pdc_min;
}

int cat_pdc32old(t_bitmap *screen, t_bitmap *shape, t_bitmap *mask, 
              u_int32_t search_left, u_int32_t search_top,
              u_int32_t search_width, u_int32_t search_height, 
              u_int32_t threshold, t_point *found) {
    t_point match;
    u_int32_t pdc_min;
    u_int32_t x, y, stopx, stopy;
    int pdc;
    
    assert(screen);
    assert(screen->rgb);
    assert(shape);
    assert(shape->rgb);
    assert(mask);
    assert(mask->rgb);
    assert(shape->width == mask->width);
    assert(shape->height == mask->height);
    assert(found);
    
    if(shape->width > search_width || shape->height > search_height) {
        return PP_ERR;
    }
    
    /* init */
    stopx = search_left + search_width - shape->width + 1;
    stopy = search_top + search_height - shape->height + 1;

    /* count unmasked shape pixels */
    for(y = 0, pdc_min = 0; y < shape->height; ++y) {
        for(x = 0; x < shape->width; ++x) {
            /* all positions of shape */
            if(mask->rgb[y * mask->width + x].r) {
                /* all valid position in mask */
                ++pdc_min;
            }
        }
    }
    pdc_min /= 2; 

    for(y = search_top; y < stopy; ++y) {
        for(x = search_left; x < stopx; ++x) {
            /* all possible positions (x; y) of shape in search space */
            
            if(PP_ERR != (pdc = cat_pdc32_sub(screen, shape, mask, x, y,
                                              pdc_min, threshold, 1))) {
                pdc_min = pdc;
                match.pos_x = x;
                match.pos_y = y;
            }
        }
    }

    found->pos_x = match.pos_x;
    found->pos_y = match.pos_y;
    
    return pdc_min;
}
#endif /* CAT_BLOCK_MATCHING_LOCAL */

#endif /* !CAT_FB_DIRECT */

