File names:
rsax_mod_exp_1024.asm, rsax_mod_exp_1024_mulx.asm, rsax_mod_exp_1024_mulx_adcox.asm

Function names: 

rsax_mod_exp_1024
rsax_mod_exp_1024_mulx
rsax_mod_exp_1024_mulx_adcox

APIs:

void rsax_mod_exp_1024(UINT64 *result, // 1024 bits, 16 qwords
                UINT64 *g,      // 1024 bits, 16 qwords
                UINT64 *exp,    // 1024 bits, 16 qwords
                MOD_EXP_1024_DATA *data);

void rsax_mod_exp_1024_mulx(UINT64 *result, // 1024 bits, 16 qwords
                UINT64 *g,      // 1024 bits, 16 qwords
                UINT64 *exp,    // 1024 bits, 16 qwords
                MOD_EXP_1024_DATA *data);

void rsax_mod_exp_1024_mulx_adcox(UINT64 *result, // 1024 bits, 16 qwords
                UINT64 *g,      // 1024 bits, 16 qwords
                UINT64 *exp,    // 1024 bits, 16 qwords
                MOD_EXP_1024_DATA *data);


Common Data Structure:

struct MOD_EXP_1024_DATA{
      UINT64 R[16];	// 2^1024 mod m
      UINT64 R2[16];	// 2^2048 mod m
      UINT64 M[16];	// m
      UINT64 m_1[1];	// (-1/m) mod 2^64
};


A sample code for initializing the data structure. 
For documentation purposes, we used Crypto++* Library functions to write our own precomputation code (http://www.cryptopp.com/).
Other multi-precision libraries could be utilized. 
*Other names and brands may be claimed as the property of others.

void pre_compute_data(UINT64 *m, MOD_EXP_1024_DATA *data)
{
        large_int two_2048, two_1024, two_64;
        large_int tmp;
        large_int _m(16, m);
        int i;

        // 2^2048
        two_2048 = 1;
        two_2048 <<= 2048;

        //2^1024
        two_1024 = 1;
        two_1024 <<= 1024;

        //2^64
        two_64 = 1;
        two_64 <<= 64;


        if (0 == (m[15] & 0x8000000000000000)){ //Code is optimised and extensively verified for 1024-bit modulus.
                fprintf(stderr,"Invalid modulus: %I64u\n", m[7]);
                exit(1);
        }
        if (0 == (m[0] & 0x1)) { // Odd modulus required for Montgomery Reduction
                fprintf(stderr,"Invalid modulus: %I64u\n", m[0]);
                exit(1);
        }

        // R = 2^1024 mod m
        // In Montgomery space, 1 is represented as 1*R = R. We store g^0 = 1 as R in Montgomery space.
        tmp = two_1024.Modulo(_m);
        tmp.extract(16, &data->R[0]);

        // R2 = 2^2048 mod m
        // we need R2 for converting g into the Montgomery space:
        // MM(R2, g) = g*R
        tmp = two_2048.Modulo(_m);
        tmp.extract(16, &data->R2[0]);

        // insert modulus into the data structure
        for (i=0; i<16; i++)
                data->M[i] = m[i];
        
        // Precompute k1, a 64b number = (-m^-1 ) mod 2^64; k1 should be non-negative.
        tmp =  (_m.Times(-1)).InverseMod(two_64);
        tmp.extract(1, &data->m_1[0]);

}