/*===============================================================================
*
*	TEXAS INSTRUMENTS ,INC.
*	
*	DISCRETE COSINE TRANSFORM - 2D, 8x8, 16-BIT INPUT, NO ROUNDING
*
*	REVISION DATE: 05/30/97
*
*	USAGE This routine is C callable and can be called as
* 
*		void dct(short *d, short *r)
*		d = array of 8x8 inputs/outputs in raster scan order
*		r = set of coefficients used in the DCT
*
*		Where *r = 0xADFD, 0xC13B, 0xE333, 0xF384, 0x098E, 0x6254,
*			   0x41B3, 0x300B, 0x25A1, 0x187E, 0x1151, 0xC4DF
*
*		If the routine is not to be used as a C callable function,
*		then all instructions relating to stack should be removed.
*	  	Refer to comments of individual instructions. You will also
*	 	need to initialize values for all the values passed as these
*	 	are assumed to be in registers as defined by the calling
*		convention of the compiler, (refer to the C compiler reference
*		guide.)
*
*	C CODE
*		This is the C equivalent of the Assembly Code without the 
*		assumptions listed below. Note that the assembly code is hand
*		optimized and assumptions apply.
*
*		SOURCE - Independent JPEG Group, Thomas G. Lane
*
*/
		void dctac(short *d, short *r)
		{
		    int             t[12];
		    short           i, j, k, m, n, p;

		    for (k = 1, m = 0, n = 13, p = 8; k <= 8;
			k += 7, m += 3, n += 3, p -= 7, d -= 64) {
			for (i = 0; i < 8; i++, d += p) {
				for (j = 0; j < 4; j++) {
					t[j] = d[k * j] + d[k * (7 - j)];
					t[7 - j] = d[k * j] - d[k * (7 - j)];
				}
				t[8]   = t[0] + t[3];
				t[9]   = t[0] - t[3];
				t[10]  = t[1] + t[2];
				t[11]  = t[1] - t[2];
				d[0]   = t[8] + t[10] >> m;
				d[4*k] = t[8] - t[10] >> m;
				t[8]   = (short) (t[11] + t[9]) * r[10];
				d[2*k] = t[8] + (short) t[9] * r[9] >> n;
				d[6*k] = t[8] + (short) t[11] * r[11] >> n;
				t[0]   = (short) (t[4] + t[7]) * r[2];
				t[1]   = (short) (t[5] + t[6]) * r[0];
				t[2]   = t[4] + t[6];
				t[3]   = t[5] + t[7];
				t[8]   = (short) (t[2] + t[3]) * r[8];
				t[2]   = (short) t[2] * r[1] + t[8];
				t[3]   = (short) t[3] * r[3] + t[8];
				d[7*k] = (short) t[4] * r[4] + t[0] + t[2] >> n;
				d[5*k] = (short) t[5] * r[6] + t[1] + t[3] >> n;
				d[3*k] = (short) t[6] * r[5] + t[1] + t[2] >> n;
				d[1*k] = (short) t[7] * r[7] + t[0] + t[3] >> n;
			}
		    }
		}
/*
*
*	DESCRIPTION
*		This routine is used to compute the DCT of an 8x8 matrix of
*		pixels which have been aranged in raster order.  The data size
*		of the input pixels and coefficients is 16 bits.
*
*	TECHNIQUES
*		The outer loop (k loop) is unrolled giving two inner loops (i
*		loops).  The two inner loops are LOOP1 and LOOP2 which process
*		the rows and columns respectively.  These loops require two
*		passes to fully prime the loop thus execute 10 times each.
*		
*	ASSUMPTIONS
*		Coefficients, *r, must be aligned on a word boundary
*
*	MEMORY NOTE
*		This code has no memory hits regardless of where d and r are
*		located in memory.
*
*	CYCLES		226
*
*===============================================================================
*** BEGIN Benchmark Timing ***
*** END Benchmark Timing ***
*/