00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_mat_mult_f32.c 00009 * 00010 * Description: Floating-point matrix multiplication. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * 00029 * Version 0.0.5 2010/04/26 00030 * incorporated review comments and updated with latest CMSIS layer 00031 * 00032 * Version 0.0.3 2010/03/10 00033 * Initial version 00034 * -------------------------------------------------------------------- */ 00035 00036 #include "arm_math.h" 00037 00073 arm_status arm_mat_mult_f32( 00074 const arm_matrix_instance_f32 * pSrcA, 00075 const arm_matrix_instance_f32 * pSrcB, 00076 arm_matrix_instance_f32 * pDst) 00077 { 00078 float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */ 00079 float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */ 00080 float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */ 00081 float32_t *pOut = pDst->pData; /* output data matrix pointer */ 00082 float32_t *px; /* Temporary output data matrix pointer */ 00083 float32_t sum; /* Accumulator */ 00084 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00085 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00086 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00087 00088 #ifndef ARM_MATH_CM0 00089 00090 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00091 00092 uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */ 00093 arm_status status; /* status of matrix multiplication */ 00094 00095 #ifdef ARM_MATH_MATRIX_CHECK 00096 00097 00098 /* Check for matrix mismatch condition */ 00099 if((pSrcA->numCols != pSrcB->numRows) || 00100 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00101 { 00102 00103 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00104 status = ARM_MATH_SIZE_MISMATCH; 00105 } 00106 else 00107 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00108 00109 { 00110 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00111 /* row loop */ 00112 do 00113 { 00114 /* Output pointer is set to starting address of the row being processed */ 00115 px = pOut + i; 00116 00117 /* For every row wise process, the column loop counter is to be initiated */ 00118 col = numColsB; 00119 00120 /* For every row wise process, the pIn2 pointer is set 00121 ** to the starting address of the pSrcB data */ 00122 pIn2 = pSrcB->pData; 00123 00124 j = 0u; 00125 00126 /* column loop */ 00127 do 00128 { 00129 /* Set the variable sum, that acts as accumulator, to zero */ 00130 sum = 0.0f; 00131 00132 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00133 pIn1 = pInA; 00134 00135 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00136 colCnt = numColsA >> 2; 00137 00138 /* matrix multiplication */ 00139 while(colCnt > 0u) 00140 { 00141 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00142 sum += *pIn1++ * (*pIn2); 00143 pIn2 += numColsB; 00144 sum += *pIn1++ * (*pIn2); 00145 pIn2 += numColsB; 00146 sum += *pIn1++ * (*pIn2); 00147 pIn2 += numColsB; 00148 sum += *pIn1++ * (*pIn2); 00149 pIn2 += numColsB; 00150 00151 /* Decrement the loop count */ 00152 colCnt--; 00153 } 00154 00155 /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here. 00156 ** No loop unrolling is used. */ 00157 colCnt = numColsA % 0x4u; 00158 00159 while(colCnt > 0u) 00160 { 00161 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00162 sum += *pIn1++ * (*pIn2); 00163 pIn2 += numColsB; 00164 00165 /* Decrement the loop counter */ 00166 colCnt--; 00167 } 00168 00169 /* Store the result in the destination buffer */ 00170 *px++ = sum; 00171 00172 /* Update the pointer pIn2 to point to the starting address of the next column */ 00173 j++; 00174 pIn2 = pSrcB->pData + j; 00175 00176 /* Decrement the column loop counter */ 00177 col--; 00178 00179 } while(col > 0u); 00180 00181 #else 00182 00183 /* Run the below code for Cortex-M0 */ 00184 00185 float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */ 00186 uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */ 00187 arm_status status; /* status of matrix multiplication */ 00188 00189 #ifdef ARM_MATH_MATRIX_CHECK 00190 00191 /* Check for matrix mismatch condition */ 00192 if((pSrcA->numCols != pSrcB->numRows) || 00193 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00194 { 00195 00196 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00197 status = ARM_MATH_SIZE_MISMATCH; 00198 } 00199 else 00200 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00201 00202 { 00203 /* The following loop performs the dot-product of each row in pInA with each column in pInB */ 00204 /* row loop */ 00205 do 00206 { 00207 /* Output pointer is set to starting address of the row being processed */ 00208 px = pOut + i; 00209 00210 /* For every row wise process, the column loop counter is to be initiated */ 00211 col = numColsB; 00212 00213 /* For every row wise process, the pIn2 pointer is set 00214 ** to the starting address of the pSrcB data */ 00215 pIn2 = pSrcB->pData; 00216 00217 /* column loop */ 00218 do 00219 { 00220 /* Set the variable sum, that acts as accumulator, to zero */ 00221 sum = 0.0f; 00222 00223 /* Initialize the pointer pIn1 to point to the starting address of the row being processed */ 00224 pIn1 = pInA; 00225 00226 /* Matrix A columns number of MAC operations are to be performed */ 00227 colCnt = numColsA; 00228 00229 while(colCnt > 0u) 00230 { 00231 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00232 sum += *pIn1++ * (*pIn2); 00233 pIn2 += numColsB; 00234 00235 /* Decrement the loop counter */ 00236 colCnt--; 00237 } 00238 00239 /* Store the result in the destination buffer */ 00240 *px++ = sum; 00241 00242 /* Decrement the column loop counter */ 00243 col--; 00244 00245 /* Update the pointer pIn2 to point to the starting address of the next column */ 00246 pIn2 = pInB + (numColsB - col); 00247 00248 } while(col > 0u); 00249 00250 #endif /* #ifndef ARM_MATH_CM0 */ 00251 00252 /* Update the pointer pInA to point to the starting address of the next row */ 00253 i = i + numColsB; 00254 pInA = pInA + numColsA; 00255 00256 /* Decrement the row loop counter */ 00257 row--; 00258 00259 } while(row > 0u); 00260 /* Set status as ARM_MATH_SUCCESS */ 00261 status = ARM_MATH_SUCCESS; 00262 } 00263 00264 /* Return to application */ 00265 return (status); 00266 } 00267