1 /* 2 3 Copyright (c) 2009, 2010, 2011, 2012, 2013 STMicroelectronics 4 Written by Christophe Lyon 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy 7 of this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights 9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 copies of the Software, and to permit persons to whom the Software is 11 furnished to do so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in 14 all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 THE SOFTWARE. 23 24 */ 25 26 #ifndef _STM_ARM_NEON_REF_H_ 27 #define _STM_ARM_NEON_REF_H_ 28 29 #if defined(__cplusplus) 30 #include <cstdio> 31 #include <cinttypes> 32 #include <cstring> 33 #else 34 #include <stdio.h> 35 #if defined(_MSC_VER) 36 #include "msinttypes.h" 37 #include <float.h> /* for isnan() ... */ 38 static int32_t _ptrNan[]={0x7fc00000L}; 39 #define NAN (*(float*)_ptrNan) 40 static int32_t _ptrInf[]={0x7f800000L}; 41 #define INFINITY (*(float*)_ptrInf) 42 #define HUGE_VALF INFINITY 43 #else 44 #include <inttypes.h> 45 #endif 46 #include <string.h> 47 #endif 48 49 #define xSTR(X) #X 50 #define STR(X) xSTR(X) 51 52 #define xNAME1(V,T) V ## _ ## T 53 #define xNAME(V,T) xNAME1(V,T) 54 55 #define VAR(V,T,W) xNAME(V,T##W) 56 #define VAR_DECL(V, T, W) T##W##_t VAR(V,T,W) 57 58 #define VECT_NAME(T, W, N) T##W##x##N 59 #define VECT_ARRAY_NAME(T, W, N, L) T##W##x##N##x##L 60 #define VECT_TYPE(T, W, N) xNAME(VECT_NAME(T,W,N),t) 61 #define VECT_ARRAY_TYPE(T, W, N, L) xNAME(VECT_ARRAY_NAME(T,W,N,L),t) 62 63 #define VECT_VAR(V,T,W,N) xNAME(V,VECT_NAME(T,W,N)) 64 #define VECT_VAR_DECL(V, T, W, N) T##W##_t VECT_VAR(V,T,W,N) 65 66 /* This one is used for padding between input buffers. */ 67 #define PAD(V, T, W, N) char VECT_VAR(V,T,W,N)=42; 68 69 /* Array declarations. */ 70 #define ARRAY(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[N] 71 #define ARRAY4(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[4] 72 73 /* Arrays of vectors. */ 74 #define VECT_ARRAY_VAR(V,T,W,N,L) xNAME(V,VECT_ARRAY_NAME(T,W,N,L)) 75 #define VECT_ARRAY(V, T, W, N, L) T##W##_t VECT_ARRAY_VAR(V,T,W,N,L)[N*L] 76 77 static int result_idx = 0; 78 #define DUMP(MSG,T,W,N,FMT) \ 79 fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++, \ 80 STR(VECT_VAR(result, T, W, N))); \ 81 for(i=0; i<N ; i++) \ 82 { \ 83 fprintf(ref_file, "%" FMT ", ", VECT_VAR(result, T, W, N)[i]); \ 84 } \ 85 fprintf(ref_file, " }\n"); \ 86 DUMP4GCC(MSG,T,W,N,FMT); 87 88 /* Use casts for remove sign bits */ 89 #define DUMP_POLY(MSG,T,W,N,FMT) \ 90 fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++, \ 91 STR(VECT_VAR(result, T, W, N))); \ 92 for(i=0; i<N ; i++) \ 93 { \ 94 fprintf(ref_file, "%" FMT ", ", \ 95 (uint##W##_t)VECT_VAR(result, T, W, N)[i]); \ 96 } \ 97 fprintf(ref_file, " }\n"); \ 98 DUMP4GCC(MSG,T,W,N,FMT); 99 100 #define DUMP_FP(MSG,T,W,N,FMT) \ 101 fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++, \ 102 STR(VECT_VAR(result, T, W, N))); \ 103 for(i=0; i<N ; i++) \ 104 { \ 105 union fp_operand { \ 106 uint##W##_t i; \ 107 float##W##_t f; \ 108 } tmp; \ 109 tmp.f = VECT_VAR(result, T, W, N)[i]; \ 110 fprintf(ref_file, "%" FMT ", ", tmp.i); \ 111 } \ 112 fprintf(ref_file, " }\n"); \ 113 DUMP4GCC_FP(MSG,T,W,N,FMT); 114 115 #define DUMP4GCC(MSG,T,W,N,FMT) \ 116 fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ", \ 117 STR(T), W, N); \ 118 for(i=0; i<(N-1) ; i++) \ 119 { \ 120 if (W < 32) { \ 121 uint32_t tmp = (uint##W##_t) VECT_VAR(result, T, W, N)[i]; \ 122 fprintf(gcc_tests_file, "0x%" FMT ", ", tmp); \ 123 } else { \ 124 fprintf(gcc_tests_file, "0x%" FMT ", ", VECT_VAR(result, T, W, N)[i]); \ 125 } \ 126 } \ 127 if (W < 32) { \ 128 uint32_t tmp = (uint##W##_t) VECT_VAR(result, T, W, N)[i]; \ 129 fprintf(gcc_tests_file, "0x%" FMT, tmp); \ 130 } else { \ 131 fprintf(gcc_tests_file, "0x%" FMT, VECT_VAR(result, T, W, N)[i]); \ 132 } \ 133 fprintf(gcc_tests_file, " };\n"); 134 135 #define DUMP4GCC_FP(MSG,T,W,N,FMT) \ 136 { \ 137 union fp_operand { \ 138 uint##W##_t i; \ 139 float##W##_t f; \ 140 } tmp; \ 141 fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ", \ 142 "hfloat", W, N); \ 143 for(i=0; i<(N-1) ; i++) \ 144 { \ 145 tmp.f = VECT_VAR(result, T, W, N)[i]; \ 146 fprintf(gcc_tests_file, "0x%" FMT ", ", tmp.i); \ 147 } \ 148 tmp.f = VECT_VAR(result, T, W, N)[i]; \ 149 fprintf(gcc_tests_file, "0x%" FMT, tmp.i); \ 150 fprintf(gcc_tests_file, " };\n"); \ 151 } 152 153 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 154 #define float16_t __fp16 155 156 #define DUMP_FP16(MSG,T,W,N,FMT) \ 157 fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++, \ 158 STR(VECT_VAR(result, T, W, N))); \ 159 for(i=0; i<N ; i++) \ 160 { \ 161 uint##W##_t tmp; \ 162 tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i]; \ 163 fprintf(ref_file, "%" FMT ", ", tmp); \ 164 } \ 165 fprintf(ref_file, " }\n"); \ 166 DUMP4GCC_FP16(MSG,T,W,N,FMT); 167 168 #define DUMP4GCC_FP16(MSG,T,W,N,FMT) \ 169 { \ 170 uint##W##_t tmp; \ 171 fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ", \ 172 "hfloat", W, N); \ 173 for(i=0; i<(N-1) ; i++) \ 174 { \ 175 tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i]; \ 176 fprintf(gcc_tests_file, "0x%" FMT ", ", tmp); \ 177 } \ 178 tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i]; \ 179 fprintf(gcc_tests_file, "0x%" FMT, tmp); \ 180 fprintf(gcc_tests_file, " };\n"); \ 181 } 182 #endif 183 184 #define CLEAN_PATTERN_8 0x33 185 #define CLEAN_PATTERN_16 0x3333 186 #define CLEAN_PATTERN_32 0x33333333 187 #define CLEAN_PATTERN_64 0x3333333333333333 188 189 #define CLEAN(VAR,T,W,N) \ 190 memset(VECT_VAR(VAR, T, W, N), \ 191 CLEAN_PATTERN_8, \ 192 sizeof(VECT_VAR(VAR, T, W, N))); 193 194 #define CHECK_INIT(VAR,Q,T1,T2,W,N) \ 195 { \ 196 ARRAY(check_result, T1, W, N); \ 197 int i; \ 198 \ 199 vst1##Q##_##T2##W(VECT_VAR(check_result, T1, W, N), \ 200 VECT_VAR(VAR, T1, W, N)); \ 201 for(i=0; i<N ; i++) \ 202 { \ 203 /*if (VECT_VAR(check_result, T1, W, N)[i] == CLEAN_PATTERN_##W)*/ { \ 204 fprintf(stdout, "%s:%d: %s[%d] unintialized! %#x\n", \ 205 __FUNCTION__, __LINE__, \ 206 STR(VECT_VAR(VAR, T1, W, N)), i, \ 207 VECT_VAR(check_result, T1, W, N)[i]); \ 208 } \ 209 } \ 210 } 211 212 /* Generic declarations: */ 213 extern FILE* log_file; 214 extern FILE* ref_file; 215 extern FILE* gcc_tests_file; 216 217 /* Input buffers, one of each size */ 218 extern ARRAY(buffer, int, 8, 8); 219 extern ARRAY(buffer, int, 16, 4); 220 extern ARRAY(buffer, int, 32, 2); 221 extern ARRAY(buffer, int, 64, 1); 222 extern ARRAY(buffer, uint, 8, 8); 223 extern ARRAY(buffer, uint, 16, 4); 224 extern ARRAY(buffer, uint, 32, 2); 225 extern ARRAY(buffer, uint, 64, 1); 226 extern ARRAY(buffer, poly, 8, 8); 227 extern ARRAY(buffer, poly, 16, 4); 228 extern ARRAY(buffer, float, 32, 2); 229 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 230 extern ARRAY(buffer, float, 16, 4); 231 #endif 232 extern ARRAY(buffer, int, 8, 16); 233 extern ARRAY(buffer, int, 16, 8); 234 extern ARRAY(buffer, int, 32, 4); 235 extern ARRAY(buffer, int, 64, 2); 236 extern ARRAY(buffer, uint, 8, 16); 237 extern ARRAY(buffer, uint, 16, 8); 238 extern ARRAY(buffer, uint, 32, 4); 239 extern ARRAY(buffer, uint, 64, 2); 240 extern ARRAY(buffer, poly, 8, 16); 241 extern ARRAY(buffer, poly, 16, 8); 242 extern ARRAY(buffer, float, 32, 4); 243 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 244 extern ARRAY(buffer, float, 16, 8); 245 #endif 246 247 /* The tests for vld1_dup and vdup expect at least 4 entries in the 248 input buffer, so force 1- and 2-elements initializers to have 4 249 entries. */ 250 extern ARRAY(buffer_dup, int, 8, 8); 251 extern ARRAY(buffer_dup, int, 16, 4); 252 extern ARRAY4(buffer_dup, int, 32, 2); 253 extern ARRAY4(buffer_dup, int, 64, 1); 254 extern ARRAY(buffer_dup, uint, 8, 8); 255 extern ARRAY(buffer_dup, uint, 16, 4); 256 extern ARRAY4(buffer_dup, uint, 32, 2); 257 extern ARRAY4(buffer_dup, uint, 64, 1); 258 extern ARRAY(buffer_dup, poly, 8, 8); 259 extern ARRAY(buffer_dup, poly, 16, 4); 260 extern ARRAY4(buffer_dup, float, 32, 2); 261 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 262 extern ARRAY4(buffer_dup, float, 16, 4); 263 #endif 264 extern ARRAY(buffer_dup, int, 8, 16); 265 extern ARRAY(buffer_dup, int, 16, 8); 266 extern ARRAY(buffer_dup, int, 32, 4); 267 extern ARRAY4(buffer_dup, int, 64, 2); 268 extern ARRAY(buffer_dup, uint, 8, 16); 269 extern ARRAY(buffer_dup, uint, 16, 8); 270 extern ARRAY(buffer_dup, uint, 32, 4); 271 extern ARRAY4(buffer_dup, uint, 64, 2); 272 extern ARRAY(buffer_dup, poly, 8, 16); 273 extern ARRAY(buffer_dup, poly, 16, 8); 274 extern ARRAY(buffer_dup, float, 32, 4); 275 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 276 extern ARRAY(buffer_dup, float, 16, 8); 277 #endif 278 279 /* Input buffers for vld2, one of each size */ 280 extern VECT_ARRAY(buffer_vld2, int, 8, 8, 2); 281 extern VECT_ARRAY(buffer_vld2, int, 16, 4, 2); 282 extern VECT_ARRAY(buffer_vld2, int, 32, 2, 2); 283 extern VECT_ARRAY(buffer_vld2, int, 64, 1, 2); 284 extern VECT_ARRAY(buffer_vld2, uint, 8, 8, 2); 285 extern VECT_ARRAY(buffer_vld2, uint, 16, 4, 2); 286 extern VECT_ARRAY(buffer_vld2, uint, 32, 2, 2); 287 extern VECT_ARRAY(buffer_vld2, uint, 64, 1, 2); 288 extern VECT_ARRAY(buffer_vld2, poly, 8, 8, 2); 289 extern VECT_ARRAY(buffer_vld2, poly, 16, 4, 2); 290 extern VECT_ARRAY(buffer_vld2, float, 32, 2, 2); 291 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 292 extern VECT_ARRAY(buffer_vld2, float, 16, 4, 2); 293 #endif 294 extern VECT_ARRAY(buffer_vld2, int, 8, 16, 2); 295 extern VECT_ARRAY(buffer_vld2, int, 16, 8, 2); 296 extern VECT_ARRAY(buffer_vld2, int, 32, 4, 2); 297 extern VECT_ARRAY(buffer_vld2, int, 64, 2, 2); 298 extern VECT_ARRAY(buffer_vld2, uint, 8, 16, 2); 299 extern VECT_ARRAY(buffer_vld2, uint, 16, 8, 2); 300 extern VECT_ARRAY(buffer_vld2, uint, 32, 4, 2); 301 extern VECT_ARRAY(buffer_vld2, uint, 64, 2, 2); 302 extern VECT_ARRAY(buffer_vld2, poly, 8, 16, 2); 303 extern VECT_ARRAY(buffer_vld2, poly, 16, 8, 2); 304 extern VECT_ARRAY(buffer_vld2, float, 32, 4, 2); 305 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 306 extern VECT_ARRAY(buffer_vld2, float, 16, 8, 2); 307 #endif 308 309 /* Input buffers for vld3, one of each size */ 310 extern VECT_ARRAY(buffer_vld3, int, 8, 8, 3); 311 extern VECT_ARRAY(buffer_vld3, int, 16, 4, 3); 312 extern VECT_ARRAY(buffer_vld3, int, 32, 2, 3); 313 extern VECT_ARRAY(buffer_vld3, int, 64, 1, 3); 314 extern VECT_ARRAY(buffer_vld3, uint, 8, 8, 3); 315 extern VECT_ARRAY(buffer_vld3, uint, 16, 4, 3); 316 extern VECT_ARRAY(buffer_vld3, uint, 32, 2, 3); 317 extern VECT_ARRAY(buffer_vld3, uint, 64, 1, 3); 318 extern VECT_ARRAY(buffer_vld3, poly, 8, 8, 3); 319 extern VECT_ARRAY(buffer_vld3, poly, 16, 4, 3); 320 extern VECT_ARRAY(buffer_vld3, float, 32, 2, 3); 321 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 322 extern VECT_ARRAY(buffer_vld3, float, 16, 4, 3); 323 #endif 324 extern VECT_ARRAY(buffer_vld3, int, 8, 16, 3); 325 extern VECT_ARRAY(buffer_vld3, int, 16, 8, 3); 326 extern VECT_ARRAY(buffer_vld3, int, 32, 4, 3); 327 extern VECT_ARRAY(buffer_vld3, int, 64, 2, 3); 328 extern VECT_ARRAY(buffer_vld3, uint, 8, 16, 3); 329 extern VECT_ARRAY(buffer_vld3, uint, 16, 8, 3); 330 extern VECT_ARRAY(buffer_vld3, uint, 32, 4, 3); 331 extern VECT_ARRAY(buffer_vld3, uint, 64, 2, 3); 332 extern VECT_ARRAY(buffer_vld3, poly, 8, 16, 3); 333 extern VECT_ARRAY(buffer_vld3, poly, 16, 8, 3); 334 extern VECT_ARRAY(buffer_vld3, float, 32, 4, 3); 335 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 336 extern VECT_ARRAY(buffer_vld3, float, 16, 8, 3); 337 #endif 338 339 /* Input buffers for vld4, one of each size */ 340 extern VECT_ARRAY(buffer_vld4, int, 8, 8, 4); 341 extern VECT_ARRAY(buffer_vld4, int, 16, 4, 4); 342 extern VECT_ARRAY(buffer_vld4, int, 32, 2, 4); 343 extern VECT_ARRAY(buffer_vld4, int, 64, 1, 4); 344 extern VECT_ARRAY(buffer_vld4, uint, 8, 8, 4); 345 extern VECT_ARRAY(buffer_vld4, uint, 16, 4, 4); 346 extern VECT_ARRAY(buffer_vld4, uint, 32, 2, 4); 347 extern VECT_ARRAY(buffer_vld4, uint, 64, 1, 4); 348 extern VECT_ARRAY(buffer_vld4, poly, 8, 8, 4); 349 extern VECT_ARRAY(buffer_vld4, poly, 16, 4, 4); 350 extern VECT_ARRAY(buffer_vld4, float, 32, 2, 4); 351 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 352 extern VECT_ARRAY(buffer_vld4, float, 16, 4, 4); 353 #endif 354 extern VECT_ARRAY(buffer_vld4, int, 8, 16, 4); 355 extern VECT_ARRAY(buffer_vld4, int, 16, 8, 4); 356 extern VECT_ARRAY(buffer_vld4, int, 32, 4, 4); 357 extern VECT_ARRAY(buffer_vld4, int, 64, 2, 4); 358 extern VECT_ARRAY(buffer_vld4, uint, 8, 16, 4); 359 extern VECT_ARRAY(buffer_vld4, uint, 16, 8, 4); 360 extern VECT_ARRAY(buffer_vld4, uint, 32, 4, 4); 361 extern VECT_ARRAY(buffer_vld4, uint, 64, 2, 4); 362 extern VECT_ARRAY(buffer_vld4, poly, 8, 16, 4); 363 extern VECT_ARRAY(buffer_vld4, poly, 16, 8, 4); 364 extern VECT_ARRAY(buffer_vld4, float, 32, 4, 4); 365 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 366 extern VECT_ARRAY(buffer_vld4, float, 16, 8, 4); 367 #endif 368 369 /* Input buffers for vld2_lane */ 370 extern VECT_VAR_DECL(buffer_vld2_lane, int, 8, 2)[2]; 371 extern VECT_VAR_DECL(buffer_vld2_lane, int, 16, 2)[2]; 372 extern VECT_VAR_DECL(buffer_vld2_lane, int, 32, 2)[2]; 373 extern VECT_VAR_DECL(buffer_vld2_lane, int, 64, 2)[2]; 374 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 8, 2)[2]; 375 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 16, 2)[2]; 376 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 32, 2)[2]; 377 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 64, 2)[2]; 378 extern VECT_VAR_DECL(buffer_vld2_lane, poly, 8, 2)[2]; 379 extern VECT_VAR_DECL(buffer_vld2_lane, poly, 16, 2)[2]; 380 extern VECT_VAR_DECL(buffer_vld2_lane, float, 32, 2)[2]; 381 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 382 extern VECT_VAR_DECL(buffer_vld2_lane, float, 16, 2)[2]; 383 #endif 384 385 /* Input buffers for vld3_lane */ 386 extern VECT_VAR_DECL(buffer_vld3_lane, int, 8, 3)[3]; 387 extern VECT_VAR_DECL(buffer_vld3_lane, int, 16, 3)[3]; 388 extern VECT_VAR_DECL(buffer_vld3_lane, int, 32, 3)[3]; 389 extern VECT_VAR_DECL(buffer_vld3_lane, int, 64, 3)[3]; 390 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 8, 3)[3]; 391 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 16, 3)[3]; 392 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 32, 3)[3]; 393 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 64, 3)[3]; 394 extern VECT_VAR_DECL(buffer_vld3_lane, poly, 8, 3)[3]; 395 extern VECT_VAR_DECL(buffer_vld3_lane, poly, 16, 3)[3]; 396 extern VECT_VAR_DECL(buffer_vld3_lane, float, 32, 3)[3]; 397 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 398 extern VECT_VAR_DECL(buffer_vld3_lane, float, 16, 3)[3]; 399 #endif 400 401 /* Input buffers for vld4_lane */ 402 extern VECT_VAR_DECL(buffer_vld4_lane, int, 8, 4)[4]; 403 extern VECT_VAR_DECL(buffer_vld4_lane, int, 16, 4)[4]; 404 extern VECT_VAR_DECL(buffer_vld4_lane, int, 32, 4)[4]; 405 extern VECT_VAR_DECL(buffer_vld4_lane, int, 64, 4)[4]; 406 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 8, 4)[4]; 407 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 16, 4)[4]; 408 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 32, 4)[4]; 409 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 64, 4)[4]; 410 extern VECT_VAR_DECL(buffer_vld4_lane, poly, 8, 4)[4]; 411 extern VECT_VAR_DECL(buffer_vld4_lane, poly, 16, 4)[4]; 412 extern VECT_VAR_DECL(buffer_vld4_lane, float, 32, 4)[4]; 413 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 414 extern VECT_VAR_DECL(buffer_vld4_lane, float, 16, 4)[4]; 415 #endif 416 417 /* Output buffers, one of each size */ 418 static ARRAY(result, int, 8, 8); 419 static ARRAY(result, int, 16, 4); 420 static ARRAY(result, int, 32, 2); 421 static ARRAY(result, int, 64, 1); 422 static ARRAY(result, uint, 8, 8); 423 static ARRAY(result, uint, 16, 4); 424 static ARRAY(result, uint, 32, 2); 425 static ARRAY(result, uint, 64, 1); 426 static ARRAY(result, poly, 8, 8); 427 static ARRAY(result, poly, 16, 4); 428 static ARRAY(result, float, 32, 2); 429 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 430 static ARRAY(result, float, 16, 4); 431 #endif 432 static ARRAY(result, int, 8, 16); 433 static ARRAY(result, int, 16, 8); 434 static ARRAY(result, int, 32, 4); 435 static ARRAY(result, int, 64, 2); 436 static ARRAY(result, uint, 8, 16); 437 static ARRAY(result, uint, 16, 8); 438 static ARRAY(result, uint, 32, 4); 439 static ARRAY(result, uint, 64, 2); 440 static ARRAY(result, poly, 8, 16); 441 static ARRAY(result, poly, 16, 8); 442 static ARRAY(result, float, 32, 4); 443 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 444 static ARRAY(result, float, 16, 8); 445 #endif 446 447 /* Dump results (generic function) */ 448 static void dump_results (char *test_name) 449 { 450 int i; 451 452 fprintf(ref_file, "\n%s output:\n", test_name); 453 fprintf(gcc_tests_file, "\n%s output:\n", test_name); 454 455 DUMP(test_name, int, 8, 8, PRId8); 456 DUMP(test_name, int, 16, 4, PRId16); 457 DUMP(test_name, int, 32, 2, PRId32); 458 DUMP(test_name, int, 64, 1, PRId64); 459 DUMP(test_name, uint, 8, 8, PRIu8); 460 DUMP(test_name, uint, 16, 4, PRIu16); 461 DUMP(test_name, uint, 32, 2, PRIu32); 462 DUMP(test_name, uint, 64, 1, PRIu64); 463 DUMP_POLY(test_name, poly, 8, 8, PRIu8); 464 DUMP_POLY(test_name, poly, 16, 4, PRIu16); 465 DUMP_FP(test_name, float, 32, 2, PRIx32); 466 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 467 DUMP_FP16(test_name, float, 16, 4, PRIu16); 468 #endif 469 470 DUMP(test_name, int, 8, 16, PRId8); 471 DUMP(test_name, int, 16, 8, PRId16); 472 DUMP(test_name, int, 32, 4, PRId32); 473 DUMP(test_name, int, 64, 2, PRId64); 474 DUMP(test_name, uint, 8, 16, PRIu8); 475 DUMP(test_name, uint, 16, 8, PRIu16); 476 DUMP(test_name, uint, 32, 4, PRIu32); 477 DUMP(test_name, uint, 64, 2, PRIu64); 478 DUMP_POLY(test_name, poly, 8, 16, PRIu8); 479 DUMP_POLY(test_name, poly, 16, 8, PRIu16); 480 DUMP_FP(test_name, float, 32, 4, PRIx32); 481 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 482 DUMP_FP16(test_name, float, 16, 8, PRIu16); 483 #endif 484 } 485 486 /* Dump results in hex (generic function) */ 487 static void dump_results_hex2 (const char *test_name, const char* comment) 488 { 489 int i; 490 491 fprintf(ref_file, "\n%s%s output:\n", test_name, comment); 492 fprintf(gcc_tests_file, "\n%s%s output:\n", test_name, comment); 493 494 DUMP(test_name, int, 8, 8, PRIx8); 495 DUMP(test_name, int, 16, 4, PRIx16); 496 DUMP(test_name, int, 32, 2, PRIx32); 497 DUMP(test_name, int, 64, 1, PRIx64); 498 DUMP(test_name, uint, 8, 8, PRIx8); 499 DUMP(test_name, uint, 16, 4, PRIx16); 500 DUMP(test_name, uint, 32, 2, PRIx32); 501 DUMP(test_name, uint, 64, 1, PRIx64); 502 DUMP_POLY(test_name, poly, 8, 8, PRIx8); 503 DUMP_POLY(test_name, poly, 16, 4, PRIx16); 504 DUMP_FP(test_name, float, 32, 2, PRIx32); 505 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 506 DUMP_FP16(test_name, float, 16, 4, PRIx16); 507 #endif 508 509 DUMP(test_name, int, 8, 16, PRIx8); 510 DUMP(test_name, int, 16, 8, PRIx16); 511 DUMP(test_name, int, 32, 4, PRIx32); 512 DUMP(test_name, int, 64, 2, PRIx64); 513 DUMP(test_name, uint, 8, 16, PRIx8); 514 DUMP(test_name, uint, 16, 8, PRIx16); 515 DUMP(test_name, uint, 32, 4, PRIx32); 516 DUMP(test_name, uint, 64, 2, PRIx64); 517 DUMP_POLY(test_name, poly, 8, 16, PRIx8); 518 DUMP_POLY(test_name, poly, 16, 8, PRIx16); 519 DUMP_FP(test_name, float, 32, 4, PRIx32); 520 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 521 DUMP_FP16(test_name, float, 16, 8, PRIx16); 522 #endif 523 } 524 525 static void dump_results_hex (const char *test_name) 526 { 527 dump_results_hex2(test_name, ""); 528 } 529 530 #ifndef STM_ARM_NEON_MODELS 531 532 /* This hack is to cope with various compilers/libc which may not 533 provide endian.h or cross-compilers such as llvm which includes the 534 host's endian.h. */ 535 #ifndef __arm__ 536 #include <endian.h> 537 #define THIS_ENDIAN __BYTE_ORDER 538 #else /* __arm__ */ 539 #ifdef __ARMEL__ 540 #define THIS_ENDIAN __LITTLE_ENDIAN 541 #else /* __ARMEL__ */ 542 #define THIS_ENDIAN __BIG_ENDIAN 543 #endif 544 #endif /* __arm__ */ 545 546 #if THIS_ENDIAN == __LITTLE_ENDIAN 547 548 typedef union { 549 struct { 550 int _xxx:27; 551 unsigned int QC:1; 552 int V:1; 553 int C:1; 554 int Z:1; 555 int N:1; 556 } b; 557 unsigned int word; 558 } _ARM_FPSCR; 559 560 #else /* __BIG_ENDIAN */ 561 562 typedef union { 563 struct { 564 int N:1; 565 int Z:1; 566 int C:1; 567 int V:1; 568 unsigned int QC:1; 569 int _dnm:27; 570 } b; 571 unsigned int word; 572 } _ARM_FPSCR; 573 574 #endif /* __BIG_ENDIAN */ 575 576 #ifdef __ARMCC_VERSION 577 register _ARM_FPSCR _afpscr_for_qc __asm("fpscr"); 578 # define Neon_Cumulative_Sat _afpscr_for_qc.b.QC 579 # define Set_Neon_Cumulative_Sat(x, depend) {Neon_Cumulative_Sat = (x);} 580 #else 581 /* GCC/ARM does not know this register */ 582 # define Neon_Cumulative_Sat __read_neon_cumulative_sat() 583 /* We need a fake dependency to ensure correct ordering of asm 584 statements to preset the QC flag value, and Neon operators writing 585 to QC. */ 586 #define Set_Neon_Cumulative_Sat(x, depend) \ 587 __set_neon_cumulative_sat((x), (depend)) 588 589 # if defined(__aarch64__) 590 static volatile int __read_neon_cumulative_sat (void) { 591 _ARM_FPSCR _afpscr_for_qc; 592 asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc)); 593 return _afpscr_for_qc.b.QC; 594 } 595 596 #define __set_neon_cumulative_sat(x, depend) { \ 597 _ARM_FPSCR _afpscr_for_qc; \ 598 asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc)); \ 599 _afpscr_for_qc.b.QC = x; \ 600 asm volatile ("msr fpsr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \ 601 } 602 603 # else 604 static volatile int __read_neon_cumulative_sat (void) { 605 _ARM_FPSCR _afpscr_for_qc; 606 asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc)); 607 return _afpscr_for_qc.b.QC; 608 } 609 610 #define __set_neon_cumulative_sat(x, depend) { \ 611 _ARM_FPSCR _afpscr_for_qc; \ 612 asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc)); \ 613 _afpscr_for_qc.b.QC = x; \ 614 asm volatile ("vmsr fpscr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \ 615 } 616 617 # endif 618 #endif 619 620 #endif /* STM_ARM_NEON_MODELS */ 621 622 static void dump_neon_cumulative_sat(const char* msg, const char *name, 623 const char* t1, int w, int n) 624 { 625 fprintf(ref_file, "%s:%d:%s Neon cumulative saturation %d\n", msg, result_idx++, 626 name, Neon_Cumulative_Sat); 627 fprintf(gcc_tests_file, 628 "int VECT_VAR(expected_cumulative_sat,%s,%d,%d) = %d;\n", 629 t1, w, n, Neon_Cumulative_Sat); 630 } 631 632 /* Clean output buffers before execution */ 633 static void clean_results (void) 634 { 635 result_idx = 0; 636 CLEAN(result, int, 8, 8); 637 CLEAN(result, int, 16, 4); 638 CLEAN(result, int, 32, 2); 639 CLEAN(result, int, 64, 1); 640 CLEAN(result, uint, 8, 8); 641 CLEAN(result, uint, 16, 4); 642 CLEAN(result, uint, 32, 2); 643 CLEAN(result, uint, 64, 1); 644 CLEAN(result, poly, 8, 8); 645 CLEAN(result, poly, 16, 4); 646 CLEAN(result, float, 32, 2); 647 648 CLEAN(result, int, 8, 16); 649 CLEAN(result, int, 16, 8); 650 CLEAN(result, int, 32, 4); 651 CLEAN(result, int, 64, 2); 652 CLEAN(result, uint, 8, 16); 653 CLEAN(result, uint, 16, 8); 654 CLEAN(result, uint, 32, 4); 655 CLEAN(result, uint, 64, 2); 656 CLEAN(result, poly, 8, 16); 657 CLEAN(result, poly, 16, 8); 658 CLEAN(result, float, 32, 4); 659 } 660 661 662 /* Helpers to declare variables of various types */ 663 #define DECL_VARIABLE(VAR, T1, W, N) \ 664 volatile VECT_TYPE(T1, W, N) VECT_VAR(VAR, T1, W, N) 665 666 #define DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR) \ 667 DECL_VARIABLE(VAR, int, 8, 8); \ 668 DECL_VARIABLE(VAR, int, 16, 4); \ 669 DECL_VARIABLE(VAR, int, 32, 2); \ 670 DECL_VARIABLE(VAR, int, 64, 1) 671 672 #define DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR) \ 673 DECL_VARIABLE(VAR, uint, 8, 8); \ 674 DECL_VARIABLE(VAR, uint, 16, 4); \ 675 DECL_VARIABLE(VAR, uint, 32, 2); \ 676 DECL_VARIABLE(VAR, uint, 64, 1) 677 678 #define DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR) \ 679 DECL_VARIABLE(VAR, int, 8, 16); \ 680 DECL_VARIABLE(VAR, int, 16, 8); \ 681 DECL_VARIABLE(VAR, int, 32, 4); \ 682 DECL_VARIABLE(VAR, int, 64, 2) 683 684 #define DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR) \ 685 DECL_VARIABLE(VAR, uint, 8, 16); \ 686 DECL_VARIABLE(VAR, uint, 16, 8); \ 687 DECL_VARIABLE(VAR, uint, 32, 4); \ 688 DECL_VARIABLE(VAR, uint, 64, 2) 689 690 #define DECL_VARIABLE_64BITS_VARIANTS(VAR) \ 691 DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR); \ 692 DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR); \ 693 DECL_VARIABLE(VAR, poly, 8, 8); \ 694 DECL_VARIABLE(VAR, poly, 16, 4); \ 695 DECL_VARIABLE(VAR, float, 32, 2) 696 697 #define DECL_VARIABLE_128BITS_VARIANTS(VAR) \ 698 DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR); \ 699 DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR); \ 700 DECL_VARIABLE(VAR, poly, 8, 16); \ 701 DECL_VARIABLE(VAR, poly, 16, 8); \ 702 DECL_VARIABLE(VAR, float, 32, 4) 703 704 #define DECL_VARIABLE_ALL_VARIANTS(VAR) \ 705 DECL_VARIABLE_64BITS_VARIANTS(VAR); \ 706 DECL_VARIABLE_128BITS_VARIANTS(VAR) 707 708 #define DECL_VARIABLE_SIGNED_VARIANTS(VAR) \ 709 DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR); \ 710 DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR) 711 712 #define DECL_VARIABLE_UNSIGNED_VARIANTS(VAR) \ 713 DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR); \ 714 DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR) 715 716 /* Helpers to initialize vectors */ 717 #define VDUP(VAR, Q, T1, T2, W, N, V) \ 718 VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V) 719 720 #define TEST_VSET_LANE(VAR, Q, T1, T2, W, N, L, V) \ 721 VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V, \ 722 VECT_VAR(VAR, T1, W, N), \ 723 L) 724 725 /* We need to load initial values first, so rely on VLD1 */ 726 #define VLOAD(VAR, BUF, Q, T1, T2, W, N) \ 727 VECT_VAR(VAR, T1, W, N) = vld1##Q##_##T2##W(VECT_VAR(BUF, T1, W, N)) 728 729 /* Helpers for macros with 1 constant and 5 variable arguments */ 730 #define TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR) \ 731 MACRO(VAR, , int, s, 8, 8); \ 732 MACRO(VAR, , int, s, 16, 4); \ 733 MACRO(VAR, , int, s, 32, 2); \ 734 MACRO(VAR, , int, s, 64, 1) 735 736 #define TEST_MACRO_64BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR) \ 737 MACRO(VAR, , uint, u, 8, 8); \ 738 MACRO(VAR, , uint, u, 16, 4); \ 739 MACRO(VAR, , uint, u, 32, 2); \ 740 MACRO(VAR, , uint, u, 64, 1) 741 742 #define TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR) \ 743 MACRO(VAR, q, int, s, 8, 16); \ 744 MACRO(VAR, q, int, s, 16, 8); \ 745 MACRO(VAR, q, int, s, 32, 4); \ 746 MACRO(VAR, q, int, s, 64, 2) 747 748 #define TEST_MACRO_128BITS_UNSIGNED_VARIANTS_1_5(MACRO,VAR) \ 749 MACRO(VAR, q, uint, u, 8, 16); \ 750 MACRO(VAR, q, uint, u, 16, 8); \ 751 MACRO(VAR, q, uint, u, 32, 4); \ 752 MACRO(VAR, q, uint, u, 64, 2) 753 754 #define TEST_MACRO_64BITS_VARIANTS_1_5(MACRO, VAR) \ 755 TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR); \ 756 TEST_MACRO_64BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR) 757 758 #define TEST_MACRO_128BITS_VARIANTS_1_5(MACRO, VAR) \ 759 TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR); \ 760 TEST_MACRO_128BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR) 761 762 #define TEST_MACRO_ALL_VARIANTS_1_5(MACRO, VAR) \ 763 TEST_MACRO_64BITS_VARIANTS_1_5(MACRO, VAR); \ 764 TEST_MACRO_128BITS_VARIANTS_1_5(MACRO, VAR) 765 766 #define TEST_MACRO_SIGNED_VARIANTS_1_5(MACRO, VAR) \ 767 TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR); \ 768 TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR) 769 770 /* Helpers for macros with 2 constant and 5 variable arguments */ 771 #define TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2) \ 772 MACRO(VAR1, VAR2, , int, s, 8, 8); \ 773 MACRO(VAR1, VAR2, , int, s, 16, 4); \ 774 MACRO(VAR1, VAR2, , int, s, 32, 2); \ 775 MACRO(VAR1, VAR2 , , int, s, 64, 1) 776 777 #define TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2) \ 778 MACRO(VAR1, VAR2, , uint, u, 8, 8); \ 779 MACRO(VAR1, VAR2, , uint, u, 16, 4); \ 780 MACRO(VAR1, VAR2, , uint, u, 32, 2); \ 781 MACRO(VAR1, VAR2, , uint, u, 64, 1) 782 783 #define TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2) \ 784 MACRO(VAR1, VAR2, q, int, s, 8, 16); \ 785 MACRO(VAR1, VAR2, q, int, s, 16, 8); \ 786 MACRO(VAR1, VAR2, q, int, s, 32, 4); \ 787 MACRO(VAR1, VAR2, q, int, s, 64, 2) 788 789 #define TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2) \ 790 MACRO(VAR1, VAR2, q, uint, u, 8, 16); \ 791 MACRO(VAR1, VAR2, q, uint, u, 16, 8); \ 792 MACRO(VAR1, VAR2, q, uint, u, 32, 4); \ 793 MACRO(VAR1, VAR2, q, uint, u, 64, 2) 794 795 #define TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2) \ 796 TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2); \ 797 TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2); \ 798 MACRO(VAR1, VAR2, , poly, p, 8, 8); \ 799 MACRO(VAR1, VAR2, , poly, p, 16, 4) 800 801 #define TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2) \ 802 TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2); \ 803 TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2); \ 804 MACRO(VAR1, VAR2, q, poly, p, 8, 16); \ 805 MACRO(VAR1, VAR2, q, poly, p, 16, 8) 806 807 #define TEST_MACRO_ALL_VARIANTS_2_5(MACRO, VAR1, VAR2) \ 808 TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2); \ 809 TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2) 810 811 #define TEST_MACRO_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2) \ 812 TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2); \ 813 TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2) 814 815 #endif /* _STM_ARM_NEON_REF_H_ */ 816