Lines Matching refs:lhs_offset
8604 std::int32_t lhs_offset, std::int32_t rhs_offset,
8624 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
8633 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
8659 std::int32_t lhs_offset, std::int32_t rhs_offset,
8679 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
8688 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
8714 std::int32_t lhs_offset, std::int32_t rhs_offset,
8734 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
8743 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
8769 std::int32_t lhs_offset, std::int32_t rhs_offset,
8789 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
8798 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
8824 std::int32_t lhs_offset, std::int32_t rhs_offset,
8844 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
8853 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
8879 std::int32_t lhs_offset, std::int32_t rhs_offset,
8899 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
8908 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
8934 std::int32_t lhs_offset, std::int32_t rhs_offset,
8954 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
8963 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
8989 std::int32_t lhs_offset, std::int32_t rhs_offset,
9009 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9018 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9044 std::int32_t lhs_offset, std::int32_t rhs_offset,
9064 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9073 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9077 zip_1x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9102 std::int32_t lhs_offset, std::int32_t rhs_offset,
9122 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9131 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9135 zip_1x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9160 std::int32_t lhs_offset, std::int32_t rhs_offset,
9180 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9189 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9193 zip_1x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9218 std::int32_t lhs_offset, std::int32_t rhs_offset,
9238 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9247 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9251 zip_1x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9276 std::int32_t lhs_offset, std::int32_t rhs_offset,
9296 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9305 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9309 zip_1x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9334 std::int32_t lhs_offset, std::int32_t rhs_offset,
9354 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9363 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9367 zip_1x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9392 std::int32_t lhs_offset, std::int32_t rhs_offset,
9412 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9421 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9425 zip_1x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9450 std::int32_t lhs_offset, std::int32_t rhs_offset,
9470 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9479 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9483 zip_1x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9508 std::int32_t lhs_offset, std::int32_t rhs_offset,
9528 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9537 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9541 zip_2x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9566 std::int32_t lhs_offset, std::int32_t rhs_offset,
9586 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9595 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9599 zip_2x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9624 std::int32_t lhs_offset, std::int32_t rhs_offset,
9644 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9653 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9657 zip_2x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9682 std::int32_t lhs_offset, std::int32_t rhs_offset,
9702 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9711 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9715 zip_2x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9740 std::int32_t lhs_offset, std::int32_t rhs_offset,
9760 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9769 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9773 zip_2x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9798 std::int32_t lhs_offset, std::int32_t rhs_offset,
9818 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9827 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9831 zip_2x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9856 std::int32_t lhs_offset, std::int32_t rhs_offset,
9876 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9885 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9889 zip_2x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9914 std::int32_t lhs_offset, std::int32_t rhs_offset,
9934 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
9943 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9947 zip_2x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
9972 std::int32_t lhs_offset, std::int32_t rhs_offset,
9994 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10003 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10042 std::int32_t lhs_offset, std::int32_t rhs_offset,
10064 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10073 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10112 std::int32_t lhs_offset, std::int32_t rhs_offset,
10134 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10143 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10182 std::int32_t lhs_offset, std::int32_t rhs_offset,
10204 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10213 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10252 std::int32_t lhs_offset, std::int32_t rhs_offset,
10274 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10283 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10322 std::int32_t lhs_offset, std::int32_t rhs_offset,
10344 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10353 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10392 std::int32_t lhs_offset, std::int32_t rhs_offset,
10414 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10423 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10462 std::int32_t lhs_offset, std::int32_t rhs_offset,
10484 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10493 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10532 std::int32_t lhs_offset, std::int32_t rhs_offset,
10554 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10563 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10567 zip_1x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10607 std::int32_t lhs_offset, std::int32_t rhs_offset,
10629 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10638 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10642 zip_1x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10682 std::int32_t lhs_offset, std::int32_t rhs_offset,
10704 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10713 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10717 zip_1x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10757 std::int32_t lhs_offset, std::int32_t rhs_offset,
10779 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10788 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10792 zip_1x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10832 std::int32_t lhs_offset, std::int32_t rhs_offset,
10854 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10863 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10867 zip_1x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10907 std::int32_t lhs_offset, std::int32_t rhs_offset,
10929 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
10938 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10942 zip_1x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
10982 std::int32_t lhs_offset, std::int32_t rhs_offset,
11004 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11013 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11017 zip_1x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11057 std::int32_t lhs_offset, std::int32_t rhs_offset,
11079 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11088 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11092 zip_1x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11132 std::int32_t lhs_offset, std::int32_t rhs_offset,
11154 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11163 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11167 zip_2x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11207 std::int32_t lhs_offset, std::int32_t rhs_offset,
11229 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11238 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11242 zip_2x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11282 std::int32_t lhs_offset, std::int32_t rhs_offset,
11304 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11313 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11317 zip_2x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11357 std::int32_t lhs_offset, std::int32_t rhs_offset,
11379 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11388 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11392 zip_2x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11432 std::int32_t lhs_offset, std::int32_t rhs_offset,
11454 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11463 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11467 zip_2x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11507 std::int32_t lhs_offset, std::int32_t rhs_offset,
11529 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11538 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11542 zip_2x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11582 std::int32_t lhs_offset, std::int32_t rhs_offset,
11604 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11613 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11617 zip_2x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11657 std::int32_t lhs_offset, std::int32_t rhs_offset,
11679 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11688 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11692 zip_2x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11732 std::int32_t lhs_offset, std::int32_t rhs_offset,
11754 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11763 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11802 std::int32_t lhs_offset, std::int32_t rhs_offset,
11824 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11833 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11872 std::int32_t lhs_offset, std::int32_t rhs_offset,
11894 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11903 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
11942 std::int32_t lhs_offset, std::int32_t rhs_offset,
11964 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
11973 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12012 std::int32_t lhs_offset, std::int32_t rhs_offset,
12034 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12043 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12082 std::int32_t lhs_offset, std::int32_t rhs_offset,
12104 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12113 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12152 std::int32_t lhs_offset, std::int32_t rhs_offset,
12174 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12183 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12222 std::int32_t lhs_offset, std::int32_t rhs_offset,
12244 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12253 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12292 std::int32_t lhs_offset, std::int32_t rhs_offset,
12314 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12323 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12327 zip_1x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12367 std::int32_t lhs_offset, std::int32_t rhs_offset,
12389 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12398 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12402 zip_1x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12442 std::int32_t lhs_offset, std::int32_t rhs_offset,
12464 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12473 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12477 zip_1x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12517 std::int32_t lhs_offset, std::int32_t rhs_offset,
12539 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12548 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12552 zip_1x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12592 std::int32_t lhs_offset, std::int32_t rhs_offset,
12614 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12623 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12627 zip_1x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12667 std::int32_t lhs_offset, std::int32_t rhs_offset,
12689 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12698 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12702 zip_1x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12742 std::int32_t lhs_offset, std::int32_t rhs_offset,
12764 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12773 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12777 zip_1x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12817 std::int32_t lhs_offset, std::int32_t rhs_offset,
12839 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12848 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12852 zip_1x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12892 std::int32_t lhs_offset, std::int32_t rhs_offset,
12914 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12923 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12927 zip_2x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
12967 std::int32_t lhs_offset, std::int32_t rhs_offset,
12989 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
12998 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13002 zip_2x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13042 std::int32_t lhs_offset, std::int32_t rhs_offset,
13064 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13073 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13077 zip_2x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13117 std::int32_t lhs_offset, std::int32_t rhs_offset,
13139 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13148 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13152 zip_2x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13192 std::int32_t lhs_offset, std::int32_t rhs_offset,
13214 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13223 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13227 zip_2x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13267 std::int32_t lhs_offset, std::int32_t rhs_offset,
13289 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13298 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13302 zip_2x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13342 std::int32_t lhs_offset, std::int32_t rhs_offset,
13364 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13373 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13377 zip_2x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13417 std::int32_t lhs_offset, std::int32_t rhs_offset,
13439 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13448 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13452 zip_2x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13491 std::int32_t k, std::int32_t lhs_offset,
13510 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13519 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13544 std::int32_t k, std::int32_t lhs_offset,
13563 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13572 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13597 std::int32_t k, std::int32_t lhs_offset,
13616 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13625 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13650 std::int32_t k, std::int32_t lhs_offset,
13669 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13678 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13703 std::int32_t k, std::int32_t lhs_offset,
13722 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13731 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13756 std::int32_t k, std::int32_t lhs_offset,
13775 lhs_offset * rhs_offset * k + result_offset;
13784 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13809 std::int32_t k, std::int32_t lhs_offset,
13828 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13837 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13862 std::int32_t k, std::int32_t lhs_offset,
13881 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13890 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13915 std::int32_t k, std::int32_t lhs_offset,
13934 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13943 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13947 zip_1x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
13971 std::int32_t k, std::int32_t lhs_offset,
13990 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
13999 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14003 zip_1x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14027 std::int32_t k, std::int32_t lhs_offset,
14046 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14055 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14059 zip_1x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14083 std::int32_t k, std::int32_t lhs_offset,
14102 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14111 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14115 zip_1x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14139 std::int32_t k, std::int32_t lhs_offset,
14158 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14167 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14171 zip_1x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14195 std::int32_t k, std::int32_t lhs_offset,
14214 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14223 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14227 zip_1x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14251 std::int32_t k, std::int32_t lhs_offset,
14270 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14279 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14283 zip_1x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14307 std::int32_t k, std::int32_t lhs_offset,
14326 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14335 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14339 zip_1x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14363 std::int32_t k, std::int32_t lhs_offset,
14382 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14391 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14395 zip_2x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14419 std::int32_t k, std::int32_t lhs_offset,
14438 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14447 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14451 zip_2x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14475 std::int32_t k, std::int32_t lhs_offset,
14494 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14503 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14507 zip_2x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14531 std::int32_t k, std::int32_t lhs_offset,
14550 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14559 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14563 zip_2x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14587 std::int32_t k, std::int32_t lhs_offset,
14606 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14615 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14619 zip_2x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14643 std::int32_t k, std::int32_t lhs_offset,
14662 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14671 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14675 zip_2x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14699 std::int32_t k, std::int32_t lhs_offset,
14718 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14727 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14731 zip_2x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14755 std::int32_t k, std::int32_t lhs_offset,
14774 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14783 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14787 zip_2x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14811 std::int32_t k, std::int32_t lhs_offset,
14832 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14841 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14879 std::int32_t k, std::int32_t lhs_offset,
14900 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14909 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
14947 std::int32_t k, std::int32_t lhs_offset,
14968 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
14977 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15015 std::int32_t k, std::int32_t lhs_offset,
15036 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15045 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15083 std::int32_t k, std::int32_t lhs_offset,
15104 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15113 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15151 std::int32_t k, std::int32_t lhs_offset,
15172 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15181 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15219 std::int32_t k, std::int32_t lhs_offset,
15240 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15249 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15287 std::int32_t k, std::int32_t lhs_offset,
15308 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15317 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15355 std::int32_t k, std::int32_t lhs_offset,
15376 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15385 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15389 zip_1x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15428 std::int32_t k, std::int32_t lhs_offset,
15449 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15458 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15462 zip_1x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15501 std::int32_t k, std::int32_t lhs_offset,
15522 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15531 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15535 zip_1x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15574 std::int32_t k, std::int32_t lhs_offset,
15595 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15604 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15608 zip_1x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15647 std::int32_t k, std::int32_t lhs_offset,
15668 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15677 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15681 zip_1x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15720 std::int32_t k, std::int32_t lhs_offset,
15741 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15750 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15754 zip_1x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15793 std::int32_t k, std::int32_t lhs_offset,
15814 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15823 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15827 zip_1x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15866 std::int32_t k, std::int32_t lhs_offset,
15887 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15896 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15900 zip_1x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15939 std::int32_t k, std::int32_t lhs_offset,
15960 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
15969 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
15973 zip_2x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16012 std::int32_t k, std::int32_t lhs_offset,
16033 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16042 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16046 zip_2x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16085 std::int32_t k, std::int32_t lhs_offset,
16106 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16115 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16119 zip_2x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16158 std::int32_t k, std::int32_t lhs_offset,
16179 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16188 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16192 zip_2x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16231 std::int32_t k, std::int32_t lhs_offset,
16252 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16261 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16265 zip_2x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16304 std::int32_t k, std::int32_t lhs_offset,
16325 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16334 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16338 zip_2x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16377 std::int32_t k, std::int32_t lhs_offset,
16398 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16407 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16411 zip_2x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16450 std::int32_t k, std::int32_t lhs_offset,
16471 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16480 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16484 zip_2x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16523 std::int32_t k, std::int32_t lhs_offset,
16544 lhs_offset * rhs_offset * k + result_offset;
16553 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16591 std::int32_t k, std::int32_t lhs_offset,
16612 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16621 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16659 std::int32_t k, std::int32_t lhs_offset,
16680 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16689 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16727 std::int32_t k, std::int32_t lhs_offset,
16748 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16757 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16795 std::int32_t k, std::int32_t lhs_offset,
16816 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16825 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16863 std::int32_t k, std::int32_t lhs_offset,
16884 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16893 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16931 std::int32_t k, std::int32_t lhs_offset,
16952 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
16961 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
16999 std::int32_t k, std::int32_t lhs_offset,
17020 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17029 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17067 std::int32_t k, std::int32_t lhs_offset,
17088 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17097 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17101 zip_1x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17140 std::int32_t k, std::int32_t lhs_offset,
17161 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17170 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17174 zip_1x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17213 std::int32_t k, std::int32_t lhs_offset,
17234 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17243 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17247 zip_1x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset
17286 std::int32_t k, std::int32_t lhs_offset,
17307 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17316 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17320 zip_1x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17359 std::int32_t k, std::int32_t lhs_offset,
17380 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17389 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17393 zip_1x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17432 std::int32_t k, std::int32_t lhs_offset,
17453 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17462 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17466 zip_1x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17505 std::int32_t k, std::int32_t lhs_offset,
17526 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17535 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17539 zip_1x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17578 std::int32_t k, std::int32_t lhs_offset,
17599 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17608 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17612 zip_1x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17651 std::int32_t k, std::int32_t lhs_offset,
17672 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17681 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17685 zip_2x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17724 std::int32_t k, std::int32_t lhs_offset,
17745 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17754 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17758 zip_2x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17797 std::int32_t k, std::int32_t lhs_offset,
17818 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17827 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17831 zip_2x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17870 std::int32_t k, std::int32_t lhs_offset,
17891 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17900 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17904 zip_2x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17943 std::int32_t k, std::int32_t lhs_offset,
17964 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
17973 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
17977 zip_2x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18016 std::int32_t k, std::int32_t lhs_offset,
18037 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
18046 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18050 zip_2x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18089 std::int32_t k, std::int32_t lhs_offset,
18110 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
18119 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18123 zip_2x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18162 std::int32_t k, std::int32_t lhs_offset,
18183 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset;
18192 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18196 zip_2x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18236 std::int32_t lhs_offset, std::int32_t rhs_offset,
18253 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18259 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18283 std::int32_t lhs_offset, std::int32_t rhs_offset,
18300 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18306 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18330 std::int32_t lhs_offset, std::int32_t rhs_offset,
18347 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18353 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18377 std::int32_t lhs_offset, std::int32_t rhs_offset,
18394 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18400 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18424 std::int32_t lhs_offset, std::int32_t rhs_offset,
18441 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18447 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18471 std::int32_t lhs_offset, std::int32_t rhs_offset,
18488 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18494 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18518 std::int32_t lhs_offset, std::int32_t rhs_offset,
18535 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18541 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18565 std::int32_t lhs_offset, std::int32_t rhs_offset,
18582 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18588 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18612 std::int32_t lhs_offset, std::int32_t rhs_offset,
18629 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18635 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18639 zip_1x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18663 std::int32_t lhs_offset, std::int32_t rhs_offset,
18680 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18686 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18690 zip_1x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18714 std::int32_t lhs_offset, std::int32_t rhs_offset,
18731 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18737 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18741 zip_1x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18765 std::int32_t lhs_offset, std::int32_t rhs_offset,
18782 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18788 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18792 zip_1x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18816 std::int32_t lhs_offset, std::int32_t rhs_offset,
18833 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18839 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18843 zip_1x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18867 std::int32_t lhs_offset, std::int32_t rhs_offset,
18884 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18890 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18894 zip_1x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18918 std::int32_t lhs_offset, std::int32_t rhs_offset,
18935 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18941 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18945 zip_1x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18969 std::int32_t lhs_offset, std::int32_t rhs_offset,
18986 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
18992 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
18996 zip_1x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19020 std::int32_t lhs_offset, std::int32_t rhs_offset,
19037 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19043 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19047 zip_2x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19071 std::int32_t lhs_offset, std::int32_t rhs_offset,
19088 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19094 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19098 zip_2x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19122 std::int32_t lhs_offset, std::int32_t rhs_offset,
19139 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19145 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19149 zip_2x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19173 std::int32_t lhs_offset, std::int32_t rhs_offset,
19190 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19196 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19200 zip_2x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19224 std::int32_t lhs_offset, std::int32_t rhs_offset,
19241 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19247 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19251 zip_2x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19275 std::int32_t lhs_offset, std::int32_t rhs_offset,
19292 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19298 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19302 zip_2x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19326 std::int32_t lhs_offset, std::int32_t rhs_offset,
19343 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19349 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19353 zip_2x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19377 std::int32_t lhs_offset, std::int32_t rhs_offset,
19394 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19400 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19404 zip_2x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19428 std::int32_t lhs_offset, std::int32_t rhs_offset,
19447 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19453 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19488 std::int32_t lhs_offset, std::int32_t rhs_offset,
19507 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19513 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19548 std::int32_t lhs_offset, std::int32_t rhs_offset,
19567 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19573 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19608 std::int32_t lhs_offset, std::int32_t rhs_offset,
19627 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19633 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19668 std::int32_t lhs_offset, std::int32_t rhs_offset,
19687 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19693 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19728 std::int32_t lhs_offset, std::int32_t rhs_offset,
19747 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19753 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19788 std::int32_t lhs_offset, std::int32_t rhs_offset,
19807 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19813 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19848 std::int32_t lhs_offset, std::int32_t rhs_offset,
19867 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19873 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19908 std::int32_t lhs_offset, std::int32_t rhs_offset,
19927 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
19933 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19937 zip_1x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
19975 std::int32_t lhs_offset, std::int32_t rhs_offset,
19994 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20000 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20004 zip_1x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20042 std::int32_t lhs_offset, std::int32_t rhs_offset,
20061 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20067 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20071 zip_1x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20109 std::int32_t lhs_offset, std::int32_t rhs_offset,
20128 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20134 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20138 zip_1x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20176 std::int32_t lhs_offset, std::int32_t rhs_offset,
20195 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20201 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20205 zip_1x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20243 std::int32_t lhs_offset, std::int32_t rhs_offset,
20262 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20268 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20272 zip_1x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20310 std::int32_t lhs_offset, std::int32_t rhs_offset,
20329 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20335 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20339 zip_1x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20377 std::int32_t lhs_offset, std::int32_t rhs_offset,
20396 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20402 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20406 zip_1x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20444 std::int32_t lhs_offset, std::int32_t rhs_offset,
20463 lhs_offset * rhs_offset * k;
20469 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20473 zip_2x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20511 std::int32_t lhs_offset, std::int32_t rhs_offset,
20530 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20536 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20540 zip_2x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20578 std::int32_t lhs_offset, std::int32_t rhs_offset,
20597 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20603 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20607 zip_2x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20645 std::int32_t lhs_offset, std::int32_t rhs_offset,
20664 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20670 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20674 zip_2x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20712 std::int32_t lhs_offset, std::int32_t rhs_offset,
20731 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20737 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20741 zip_2x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20779 std::int32_t lhs_offset, std::int32_t rhs_offset,
20798 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20804 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20808 zip_2x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20846 std::int32_t lhs_offset, std::int32_t rhs_offset,
20865 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20871 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20875 zip_2x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20913 std::int32_t lhs_offset, std::int32_t rhs_offset,
20932 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
20938 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20942 zip_2x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
20980 std::int32_t lhs_offset, std::int32_t rhs_offset,
20999 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21005 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21040 std::int32_t lhs_offset, std::int32_t rhs_offset,
21059 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21065 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21100 std::int32_t lhs_offset, std::int32_t rhs_offset,
21119 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21125 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21160 std::int32_t lhs_offset, std::int32_t rhs_offset,
21179 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21185 lhs_offset, 0);
21220 std::int32_t lhs_offset, std::int32_t rhs_offset,
21239 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21245 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21280 std::int32_t lhs_offset, std::int32_t rhs_offset,
21299 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21305 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21340 std::int32_t lhs_offset, std::int32_t rhs_offset,
21359 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21365 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21400 std::int32_t lhs_offset, std::int32_t rhs_offset,
21419 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21425 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21460 std::int32_t lhs_offset, std::int32_t rhs_offset,
21479 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21485 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21489 zip_1x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21527 std::int32_t lhs_offset, std::int32_t rhs_offset,
21546 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21552 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21556 zip_1x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21594 std::int32_t lhs_offset, std::int32_t rhs_offset,
21613 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21619 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21623 zip_1x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21661 std::int32_t lhs_offset, std::int32_t rhs_offset,
21680 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21686 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21690 zip_1x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21728 std::int32_t lhs_offset, std::int32_t rhs_offset,
21747 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21753 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21757 zip_1x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21795 std::int32_t lhs_offset, std::int32_t rhs_offset,
21814 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21820 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21824 zip_1x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21862 std::int32_t lhs_offset, std::int32_t rhs_offset,
21881 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21887 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21891 zip_1x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21929 std::int32_t lhs_offset, std::int32_t rhs_offset,
21948 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
21954 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21958 zip_1x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
21996 std::int32_t lhs_offset, std::int32_t rhs_offset,
22015 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22021 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22025 zip_2x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22063 std::int32_t lhs_offset, std::int32_t rhs_offset,
22082 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22088 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22092 zip_2x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22130 std::int32_t lhs_offset, std::int32_t rhs_offset,
22149 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22155 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22159 zip_2x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22197 std::int32_t lhs_offset, std::int32_t rhs_offset,
22216 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22222 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22226 zip_2x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22264 std::int32_t lhs_offset, std::int32_t rhs_offset,
22283 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22289 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22293 zip_2x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22331 std::int32_t lhs_offset, std::int32_t rhs_offset,
22350 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22356 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22360 zip_2x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22398 std::int32_t lhs_offset, std::int32_t rhs_offset,
22417 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22423 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22427 zip_2x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22465 std::int32_t lhs_offset, std::int32_t rhs_offset,
22484 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22490 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22494 zip_2x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22531 std::int32_t k, std::int32_t lhs_offset,
22549 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22555 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22578 std::int32_t k, std::int32_t lhs_offset,
22596 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22602 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22625 std::int32_t k, std::int32_t lhs_offset,
22643 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22649 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22672 std::int32_t k, std::int32_t lhs_offset,
22690 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22696 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22719 std::int32_t k, std::int32_t lhs_offset,
22737 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22743 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22766 std::int32_t k, std::int32_t lhs_offset,
22784 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22790 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22813 std::int32_t k, std::int32_t lhs_offset,
22831 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22837 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22860 std::int32_t k, std::int32_t lhs_offset,
22878 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22884 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22907 std::int32_t k, std::int32_t lhs_offset,
22925 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22931 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22935 zip_1x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22958 std::int32_t k, std::int32_t lhs_offset,
22976 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
22982 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
22986 zip_1x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23009 std::int32_t k, std::int32_t lhs_offset,
23027 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23033 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23037 zip_1x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23060 std::int32_t k, std::int32_t lhs_offset,
23078 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23084 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23088 zip_1x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23111 std::int32_t k, std::int32_t lhs_offset,
23129 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23135 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23139 zip_1x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23162 std::int32_t k, std::int32_t lhs_offset,
23180 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23186 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23190 zip_1x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23213 std::int32_t k, std::int32_t lhs_offset,
23231 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23237 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23241 zip_1x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23264 std::int32_t k, std::int32_t lhs_offset,
23282 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23288 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23292 zip_1x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23315 std::int32_t k, std::int32_t lhs_offset,
23333 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23339 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23343 zip_2x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23366 std::int32_t k, std::int32_t lhs_offset,
23384 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23390 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23394 zip_2x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23417 std::int32_t k, std::int32_t lhs_offset,
23435 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23441 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23445 zip_2x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23468 std::int32_t k, std::int32_t lhs_offset,
23486 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23492 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23496 zip_2x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23519 std::int32_t k, std::int32_t lhs_offset,
23537 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23543 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23547 zip_2x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23570 std::int32_t k, std::int32_t lhs_offset,
23588 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23594 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23598 zip_2x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23621 std::int32_t k, std::int32_t lhs_offset,
23639 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23645 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23649 zip_2x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23672 std::int32_t k, std::int32_t lhs_offset,
23690 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23696 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23700 zip_2x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23723 std::int32_t k, std::int32_t lhs_offset,
23743 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23749 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23783 std::int32_t k, std::int32_t lhs_offset,
23803 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23809 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23843 std::int32_t k, std::int32_t lhs_offset,
23863 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23869 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23903 std::int32_t k, std::int32_t lhs_offset,
23923 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23929 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
23963 std::int32_t k, std::int32_t lhs_offset,
23983 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
23989 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24023 std::int32_t k, std::int32_t lhs_offset,
24043 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24049 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24083 std::int32_t k, std::int32_t lhs_offset,
24103 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24109 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24143 lhs_offset,
24163 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24169 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24203 std::int32_t k, std::int32_t lhs_offset,
24223 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24229 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24233 zip_1x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24270 std::int32_t k, std::int32_t lhs_offset,
24290 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24296 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24300 zip_1x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24337 std::int32_t k, std::int32_t lhs_offset,
24357 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24363 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24367 zip_1x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24404 std::int32_t k, std::int32_t lhs_offset,
24424 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24430 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24434 zip_1x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24471 std::int32_t k, std::int32_t lhs_offset,
24491 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24497 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24501 zip_1x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24538 std::int32_t k, std::int32_t lhs_offset,
24558 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24564 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24568 zip_1x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24605 std::int32_t k, std::int32_t lhs_offset,
24625 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24631 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24635 zip_1x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24672 std::int32_t k, std::int32_t lhs_offset,
24692 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24698 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24702 zip_1x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24739 std::int32_t k, std::int32_t lhs_offset,
24759 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24765 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24769 zip_2x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24806 std::int32_t k, std::int32_t lhs_offset,
24826 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24832 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24836 zip_2x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24873 std::int32_t k, std::int32_t lhs_offset,
24893 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24899 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24903 zip_2x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24940 std::int32_t k, std::int32_t lhs_offset,
24960 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
24966 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
24970 zip_2x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25007 std::int32_t k, std::int32_t lhs_offset,
25027 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25033 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25037 zip_2x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25074 std::int32_t k, std::int32_t lhs_offset,
25094 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25100 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25104 zip_2x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25141 std::int32_t k, std::int32_t lhs_offset,
25161 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25167 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25171 zip_2x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25208 std::int32_t k, std::int32_t lhs_offset,
25228 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25234 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25238 zip_2x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25275 std::int32_t k, std::int32_t lhs_offset,
25295 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25301 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25335 std::int32_t k, std::int32_t lhs_offset,
25355 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25361 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25395 std::int32_t k, std::int32_t lhs_offset,
25415 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25421 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25455 std::int32_t k, std::int32_t lhs_offset,
25475 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25481 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25515 std::int32_t k, std::int32_t lhs_offset,
25535 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25541 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25575 std::int32_t k, std::int32_t lhs_offset,
25595 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25601 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25635 std::int32_t k, std::int32_t lhs_offset,
25655 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25661 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25695 std::int32_t k, std::int32_t lhs_offset,
25715 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25721 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25755 std::int32_t k, std::int32_t lhs_offset,
25775 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25781 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25785 zip_1x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25822 std::int32_t k, std::int32_t lhs_offset,
25842 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25848 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25852 zip_1x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25889 std::int32_t k, std::int32_t lhs_offset,
25909 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25915 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25919 zip_1x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25956 std::int32_t k, std::int32_t lhs_offset,
25976 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
25982 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
25986 zip_1x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26023 std::int32_t k, std::int32_t lhs_offset,
26043 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26049 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26053 zip_1x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26090 std::int32_t k, std::int32_t lhs_offset,
26110 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26116 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26120 zip_1x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26157 std::int32_t k, std::int32_t lhs_offset,
26177 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26183 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26187 zip_1x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26224 std::int32_t k, std::int32_t lhs_offset,
26244 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26250 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26254 zip_1x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26291 std::int32_t k, std::int32_t lhs_offset,
26311 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26317 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26321 zip_2x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26358 std::int32_t k, std::int32_t lhs_offset,
26378 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26384 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26388 zip_2x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26425 std::int32_t k, std::int32_t lhs_offset,
26445 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26451 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26455 zip_2x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26492 std::int32_t k, std::int32_t lhs_offset,
26512 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26518 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26522 zip_2x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26559 std::int32_t k, std::int32_t lhs_offset,
26579 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26585 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26589 zip_2x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26626 std::int32_t k, std::int32_t lhs_offset,
26646 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26652 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26656 zip_2x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26693 std::int32_t k, std::int32_t lhs_offset,
26713 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26719 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26723 zip_2x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26760 std::int32_t k, std::int32_t lhs_offset,
26780 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26786 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26790 zip_2x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26828 std::int32_t lhs_offset, std::int32_t rhs_offset,
26846 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26852 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26876 std::int32_t lhs_offset, std::int32_t rhs_offset,
26894 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26900 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26924 std::int32_t lhs_offset, std::int32_t rhs_offset,
26942 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26948 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
26972 std::int32_t lhs_offset, std::int32_t rhs_offset,
26990 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
26996 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27020 std::int32_t lhs_offset, std::int32_t rhs_offset,
27038 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27044 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27068 std::int32_t lhs_offset, std::int32_t rhs_offset,
27086 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27092 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27116 std::int32_t lhs_offset, std::int32_t rhs_offset,
27134 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27140 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27164 std::int32_t lhs_offset, std::int32_t rhs_offset,
27182 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27188 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27212 std::int32_t lhs_offset, std::int32_t rhs_offset,
27230 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27236 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27240 zip_1x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27264 std::int32_t lhs_offset, std::int32_t rhs_offset,
27282 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27288 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27292 zip_1x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27316 std::int32_t lhs_offset, std::int32_t rhs_offset,
27334 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27340 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27344 zip_1x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27368 std::int32_t lhs_offset, std::int32_t rhs_offset,
27386 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27392 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27396 zip_1x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27420 std::int32_t lhs_offset, std::int32_t rhs_offset,
27438 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27444 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27448 zip_1x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27472 std::int32_t lhs_offset, std::int32_t rhs_offset,
27490 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27496 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27500 zip_1x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27524 std::int32_t lhs_offset, std::int32_t rhs_offset,
27542 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27548 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27552 zip_1x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27576 std::int32_t lhs_offset, std::int32_t rhs_offset,
27594 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27600 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27604 zip_1x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27628 std::int32_t lhs_offset, std::int32_t rhs_offset,
27646 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27652 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27656 zip_2x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27680 std::int32_t lhs_offset, std::int32_t rhs_offset,
27698 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27704 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27708 zip_2x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27732 std::int32_t lhs_offset, std::int32_t rhs_offset,
27750 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27756 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27760 zip_2x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27784 std::int32_t lhs_offset, std::int32_t rhs_offset,
27802 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27808 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27812 zip_2x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27836 std::int32_t lhs_offset, std::int32_t rhs_offset,
27854 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27860 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27864 zip_2x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27888 std::int32_t lhs_offset, std::int32_t rhs_offset,
27906 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27912 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27916 zip_2x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27940 std::int32_t lhs_offset, std::int32_t rhs_offset,
27958 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
27964 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27968 zip_2x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
27992 std::int32_t lhs_offset, std::int32_t rhs_offset,
28010 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28016 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28020 zip_2x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28044 std::int32_t lhs_offset, std::int32_t rhs_offset,
28064 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28070 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28105 std::int32_t lhs_offset, std::int32_t rhs_offset,
28125 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28131 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28166 std::int32_t lhs_offset, std::int32_t rhs_offset,
28186 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28192 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28227 std::int32_t lhs_offset, std::int32_t rhs_offset,
28247 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28253 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28288 std::int32_t lhs_offset, std::int32_t rhs_offset,
28308 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28314 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28349 std::int32_t lhs_offset, std::int32_t rhs_offset,
28369 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28375 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28410 std::int32_t lhs_offset, std::int32_t rhs_offset,
28430 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28436 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28471 std::int32_t lhs_offset, std::int32_t rhs_offset,
28491 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28497 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28532 std::int32_t lhs_offset, std::int32_t rhs_offset,
28552 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28558 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28562 zip_1x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28600 std::int32_t lhs_offset, std::int32_t rhs_offset,
28620 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28626 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28630 zip_1x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28668 std::int32_t lhs_offset, std::int32_t rhs_offset,
28688 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28694 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28698 zip_1x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28736 std::int32_t lhs_offset, std::int32_t rhs_offset,
28756 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28762 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28766 zip_1x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28804 std::int32_t lhs_offset, std::int32_t rhs_offset,
28824 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28830 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28834 zip_1x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28872 std::int32_t lhs_offset, std::int32_t rhs_offset,
28892 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28898 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28902 zip_1x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28940 std::int32_t lhs_offset, std::int32_t rhs_offset,
28960 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
28966 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
28970 zip_1x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29008 std::int32_t lhs_offset, std::int32_t rhs_offset,
29028 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29034 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29038 zip_1x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29076 std::int32_t lhs_offset, std::int32_t rhs_offset,
29096 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29102 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29106 zip_2x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29144 std::int32_t lhs_offset, std::int32_t rhs_offset,
29164 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29170 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29174 zip_2x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29212 std::int32_t lhs_offset, std::int32_t rhs_offset,
29232 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29238 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29242 zip_2x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29280 std::int32_t lhs_offset, std::int32_t rhs_offset,
29300 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29306 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29310 zip_2x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29348 std::int32_t lhs_offset, std::int32_t rhs_offset,
29368 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29374 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29378 zip_2x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29416 std::int32_t lhs_offset, std::int32_t rhs_offset,
29436 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29442 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29446 zip_2x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29484 std::int32_t lhs_offset, std::int32_t rhs_offset,
29504 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29510 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29514 zip_2x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29552 std::int32_t lhs_offset, std::int32_t rhs_offset,
29572 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29578 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29582 zip_2x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29620 std::int32_t lhs_offset, std::int32_t rhs_offset,
29640 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29646 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29681 std::int32_t lhs_offset, std::int32_t rhs_offset,
29701 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29707 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29742 std::int32_t lhs_offset, std::int32_t rhs_offset,
29762 const std::int32_t const_offset = lhs_offset
29768 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29803 std::int32_t lhs_offset, std::int32_t rhs_offset,
29823 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29829 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29864 std::int32_t lhs_offset, std::int32_t rhs_offset,
29884 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29890 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29925 std::int32_t lhs_offset, std::int32_t rhs_offset,
29945 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
29951 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
29986 std::int32_t lhs_offset, std::int32_t rhs_offset,
30006 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30012 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30047 std::int32_t lhs_offset, std::int32_t rhs_offset,
30067 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30073 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30108 std::int32_t lhs_offset, std::int32_t rhs_offset,
30128 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30134 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30138 zip_1x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30176 std::int32_t lhs_offset, std::int32_t rhs_offset,
30196 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30202 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30206 zip_1x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30244 std::int32_t lhs_offset, std::int32_t rhs_offset,
30264 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30270 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30274 zip_1x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30312 std::int32_t lhs_offset, std::int32_t rhs_offset,
30332 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30338 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30342 zip_1x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30380 std::int32_t lhs_offset, std::int32_t rhs_offset,
30400 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30406 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30410 zip_1x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30448 std::int32_t lhs_offset, std::int32_t rhs_offset,
30468 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30474 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30478 zip_1x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30516 std::int32_t lhs_offset, std::int32_t rhs_offset,
30536 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30542 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30546 zip_1x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30584 std::int32_t lhs_offset, std::int32_t rhs_offset,
30604 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30610 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30614 zip_1x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30652 std::int32_t lhs_offset, std::int32_t rhs_offset,
30672 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30678 zip_3x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30682 zip_2x8_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30720 std::int32_t lhs_offset, std::int32_t rhs_offset,
30740 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30746 zip_3x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30750 zip_2x8_1_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30788 std::int32_t lhs_offset, std::int32_t rhs_offset,
30808 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30814 zip_3x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30818 zip_2x8_2_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30856 std::int32_t lhs_offset, std::int32_t rhs_offset,
30876 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30882 zip_3x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30886 zip_2x8_3_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30924 std::int32_t lhs_offset, std::int32_t rhs_offset,
30944 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
30950 zip_3x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30954 zip_2x8_4_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
30992 std::int32_t lhs_offset, std::int32_t rhs_offset,
31012 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31018 zip_3x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31022 zip_2x8_5_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31060 std::int32_t lhs_offset, std::int32_t rhs_offset,
31080 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31086 zip_3x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31090 zip_2x8_6_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31128 std::int32_t lhs_offset, std::int32_t rhs_offset,
31148 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31154 zip_3x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31158 zip_2x8_7_aligned(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31195 std::int32_t k, std::int32_t lhs_offset,
31213 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31219 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31242 std::int32_t k, std::int32_t lhs_offset,
31260 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31266 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31289 std::int32_t k, std::int32_t lhs_offset,
31307 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31313 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31336 std::int32_t k, std::int32_t lhs_offset,
31354 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31360 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31383 std::int32_t k, std::int32_t lhs_offset,
31401 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31407 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31430 std::int32_t k, std::int32_t lhs_offset,
31448 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31454 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31477 std::int32_t k, std::int32_t lhs_offset,
31495 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31501 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31524 std::int32_t k, std::int32_t lhs_offset,
31542 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31548 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31571 std::int32_t k, std::int32_t lhs_offset,
31589 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31595 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31599 zip_1x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31622 std::int32_t k, std::int32_t lhs_offset,
31640 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31646 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31650 zip_1x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31673 std::int32_t k, std::int32_t lhs_offset,
31691 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31697 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31701 zip_1x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31724 std::int32_t k, std::int32_t lhs_offset,
31742 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31748 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31752 zip_1x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31775 std::int32_t k, std::int32_t lhs_offset,
31793 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31799 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31803 zip_1x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31826 std::int32_t k, std::int32_t lhs_offset,
31844 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31850 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31854 zip_1x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31877 std::int32_t k, std::int32_t lhs_offset,
31895 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31901 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31905 zip_1x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31928 std::int32_t k, std::int32_t lhs_offset,
31946 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
31952 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31956 zip_1x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
31979 std::int32_t k, std::int32_t lhs_offset,
31997 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32003 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32007 zip_2x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32030 std::int32_t k, std::int32_t lhs_offset,
32048 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32054 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32058 zip_2x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset
32081 std::int32_t k, std::int32_t lhs_offset,
32099 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32105 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32109 zip_2x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32132 std::int32_t k, std::int32_t lhs_offset,
32150 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32156 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32160 zip_2x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32183 std::int32_t k, std::int32_t lhs_offset,
32201 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32207 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32211 zip_2x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32234 std::int32_t k, std::int32_t lhs_offset,
32252 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32258 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32262 zip_2x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32285 std::int32_t k, std::int32_t lhs_offset,
32303 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32309 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32313 zip_2x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32336 std::int32_t k, std::int32_t lhs_offset,
32354 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32360 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32364 zip_2x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32387 std::int32_t k, std::int32_t lhs_offset,
32407 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32413 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32447 std::int32_t k, std::int32_t lhs_offset,
32467 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32473 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32507 std::int32_t k, std::int32_t lhs_offset,
32527 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32533 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32567 std::int32_t k, std::int32_t lhs_offset,
32587 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32593 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32627 std::int32_t k, std::int32_t lhs_offset,
32647 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32653 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32687 std::int32_t k, std::int32_t lhs_offset,
32707 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32713 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32747 std::int32_t k, std::int32_t lhs_offset,
32767 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32773 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32807 std::int32_t k, std::int32_t lhs_offset,
32827 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32833 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32867 std::int32_t k, std::int32_t lhs_offset,
32887 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32893 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32897 zip_1x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32934 std::int32_t k, std::int32_t lhs_offset,
32954 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
32960 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
32964 zip_1x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33001 std::int32_t k, std::int32_t lhs_offset,
33021 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33027 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33031 zip_1x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33068 std::int32_t k, std::int32_t lhs_offset,
33088 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33094 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33098 zip_1x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33135 std::int32_t k, std::int32_t lhs_offset,
33155 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33161 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33165 zip_1x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33202 std::int32_t k, std::int32_t lhs_offset,
33222 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33228 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33232 zip_1x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33269 std::int32_t k, std::int32_t lhs_offset,
33289 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33295 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33299 zip_1x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33336 std::int32_t k, std::int32_t lhs_offset,
33356 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33362 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33366 zip_1x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33403 std::int32_t k, std::int32_t lhs_offset,
33423 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33429 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33433 zip_2x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33470 std::int32_t k, std::int32_t lhs_offset,
33490 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33496 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33500 zip_2x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33537 std::int32_t k, std::int32_t lhs_offset,
33557 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33563 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33567 zip_2x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33604 std::int32_t k, std::int32_t lhs_offset,
33624 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33630 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33634 zip_2x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33671 std::int32_t k, std::int32_t lhs_offset,
33691 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33697 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33701 zip_2x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33738 std::int32_t k, std::int32_t lhs_offset,
33758 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33764 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33768 zip_2x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33805 std::int32_t k, std::int32_t lhs_offset,
33825 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33831 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33835 zip_2x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33872 std::int32_t k, std::int32_t lhs_offset,
33892 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33898 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33902 zip_2x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33939 std::int32_t k, std::int32_t lhs_offset,
33959 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
33965 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
33999 std::int32_t k, std::int32_t lhs_offset,
34019 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34025 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34059 std::int32_t k, std::int32_t lhs_offset,
34079 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34085 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34119 std::int32_t k, std::int32_t lhs_offset,
34139 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34145 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34179 std::int32_t k, std::int32_t lhs_offset,
34199 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34205 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34239 std::int32_t k, std::int32_t lhs_offset,
34259 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34265 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34299 std::int32_t k, std::int32_t lhs_offset,
34319 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34325 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34359 std::int32_t k, std::int32_t lhs_offset,
34379 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34385 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34419 std::int32_t k, std::int32_t lhs_offset,
34439 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34445 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34449 zip_1x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34486 std::int32_t k, std::int32_t lhs_offset,
34506 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34512 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34516 zip_1x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34553 std::int32_t k, std::int32_t lhs_offset,
34573 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34579 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34583 zip_1x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34620 std::int32_t k, std::int32_t lhs_offset,
34640 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34646 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34650 zip_1x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34687 std::int32_t k, std::int32_t lhs_offset,
34707 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34713 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34717 zip_1x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34754 std::int32_t k, std::int32_t lhs_offset,
34774 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34780 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34784 zip_1x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34821 std::int32_t k, std::int32_t lhs_offset,
34841 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34847 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34851 zip_1x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34888 std::int32_t k, std::int32_t lhs_offset,
34908 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34914 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34918 zip_1x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34955 std::int32_t k, std::int32_t lhs_offset,
34975 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
34981 zip_3x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
34985 zip_2x8(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35022 std::int32_t k, std::int32_t lhs_offset,
35042 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
35048 zip_3x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35052 zip_2x8_1(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35089 std::int32_t k, std::int32_t lhs_offset,
35109 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
35115 zip_3x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35119 zip_2x8_2(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35156 std::int32_t k, std::int32_t lhs_offset,
35176 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
35182 zip_3x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35186 zip_2x8_3(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35223 std::int32_t k, std::int32_t lhs_offset,
35243 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
35249 zip_3x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35253 zip_2x8_4(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35290 std::int32_t k, std::int32_t lhs_offset,
35310 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
35316 zip_3x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35320 zip_2x8_5(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35357 std::int32_t k, std::int32_t lhs_offset,
35377 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
35383 zip_3x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35387 zip_2x8_6(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35424 std::int32_t k, std::int32_t lhs_offset,
35444 const std::int32_t const_offset = lhs_offset * rhs_offset * k;
35450 zip_3x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35454 zip_2x8_7(rhs_chunk, k, k, zipped_rhs_chunk, lhs_offset, 0);
35493 std::int32_t k, std::int32_t lhs_offset,
35513 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35519 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35525 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35531 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35537 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35543 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35549 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35555 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35565 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35571 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35577 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35583 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35589 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35595 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35601 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35607 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35617 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35623 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35629 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35635 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35641 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35647 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35653 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35659 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35673 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35679 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35685 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35691 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35697 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35703 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35709 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35715 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35725 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35731 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35737 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35743 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35749 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35755 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35761 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35767 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35777 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35783 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35789 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35795 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35801 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35807 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35813 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35819 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35833 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35839 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35845 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35851 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35857 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35863 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35869 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35875 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35885 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35891 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35897 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35903 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35909 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35915 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35921 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35927 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35937 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35943 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35949 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35955 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35961 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35967 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35973 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35979 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
35995 internal::gemm_q8_0_0_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36001 internal::gemm_q8_0_0_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36007 internal::gemm_q8_0_0_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36013 internal::gemm_q8_0_0_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36019 internal::gemm_q8_0_0_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36025 internal::gemm_q8_0_0_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36031 internal::gemm_q8_0_0_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36037 internal::gemm_q8_0_0_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36047 internal::gemm_q8_0_1_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36053 internal::gemm_q8_0_1_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36059 internal::gemm_q8_0_1_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36065 internal::gemm_q8_0_1_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36071 internal::gemm_q8_0_1_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36077 internal::gemm_q8_0_1_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36083 internal::gemm_q8_0_1_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36089 internal::gemm_q8_0_1_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36099 internal::gemm_q8_0_2_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36105 internal::gemm_q8_0_2_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36111 internal::gemm_q8_0_2_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36117 internal::gemm_q8_0_2_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36123 internal::gemm_q8_0_2_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36129 internal::gemm_q8_0_2_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36135 internal::gemm_q8_0_2_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36141 internal::gemm_q8_0_2_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36155 internal::gemm_q8_1_0_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36161 internal::gemm_q8_1_0_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36167 internal::gemm_q8_1_0_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36173 internal::gemm_q8_1_0_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36179 internal::gemm_q8_1_0_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36185 internal::gemm_q8_1_0_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36191 internal::gemm_q8_1_0_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36197 internal::gemm_q8_1_0_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36207 internal::gemm_q8_1_1_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36213 internal::gemm_q8_1_1_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36219 internal::gemm_q8_1_1_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36225 internal::gemm_q8_1_1_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36231 internal::gemm_q8_1_1_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36237 internal::gemm_q8_1_1_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36243 internal::gemm_q8_1_1_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36249 internal::gemm_q8_1_1_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36259 internal::gemm_q8_1_2_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36265 internal::gemm_q8_1_2_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36271 internal::gemm_q8_1_2_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36277 internal::gemm_q8_1_2_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36283 internal::gemm_q8_1_2_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36289 internal::gemm_q8_1_2_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36295 internal::gemm_q8_1_2_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36301 internal::gemm_q8_1_2_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36315 internal::gemm_q8_2_0_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36321 internal::gemm_q8_2_0_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36327 internal::gemm_q8_2_0_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36333 internal::gemm_q8_2_0_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36339 internal::gemm_q8_2_0_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36345 internal::gemm_q8_2_0_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36351 internal::gemm_q8_2_0_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36357 internal::gemm_q8_2_0_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36367 internal::gemm_q8_2_1_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36373 internal::gemm_q8_2_1_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36379 internal::gemm_q8_2_1_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36385 internal::gemm_q8_2_1_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36391 internal::gemm_q8_2_1_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36397 internal::gemm_q8_2_1_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36403 internal::gemm_q8_2_1_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36409 internal::gemm_q8_2_1_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36419 internal::gemm_q8_2_2_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36425 internal::gemm_q8_2_2_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36431 internal::gemm_q8_2_2_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36437 internal::gemm_q8_2_2_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36443 internal::gemm_q8_2_2_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36449 internal::gemm_q8_2_2_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36455 internal::gemm_q8_2_2_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36461 internal::gemm_q8_2_2_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36476 std::int32_t k, std::int32_t lhs_offset,
36491 lhs_offset, rhs_offset, result,
36496 lhs_offset, rhs_offset, result,
36501 lhs_offset, rhs_offset, result,
36506 lhs_offset, rhs_offset, result,
36511 lhs_offset, rhs_offset, result,
36516 lhs_offset, rhs_offset, result,
36521 lhs_offset, rhs_offset, result,
36526 lhs_offset, rhs_offset, result,
36535 lhs_offset, rhs_offset, result,
36540 lhs_offset, rhs_offset, result,
36545 lhs_offset, rhs_offset, result,
36550 lhs_offset, rhs_offset, result,
36555 lhs_offset, rhs_offset, result,
36560 lhs_offset, rhs_offset, result,
36565 lhs_offset, rhs_offset, result,
36570 lhs_offset, rhs_offset, result,
36579 lhs_offset, rhs_offset, result,
36584 lhs_offset, rhs_offset, result,
36589 lhs_offset, rhs_offset, result,
36594 lhs_offset, rhs_offset, result,
36599 lhs_offset, rhs_offset, result,
36604 lhs_offset, rhs_offset, result,
36609 lhs_offset, rhs_offset, result,
36614 lhs_offset, rhs_offset, result,
36627 lhs_offset, rhs_offset, result,
36632 lhs_offset, rhs_offset, result,
36637 lhs_offset, rhs_offset, result,
36642 lhs_offset, rhs_offset, result,
36647 lhs_offset, rhs_offset, result,
36652 lhs_offset, rhs_offset, result,
36657 lhs_offset, rhs_offset, result,
36662 lhs_offset, rhs_offset, result,
36671 lhs_offset, rhs_offset, result,
36676 lhs_offset, rhs_offset, result,
36681 lhs_offset, rhs_offset, result,
36686 lhs_offset, rhs_offset, result,
36691 lhs_offset, rhs_offset, result,
36696 lhs_offset, rhs_offset, result,
36701 lhs_offset, rhs_offset, result,
36706 lhs_offset, rhs_offset, result,
36715 lhs_offset, rhs_offset, result,
36720 lhs_offset, rhs_offset, result,
36725 lhs_offset, rhs_offset, result,
36730 lhs_offset, rhs_offset, result,
36735 lhs_offset, rhs_offset, result,
36740 lhs_offset, rhs_offset, result,
36745 lhs_offset, rhs_offset, result,
36750 lhs_offset, rhs_offset, result,
36763 lhs_offset, rhs_offset, result,
36768 lhs_offset, rhs_offset, result,
36773 lhs_offset, rhs_offset, result,
36778 lhs_offset, rhs_offset, result,
36783 lhs_offset, rhs_offset, result,
36788 lhs_offset, rhs_offset, result,
36793 lhs_offset, rhs_offset, result,
36798 lhs_offset, rhs_offset, result,
36807 lhs_offset, rhs_offset, result,
36812 lhs_offset, rhs_offset, result,
36817 lhs_offset, rhs_offset, result,
36822 lhs_offset, rhs_offset, result,
36827 lhs_offset, rhs_offset, result,
36832 lhs_offset, rhs_offset, result,
36837 lhs_offset, rhs_offset, result,
36842 lhs_offset, rhs_offset, result,
36851 lhs_offset, rhs_offset, result,
36856 lhs_offset, rhs_offset, result,
36861 lhs_offset, rhs_offset, result,
36866 lhs_offset, rhs_offset, result,
36871 lhs_offset, rhs_offset, result,
36876 lhs_offset, rhs_offset, result,
36881 lhs_offset, rhs_offset, result,
36886 lhs_offset, rhs_offset, result,
36901 internal::gemm_i32_0_0_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36905 internal::gemm_i32_0_0_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36909 lhs_offset,
36913 internal::gemm_i32_0_0_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36917 internal::gemm_i32_0_0_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36921 internal::gemm_i32_0_0_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36925 internal::gemm_i32_0_0_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36929 internal::gemm_i32_0_0_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36937 internal::gemm_i32_0_1_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36941 internal::gemm_i32_0_1_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36945 internal::gemm_i32_0_1_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36949 internal::gemm_i32_0_1_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36953 internal::gemm_i32_0_1_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36957 internal::gemm_i32_0_1_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36961 internal::gemm_i32_0_1_6(scratch, lhs, rhs, m, n, k, lhs_offset,
36965 internal::gemm_i32_0_1_7(scratch, lhs, rhs, m, n, k, lhs_offset,
36973 internal::gemm_i32_0_2_0(scratch, lhs, rhs, m, n, k, lhs_offset,
36977 internal::gemm_i32_0_2_1(scratch, lhs, rhs, m, n, k, lhs_offset,
36981 internal::gemm_i32_0_2_2(scratch, lhs, rhs, m, n, k, lhs_offset,
36985 internal::gemm_i32_0_2_3(scratch, lhs, rhs, m, n, k, lhs_offset,
36989 internal::gemm_i32_0_2_4(scratch, lhs, rhs, m, n, k, lhs_offset,
36993 internal::gemm_i32_0_2_5(scratch, lhs, rhs, m, n, k, lhs_offset,
36997 internal::gemm_i32_0_2_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37001 internal::gemm_i32_0_2_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37013 internal::gemm_i32_1_0_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37017 internal::gemm_i32_1_0_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37021 internal::gemm_i32_1_0_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37025 internal::gemm_i32_1_0_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37029 internal::gemm_i32_1_0_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37033 internal::gemm_i32_1_0_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37037 internal::gemm_i32_1_0_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37041 internal::gemm_i32_1_0_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37049 internal::gemm_i32_1_1_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37053 internal::gemm_i32_1_1_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37057 internal::gemm_i32_1_1_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37061 internal::gemm_i32_1_1_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37065 internal::gemm_i32_1_1_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37069 internal::gemm_i32_1_1_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37073 internal::gemm_i32_1_1_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37077 internal::gemm_i32_1_1_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37085 internal::gemm_i32_1_2_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37089 internal::gemm_i32_1_2_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37093 internal::gemm_i32_1_2_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37097 internal::gemm_i32_1_2_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37101 internal::gemm_i32_1_2_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37105 internal::gemm_i32_1_2_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37109 internal::gemm_i32_1_2_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37113 internal::gemm_i32_1_2_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37125 internal::gemm_i32_2_0_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37129 internal::gemm_i32_2_0_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37133 internal::gemm_i32_2_0_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37137 internal::gemm_i32_2_0_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37141 internal::gemm_i32_2_0_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37145 internal::gemm_i32_2_0_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37149 internal::gemm_i32_2_0_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37153 internal::gemm_i32_2_0_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37161 internal::gemm_i32_2_1_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37165 internal::gemm_i32_2_1_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37169 internal::gemm_i32_2_1_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37173 internal::gemm_i32_2_1_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37177 internal::gemm_i32_2_1_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37181 internal::gemm_i32_2_1_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37185 internal::gemm_i32_2_1_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37189 internal::gemm_i32_2_1_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37197 internal::gemm_i32_2_2_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37201 internal::gemm_i32_2_2_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37205 internal::gemm_i32_2_2_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37209 internal::gemm_i32_2_2_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37213 internal::gemm_i32_2_2_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37217 internal::gemm_i32_2_2_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37221 internal::gemm_i32_2_2_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37225 internal::gemm_i32_2_2_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37238 std::int32_t k, std::int32_t lhs_offset,
37253 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37258 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37263 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37268 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37273 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37278 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37283 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37288 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37297 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37302 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37307 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37312 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37317 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37322 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37327 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37332 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37341 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37346 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37351 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37356 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37361 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37366 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37371 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37376 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37389 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37394 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37399 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37404 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37409 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37414 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37419 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37424 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37433 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37438 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37443 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37448 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37453 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37458 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37463 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37468 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37477 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37482 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37487 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37492 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37497 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37502 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37507 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37512 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37525 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37530 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37535 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37540 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37545 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37550 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37555 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37560 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37569 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37574 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37579 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37584 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37589 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37594 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37599 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37604 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37613 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37618 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37623 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37628 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37633 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37638 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37643 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37648 scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
37663 lhs_offset,
37668 internal::gemm_f_0_0_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37673 internal::gemm_f_0_0_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37678 internal::gemm_f_0_0_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37683 internal::gemm_f_0_0_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37688 internal::gemm_f_0_0_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37693 internal::gemm_f_0_0_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37698 internal::gemm_f_0_0_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37707 internal::gemm_f_0_1_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37712 internal::gemm_f_0_1_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37717 internal::gemm_f_0_1_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37722 internal::gemm_f_0_1_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37727 internal::gemm_f_0_1_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37732 internal::gemm_f_0_1_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37737 internal::gemm_f_0_1_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37742 internal::gemm_f_0_1_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37751 internal::gemm_f_0_2_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37756 internal::gemm_f_0_2_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37761 internal::gemm_f_0_2_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37766 internal::gemm_f_0_2_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37771 internal::gemm_f_0_2_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37776 internal::gemm_f_0_2_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37781 internal::gemm_f_0_2_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37786 internal::gemm_f_0_2_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37799 internal::gemm_f_1_0_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37804 internal::gemm_f_1_0_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37809 internal::gemm_f_1_0_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37814 internal::gemm_f_1_0_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37819 internal::gemm_f_1_0_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37824 internal::gemm_f_1_0_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37829 internal::gemm_f_1_0_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37834 internal::gemm_f_1_0_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37843 internal::gemm_f_1_1_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37848 internal::gemm_f_1_1_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37853 internal::gemm_f_1_1_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37858 internal::gemm_f_1_1_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37863 internal::gemm_f_1_1_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37868 internal::gemm_f_1_1_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37873 internal::gemm_f_1_1_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37878 internal::gemm_f_1_1_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37887 internal::gemm_f_1_2_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37892 internal::gemm_f_1_2_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37897 internal::gemm_f_1_2_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37902 internal::gemm_f_1_2_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37907 internal::gemm_f_1_2_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37912 internal::gemm_f_1_2_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37917 internal::gemm_f_1_2_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37922 internal::gemm_f_1_2_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37935 internal::gemm_f_2_0_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37940 internal::gemm_f_2_0_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37945 internal::gemm_f_2_0_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37950 internal::gemm_f_2_0_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37955 internal::gemm_f_2_0_4(scratch, lhs, rhs, m, n, k, lhs_offset,
37960 internal::gemm_f_2_0_5(scratch, lhs, rhs, m, n, k, lhs_offset,
37965 internal::gemm_f_2_0_6(scratch, lhs, rhs, m, n, k, lhs_offset,
37970 internal::gemm_f_2_0_7(scratch, lhs, rhs, m, n, k, lhs_offset,
37979 internal::gemm_f_2_1_0(scratch, lhs, rhs, m, n, k, lhs_offset,
37984 internal::gemm_f_2_1_1(scratch, lhs, rhs, m, n, k, lhs_offset,
37989 internal::gemm_f_2_1_2(scratch, lhs, rhs, m, n, k, lhs_offset,
37994 internal::gemm_f_2_1_3(scratch, lhs, rhs, m, n, k, lhs_offset,
37999 internal::gemm_f_2_1_4(scratch, lhs, rhs, m, n, k, lhs_offset,
38004 internal::gemm_f_2_1_5(scratch, lhs, rhs, m, n, k, lhs_offset,
38009 internal::gemm_f_2_1_6(scratch, lhs, rhs, m, n, k, lhs_offset,
38014 internal::gemm_f_2_1_7(scratch, lhs, rhs, m, n, k, lhs_offset,
38023 internal::gemm_f_2_2_0(scratch, lhs, rhs, m, n, k, lhs_offset,
38028 internal::gemm_f_2_2_1(scratch, lhs, rhs, m, n, k, lhs_offset,
38033 internal::gemm_f_2_2_2(scratch, lhs, rhs, m, n, k, lhs_offset,
38038 internal::gemm_f_2_2_3(scratch, lhs, rhs, m, n, k, lhs_offset,
38043 internal::gemm_f_2_2_4(scratch, lhs, rhs, m, n, k, lhs_offset,
38048 internal::gemm_f_2_2_5(scratch, lhs, rhs, m, n, k, lhs_offset,
38053 internal::gemm_f_2_2_6(scratch, lhs, rhs, m, n, k, lhs_offset,
38058 internal::gemm_f_2_2_7(scratch, lhs, rhs, m, n, k, lhs_offset,
38072 std::int32_t k, std::int32_t lhs_offset, std::int32_t rhs_offset,
38075 gemm_q8_strided(scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,
38081 std::int32_t k, std::int32_t lhs_offset, std::int32_t rhs_offset,
38083 gemm_i32_strided(scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset, result,
38089 std::int32_t k, std::int32_t lhs_offset, std::int32_t rhs_offset,
38091 gemm_f_strided(scratch, lhs, rhs, m, n, k, lhs_offset, rhs_offset,