Home | History | Annotate | Download | only in Hexagon
      1 ; RUN: llc -march=hexagon < %s | FileCheck %s
      2 ; CHECK-NOT: setbit(r{{[0-9]+}},#1)
      3 
      4 target triple = "hexagon-unknown--elf"
      5 
      6 %s.8 = type { i8*, i32, i32, i32, i32, %s.9*, %s.9*, %s.9* }
      7 %s.9 = type { %s.10 }
      8 %s.10 = type { i64 }
      9 %s.4 = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8, i8, [6 x i8] }
     10 
     11 @g0 = private constant [6 x i8] c"input\00", align 32
     12 @g1 = private constant [11 x i8] c"gaussian11\00", align 32
     13 @g2 = private constant [2 x %s.8] [%s.8 { i8* getelementptr inbounds ([6 x i8], [6 x i8]* @g0, i32 0, i32 0), i32 1, i32 2, i32 1, i32 8, %s.9* null, %s.9* null, %s.9* null }, %s.8 { i8* getelementptr inbounds ([11 x i8], [11 x i8]* @g1, i32 0, i32 0), i32 2, i32 2, i32 1, i32 8, %s.9* null, %s.9* null, %s.9* null }]
     14 @g3 = private constant [53 x i8] c"hexagon-32-os_unknown-no_asserts-no_bounds_query-hvx\00", align 32
     15 
     16 ; Function Attrs: nounwind
     17 declare i8* @f0(i8*, i32) #0
     18 
     19 ; Function Attrs: nounwind
     20 declare void @f1(i8*, i8*) #0
     21 
     22 ; Function Attrs: nounwind
     23 declare noalias i8* @f2(i8*, i32) #0
     24 
     25 ; Function Attrs: nounwind
     26 declare void @f3(i8*, i8*) #0
     27 
     28 ; Function Attrs: nounwind
     29 declare void @f4() #0
     30 
     31 ; Function Attrs: nounwind
     32 declare void @f5() #0
     33 
     34 ; Function Attrs: nounwind
     35 define i32 @f6(%s.4* noalias nocapture readonly %a0, %s.4* noalias nocapture readonly %a1) #0 {
     36 b0:
     37   %v0 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 1
     38   %v1 = load i8*, i8** %v0
     39   %v2 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 2, i32 0
     40   %v3 = load i32, i32* %v2
     41   %v4 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 2, i32 1
     42   %v5 = load i32, i32* %v4
     43   %v6 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 3, i32 1
     44   %v7 = load i32, i32* %v6
     45   %v8 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 4, i32 0
     46   %v9 = load i32, i32* %v8
     47   %v10 = getelementptr inbounds %s.4, %s.4* %a0, i32 0, i32 4, i32 1
     48   %v11 = load i32, i32* %v10
     49   %v12 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 1
     50   %v13 = load i8*, i8** %v12
     51   %v14 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 2, i32 0
     52   %v15 = load i32, i32* %v14
     53   %v16 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 2, i32 1
     54   %v17 = load i32, i32* %v16
     55   %v18 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 3, i32 1
     56   %v19 = load i32, i32* %v18
     57   %v20 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 4, i32 0
     58   %v21 = load i32, i32* %v20
     59   %v22 = getelementptr inbounds %s.4, %s.4* %a1, i32 0, i32 4, i32 1
     60   %v23 = load i32, i32* %v22
     61   %v24 = add nsw i32 %v21, %v15
     62   %v25 = add nsw i32 %v24, -64
     63   %v26 = icmp slt i32 %v21, %v25
     64   %v27 = select i1 %v26, i32 %v21, i32 %v25
     65   %v28 = add nsw i32 %v15, -1
     66   %v29 = and i32 %v28, -64
     67   %v30 = add i32 %v21, 63
     68   %v31 = add i32 %v30, %v29
     69   %v32 = add nsw i32 %v24, -1
     70   %v33 = icmp slt i32 %v31, %v32
     71   %v34 = select i1 %v33, i32 %v31, i32 %v32
     72   %v35 = sub nsw i32 %v34, %v27
     73   %v36 = icmp slt i32 %v24, %v34
     74   %v37 = select i1 %v36, i32 %v34, i32 %v24
     75   %v38 = add nsw i32 %v37, -1
     76   %v39 = icmp slt i32 %v38, %v34
     77   %v40 = select i1 %v39, i32 %v34, i32 %v38
     78   %v41 = add nsw i32 %v17, 1
     79   %v42 = sext i32 %v41 to i64
     80   %v43 = sub nsw i32 %v40, %v27
     81   %v44 = add nsw i32 %v43, 2
     82   %v45 = sext i32 %v44 to i64
     83   %v46 = mul nsw i64 %v45, %v42
     84   %v47 = trunc i64 %v46 to i32
     85   %v48 = tail call i8* @f2(i8* null, i32 %v47)
     86   %v49 = add nsw i32 %v23, -1
     87   %v50 = add i32 %v23, %v17
     88   %v51 = icmp sgt i32 %v23, %v50
     89   br i1 %v51, label %b12, label %b1, !prof !3
     90 
     91 b1:                                               ; preds = %b11, %b0
     92   %v52 = phi i32 [ %v220, %b11 ], [ %v49, %b0 ]
     93   %v53 = icmp slt i32 %v9, %v24
     94   %v54 = select i1 %v53, i32 %v9, i32 %v24
     95   %v55 = add nsw i32 %v21, -1
     96   %v56 = icmp slt i32 %v54, %v55
     97   %v57 = select i1 %v56, i32 %v55, i32 %v54
     98   %v58 = add nsw i32 %v9, %v3
     99   %v59 = icmp slt i32 %v58, %v24
    100   %v60 = select i1 %v59, i32 %v58, i32 %v24
    101   %v61 = icmp slt i32 %v60, %v57
    102   %v62 = select i1 %v61, i32 %v57, i32 %v60
    103   %v63 = icmp slt i32 %v57, %v21
    104   br i1 %v63, label %b7, label %b2, !prof !3
    105 
    106 b2:                                               ; preds = %b1
    107   %v64 = add nsw i32 %v11, %v5
    108   %v65 = add nsw i32 %v64, -1
    109   %v66 = icmp slt i32 %v52, %v65
    110   br i1 %v66, label %b3, label %b4
    111 
    112 b3:                                               ; preds = %b3, %b2
    113   %v67 = phi i32 [ %v96, %b3 ], [ %v55, %b2 ]
    114   %v68 = mul nsw i32 %v11, %v7
    115   %v69 = icmp slt i32 %v52, %v11
    116   %v70 = select i1 %v69, i32 %v11, i32 %v52
    117   %v71 = mul nsw i32 %v70, %v7
    118   %v72 = add nsw i32 %v58, -1
    119   %v73 = icmp slt i32 %v67, %v72
    120   %v74 = select i1 %v73, i32 %v67, i32 %v72
    121   %v75 = icmp slt i32 %v74, %v9
    122   %v76 = select i1 %v75, i32 %v9, i32 %v74
    123   %v77 = add i32 %v68, %v9
    124   %v78 = sub i32 %v71, %v77
    125   %v79 = add i32 %v78, %v76
    126   %v80 = getelementptr inbounds i8, i8* %v1, i32 %v79
    127   %v81 = load i8, i8* %v80, align 1, !tbaa !4
    128   %v82 = icmp sle i32 %v64, %v52
    129   %v83 = icmp sle i32 %v58, %v67
    130   %v84 = icmp slt i32 %v67, %v9
    131   %v85 = or i1 %v84, %v83
    132   %v86 = or i1 %v69, %v85
    133   %v87 = or i1 %v82, %v86
    134   %v88 = select i1 %v87, i8 0, i8 %v81
    135   %v89 = sub i32 1, %v23
    136   %v90 = add i32 %v89, %v52
    137   %v91 = mul nsw i32 %v90, %v44
    138   %v92 = sub i32 1, %v27
    139   %v93 = add i32 %v92, %v91
    140   %v94 = add i32 %v93, %v67
    141   %v95 = getelementptr inbounds i8, i8* %v48, i32 %v94
    142   store i8 %v88, i8* %v95, align 1, !tbaa !7
    143   %v96 = add nsw i32 %v67, 1
    144   %v97 = icmp eq i32 %v96, %v57
    145   br i1 %v97, label %b7, label %b3
    146 
    147 b4:                                               ; preds = %b2
    148   %v98 = icmp slt i32 %v5, 1
    149   br i1 %v98, label %b5, label %b6
    150 
    151 b5:                                               ; preds = %b5, %b4
    152   %v99 = phi i32 [ %v123, %b5 ], [ %v55, %b4 ]
    153   %v100 = add nsw i32 %v58, -1
    154   %v101 = icmp slt i32 %v99, %v100
    155   %v102 = select i1 %v101, i32 %v99, i32 %v100
    156   %v103 = icmp slt i32 %v102, %v9
    157   %v104 = select i1 %v103, i32 %v9, i32 %v102
    158   %v105 = sub i32 %v104, %v9
    159   %v106 = getelementptr inbounds i8, i8* %v1, i32 %v105
    160   %v107 = load i8, i8* %v106, align 1, !tbaa !4
    161   %v108 = icmp sle i32 %v64, %v52
    162   %v109 = icmp slt i32 %v52, %v11
    163   %v110 = icmp sle i32 %v58, %v99
    164   %v111 = icmp slt i32 %v99, %v9
    165   %v112 = or i1 %v111, %v110
    166   %v113 = or i1 %v109, %v112
    167   %v114 = or i1 %v108, %v113
    168   %v115 = select i1 %v114, i8 0, i8 %v107
    169   %v116 = sub i32 1, %v23
    170   %v117 = add i32 %v116, %v52
    171   %v118 = mul nsw i32 %v117, %v44
    172   %v119 = sub i32 1, %v27
    173   %v120 = add i32 %v119, %v118
    174   %v121 = add i32 %v120, %v99
    175   %v122 = getelementptr inbounds i8, i8* %v48, i32 %v121
    176   store i8 %v115, i8* %v122, align 1, !tbaa !7
    177   %v123 = add nsw i32 %v99, 1
    178   %v124 = icmp eq i32 %v123, %v57
    179   br i1 %v124, label %b7, label %b5
    180 
    181 b6:                                               ; preds = %b6, %b4
    182   %v125 = phi i32 [ %v153, %b6 ], [ %v55, %b4 ]
    183   %v126 = mul nsw i32 %v11, %v7
    184   %v127 = mul nsw i32 %v65, %v7
    185   %v128 = add nsw i32 %v58, -1
    186   %v129 = icmp slt i32 %v125, %v128
    187   %v130 = select i1 %v129, i32 %v125, i32 %v128
    188   %v131 = icmp slt i32 %v130, %v9
    189   %v132 = select i1 %v131, i32 %v9, i32 %v130
    190   %v133 = add i32 %v126, %v9
    191   %v134 = sub i32 %v127, %v133
    192   %v135 = add i32 %v134, %v132
    193   %v136 = getelementptr inbounds i8, i8* %v1, i32 %v135
    194   %v137 = load i8, i8* %v136, align 1, !tbaa !4
    195   %v138 = icmp sle i32 %v64, %v52
    196   %v139 = icmp slt i32 %v52, %v11
    197   %v140 = icmp sle i32 %v58, %v125
    198   %v141 = icmp slt i32 %v125, %v9
    199   %v142 = or i1 %v141, %v140
    200   %v143 = or i1 %v139, %v142
    201   %v144 = or i1 %v138, %v143
    202   %v145 = select i1 %v144, i8 0, i8 %v137
    203   %v146 = sub i32 1, %v23
    204   %v147 = add i32 %v146, %v52
    205   %v148 = mul nsw i32 %v147, %v44
    206   %v149 = sub i32 1, %v27
    207   %v150 = add i32 %v149, %v148
    208   %v151 = add i32 %v150, %v125
    209   %v152 = getelementptr inbounds i8, i8* %v48, i32 %v151
    210   store i8 %v145, i8* %v152, align 1, !tbaa !7
    211   %v153 = add nsw i32 %v125, 1
    212   %v154 = icmp eq i32 %v153, %v57
    213   br i1 %v154, label %b7, label %b6
    214 
    215 b7:                                               ; preds = %b6, %b5, %b3, %b1
    216   %v155 = icmp slt i32 %v57, %v62
    217   br i1 %v155, label %b8, label %b9, !prof !9
    218 
    219 b8:                                               ; preds = %b8, %b7
    220   %v156 = phi i32 [ %v181, %b8 ], [ %v57, %b7 ]
    221   %v157 = mul nsw i32 %v11, %v7
    222   %v158 = add nsw i32 %v11, %v5
    223   %v159 = add nsw i32 %v158, -1
    224   %v160 = icmp slt i32 %v52, %v159
    225   %v161 = select i1 %v160, i32 %v52, i32 %v159
    226   %v162 = icmp slt i32 %v161, %v11
    227   %v163 = select i1 %v162, i32 %v11, i32 %v161
    228   %v164 = mul nsw i32 %v163, %v7
    229   %v165 = add i32 %v157, %v9
    230   %v166 = sub i32 %v164, %v165
    231   %v167 = add i32 %v166, %v156
    232   %v168 = getelementptr inbounds i8, i8* %v1, i32 %v167
    233   %v169 = load i8, i8* %v168, align 1, !tbaa !4
    234   %v170 = icmp sle i32 %v158, %v52
    235   %v171 = icmp slt i32 %v52, %v11
    236   %v172 = or i1 %v171, %v170
    237   %v173 = select i1 %v172, i8 0, i8 %v169
    238   %v174 = sub i32 1, %v23
    239   %v175 = add i32 %v174, %v52
    240   %v176 = mul nsw i32 %v175, %v44
    241   %v177 = sub i32 1, %v27
    242   %v178 = add i32 %v177, %v176
    243   %v179 = add i32 %v178, %v156
    244   %v180 = getelementptr inbounds i8, i8* %v48, i32 %v179
    245   store i8 %v173, i8* %v180, align 1, !tbaa !7
    246   %v181 = add nsw i32 %v156, 1
    247   %v182 = icmp eq i32 %v181, %v62
    248   br i1 %v182, label %b9, label %b8
    249 
    250 b9:                                               ; preds = %b8, %b7
    251   %v183 = icmp slt i32 %v62, %v24
    252   br i1 %v183, label %b10, label %b11, !prof !9
    253 
    254 b10:                                              ; preds = %b10, %b9
    255   %v184 = phi i32 [ %v218, %b10 ], [ %v62, %b9 ]
    256   %v185 = mul nsw i32 %v11, %v7
    257   %v186 = add nsw i32 %v11, %v5
    258   %v187 = add nsw i32 %v186, -1
    259   %v188 = icmp slt i32 %v52, %v187
    260   %v189 = select i1 %v188, i32 %v52, i32 %v187
    261   %v190 = icmp slt i32 %v189, %v11
    262   %v191 = select i1 %v190, i32 %v11, i32 %v189
    263   %v192 = mul nsw i32 %v191, %v7
    264   %v193 = add nsw i32 %v58, -1
    265   %v194 = icmp slt i32 %v184, %v193
    266   %v195 = select i1 %v194, i32 %v184, i32 %v193
    267   %v196 = icmp slt i32 %v195, %v9
    268   %v197 = select i1 %v196, i32 %v9, i32 %v195
    269   %v198 = add i32 %v185, %v9
    270   %v199 = sub i32 %v192, %v198
    271   %v200 = add i32 %v199, %v197
    272   %v201 = getelementptr inbounds i8, i8* %v1, i32 %v200
    273   %v202 = load i8, i8* %v201, align 1, !tbaa !4
    274   %v203 = icmp sle i32 %v186, %v52
    275   %v204 = icmp slt i32 %v52, %v11
    276   %v205 = icmp sle i32 %v58, %v184
    277   %v206 = icmp slt i32 %v184, %v9
    278   %v207 = or i1 %v206, %v205
    279   %v208 = or i1 %v204, %v207
    280   %v209 = or i1 %v203, %v208
    281   %v210 = select i1 %v209, i8 0, i8 %v202
    282   %v211 = sub i32 1, %v23
    283   %v212 = add i32 %v211, %v52
    284   %v213 = mul nsw i32 %v212, %v44
    285   %v214 = sub i32 1, %v27
    286   %v215 = add i32 %v214, %v213
    287   %v216 = add i32 %v215, %v184
    288   %v217 = getelementptr inbounds i8, i8* %v48, i32 %v216
    289   store i8 %v210, i8* %v217, align 1, !tbaa !7
    290   %v218 = add nsw i32 %v184, 1
    291   %v219 = icmp eq i32 %v218, %v24
    292   br i1 %v219, label %b11, label %b10
    293 
    294 b11:                                              ; preds = %b10, %b9
    295   %v220 = add nsw i32 %v52, 1
    296   %v221 = icmp eq i32 %v220, %v50
    297   br i1 %v221, label %b12, label %b1
    298 
    299 b12:                                              ; preds = %b11, %b0
    300   %v222 = add nsw i32 %v35, 1
    301   %v223 = sext i32 %v222 to i64
    302   %v224 = shl nsw i64 %v42, 2
    303   %v225 = mul i64 %v224, %v223
    304   %v226 = trunc i64 %v225 to i32
    305   %v227 = tail call i8* @f2(i8* null, i32 %v226)
    306   br i1 %v51, label %b14, label %b13, !prof !3
    307 
    308 b13:                                              ; preds = %b19, %b12
    309   %v228 = phi i32 [ %v351, %b19 ], [ %v49, %b12 ]
    310   %v229 = ashr i32 %v15, 6
    311   %v230 = icmp slt i32 %v229, 0
    312   %v231 = select i1 %v230, i32 0, i32 %v229
    313   %v232 = icmp sgt i32 %v231, 0
    314   br i1 %v232, label %b16, label %b17, !prof !9
    315 
    316 b14:                                              ; preds = %b19, %b12
    317   %v233 = icmp eq i8* %v48, null
    318   br i1 %v233, label %b20, label %b15
    319 
    320 b15:                                              ; preds = %b14
    321   tail call void @f3(i8* null, i8* %v48) #2
    322   br label %b20
    323 
    324 b16:                                              ; preds = %b16, %b13
    325   %v234 = phi i32 [ %v289, %b16 ], [ 0, %b13 ]
    326   %v235 = sub nsw i32 %v228, %v23
    327   %v236 = add nsw i32 %v235, 1
    328   %v237 = mul nsw i32 %v236, %v44
    329   %v238 = shl i32 %v234, 6
    330   %v239 = sub i32 %v21, %v27
    331   %v240 = add i32 %v239, %v238
    332   %v241 = add nsw i32 %v240, %v237
    333   %v242 = getelementptr inbounds i8, i8* %v48, i32 %v241
    334   %v243 = bitcast i8* %v242 to <16 x i32>*
    335   %v244 = load <16 x i32>, <16 x i32>* %v243, align 1, !tbaa !7
    336   %v245 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %v244)
    337   %v246 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v245)
    338   %v247 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v245)
    339   %v248 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v247)
    340   %v249 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v246)
    341   %v250 = add nsw i32 %v241, 1
    342   %v251 = getelementptr inbounds i8, i8* %v48, i32 %v250
    343   %v252 = bitcast i8* %v251 to <16 x i32>*
    344   %v253 = load <16 x i32>, <16 x i32>* %v252, align 1, !tbaa !7
    345   %v254 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %v253)
    346   %v255 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v254)
    347   %v256 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v255)
    348   %v257 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v256)
    349   %v258 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v256)
    350   %v259 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v257, i32 168430090)
    351   %v260 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v258, i32 168430090)
    352   %v261 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v259, <16 x i32> %v260)
    353   %v262 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v254)
    354   %v263 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v262)
    355   %v264 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v263)
    356   %v265 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v263)
    357   %v266 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v264, i32 168430090)
    358   %v267 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v265, i32 168430090)
    359   %v268 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v266, <16 x i32> %v267)
    360   %v269 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v248, <32 x i32> %v261)
    361   %v270 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v249, <32 x i32> %v268)
    362   %v271 = shufflevector <32 x i32> %v269, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    363   %v272 = mul nsw i32 %v236, %v222
    364   %v273 = add nsw i32 %v240, %v272
    365   %v274 = bitcast i8* %v227 to i32*
    366   %v275 = getelementptr inbounds i32, i32* %v274, i32 %v273
    367   %v276 = bitcast i32* %v275 to <16 x i32>*
    368   store <16 x i32> %v271, <16 x i32>* %v276, align 4, !tbaa !10
    369   %v277 = shufflevector <32 x i32> %v269, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    370   %v278 = add nsw i32 %v273, 16
    371   %v279 = getelementptr inbounds i32, i32* %v274, i32 %v278
    372   %v280 = bitcast i32* %v279 to <16 x i32>*
    373   store <16 x i32> %v277, <16 x i32>* %v280, align 4, !tbaa !10
    374   %v281 = shufflevector <32 x i32> %v270, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    375   %v282 = add nsw i32 %v273, 32
    376   %v283 = getelementptr inbounds i32, i32* %v274, i32 %v282
    377   %v284 = bitcast i32* %v283 to <16 x i32>*
    378   store <16 x i32> %v281, <16 x i32>* %v284, align 4, !tbaa !10
    379   %v285 = shufflevector <32 x i32> %v270, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    380   %v286 = add nsw i32 %v273, 48
    381   %v287 = getelementptr inbounds i32, i32* %v274, i32 %v286
    382   %v288 = bitcast i32* %v287 to <16 x i32>*
    383   store <16 x i32> %v285, <16 x i32>* %v288, align 4, !tbaa !10
    384   %v289 = add nuw nsw i32 %v234, 1
    385   %v290 = icmp eq i32 %v289, %v231
    386   br i1 %v290, label %b17, label %b16
    387 
    388 b17:                                              ; preds = %b16, %b13
    389   %v291 = add nsw i32 %v15, 63
    390   %v292 = ashr i32 %v291, 6
    391   %v293 = icmp slt i32 %v231, %v292
    392   br i1 %v293, label %b18, label %b19, !prof !9
    393 
    394 b18:                                              ; preds = %b18, %b17
    395   %v294 = phi i32 [ %v349, %b18 ], [ %v231, %b17 ]
    396   %v295 = sub nsw i32 %v228, %v23
    397   %v296 = add nsw i32 %v295, 1
    398   %v297 = mul nsw i32 %v296, %v44
    399   %v298 = sub nsw i32 %v24, %v27
    400   %v299 = add nsw i32 %v297, %v298
    401   %v300 = add nsw i32 %v299, -64
    402   %v301 = getelementptr inbounds i8, i8* %v48, i32 %v300
    403   %v302 = bitcast i8* %v301 to <16 x i32>*
    404   %v303 = load <16 x i32>, <16 x i32>* %v302, align 1, !tbaa !7
    405   %v304 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %v303)
    406   %v305 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v304)
    407   %v306 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v304)
    408   %v307 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v306)
    409   %v308 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v305)
    410   %v309 = add nsw i32 %v299, -63
    411   %v310 = getelementptr inbounds i8, i8* %v48, i32 %v309
    412   %v311 = bitcast i8* %v310 to <16 x i32>*
    413   %v312 = load <16 x i32>, <16 x i32>* %v311, align 1, !tbaa !7
    414   %v313 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %v312)
    415   %v314 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v313)
    416   %v315 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v314)
    417   %v316 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v315)
    418   %v317 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v315)
    419   %v318 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v316, i32 168430090)
    420   %v319 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v317, i32 168430090)
    421   %v320 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v318, <16 x i32> %v319)
    422   %v321 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v313)
    423   %v322 = tail call <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32> %v321)
    424   %v323 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v322)
    425   %v324 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v322)
    426   %v325 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v323, i32 168430090)
    427   %v326 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v324, i32 168430090)
    428   %v327 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v325, <16 x i32> %v326)
    429   %v328 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v307, <32 x i32> %v320)
    430   %v329 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v308, <32 x i32> %v327)
    431   %v330 = shufflevector <32 x i32> %v328, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    432   %v331 = mul nsw i32 %v296, %v222
    433   %v332 = add nsw i32 %v331, %v298
    434   %v333 = add nsw i32 %v332, -64
    435   %v334 = bitcast i8* %v227 to i32*
    436   %v335 = getelementptr inbounds i32, i32* %v334, i32 %v333
    437   %v336 = bitcast i32* %v335 to <16 x i32>*
    438   store <16 x i32> %v330, <16 x i32>* %v336, align 4, !tbaa !10
    439   %v337 = shufflevector <32 x i32> %v328, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    440   %v338 = add nsw i32 %v332, -48
    441   %v339 = getelementptr inbounds i32, i32* %v334, i32 %v338
    442   %v340 = bitcast i32* %v339 to <16 x i32>*
    443   store <16 x i32> %v337, <16 x i32>* %v340, align 4, !tbaa !10
    444   %v341 = shufflevector <32 x i32> %v329, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    445   %v342 = add nsw i32 %v332, -32
    446   %v343 = getelementptr inbounds i32, i32* %v334, i32 %v342
    447   %v344 = bitcast i32* %v343 to <16 x i32>*
    448   store <16 x i32> %v341, <16 x i32>* %v344, align 4, !tbaa !10
    449   %v345 = shufflevector <32 x i32> %v329, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    450   %v346 = add nsw i32 %v332, -16
    451   %v347 = getelementptr inbounds i32, i32* %v334, i32 %v346
    452   %v348 = bitcast i32* %v347 to <16 x i32>*
    453   store <16 x i32> %v345, <16 x i32>* %v348, align 4, !tbaa !10
    454   %v349 = add nuw nsw i32 %v294, 1
    455   %v350 = icmp eq i32 %v349, %v292
    456   br i1 %v350, label %b19, label %b18
    457 
    458 b19:                                              ; preds = %b18, %b17
    459   %v351 = add nsw i32 %v228, 1
    460   %v352 = icmp eq i32 %v351, %v50
    461   br i1 %v352, label %b14, label %b13
    462 
    463 b20:                                              ; preds = %b15, %b14
    464   %v353 = icmp sgt i32 %v17, 0
    465   br i1 %v353, label %b21, label %b31, !prof !9
    466 
    467 b21:                                              ; preds = %b20
    468   %v354 = ashr i32 %v15, 6
    469   %v355 = icmp slt i32 %v354, 0
    470   %v356 = select i1 %v355, i32 0, i32 %v354
    471   %v357 = icmp sgt i32 %v356, 0
    472   br i1 %v357, label %b25, label %b27
    473 
    474 b22:                                              ; preds = %b25, %b22
    475   %v358 = phi i32 [ %v442, %b22 ], [ 0, %b25 ]
    476   %v359 = sub nsw i32 %v525, %v23
    477   %v360 = mul nsw i32 %v359, %v222
    478   %v361 = shl nsw i32 %v358, 6
    479   %v362 = add nsw i32 %v361, %v21
    480   %v363 = sub nsw i32 %v362, %v27
    481   %v364 = add nsw i32 %v363, %v360
    482   %v365 = bitcast i8* %v227 to i32*
    483   %v366 = getelementptr inbounds i32, i32* %v365, i32 %v364
    484   %v367 = bitcast i32* %v366 to <16 x i32>*
    485   %v368 = load <16 x i32>, <16 x i32>* %v367, align 4, !tbaa !10
    486   %v369 = add nsw i32 %v364, 16
    487   %v370 = getelementptr inbounds i32, i32* %v365, i32 %v369
    488   %v371 = bitcast i32* %v370 to <16 x i32>*
    489   %v372 = load <16 x i32>, <16 x i32>* %v371, align 4, !tbaa !10
    490   %v373 = shufflevector <16 x i32> %v368, <16 x i32> %v372, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    491   %v374 = add nsw i32 %v359, 1
    492   %v375 = mul nsw i32 %v374, %v222
    493   %v376 = add nsw i32 %v363, %v375
    494   %v377 = getelementptr inbounds i32, i32* %v365, i32 %v376
    495   %v378 = bitcast i32* %v377 to <16 x i32>*
    496   %v379 = load <16 x i32>, <16 x i32>* %v378, align 4, !tbaa !10
    497   %v380 = add nsw i32 %v376, 16
    498   %v381 = getelementptr inbounds i32, i32* %v365, i32 %v380
    499   %v382 = bitcast i32* %v381 to <16 x i32>*
    500   %v383 = load <16 x i32>, <16 x i32>* %v382, align 4, !tbaa !10
    501   %v384 = shufflevector <16 x i32> %v379, <16 x i32> %v383, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    502   %v385 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v384)
    503   %v386 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v384)
    504   %v387 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v385, i32 168430090)
    505   %v388 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v386, i32 168430090)
    506   %v389 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v387, <16 x i32> %v388)
    507   %v390 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v373, <32 x i32> %v389)
    508   %v391 = shufflevector <32 x i32> %v390, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    509   %v392 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v391, i32 20)
    510   %v393 = shufflevector <32 x i32> %v390, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    511   %v394 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v393, i32 20)
    512   %v395 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v394, <16 x i32> %v392)
    513   %v396 = add nsw i32 %v364, 32
    514   %v397 = getelementptr inbounds i32, i32* %v365, i32 %v396
    515   %v398 = bitcast i32* %v397 to <16 x i32>*
    516   %v399 = load <16 x i32>, <16 x i32>* %v398, align 4, !tbaa !10
    517   %v400 = add nsw i32 %v364, 48
    518   %v401 = getelementptr inbounds i32, i32* %v365, i32 %v400
    519   %v402 = bitcast i32* %v401 to <16 x i32>*
    520   %v403 = load <16 x i32>, <16 x i32>* %v402, align 4, !tbaa !10
    521   %v404 = shufflevector <16 x i32> %v399, <16 x i32> %v403, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    522   %v405 = add nsw i32 %v376, 32
    523   %v406 = getelementptr inbounds i32, i32* %v365, i32 %v405
    524   %v407 = bitcast i32* %v406 to <16 x i32>*
    525   %v408 = load <16 x i32>, <16 x i32>* %v407, align 4, !tbaa !10
    526   %v409 = add nsw i32 %v376, 48
    527   %v410 = getelementptr inbounds i32, i32* %v365, i32 %v409
    528   %v411 = bitcast i32* %v410 to <16 x i32>*
    529   %v412 = load <16 x i32>, <16 x i32>* %v411, align 4, !tbaa !10
    530   %v413 = shufflevector <16 x i32> %v408, <16 x i32> %v412, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    531   %v414 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v413)
    532   %v415 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v413)
    533   %v416 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v414, i32 168430090)
    534   %v417 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v415, i32 168430090)
    535   %v418 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v416, <16 x i32> %v417)
    536   %v419 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v404, <32 x i32> %v418)
    537   %v420 = shufflevector <32 x i32> %v419, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    538   %v421 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v420, i32 20)
    539   %v422 = shufflevector <32 x i32> %v419, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    540   %v423 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v422, i32 20)
    541   %v424 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v423, <16 x i32> %v421)
    542   %v425 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v395)
    543   %v426 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v395)
    544   %v427 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v425, <16 x i32> %v426)
    545   %v428 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v424)
    546   %v429 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v424)
    547   %v430 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v428, <16 x i32> %v429)
    548   %v431 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v430, <16 x i32> %v427)
    549   %v432 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v431)
    550   %v433 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v431)
    551   %v434 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %v432, <16 x i32> %v433)
    552   %v435 = mul nsw i32 %v23, %v19
    553   %v436 = mul nsw i32 %v525, %v19
    554   %v437 = add i32 %v435, %v21
    555   %v438 = sub i32 %v436, %v437
    556   %v439 = add i32 %v438, %v362
    557   %v440 = getelementptr inbounds i8, i8* %v13, i32 %v439
    558   %v441 = bitcast i8* %v440 to <16 x i32>*
    559   store <16 x i32> %v434, <16 x i32>* %v441, align 1, !tbaa !12
    560   %v442 = add nuw nsw i32 %v358, 1
    561   %v443 = icmp eq i32 %v442, %v356
    562   br i1 %v443, label %b26, label %b22
    563 
    564 b23:                                              ; preds = %b26, %b23
    565   %v444 = phi i32 [ %v521, %b23 ], [ %v356, %b26 ]
    566   %v445 = sub nsw i32 %v24, %v27
    567   %v446 = add nsw i32 %v360, %v445
    568   %v447 = add nsw i32 %v446, -64
    569   %v448 = getelementptr inbounds i32, i32* %v365, i32 %v447
    570   %v449 = bitcast i32* %v448 to <16 x i32>*
    571   %v450 = load <16 x i32>, <16 x i32>* %v449, align 4, !tbaa !10
    572   %v451 = add nsw i32 %v446, -48
    573   %v452 = getelementptr inbounds i32, i32* %v365, i32 %v451
    574   %v453 = bitcast i32* %v452 to <16 x i32>*
    575   %v454 = load <16 x i32>, <16 x i32>* %v453, align 4, !tbaa !10
    576   %v455 = shufflevector <16 x i32> %v450, <16 x i32> %v454, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    577   %v456 = add nsw i32 %v375, %v445
    578   %v457 = add nsw i32 %v456, -64
    579   %v458 = getelementptr inbounds i32, i32* %v365, i32 %v457
    580   %v459 = bitcast i32* %v458 to <16 x i32>*
    581   %v460 = load <16 x i32>, <16 x i32>* %v459, align 4, !tbaa !10
    582   %v461 = add nsw i32 %v456, -48
    583   %v462 = getelementptr inbounds i32, i32* %v365, i32 %v461
    584   %v463 = bitcast i32* %v462 to <16 x i32>*
    585   %v464 = load <16 x i32>, <16 x i32>* %v463, align 4, !tbaa !10
    586   %v465 = shufflevector <16 x i32> %v460, <16 x i32> %v464, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    587   %v466 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v465)
    588   %v467 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v465)
    589   %v468 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v466, i32 168430090)
    590   %v469 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v467, i32 168430090)
    591   %v470 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v468, <16 x i32> %v469)
    592   %v471 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v455, <32 x i32> %v470)
    593   %v472 = shufflevector <32 x i32> %v471, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    594   %v473 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v472, i32 20)
    595   %v474 = shufflevector <32 x i32> %v471, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    596   %v475 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v474, i32 20)
    597   %v476 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v475, <16 x i32> %v473)
    598   %v477 = add nsw i32 %v446, -32
    599   %v478 = getelementptr inbounds i32, i32* %v365, i32 %v477
    600   %v479 = bitcast i32* %v478 to <16 x i32>*
    601   %v480 = load <16 x i32>, <16 x i32>* %v479, align 4, !tbaa !10
    602   %v481 = add nsw i32 %v446, -16
    603   %v482 = getelementptr inbounds i32, i32* %v365, i32 %v481
    604   %v483 = bitcast i32* %v482 to <16 x i32>*
    605   %v484 = load <16 x i32>, <16 x i32>* %v483, align 4, !tbaa !10
    606   %v485 = shufflevector <16 x i32> %v480, <16 x i32> %v484, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    607   %v486 = add nsw i32 %v456, -32
    608   %v487 = getelementptr inbounds i32, i32* %v365, i32 %v486
    609   %v488 = bitcast i32* %v487 to <16 x i32>*
    610   %v489 = load <16 x i32>, <16 x i32>* %v488, align 4, !tbaa !10
    611   %v490 = add nsw i32 %v456, -16
    612   %v491 = getelementptr inbounds i32, i32* %v365, i32 %v490
    613   %v492 = bitcast i32* %v491 to <16 x i32>*
    614   %v493 = load <16 x i32>, <16 x i32>* %v492, align 4, !tbaa !10
    615   %v494 = shufflevector <16 x i32> %v489, <16 x i32> %v493, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    616   %v495 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v494)
    617   %v496 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v494)
    618   %v497 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v495, i32 168430090)
    619   %v498 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v496, i32 168430090)
    620   %v499 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v497, <16 x i32> %v498)
    621   %v500 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v485, <32 x i32> %v499)
    622   %v501 = shufflevector <32 x i32> %v500, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    623   %v502 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v501, i32 20)
    624   %v503 = shufflevector <32 x i32> %v500, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    625   %v504 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v503, i32 20)
    626   %v505 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v504, <16 x i32> %v502)
    627   %v506 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v476)
    628   %v507 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v476)
    629   %v508 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v506, <16 x i32> %v507)
    630   %v509 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v505)
    631   %v510 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v505)
    632   %v511 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v509, <16 x i32> %v510)
    633   %v512 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v511, <16 x i32> %v508)
    634   %v513 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v512)
    635   %v514 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v512)
    636   %v515 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %v513, <16 x i32> %v514)
    637   %v516 = add i32 %v15, -64
    638   %v517 = sub i32 %v516, %v435
    639   %v518 = add i32 %v517, %v436
    640   %v519 = getelementptr inbounds i8, i8* %v13, i32 %v518
    641   %v520 = bitcast i8* %v519 to <16 x i32>*
    642   store <16 x i32> %v515, <16 x i32>* %v520, align 1, !tbaa !12
    643   %v521 = add nuw nsw i32 %v444, 1
    644   %v522 = icmp eq i32 %v521, %v527
    645   br i1 %v522, label %b24, label %b23
    646 
    647 b24:                                              ; preds = %b26, %b23
    648   %v523 = add nsw i32 %v525, 1
    649   %v524 = icmp eq i32 %v523, %v50
    650   br i1 %v524, label %b32, label %b25
    651 
    652 b25:                                              ; preds = %b24, %b21
    653   %v525 = phi i32 [ %v523, %b24 ], [ %v23, %b21 ]
    654   br label %b22
    655 
    656 b26:                                              ; preds = %b22
    657   %v526 = add nsw i32 %v15, 63
    658   %v527 = ashr i32 %v526, 6
    659   %v528 = icmp slt i32 %v356, %v527
    660   br i1 %v528, label %b23, label %b24, !prof !9
    661 
    662 b27:                                              ; preds = %b21
    663   %v529 = add nsw i32 %v15, 63
    664   %v530 = ashr i32 %v529, 6
    665   %v531 = icmp slt i32 %v356, %v530
    666   br i1 %v531, label %b29, label %b31
    667 
    668 b28:                                              ; preds = %b29, %b28
    669   %v532 = phi i32 [ %v616, %b28 ], [ %v356, %b29 ]
    670   %v533 = sub nsw i32 %v618, %v23
    671   %v534 = mul nsw i32 %v533, %v222
    672   %v535 = sub nsw i32 %v24, %v27
    673   %v536 = add nsw i32 %v534, %v535
    674   %v537 = add nsw i32 %v536, -64
    675   %v538 = bitcast i8* %v227 to i32*
    676   %v539 = getelementptr inbounds i32, i32* %v538, i32 %v537
    677   %v540 = bitcast i32* %v539 to <16 x i32>*
    678   %v541 = load <16 x i32>, <16 x i32>* %v540, align 4, !tbaa !10
    679   %v542 = add nsw i32 %v536, -48
    680   %v543 = getelementptr inbounds i32, i32* %v538, i32 %v542
    681   %v544 = bitcast i32* %v543 to <16 x i32>*
    682   %v545 = load <16 x i32>, <16 x i32>* %v544, align 4, !tbaa !10
    683   %v546 = shufflevector <16 x i32> %v541, <16 x i32> %v545, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    684   %v547 = add nsw i32 %v533, 1
    685   %v548 = mul nsw i32 %v547, %v222
    686   %v549 = add nsw i32 %v548, %v535
    687   %v550 = add nsw i32 %v549, -64
    688   %v551 = getelementptr inbounds i32, i32* %v538, i32 %v550
    689   %v552 = bitcast i32* %v551 to <16 x i32>*
    690   %v553 = load <16 x i32>, <16 x i32>* %v552, align 4, !tbaa !10
    691   %v554 = add nsw i32 %v549, -48
    692   %v555 = getelementptr inbounds i32, i32* %v538, i32 %v554
    693   %v556 = bitcast i32* %v555 to <16 x i32>*
    694   %v557 = load <16 x i32>, <16 x i32>* %v556, align 4, !tbaa !10
    695   %v558 = shufflevector <16 x i32> %v553, <16 x i32> %v557, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    696   %v559 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v558)
    697   %v560 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v558)
    698   %v561 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v559, i32 168430090)
    699   %v562 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v560, i32 168430090)
    700   %v563 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v561, <16 x i32> %v562)
    701   %v564 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v546, <32 x i32> %v563)
    702   %v565 = shufflevector <32 x i32> %v564, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    703   %v566 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v565, i32 20)
    704   %v567 = shufflevector <32 x i32> %v564, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    705   %v568 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v567, i32 20)
    706   %v569 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v568, <16 x i32> %v566)
    707   %v570 = add nsw i32 %v536, -32
    708   %v571 = getelementptr inbounds i32, i32* %v538, i32 %v570
    709   %v572 = bitcast i32* %v571 to <16 x i32>*
    710   %v573 = load <16 x i32>, <16 x i32>* %v572, align 4, !tbaa !10
    711   %v574 = add nsw i32 %v536, -16
    712   %v575 = getelementptr inbounds i32, i32* %v538, i32 %v574
    713   %v576 = bitcast i32* %v575 to <16 x i32>*
    714   %v577 = load <16 x i32>, <16 x i32>* %v576, align 4, !tbaa !10
    715   %v578 = shufflevector <16 x i32> %v573, <16 x i32> %v577, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    716   %v579 = add nsw i32 %v549, -32
    717   %v580 = getelementptr inbounds i32, i32* %v538, i32 %v579
    718   %v581 = bitcast i32* %v580 to <16 x i32>*
    719   %v582 = load <16 x i32>, <16 x i32>* %v581, align 4, !tbaa !10
    720   %v583 = add nsw i32 %v549, -16
    721   %v584 = getelementptr inbounds i32, i32* %v538, i32 %v583
    722   %v585 = bitcast i32* %v584 to <16 x i32>*
    723   %v586 = load <16 x i32>, <16 x i32>* %v585, align 4, !tbaa !10
    724   %v587 = shufflevector <16 x i32> %v582, <16 x i32> %v586, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    725   %v588 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v587)
    726   %v589 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v587)
    727   %v590 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v588, i32 168430090)
    728   %v591 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32> %v589, i32 168430090)
    729   %v592 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v590, <16 x i32> %v591)
    730   %v593 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v578, <32 x i32> %v592)
    731   %v594 = shufflevector <32 x i32> %v593, <32 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    732   %v595 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v594, i32 20)
    733   %v596 = shufflevector <32 x i32> %v593, <32 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    734   %v597 = tail call <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32> %v596, i32 20)
    735   %v598 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v597, <16 x i32> %v595)
    736   %v599 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v569)
    737   %v600 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v569)
    738   %v601 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v599, <16 x i32> %v600)
    739   %v602 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v598)
    740   %v603 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v598)
    741   %v604 = tail call <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32> %v602, <16 x i32> %v603)
    742   %v605 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %v604, <16 x i32> %v601)
    743   %v606 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v605)
    744   %v607 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %v605)
    745   %v608 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %v606, <16 x i32> %v607)
    746   %v609 = mul nsw i32 %v23, %v19
    747   %v610 = mul nsw i32 %v618, %v19
    748   %v611 = add i32 %v15, -64
    749   %v612 = sub i32 %v611, %v609
    750   %v613 = add i32 %v612, %v610
    751   %v614 = getelementptr inbounds i8, i8* %v13, i32 %v613
    752   %v615 = bitcast i8* %v614 to <16 x i32>*
    753   store <16 x i32> %v608, <16 x i32>* %v615, align 1, !tbaa !12
    754   %v616 = add nuw nsw i32 %v532, 1
    755   %v617 = icmp eq i32 %v616, %v530
    756   br i1 %v617, label %b30, label %b28
    757 
    758 b29:                                              ; preds = %b30, %b27
    759   %v618 = phi i32 [ %v619, %b30 ], [ %v23, %b27 ]
    760   br label %b28
    761 
    762 b30:                                              ; preds = %b28
    763   %v619 = add nsw i32 %v618, 1
    764   %v620 = icmp eq i32 %v619, %v50
    765   br i1 %v620, label %b32, label %b29
    766 
    767 b31:                                              ; preds = %b27, %b20
    768   %v621 = icmp eq i8* %v227, null
    769   br i1 %v621, label %b33, label %b32
    770 
    771 b32:                                              ; preds = %b31, %b30, %b24
    772   tail call void @f3(i8* null, i8* %v227) #2
    773   br label %b33
    774 
    775 b33:                                              ; preds = %b32, %b31
    776   ret i32 0
    777 }
    778 
    779 ; Function Attrs: nounwind readnone
    780 declare <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32>) #1
    781 
    782 ; Function Attrs: nounwind readnone
    783 declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
    784 
    785 ; Function Attrs: nounwind readnone
    786 declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) #1
    787 
    788 ; Function Attrs: nounwind readnone
    789 declare <32 x i32> @llvm.hexagon.V6.vzh(<16 x i32>) #1
    790 
    791 ; Function Attrs: nounwind readnone
    792 declare <16 x i32> @llvm.hexagon.V6.vmpyiwb(<16 x i32>, i32) #1
    793 
    794 ; Function Attrs: nounwind readnone
    795 declare <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32>, <16 x i32>) #1
    796 
    797 ; Function Attrs: nounwind readnone
    798 declare <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32>, <32 x i32>) #1
    799 
    800 ; Function Attrs: nounwind readnone
    801 declare <16 x i32> @llvm.hexagon.V6.vlsrw(<16 x i32>, i32) #1
    802 
    803 ; Function Attrs: nounwind readnone
    804 declare <16 x i32> @llvm.hexagon.V6.vsatwh(<16 x i32>, <16 x i32>) #1
    805 
    806 ; Function Attrs: nounwind readnone
    807 declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1
    808 
    809 attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length64b" }
    810 attributes #1 = { nounwind readnone }
    811 attributes #2 = { nobuiltin nounwind }
    812 
    813 !llvm.module.flags = !{!0, !1, !2}
    814 
    815 !0 = !{i32 2, !"halide_use_soft_float_abi", i32 0}
    816 !1 = !{i32 2, !"halide_mcpu", !"hexagonv60"}
    817 !2 = !{i32 2, !"halide_mattrs", !"+hvx"}
    818 !3 = !{!"branch_weights", i32 0, i32 1073741824}
    819 !4 = !{!5, !5, i64 0}
    820 !5 = !{!"input", !6}
    821 !6 = !{!"Halide buffer"}
    822 !7 = !{!8, !8, i64 0}
    823 !8 = !{!"constant_exterior", !6}
    824 !9 = !{!"branch_weights", i32 1073741824, i32 0}
    825 !10 = !{!11, !11, i64 0}
    826 !11 = !{!"rows", !6}
    827 !12 = !{!13, !13, i64 0}
    828 !13 = !{!"gaussian11", !6}
    829