Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s
      3 ; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
      4 
      5 ; We expect a two digit VGPR usage here, not a three digit.
      6 ; CHECK: NumVgprs: {{[0-9][0-9]$}}
      7 
      8 define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) {
      9 bb:
     10   %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
     11   %tmp2 = load float, float addrspace(3)* %tmp, align 4
     12   %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
     13   %tmp4 = load float, float addrspace(3)* %tmp3, align 4
     14   %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
     15   %tmp6 = load float, float addrspace(3)* %tmp5, align 4
     16   %tmp7 = tail call float @llvm.fmuladd.f32(float %tmp2, float %tmp4, float %tmp6)
     17   %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
     18   %tmp9 = load float, float addrspace(3)* %tmp8, align 4
     19   %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
     20   %tmp11 = load float, float addrspace(3)* %tmp10, align 4
     21   %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
     22   %tmp13 = load float, float addrspace(3)* %tmp12, align 4
     23   %tmp14 = tail call float @llvm.fmuladd.f32(float %tmp9, float %tmp11, float %tmp13)
     24   %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
     25   %tmp16 = load float, float addrspace(3)* %tmp15, align 4
     26   %tmp17 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
     27   %tmp18 = load float, float addrspace(3)* %tmp17, align 4
     28   %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
     29   %tmp20 = load float, float addrspace(3)* %tmp19, align 4
     30   %tmp21 = tail call float @llvm.fmuladd.f32(float %tmp16, float %tmp18, float %tmp20)
     31   %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
     32   %tmp23 = load float, float addrspace(3)* %tmp22, align 4
     33   %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
     34   %tmp25 = load float, float addrspace(3)* %tmp24, align 4
     35   %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
     36   %tmp27 = load float, float addrspace(3)* %tmp26, align 4
     37   %tmp28 = tail call float @llvm.fmuladd.f32(float %tmp23, float %tmp25, float %tmp27)
     38   %tmp29 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
     39   %tmp30 = load float, float addrspace(3)* %tmp29, align 4
     40   %tmp31 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
     41   %tmp32 = load float, float addrspace(3)* %tmp31, align 4
     42   %tmp33 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
     43   %tmp34 = load float, float addrspace(3)* %tmp33, align 4
     44   %tmp35 = tail call float @llvm.fmuladd.f32(float %tmp30, float %tmp32, float %tmp34)
     45   %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
     46   %tmp37 = load float, float addrspace(3)* %tmp36, align 4
     47   %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
     48   %tmp39 = load float, float addrspace(3)* %tmp38, align 4
     49   %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
     50   %tmp41 = load float, float addrspace(3)* %tmp40, align 4
     51   %tmp42 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41)
     52   %tmp43 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
     53   %tmp44 = load float, float addrspace(3)* %tmp43, align 4
     54   %tmp45 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
     55   %tmp46 = load float, float addrspace(3)* %tmp45, align 4
     56   %tmp47 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
     57   %tmp48 = load float, float addrspace(3)* %tmp47, align 4
     58   %tmp49 = tail call float @llvm.fmuladd.f32(float %tmp44, float %tmp46, float %tmp48)
     59   %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
     60   %tmp51 = load float, float addrspace(3)* %tmp50, align 4
     61   %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
     62   %tmp53 = load float, float addrspace(3)* %tmp52, align 4
     63   %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 31
     64   %tmp55 = load float, float addrspace(3)* %tmp54, align 4
     65   %tmp56 = tail call float @llvm.fmuladd.f32(float %tmp51, float %tmp53, float %tmp55)
     66   %tmp57 = getelementptr inbounds float, float addrspace(3)* %arg, i32 33
     67   %tmp58 = load float, float addrspace(3)* %tmp57, align 4
     68   %tmp59 = getelementptr inbounds float, float addrspace(3)* %arg, i32 34
     69   %tmp60 = load float, float addrspace(3)* %tmp59, align 4
     70   %tmp61 = getelementptr inbounds float, float addrspace(3)* %arg, i32 35
     71   %tmp62 = load float, float addrspace(3)* %tmp61, align 4
     72   %tmp63 = tail call float @llvm.fmuladd.f32(float %tmp58, float %tmp60, float %tmp62)
     73   %tmp64 = getelementptr inbounds float, float addrspace(3)* %arg, i32 37
     74   %tmp65 = load float, float addrspace(3)* %tmp64, align 4
     75   %tmp66 = getelementptr inbounds float, float addrspace(3)* %arg, i32 38
     76   %tmp67 = load float, float addrspace(3)* %tmp66, align 4
     77   %tmp68 = getelementptr inbounds float, float addrspace(3)* %arg, i32 39
     78   %tmp69 = load float, float addrspace(3)* %tmp68, align 4
     79   %tmp70 = tail call float @llvm.fmuladd.f32(float %tmp65, float %tmp67, float %tmp69)
     80   %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg, i32 41
     81   %tmp72 = load float, float addrspace(3)* %tmp71, align 4
     82   %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg, i32 42
     83   %tmp74 = load float, float addrspace(3)* %tmp73, align 4
     84   %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg, i32 43
     85   %tmp76 = load float, float addrspace(3)* %tmp75, align 4
     86   %tmp77 = tail call float @llvm.fmuladd.f32(float %tmp72, float %tmp74, float %tmp76)
     87   %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg, i32 45
     88   %tmp79 = load float, float addrspace(3)* %tmp78, align 4
     89   %tmp80 = getelementptr inbounds float, float addrspace(3)* %arg, i32 46
     90   %tmp81 = load float, float addrspace(3)* %tmp80, align 4
     91   %tmp82 = getelementptr inbounds float, float addrspace(3)* %arg, i32 47
     92   %tmp83 = load float, float addrspace(3)* %tmp82, align 4
     93   %tmp84 = tail call float @llvm.fmuladd.f32(float %tmp79, float %tmp81, float %tmp83)
     94   %tmp85 = getelementptr inbounds float, float addrspace(3)* %arg, i32 49
     95   %tmp86 = load float, float addrspace(3)* %tmp85, align 4
     96   %tmp87 = getelementptr inbounds float, float addrspace(3)* %arg, i32 50
     97   %tmp88 = load float, float addrspace(3)* %tmp87, align 4
     98   %tmp89 = getelementptr inbounds float, float addrspace(3)* %arg, i32 51
     99   %tmp90 = load float, float addrspace(3)* %tmp89, align 4
    100   %tmp91 = tail call float @llvm.fmuladd.f32(float %tmp86, float %tmp88, float %tmp90)
    101   %tmp92 = getelementptr inbounds float, float addrspace(3)* %arg, i32 53
    102   %tmp93 = load float, float addrspace(3)* %tmp92, align 4
    103   %tmp94 = getelementptr inbounds float, float addrspace(3)* %arg, i32 54
    104   %tmp95 = load float, float addrspace(3)* %tmp94, align 4
    105   %tmp96 = getelementptr inbounds float, float addrspace(3)* %arg, i32 55
    106   %tmp97 = load float, float addrspace(3)* %tmp96, align 4
    107   %tmp98 = tail call float @llvm.fmuladd.f32(float %tmp93, float %tmp95, float %tmp97)
    108   %tmp99 = getelementptr inbounds float, float addrspace(3)* %arg, i32 57
    109   %tmp100 = load float, float addrspace(3)* %tmp99, align 4
    110   %tmp101 = getelementptr inbounds float, float addrspace(3)* %arg, i32 58
    111   %tmp102 = load float, float addrspace(3)* %tmp101, align 4
    112   %tmp103 = getelementptr inbounds float, float addrspace(3)* %arg, i32 59
    113   %tmp104 = load float, float addrspace(3)* %tmp103, align 4
    114   %tmp105 = tail call float @llvm.fmuladd.f32(float %tmp100, float %tmp102, float %tmp104)
    115   %tmp106 = getelementptr inbounds float, float addrspace(3)* %arg, i32 61
    116   %tmp107 = load float, float addrspace(3)* %tmp106, align 4
    117   %tmp108 = getelementptr inbounds float, float addrspace(3)* %arg, i32 62
    118   %tmp109 = load float, float addrspace(3)* %tmp108, align 4
    119   %tmp110 = getelementptr inbounds float, float addrspace(3)* %arg, i32 63
    120   %tmp111 = load float, float addrspace(3)* %tmp110, align 4
    121   %tmp112 = tail call float @llvm.fmuladd.f32(float %tmp107, float %tmp109, float %tmp111)
    122   %tmp113 = getelementptr inbounds float, float addrspace(3)* %arg, i32 65
    123   %tmp114 = load float, float addrspace(3)* %tmp113, align 4
    124   %tmp115 = getelementptr inbounds float, float addrspace(3)* %arg, i32 66
    125   %tmp116 = load float, float addrspace(3)* %tmp115, align 4
    126   %tmp117 = getelementptr inbounds float, float addrspace(3)* %arg, i32 67
    127   %tmp118 = load float, float addrspace(3)* %tmp117, align 4
    128   %tmp119 = tail call float @llvm.fmuladd.f32(float %tmp114, float %tmp116, float %tmp118)
    129   %tmp120 = getelementptr inbounds float, float addrspace(3)* %arg, i32 69
    130   %tmp121 = load float, float addrspace(3)* %tmp120, align 4
    131   %tmp122 = getelementptr inbounds float, float addrspace(3)* %arg, i32 70
    132   %tmp123 = load float, float addrspace(3)* %tmp122, align 4
    133   %tmp124 = getelementptr inbounds float, float addrspace(3)* %arg, i32 71
    134   %tmp125 = load float, float addrspace(3)* %tmp124, align 4
    135   %tmp126 = tail call float @llvm.fmuladd.f32(float %tmp121, float %tmp123, float %tmp125)
    136   %tmp127 = getelementptr inbounds float, float addrspace(3)* %arg, i32 73
    137   %tmp128 = load float, float addrspace(3)* %tmp127, align 4
    138   %tmp129 = getelementptr inbounds float, float addrspace(3)* %arg, i32 74
    139   %tmp130 = load float, float addrspace(3)* %tmp129, align 4
    140   %tmp131 = getelementptr inbounds float, float addrspace(3)* %arg, i32 75
    141   %tmp132 = load float, float addrspace(3)* %tmp131, align 4
    142   %tmp133 = tail call float @llvm.fmuladd.f32(float %tmp128, float %tmp130, float %tmp132)
    143   %tmp134 = getelementptr inbounds float, float addrspace(3)* %arg, i32 77
    144   %tmp135 = load float, float addrspace(3)* %tmp134, align 4
    145   %tmp136 = getelementptr inbounds float, float addrspace(3)* %arg, i32 78
    146   %tmp137 = load float, float addrspace(3)* %tmp136, align 4
    147   %tmp138 = getelementptr inbounds float, float addrspace(3)* %arg, i32 79
    148   %tmp139 = load float, float addrspace(3)* %tmp138, align 4
    149   %tmp140 = tail call float @llvm.fmuladd.f32(float %tmp135, float %tmp137, float %tmp139)
    150   %tmp141 = getelementptr inbounds float, float addrspace(3)* %arg, i32 81
    151   %tmp142 = load float, float addrspace(3)* %tmp141, align 4
    152   %tmp143 = getelementptr inbounds float, float addrspace(3)* %arg, i32 82
    153   %tmp144 = load float, float addrspace(3)* %tmp143, align 4
    154   %tmp145 = getelementptr inbounds float, float addrspace(3)* %arg, i32 83
    155   %tmp146 = load float, float addrspace(3)* %tmp145, align 4
    156   %tmp147 = tail call float @llvm.fmuladd.f32(float %tmp142, float %tmp144, float %tmp146)
    157   %tmp148 = getelementptr inbounds float, float addrspace(3)* %arg, i32 85
    158   %tmp149 = load float, float addrspace(3)* %tmp148, align 4
    159   %tmp150 = getelementptr inbounds float, float addrspace(3)* %arg, i32 86
    160   %tmp151 = load float, float addrspace(3)* %tmp150, align 4
    161   %tmp152 = getelementptr inbounds float, float addrspace(3)* %arg, i32 87
    162   %tmp153 = load float, float addrspace(3)* %tmp152, align 4
    163   %tmp154 = tail call float @llvm.fmuladd.f32(float %tmp149, float %tmp151, float %tmp153)
    164   %tmp155 = getelementptr inbounds float, float addrspace(3)* %arg, i32 89
    165   %tmp156 = load float, float addrspace(3)* %tmp155, align 4
    166   %tmp157 = getelementptr inbounds float, float addrspace(3)* %arg, i32 90
    167   %tmp158 = load float, float addrspace(3)* %tmp157, align 4
    168   %tmp159 = getelementptr inbounds float, float addrspace(3)* %arg, i32 91
    169   %tmp160 = load float, float addrspace(3)* %tmp159, align 4
    170   %tmp161 = tail call float @llvm.fmuladd.f32(float %tmp156, float %tmp158, float %tmp160)
    171   %tmp162 = getelementptr inbounds float, float addrspace(3)* %arg, i32 93
    172   %tmp163 = load float, float addrspace(3)* %tmp162, align 4
    173   %tmp164 = getelementptr inbounds float, float addrspace(3)* %arg, i32 94
    174   %tmp165 = load float, float addrspace(3)* %tmp164, align 4
    175   %tmp166 = getelementptr inbounds float, float addrspace(3)* %arg, i32 95
    176   %tmp167 = load float, float addrspace(3)* %tmp166, align 4
    177   %tmp168 = tail call float @llvm.fmuladd.f32(float %tmp163, float %tmp165, float %tmp167)
    178   %tmp169 = getelementptr inbounds float, float addrspace(3)* %arg, i32 97
    179   %tmp170 = load float, float addrspace(3)* %tmp169, align 4
    180   %tmp171 = getelementptr inbounds float, float addrspace(3)* %arg, i32 98
    181   %tmp172 = load float, float addrspace(3)* %tmp171, align 4
    182   %tmp173 = getelementptr inbounds float, float addrspace(3)* %arg, i32 99
    183   %tmp174 = load float, float addrspace(3)* %tmp173, align 4
    184   %tmp175 = tail call float @llvm.fmuladd.f32(float %tmp170, float %tmp172, float %tmp174)
    185   %tmp176 = getelementptr inbounds float, float addrspace(3)* %arg, i32 101
    186   %tmp177 = load float, float addrspace(3)* %tmp176, align 4
    187   %tmp178 = getelementptr inbounds float, float addrspace(3)* %arg, i32 102
    188   %tmp179 = load float, float addrspace(3)* %tmp178, align 4
    189   %tmp180 = getelementptr inbounds float, float addrspace(3)* %arg, i32 103
    190   %tmp181 = load float, float addrspace(3)* %tmp180, align 4
    191   %tmp182 = tail call float @llvm.fmuladd.f32(float %tmp177, float %tmp179, float %tmp181)
    192   %tmp183 = getelementptr inbounds float, float addrspace(3)* %arg, i32 105
    193   %tmp184 = load float, float addrspace(3)* %tmp183, align 4
    194   %tmp185 = getelementptr inbounds float, float addrspace(3)* %arg, i32 106
    195   %tmp186 = load float, float addrspace(3)* %tmp185, align 4
    196   %tmp187 = getelementptr inbounds float, float addrspace(3)* %arg, i32 107
    197   %tmp188 = load float, float addrspace(3)* %tmp187, align 4
    198   %tmp189 = tail call float @llvm.fmuladd.f32(float %tmp184, float %tmp186, float %tmp188)
    199   %tmp190 = getelementptr inbounds float, float addrspace(3)* %arg, i32 109
    200   %tmp191 = load float, float addrspace(3)* %tmp190, align 4
    201   %tmp192 = getelementptr inbounds float, float addrspace(3)* %arg, i32 110
    202   %tmp193 = load float, float addrspace(3)* %tmp192, align 4
    203   %tmp194 = getelementptr inbounds float, float addrspace(3)* %arg, i32 111
    204   %tmp195 = load float, float addrspace(3)* %tmp194, align 4
    205   %tmp196 = tail call float @llvm.fmuladd.f32(float %tmp191, float %tmp193, float %tmp195)
    206   %tmp197 = getelementptr inbounds float, float addrspace(3)* %arg, i32 113
    207   %tmp198 = load float, float addrspace(3)* %tmp197, align 4
    208   %tmp199 = getelementptr inbounds float, float addrspace(3)* %arg, i32 114
    209   %tmp200 = load float, float addrspace(3)* %tmp199, align 4
    210   %tmp201 = getelementptr inbounds float, float addrspace(3)* %arg, i32 115
    211   %tmp202 = load float, float addrspace(3)* %tmp201, align 4
    212   %tmp203 = tail call float @llvm.fmuladd.f32(float %tmp198, float %tmp200, float %tmp202)
    213   %tmp204 = getelementptr inbounds float, float addrspace(3)* %arg, i32 117
    214   %tmp205 = load float, float addrspace(3)* %tmp204, align 4
    215   %tmp206 = getelementptr inbounds float, float addrspace(3)* %arg, i32 118
    216   %tmp207 = load float, float addrspace(3)* %tmp206, align 4
    217   %tmp208 = getelementptr inbounds float, float addrspace(3)* %arg, i32 119
    218   %tmp209 = load float, float addrspace(3)* %tmp208, align 4
    219   %tmp210 = tail call float @llvm.fmuladd.f32(float %tmp205, float %tmp207, float %tmp209)
    220   %tmp211 = getelementptr inbounds float, float addrspace(3)* %arg, i32 121
    221   %tmp212 = load float, float addrspace(3)* %tmp211, align 4
    222   %tmp213 = getelementptr inbounds float, float addrspace(3)* %arg, i32 122
    223   %tmp214 = load float, float addrspace(3)* %tmp213, align 4
    224   %tmp215 = getelementptr inbounds float, float addrspace(3)* %arg, i32 123
    225   %tmp216 = load float, float addrspace(3)* %tmp215, align 4
    226   %tmp217 = tail call float @llvm.fmuladd.f32(float %tmp212, float %tmp214, float %tmp216)
    227   %tmp218 = getelementptr inbounds float, float addrspace(3)* %arg, i32 125
    228   %tmp219 = load float, float addrspace(3)* %tmp218, align 4
    229   %tmp220 = getelementptr inbounds float, float addrspace(3)* %arg, i32 126
    230   %tmp221 = load float, float addrspace(3)* %tmp220, align 4
    231   %tmp222 = getelementptr inbounds float, float addrspace(3)* %arg, i32 127
    232   %tmp223 = load float, float addrspace(3)* %tmp222, align 4
    233   %tmp224 = tail call float @llvm.fmuladd.f32(float %tmp219, float %tmp221, float %tmp223)
    234   %tmp225 = getelementptr inbounds float, float addrspace(3)* %arg, i32 129
    235   %tmp226 = load float, float addrspace(3)* %tmp225, align 4
    236   %tmp227 = getelementptr inbounds float, float addrspace(3)* %arg, i32 130
    237   %tmp228 = load float, float addrspace(3)* %tmp227, align 4
    238   %tmp229 = getelementptr inbounds float, float addrspace(3)* %arg, i32 131
    239   %tmp230 = load float, float addrspace(3)* %tmp229, align 4
    240   %tmp231 = tail call float @llvm.fmuladd.f32(float %tmp226, float %tmp228, float %tmp230)
    241   %tmp232 = getelementptr inbounds float, float addrspace(3)* %arg, i32 133
    242   %tmp233 = load float, float addrspace(3)* %tmp232, align 4
    243   %tmp234 = getelementptr inbounds float, float addrspace(3)* %arg, i32 134
    244   %tmp235 = load float, float addrspace(3)* %tmp234, align 4
    245   %tmp236 = getelementptr inbounds float, float addrspace(3)* %arg, i32 135
    246   %tmp237 = load float, float addrspace(3)* %tmp236, align 4
    247   %tmp238 = tail call float @llvm.fmuladd.f32(float %tmp233, float %tmp235, float %tmp237)
    248   %tmp239 = getelementptr inbounds float, float addrspace(3)* %arg, i32 137
    249   %tmp240 = load float, float addrspace(3)* %tmp239, align 4
    250   %tmp241 = getelementptr inbounds float, float addrspace(3)* %arg, i32 138
    251   %tmp242 = load float, float addrspace(3)* %tmp241, align 4
    252   %tmp243 = getelementptr inbounds float, float addrspace(3)* %arg, i32 139
    253   %tmp244 = load float, float addrspace(3)* %tmp243, align 4
    254   %tmp245 = tail call float @llvm.fmuladd.f32(float %tmp240, float %tmp242, float %tmp244)
    255   %tmp246 = getelementptr inbounds float, float addrspace(3)* %arg, i32 141
    256   %tmp247 = load float, float addrspace(3)* %tmp246, align 4
    257   %tmp248 = getelementptr inbounds float, float addrspace(3)* %arg, i32 142
    258   %tmp249 = load float, float addrspace(3)* %tmp248, align 4
    259   %tmp250 = getelementptr inbounds float, float addrspace(3)* %arg, i32 143
    260   %tmp251 = load float, float addrspace(3)* %tmp250, align 4
    261   %tmp252 = tail call float @llvm.fmuladd.f32(float %tmp247, float %tmp249, float %tmp251)
    262   %tmp253 = getelementptr inbounds float, float addrspace(3)* %arg, i32 145
    263   %tmp254 = load float, float addrspace(3)* %tmp253, align 4
    264   %tmp255 = getelementptr inbounds float, float addrspace(3)* %arg, i32 146
    265   %tmp256 = load float, float addrspace(3)* %tmp255, align 4
    266   %tmp257 = getelementptr inbounds float, float addrspace(3)* %arg, i32 147
    267   %tmp258 = load float, float addrspace(3)* %tmp257, align 4
    268   %tmp259 = tail call float @llvm.fmuladd.f32(float %tmp254, float %tmp256, float %tmp258)
    269   %tmp260 = getelementptr inbounds float, float addrspace(3)* %arg, i32 149
    270   %tmp261 = load float, float addrspace(3)* %tmp260, align 4
    271   %tmp262 = getelementptr inbounds float, float addrspace(3)* %arg, i32 150
    272   %tmp263 = load float, float addrspace(3)* %tmp262, align 4
    273   %tmp264 = getelementptr inbounds float, float addrspace(3)* %arg, i32 151
    274   %tmp265 = load float, float addrspace(3)* %tmp264, align 4
    275   %tmp266 = tail call float @llvm.fmuladd.f32(float %tmp261, float %tmp263, float %tmp265)
    276   %tmp267 = getelementptr inbounds float, float addrspace(3)* %arg, i32 153
    277   %tmp268 = load float, float addrspace(3)* %tmp267, align 4
    278   %tmp269 = getelementptr inbounds float, float addrspace(3)* %arg, i32 154
    279   %tmp270 = load float, float addrspace(3)* %tmp269, align 4
    280   %tmp271 = getelementptr inbounds float, float addrspace(3)* %arg, i32 155
    281   %tmp272 = load float, float addrspace(3)* %tmp271, align 4
    282   %tmp273 = tail call float @llvm.fmuladd.f32(float %tmp268, float %tmp270, float %tmp272)
    283   %tmp274 = getelementptr inbounds float, float addrspace(3)* %arg, i32 157
    284   %tmp275 = load float, float addrspace(3)* %tmp274, align 4
    285   %tmp276 = getelementptr inbounds float, float addrspace(3)* %arg, i32 158
    286   %tmp277 = load float, float addrspace(3)* %tmp276, align 4
    287   %tmp278 = getelementptr inbounds float, float addrspace(3)* %arg, i32 159
    288   %tmp279 = load float, float addrspace(3)* %tmp278, align 4
    289   %tmp280 = tail call float @llvm.fmuladd.f32(float %tmp275, float %tmp277, float %tmp279)
    290   %tmp281 = getelementptr inbounds float, float addrspace(3)* %arg, i32 161
    291   %tmp282 = load float, float addrspace(3)* %tmp281, align 4
    292   %tmp283 = getelementptr inbounds float, float addrspace(3)* %arg, i32 162
    293   %tmp284 = load float, float addrspace(3)* %tmp283, align 4
    294   %tmp285 = getelementptr inbounds float, float addrspace(3)* %arg, i32 163
    295   %tmp286 = load float, float addrspace(3)* %tmp285, align 4
    296   %tmp287 = tail call float @llvm.fmuladd.f32(float %tmp282, float %tmp284, float %tmp286)
    297   %tmp288 = getelementptr inbounds float, float addrspace(3)* %arg, i32 165
    298   %tmp289 = load float, float addrspace(3)* %tmp288, align 4
    299   %tmp290 = getelementptr inbounds float, float addrspace(3)* %arg, i32 166
    300   %tmp291 = load float, float addrspace(3)* %tmp290, align 4
    301   %tmp292 = getelementptr inbounds float, float addrspace(3)* %arg, i32 167
    302   %tmp293 = load float, float addrspace(3)* %tmp292, align 4
    303   %tmp294 = tail call float @llvm.fmuladd.f32(float %tmp289, float %tmp291, float %tmp293)
    304   %tmp295 = getelementptr inbounds float, float addrspace(3)* %arg, i32 169
    305   %tmp296 = load float, float addrspace(3)* %tmp295, align 4
    306   %tmp297 = getelementptr inbounds float, float addrspace(3)* %arg, i32 170
    307   %tmp298 = load float, float addrspace(3)* %tmp297, align 4
    308   %tmp299 = getelementptr inbounds float, float addrspace(3)* %arg, i32 171
    309   %tmp300 = load float, float addrspace(3)* %tmp299, align 4
    310   %tmp301 = tail call float @llvm.fmuladd.f32(float %tmp296, float %tmp298, float %tmp300)
    311   %tmp302 = getelementptr inbounds float, float addrspace(3)* %arg, i32 173
    312   %tmp303 = load float, float addrspace(3)* %tmp302, align 4
    313   %tmp304 = getelementptr inbounds float, float addrspace(3)* %arg, i32 174
    314   %tmp305 = load float, float addrspace(3)* %tmp304, align 4
    315   %tmp306 = getelementptr inbounds float, float addrspace(3)* %arg, i32 175
    316   %tmp307 = load float, float addrspace(3)* %tmp306, align 4
    317   %tmp308 = tail call float @llvm.fmuladd.f32(float %tmp303, float %tmp305, float %tmp307)
    318   %tmp309 = getelementptr inbounds float, float addrspace(3)* %arg, i32 177
    319   %tmp310 = load float, float addrspace(3)* %tmp309, align 4
    320   %tmp311 = getelementptr inbounds float, float addrspace(3)* %arg, i32 178
    321   %tmp312 = load float, float addrspace(3)* %tmp311, align 4
    322   %tmp313 = getelementptr inbounds float, float addrspace(3)* %arg, i32 179
    323   %tmp314 = load float, float addrspace(3)* %tmp313, align 4
    324   %tmp315 = tail call float @llvm.fmuladd.f32(float %tmp310, float %tmp312, float %tmp314)
    325   %tmp316 = getelementptr inbounds float, float addrspace(3)* %arg, i32 181
    326   %tmp317 = load float, float addrspace(3)* %tmp316, align 4
    327   %tmp318 = getelementptr inbounds float, float addrspace(3)* %arg, i32 182
    328   %tmp319 = load float, float addrspace(3)* %tmp318, align 4
    329   %tmp320 = getelementptr inbounds float, float addrspace(3)* %arg, i32 183
    330   %tmp321 = load float, float addrspace(3)* %tmp320, align 4
    331   %tmp322 = tail call float @llvm.fmuladd.f32(float %tmp317, float %tmp319, float %tmp321)
    332   %tmp323 = getelementptr inbounds float, float addrspace(3)* %arg, i32 185
    333   %tmp324 = load float, float addrspace(3)* %tmp323, align 4
    334   %tmp325 = getelementptr inbounds float, float addrspace(3)* %arg, i32 186
    335   %tmp326 = load float, float addrspace(3)* %tmp325, align 4
    336   %tmp327 = getelementptr inbounds float, float addrspace(3)* %arg, i32 187
    337   %tmp328 = load float, float addrspace(3)* %tmp327, align 4
    338   %tmp329 = tail call float @llvm.fmuladd.f32(float %tmp324, float %tmp326, float %tmp328)
    339   %tmp330 = getelementptr inbounds float, float addrspace(3)* %arg, i32 189
    340   %tmp331 = load float, float addrspace(3)* %tmp330, align 4
    341   %tmp332 = getelementptr inbounds float, float addrspace(3)* %arg, i32 190
    342   %tmp333 = load float, float addrspace(3)* %tmp332, align 4
    343   %tmp334 = getelementptr inbounds float, float addrspace(3)* %arg, i32 191
    344   %tmp335 = load float, float addrspace(3)* %tmp334, align 4
    345   %tmp336 = tail call float @llvm.fmuladd.f32(float %tmp331, float %tmp333, float %tmp335)
    346   %tmp337 = getelementptr inbounds float, float addrspace(3)* %arg, i32 193
    347   %tmp338 = load float, float addrspace(3)* %tmp337, align 4
    348   %tmp339 = getelementptr inbounds float, float addrspace(3)* %arg, i32 194
    349   %tmp340 = load float, float addrspace(3)* %tmp339, align 4
    350   %tmp341 = getelementptr inbounds float, float addrspace(3)* %arg, i32 195
    351   %tmp342 = load float, float addrspace(3)* %tmp341, align 4
    352   %tmp343 = tail call float @llvm.fmuladd.f32(float %tmp338, float %tmp340, float %tmp342)
    353   %tmp344 = getelementptr inbounds float, float addrspace(3)* %arg, i32 197
    354   %tmp345 = load float, float addrspace(3)* %tmp344, align 4
    355   %tmp346 = getelementptr inbounds float, float addrspace(3)* %arg, i32 198
    356   %tmp347 = load float, float addrspace(3)* %tmp346, align 4
    357   %tmp348 = getelementptr inbounds float, float addrspace(3)* %arg, i32 199
    358   %tmp349 = load float, float addrspace(3)* %tmp348, align 4
    359   %tmp350 = tail call float @llvm.fmuladd.f32(float %tmp345, float %tmp347, float %tmp349)
    360   %tmp351 = getelementptr inbounds float, float addrspace(3)* %arg, i32 201
    361   %tmp352 = load float, float addrspace(3)* %tmp351, align 4
    362   %tmp353 = getelementptr inbounds float, float addrspace(3)* %arg, i32 202
    363   %tmp354 = load float, float addrspace(3)* %tmp353, align 4
    364   %tmp355 = getelementptr inbounds float, float addrspace(3)* %arg, i32 203
    365   %tmp356 = load float, float addrspace(3)* %tmp355, align 4
    366   %tmp357 = tail call float @llvm.fmuladd.f32(float %tmp352, float %tmp354, float %tmp356)
    367   %tmp358 = getelementptr inbounds float, float addrspace(3)* %arg, i32 205
    368   %tmp359 = load float, float addrspace(3)* %tmp358, align 4
    369   %tmp360 = getelementptr inbounds float, float addrspace(3)* %arg, i32 206
    370   %tmp361 = load float, float addrspace(3)* %tmp360, align 4
    371   %tmp362 = getelementptr inbounds float, float addrspace(3)* %arg, i32 207
    372   %tmp363 = load float, float addrspace(3)* %tmp362, align 4
    373   %tmp364 = tail call float @llvm.fmuladd.f32(float %tmp359, float %tmp361, float %tmp363)
    374   %tmp365 = getelementptr inbounds float, float addrspace(3)* %arg, i32 209
    375   %tmp366 = load float, float addrspace(3)* %tmp365, align 4
    376   %tmp367 = getelementptr inbounds float, float addrspace(3)* %arg, i32 210
    377   %tmp368 = load float, float addrspace(3)* %tmp367, align 4
    378   %tmp369 = getelementptr inbounds float, float addrspace(3)* %arg, i32 211
    379   %tmp370 = load float, float addrspace(3)* %tmp369, align 4
    380   %tmp371 = tail call float @llvm.fmuladd.f32(float %tmp366, float %tmp368, float %tmp370)
    381   %tmp372 = getelementptr inbounds float, float addrspace(3)* %arg, i32 213
    382   %tmp373 = load float, float addrspace(3)* %tmp372, align 4
    383   %tmp374 = getelementptr inbounds float, float addrspace(3)* %arg, i32 214
    384   %tmp375 = load float, float addrspace(3)* %tmp374, align 4
    385   %tmp376 = getelementptr inbounds float, float addrspace(3)* %arg, i32 215
    386   %tmp377 = load float, float addrspace(3)* %tmp376, align 4
    387   %tmp378 = tail call float @llvm.fmuladd.f32(float %tmp373, float %tmp375, float %tmp377)
    388   %tmp379 = getelementptr inbounds float, float addrspace(3)* %arg, i32 217
    389   %tmp380 = load float, float addrspace(3)* %tmp379, align 4
    390   %tmp381 = getelementptr inbounds float, float addrspace(3)* %arg, i32 218
    391   %tmp382 = load float, float addrspace(3)* %tmp381, align 4
    392   %tmp383 = getelementptr inbounds float, float addrspace(3)* %arg, i32 219
    393   %tmp384 = load float, float addrspace(3)* %tmp383, align 4
    394   %tmp385 = tail call float @llvm.fmuladd.f32(float %tmp380, float %tmp382, float %tmp384)
    395   %tmp386 = getelementptr inbounds float, float addrspace(3)* %arg, i32 221
    396   %tmp387 = load float, float addrspace(3)* %tmp386, align 4
    397   %tmp388 = getelementptr inbounds float, float addrspace(3)* %arg, i32 222
    398   %tmp389 = load float, float addrspace(3)* %tmp388, align 4
    399   %tmp390 = getelementptr inbounds float, float addrspace(3)* %arg, i32 223
    400   %tmp391 = load float, float addrspace(3)* %tmp390, align 4
    401   %tmp392 = tail call float @llvm.fmuladd.f32(float %tmp387, float %tmp389, float %tmp391)
    402   %tmp393 = getelementptr inbounds float, float addrspace(3)* %arg, i32 225
    403   %tmp394 = load float, float addrspace(3)* %tmp393, align 4
    404   %tmp395 = getelementptr inbounds float, float addrspace(3)* %arg, i32 226
    405   %tmp396 = load float, float addrspace(3)* %tmp395, align 4
    406   %tmp397 = getelementptr inbounds float, float addrspace(3)* %arg, i32 227
    407   %tmp398 = load float, float addrspace(3)* %tmp397, align 4
    408   %tmp399 = tail call float @llvm.fmuladd.f32(float %tmp394, float %tmp396, float %tmp398)
    409   %tmp400 = getelementptr inbounds float, float addrspace(3)* %arg, i32 229
    410   %tmp401 = load float, float addrspace(3)* %tmp400, align 4
    411   %tmp402 = getelementptr inbounds float, float addrspace(3)* %arg, i32 230
    412   %tmp403 = load float, float addrspace(3)* %tmp402, align 4
    413   %tmp404 = getelementptr inbounds float, float addrspace(3)* %arg, i32 231
    414   %tmp405 = load float, float addrspace(3)* %tmp404, align 4
    415   %tmp406 = tail call float @llvm.fmuladd.f32(float %tmp401, float %tmp403, float %tmp405)
    416   %tmp407 = getelementptr inbounds float, float addrspace(3)* %arg, i32 233
    417   %tmp408 = load float, float addrspace(3)* %tmp407, align 4
    418   %tmp409 = getelementptr inbounds float, float addrspace(3)* %arg, i32 234
    419   %tmp410 = load float, float addrspace(3)* %tmp409, align 4
    420   %tmp411 = getelementptr inbounds float, float addrspace(3)* %arg, i32 235
    421   %tmp412 = load float, float addrspace(3)* %tmp411, align 4
    422   %tmp413 = tail call float @llvm.fmuladd.f32(float %tmp408, float %tmp410, float %tmp412)
    423   %tmp414 = getelementptr inbounds float, float addrspace(3)* %arg, i32 237
    424   %tmp415 = load float, float addrspace(3)* %tmp414, align 4
    425   %tmp416 = getelementptr inbounds float, float addrspace(3)* %arg, i32 238
    426   %tmp417 = load float, float addrspace(3)* %tmp416, align 4
    427   %tmp418 = getelementptr inbounds float, float addrspace(3)* %arg, i32 239
    428   %tmp419 = load float, float addrspace(3)* %tmp418, align 4
    429   %tmp420 = tail call float @llvm.fmuladd.f32(float %tmp415, float %tmp417, float %tmp419)
    430   %tmp421 = getelementptr inbounds float, float addrspace(3)* %arg, i32 241
    431   %tmp422 = load float, float addrspace(3)* %tmp421, align 4
    432   %tmp423 = getelementptr inbounds float, float addrspace(3)* %arg, i32 242
    433   %tmp424 = load float, float addrspace(3)* %tmp423, align 4
    434   %tmp425 = getelementptr inbounds float, float addrspace(3)* %arg, i32 243
    435   %tmp426 = load float, float addrspace(3)* %tmp425, align 4
    436   %tmp427 = tail call float @llvm.fmuladd.f32(float %tmp422, float %tmp424, float %tmp426)
    437   %tmp428 = getelementptr inbounds float, float addrspace(3)* %arg, i32 245
    438   %tmp429 = load float, float addrspace(3)* %tmp428, align 4
    439   %tmp430 = getelementptr inbounds float, float addrspace(3)* %arg, i32 246
    440   %tmp431 = load float, float addrspace(3)* %tmp430, align 4
    441   %tmp432 = getelementptr inbounds float, float addrspace(3)* %arg, i32 247
    442   %tmp433 = load float, float addrspace(3)* %tmp432, align 4
    443   %tmp434 = tail call float @llvm.fmuladd.f32(float %tmp429, float %tmp431, float %tmp433)
    444   %tmp435 = getelementptr inbounds float, float addrspace(3)* %arg, i32 249
    445   %tmp436 = load float, float addrspace(3)* %tmp435, align 4
    446   %tmp437 = getelementptr inbounds float, float addrspace(3)* %arg, i32 250
    447   %tmp438 = load float, float addrspace(3)* %tmp437, align 4
    448   %tmp439 = getelementptr inbounds float, float addrspace(3)* %arg, i32 251
    449   %tmp440 = load float, float addrspace(3)* %tmp439, align 4
    450   %tmp441 = tail call float @llvm.fmuladd.f32(float %tmp436, float %tmp438, float %tmp440)
    451   %tmp442 = getelementptr inbounds float, float addrspace(3)* %arg, i32 253
    452   %tmp443 = load float, float addrspace(3)* %tmp442, align 4
    453   %tmp444 = getelementptr inbounds float, float addrspace(3)* %arg, i32 254
    454   %tmp445 = load float, float addrspace(3)* %tmp444, align 4
    455   %tmp446 = getelementptr inbounds float, float addrspace(3)* %arg, i32 255
    456   %tmp447 = load float, float addrspace(3)* %tmp446, align 4
    457   %tmp448 = tail call float @llvm.fmuladd.f32(float %tmp443, float %tmp445, float %tmp447)
    458   store float %tmp7, float addrspace(1)* %arg1, align 4
    459   %tmp449 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 1
    460   store float %tmp14, float addrspace(1)* %tmp449, align 4
    461   %tmp450 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 2
    462   store float %tmp21, float addrspace(1)* %tmp450, align 4
    463   %tmp451 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 3
    464   store float %tmp28, float addrspace(1)* %tmp451, align 4
    465   %tmp452 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 4
    466   store float %tmp35, float addrspace(1)* %tmp452, align 4
    467   %tmp453 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 5
    468   store float %tmp42, float addrspace(1)* %tmp453, align 4
    469   %tmp454 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 6
    470   store float %tmp49, float addrspace(1)* %tmp454, align 4
    471   %tmp455 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 7
    472   store float %tmp56, float addrspace(1)* %tmp455, align 4
    473   %tmp456 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 8
    474   store float %tmp63, float addrspace(1)* %tmp456, align 4
    475   %tmp457 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 9
    476   store float %tmp70, float addrspace(1)* %tmp457, align 4
    477   %tmp458 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 10
    478   store float %tmp77, float addrspace(1)* %tmp458, align 4
    479   %tmp459 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 11
    480   store float %tmp84, float addrspace(1)* %tmp459, align 4
    481   %tmp460 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 12
    482   store float %tmp91, float addrspace(1)* %tmp460, align 4
    483   %tmp461 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 13
    484   store float %tmp98, float addrspace(1)* %tmp461, align 4
    485   %tmp462 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 14
    486   store float %tmp105, float addrspace(1)* %tmp462, align 4
    487   %tmp463 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 15
    488   store float %tmp112, float addrspace(1)* %tmp463, align 4
    489   %tmp464 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 16
    490   store float %tmp119, float addrspace(1)* %tmp464, align 4
    491   %tmp465 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 17
    492   store float %tmp126, float addrspace(1)* %tmp465, align 4
    493   %tmp466 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 18
    494   store float %tmp133, float addrspace(1)* %tmp466, align 4
    495   %tmp467 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 19
    496   store float %tmp140, float addrspace(1)* %tmp467, align 4
    497   %tmp468 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 20
    498   store float %tmp147, float addrspace(1)* %tmp468, align 4
    499   %tmp469 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 21
    500   store float %tmp154, float addrspace(1)* %tmp469, align 4
    501   %tmp470 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 22
    502   store float %tmp161, float addrspace(1)* %tmp470, align 4
    503   %tmp471 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 23
    504   store float %tmp168, float addrspace(1)* %tmp471, align 4
    505   %tmp472 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 24
    506   store float %tmp175, float addrspace(1)* %tmp472, align 4
    507   %tmp473 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 25
    508   store float %tmp182, float addrspace(1)* %tmp473, align 4
    509   %tmp474 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 26
    510   store float %tmp189, float addrspace(1)* %tmp474, align 4
    511   %tmp475 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 27
    512   store float %tmp196, float addrspace(1)* %tmp475, align 4
    513   %tmp476 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 28
    514   store float %tmp203, float addrspace(1)* %tmp476, align 4
    515   %tmp477 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 29
    516   store float %tmp210, float addrspace(1)* %tmp477, align 4
    517   %tmp478 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 30
    518   store float %tmp217, float addrspace(1)* %tmp478, align 4
    519   %tmp479 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 31
    520   store float %tmp224, float addrspace(1)* %tmp479, align 4
    521   %tmp480 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 32
    522   store float %tmp231, float addrspace(1)* %tmp480, align 4
    523   %tmp481 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 33
    524   store float %tmp238, float addrspace(1)* %tmp481, align 4
    525   %tmp482 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 34
    526   store float %tmp245, float addrspace(1)* %tmp482, align 4
    527   %tmp483 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 35
    528   store float %tmp252, float addrspace(1)* %tmp483, align 4
    529   %tmp484 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 36
    530   store float %tmp259, float addrspace(1)* %tmp484, align 4
    531   %tmp485 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 37
    532   store float %tmp266, float addrspace(1)* %tmp485, align 4
    533   %tmp486 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 38
    534   store float %tmp273, float addrspace(1)* %tmp486, align 4
    535   %tmp487 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 39
    536   store float %tmp280, float addrspace(1)* %tmp487, align 4
    537   %tmp488 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 40
    538   store float %tmp287, float addrspace(1)* %tmp488, align 4
    539   %tmp489 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 41
    540   store float %tmp294, float addrspace(1)* %tmp489, align 4
    541   %tmp490 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 42
    542   store float %tmp301, float addrspace(1)* %tmp490, align 4
    543   %tmp491 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 43
    544   store float %tmp308, float addrspace(1)* %tmp491, align 4
    545   %tmp492 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 44
    546   store float %tmp315, float addrspace(1)* %tmp492, align 4
    547   %tmp493 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 45
    548   store float %tmp322, float addrspace(1)* %tmp493, align 4
    549   %tmp494 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 46
    550   store float %tmp329, float addrspace(1)* %tmp494, align 4
    551   %tmp495 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 47
    552   store float %tmp336, float addrspace(1)* %tmp495, align 4
    553   %tmp496 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 48
    554   store float %tmp343, float addrspace(1)* %tmp496, align 4
    555   %tmp497 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 49
    556   store float %tmp350, float addrspace(1)* %tmp497, align 4
    557   %tmp498 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 50
    558   store float %tmp357, float addrspace(1)* %tmp498, align 4
    559   %tmp499 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 51
    560   store float %tmp364, float addrspace(1)* %tmp499, align 4
    561   %tmp500 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 52
    562   store float %tmp371, float addrspace(1)* %tmp500, align 4
    563   %tmp501 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 53
    564   store float %tmp378, float addrspace(1)* %tmp501, align 4
    565   %tmp502 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 54
    566   store float %tmp385, float addrspace(1)* %tmp502, align 4
    567   %tmp503 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 55
    568   store float %tmp392, float addrspace(1)* %tmp503, align 4
    569   %tmp504 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 56
    570   store float %tmp399, float addrspace(1)* %tmp504, align 4
    571   %tmp505 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 57
    572   store float %tmp406, float addrspace(1)* %tmp505, align 4
    573   %tmp506 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 58
    574   store float %tmp413, float addrspace(1)* %tmp506, align 4
    575   %tmp507 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 59
    576   store float %tmp420, float addrspace(1)* %tmp507, align 4
    577   %tmp508 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 60
    578   store float %tmp427, float addrspace(1)* %tmp508, align 4
    579   %tmp509 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 61
    580   store float %tmp434, float addrspace(1)* %tmp509, align 4
    581   %tmp510 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 62
    582   store float %tmp441, float addrspace(1)* %tmp510, align 4
    583   %tmp511 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 63
    584   store float %tmp448, float addrspace(1)* %tmp511, align 4
    585   ret void
    586 }
    587 
    588 ; Function Attrs: nounwind readnone
    589 declare float @llvm.fmuladd.f32(float, float, float) #0
    590 
    591 attributes #0 = { nounwind readnone }
    592