1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/grappler/costs/virtual_placer.h" 17 #include "tensorflow/core/framework/node_def.pb.h" 18 #include "tensorflow/core/grappler/clusters/virtual_cluster.h" 19 #include "tensorflow/core/lib/strings/strcat.h" 20 #include "tensorflow/core/platform/test.h" 21 #include "tensorflow/core/protobuf/device_properties.pb.h" 22 23 namespace tensorflow { 24 namespace grappler { 25 26 TEST(VirtualPlacerTest, LocalDevices) { 27 // Create a virtual cluster with a local CPU and a local GPU 28 std::unordered_map<string, DeviceProperties> devices; 29 DeviceProperties cpu_device; 30 cpu_device.set_type("CPU"); 31 devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device; 32 DeviceProperties gpu_device; 33 gpu_device.set_type("GPU"); 34 devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device; 35 VirtualCluster cluster(devices); 36 VirtualPlacer placer(&cluster); 37 38 NodeDef node; 39 node.set_op("Conv2D"); 40 // node.device() is empty, but GPU is default device if there is. 41 EXPECT_EQ("GPU", placer.get_device(node).type()); 42 EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0", 43 placer.get_canonical_device_name(node)); 44 45 node.set_device("CPU"); 46 EXPECT_EQ("CPU", placer.get_device(node).type()); 47 EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0", 48 placer.get_canonical_device_name(node)); 49 50 node.set_device("GPU:0"); 51 EXPECT_EQ("GPU", placer.get_device(node).type()); 52 EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0", 53 placer.get_canonical_device_name(node)); 54 } 55 56 TEST(VirtualPlacerTest, ShortNames) { 57 // Create a virtual cluster with a local CPU and a local GPU 58 std::unordered_map<string, DeviceProperties> devices; 59 DeviceProperties cpu_device; 60 cpu_device.set_type("CPU"); 61 devices["/CPU:0"] = cpu_device; 62 DeviceProperties gpu_device; 63 gpu_device.set_type("GPU"); 64 devices["/GPU:0"] = gpu_device; 65 VirtualCluster cluster(devices); 66 VirtualPlacer placer(&cluster); 67 68 NodeDef node; 69 node.set_op("Conv2D"); 70 // node.device() is empty, but GPU is default device if there is. 71 EXPECT_EQ("GPU", placer.get_device(node).type()); 72 EXPECT_EQ("/GPU:0", placer.get_canonical_device_name(node)); 73 74 node.set_device("CPU"); 75 EXPECT_EQ("CPU", placer.get_device(node).type()); 76 EXPECT_EQ("/CPU:0", placer.get_canonical_device_name(node)); 77 78 node.set_device("GPU:0"); 79 EXPECT_EQ("GPU", placer.get_device(node).type()); 80 EXPECT_EQ("/GPU:0", placer.get_canonical_device_name(node)); 81 } 82 83 TEST(VirtualPlacerTest, PlacementOnNonDefaultDevice) { 84 // Create a virtual cluster with a CPU and a device:TPU 85 // Test that placement on TPU works 86 // In contrast with GPU, TPU is not selected as default device at the moment. 87 88 std::unordered_map<string, DeviceProperties> devices; 89 DeviceProperties cpu_device; 90 cpu_device.set_type("CPU"); 91 devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device; 92 DeviceProperties tpu_device; 93 tpu_device.set_type("TPU"); 94 devices["/job:localhost/replica:0/task:0/device:TPU:0"] = tpu_device; 95 VirtualCluster cluster(devices); 96 VirtualPlacer placer(&cluster); 97 98 NodeDef node; 99 node.set_op("Conv2D"); 100 // node.device() is empty, and CPU is default device. 101 EXPECT_EQ("CPU", placer.get_device(node).type()); 102 EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0", 103 placer.get_canonical_device_name(node)); 104 105 node.set_device("/device:TPU:0"); 106 EXPECT_EQ("TPU", placer.get_device(node).type()); 107 EXPECT_EQ("/job:localhost/replica:0/task:0/device:TPU:0", 108 placer.get_canonical_device_name(node)); 109 } 110 111 TEST(VirtualPlacerTest, EmptyJobName) { 112 // Virtual placer choose job name from the devices in cluster if a device name 113 // of an op is empty. In case there are more than one kind of job name 114 // or job names are missin in the devices in cluster, we use local_host. 115 for (const string& job_name : {"localhost", "worker", "worker_train"}) { 116 std::unordered_map<string, DeviceProperties> devices; 117 DeviceProperties cpu_device; 118 cpu_device.set_type("CPU"); 119 devices[strings::StrCat("/job:", job_name, "/replica:0/task:0/cpu:0")] = 120 cpu_device; 121 DeviceProperties gpu_device; 122 gpu_device.set_type("GPU"); 123 devices[strings::StrCat("/job:", job_name, 124 "/replica:0/task:0/device:GPU:0")] = gpu_device; 125 VirtualCluster cluster(devices); 126 VirtualPlacer placer(&cluster); 127 128 NodeDef node; 129 node.set_op("Conv2D"); 130 node.set_device("/device:CPU:0"); 131 EXPECT_EQ(strings::StrCat("/job:", job_name, "/replica:0/task:0/cpu:0"), 132 placer.get_canonical_device_name(node)); 133 node.set_device("/device:GPU:0"); 134 EXPECT_EQ( 135 strings::StrCat("/job:", job_name, "/replica:0/task:0/device:GPU:0"), 136 placer.get_canonical_device_name(node)); 137 } 138 139 // When more than one job names are used, we use default "localhost" 140 // This may be improved later. 141 std::unordered_map<string, DeviceProperties> devices; 142 DeviceProperties cpu_device; 143 cpu_device.set_type("CPU"); 144 devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device; 145 devices["/job:ps/replica:0/task:0/cpu:0"] = cpu_device; 146 devices["/job:worker/replica:0/task:0/cpu:0"] = cpu_device; 147 VirtualCluster cluster(devices); 148 VirtualPlacer placer(&cluster); 149 150 NodeDef node; 151 node.set_op("Conv2D"); 152 node.set_device("/device:CPU:0"); 153 EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0", 154 placer.get_canonical_device_name(node)); 155 } 156 157 string GetDefaultDeviceName( 158 const std::unordered_map<string, DeviceProperties>& devices) { 159 VirtualCluster cluster(devices); 160 VirtualPlacer placer(&cluster); 161 NodeDef node; 162 node.set_op("Conv2D"); 163 // Device is not set to the node, so get_canonical_device_name() will return 164 // the default_device_. 165 return placer.get_canonical_device_name(node); 166 } 167 168 TEST(VirtualPlacerTest, DefaultDevice) { 169 std::unordered_map<string, DeviceProperties> devices; 170 DeviceProperties cpu_device; 171 cpu_device.set_type("CPU"); 172 devices["/job:worker/replica:0/task:0/cpu:0"] = cpu_device; 173 174 // CPU is default when there is only CPU. 175 EXPECT_EQ("/job:worker/replica:0/task:0/cpu:0", 176 GetDefaultDeviceName(devices)); 177 178 DeviceProperties gpu_device; 179 gpu_device.set_type("GPU"); 180 181 // If there is any GPU, then gpu:0 is default device. 182 for (int i = 0; i < 8; i++) { 183 devices[strings::StrCat("/job:worker/replica:0/task:0/gpu:", i)] = 184 gpu_device; 185 EXPECT_EQ("/job:worker/replica:0/task:0/gpu:0", 186 GetDefaultDeviceName(devices)); 187 } 188 } 189 190 TEST(VirtualPlacerTest, MultiReplica) { 191 // Create a cluster with 8 workers, each with 8 GPUs. 192 std::unordered_map<string, DeviceProperties> devices; 193 DeviceProperties cpu_device; 194 cpu_device.set_type("CPU"); 195 DeviceProperties gpu_device; 196 gpu_device.set_type("GPU"); 197 for (int i = 0; i < 8; i++) { 198 devices[strings::StrCat("/job:worker/replica:", i, "/task:0/cpu:0")] = 199 cpu_device; 200 for (int j = 0; j < 8; j++) { 201 devices[strings::StrCat("/job:worker/replica:", i, "/task:0/gpu:", j)] = 202 gpu_device; 203 } 204 } 205 206 std::unique_ptr<VirtualCluster> cluster(new VirtualCluster(devices)); 207 std::unique_ptr<VirtualPlacer> placer(new VirtualPlacer(cluster.get())); 208 209 auto get_device_name = [&placer](const string& device) -> string { 210 NodeDef node; 211 node.set_op("Conv2D"); 212 node.set_device(device); 213 return placer->get_canonical_device_name(node); 214 }; 215 216 // Validate device name is correct when we pass only replica ID and device 217 // name. 218 EXPECT_EQ("/job:worker/replica:0/task:0/cpu:0", 219 get_device_name("/replica:0/cpu:0")); 220 EXPECT_EQ("/job:worker/replica:2/task:0/cpu:0", 221 get_device_name("/replica:2/cpu:0")); 222 EXPECT_EQ("/job:worker/replica:7/task:0/cpu:0", 223 get_device_name("/replica:7/cpu:0")); 224 EXPECT_EQ("/job:worker/replica:3/task:0/gpu:0", 225 get_device_name("/replica:3/gpu:0")); 226 EXPECT_EQ("/job:worker/replica:5/task:0/gpu:3", 227 get_device_name("/replica:5/gpu:3")); 228 EXPECT_EQ("/job:worker/replica:4/task:0/gpu:7", 229 get_device_name("/replica:4/gpu:7")); 230 231 // Now add PS replicas; with multiple job names present in the cluster, 232 // device names in nodes should specify job names correctly. 233 for (int i = 0; i < 4; i++) { 234 devices[strings::StrCat("/job:ps/replica:", i, "/task:0/cpu:0")] = 235 cpu_device; 236 } 237 cluster.reset(new VirtualCluster(devices)); 238 placer.reset(new VirtualPlacer(cluster.get())); 239 EXPECT_EQ("/job:worker/replica:0/task:0/cpu:0", 240 get_device_name("/job:worker/replica:0/cpu:0")); 241 EXPECT_EQ("/job:worker/replica:7/task:0/gpu:3", 242 get_device_name("/job:worker/replica:7/gpu:3")); 243 EXPECT_EQ("/job:ps/replica:0/task:0/cpu:0", 244 get_device_name("/job:ps/replica:0/cpu:0")); 245 EXPECT_EQ("/job:ps/replica:1/task:0/cpu:0", 246 get_device_name("/job:ps/replica:1/cpu:0")); 247 EXPECT_EQ("/job:ps/replica:2/task:0/cpu:0", 248 get_device_name("/job:ps/replica:2/cpu:0")); 249 EXPECT_EQ("/job:ps/replica:3/task:0/cpu:0", 250 get_device_name("/job:ps/replica:3/cpu:0")); 251 } 252 253 TEST(VirtualPlacerTest, FallBackUnknown) { 254 // Virtual placer falls back to "UNKNOWN" only if there are no devices in the 255 // cluster. 256 std::unordered_map<string, DeviceProperties> devices; 257 VirtualCluster cluster(devices); 258 VirtualPlacer placer(&cluster); 259 260 NodeDef node; 261 node.set_op("Conv2D"); 262 263 // Device falls back to UNKNOWN since the cluster has no devices. 264 EXPECT_EQ("UNKNOWN", placer.get_device(node).type()); 265 EXPECT_EQ("UNKNOWN", placer.get_canonical_device_name(node)); 266 } 267 268 TEST(VirtualPlacerTest, FallBackCPU) { 269 std::unordered_map<string, DeviceProperties> devices; 270 DeviceProperties cpu_device; 271 cpu_device.set_type("CPU"); 272 devices["/job:my_job/replica:0/task:0/cpu:0"] = cpu_device; 273 VirtualCluster cluster(devices); 274 VirtualPlacer placer(&cluster); 275 276 NodeDef node; 277 node.set_op("Conv2D"); 278 279 // Device falls back to CPU since there is no GPU. 280 EXPECT_EQ("CPU", placer.get_device(node).type()); 281 EXPECT_EQ("/job:my_job/replica:0/task:0/cpu:0", 282 placer.get_canonical_device_name(node)); 283 } 284 285 TEST(VirtualPlacerTest, RemoteDevices) { 286 std::unordered_map<string, DeviceProperties> devices; 287 DeviceProperties cpu_device; 288 cpu_device.set_type("CPU"); 289 devices["/job:my_job/replica:0/task:0/cpu:0"] = cpu_device; 290 DeviceProperties gpu_device; 291 gpu_device.set_type("GPU"); 292 devices["/job:my_job/replica:0/task:0/device:GPU:0"] = gpu_device; 293 VirtualCluster cluster(devices); 294 VirtualPlacer placer(&cluster); 295 296 NodeDef node; 297 node.set_op("Conv2D"); 298 299 // Device falls back to GPU. 300 EXPECT_EQ("GPU", placer.get_device(node).type()); 301 EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0", 302 placer.get_canonical_device_name(node)); 303 304 node.set_device("/job:my_job/replica:0/task:0/cpu:0"); 305 EXPECT_EQ("CPU", placer.get_device(node).type()); 306 EXPECT_EQ("/job:my_job/replica:0/task:0/cpu:0", 307 placer.get_canonical_device_name(node)); 308 309 node.set_device("/job:my_job/replica:0/task:0/device:GPU:0"); 310 EXPECT_EQ("GPU", placer.get_device(node).type()); 311 EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0", 312 placer.get_canonical_device_name(node)); 313 314 // There is no local cpu available. Device falls back to GPU. 315 node.set_device("CPU"); 316 EXPECT_EQ("GPU", placer.get_device(node).type()); 317 EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0", 318 placer.get_canonical_device_name(node)); 319 320 node.set_device("GPU:0"); 321 // There is no local GPU available. Fall back to default GPU. 322 EXPECT_EQ("GPU", placer.get_device(node).type()); 323 EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0", 324 placer.get_canonical_device_name(node)); 325 326 // This isn't a valid name. Fall back to GPU. 327 node.set_device("/job:my_job/replica:0/task:0"); 328 EXPECT_EQ("GPU", placer.get_device(node).type()); 329 EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0", 330 placer.get_canonical_device_name(node)); 331 } 332 333 } // end namespace grappler 334 } // end namespace tensorflow 335