Home | History | Annotate | Download | only in costs
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/grappler/costs/virtual_placer.h"
     17 #include "tensorflow/core/framework/node_def.pb.h"
     18 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
     19 #include "tensorflow/core/lib/strings/strcat.h"
     20 #include "tensorflow/core/platform/test.h"
     21 #include "tensorflow/core/protobuf/device_properties.pb.h"
     22 
     23 namespace tensorflow {
     24 namespace grappler {
     25 
     26 TEST(VirtualPlacerTest, LocalDevices) {
     27   // Create a virtual cluster with a local CPU and a local GPU
     28   std::unordered_map<string, DeviceProperties> devices;
     29   DeviceProperties cpu_device;
     30   cpu_device.set_type("CPU");
     31   devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
     32   DeviceProperties gpu_device;
     33   gpu_device.set_type("GPU");
     34   devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
     35   VirtualCluster cluster(devices);
     36   VirtualPlacer placer(&cluster);
     37 
     38   NodeDef node;
     39   node.set_op("Conv2D");
     40   // node.device() is empty, but GPU is default device if there is.
     41   EXPECT_EQ("GPU", placer.get_device(node).type());
     42   EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
     43             placer.get_canonical_device_name(node));
     44 
     45   node.set_device("CPU");
     46   EXPECT_EQ("CPU", placer.get_device(node).type());
     47   EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0",
     48             placer.get_canonical_device_name(node));
     49 
     50   node.set_device("GPU:0");
     51   EXPECT_EQ("GPU", placer.get_device(node).type());
     52   EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
     53             placer.get_canonical_device_name(node));
     54 }
     55 
     56 TEST(VirtualPlacerTest, ShortNames) {
     57   // Create a virtual cluster with a local CPU and a local GPU
     58   std::unordered_map<string, DeviceProperties> devices;
     59   DeviceProperties cpu_device;
     60   cpu_device.set_type("CPU");
     61   devices["/CPU:0"] = cpu_device;
     62   DeviceProperties gpu_device;
     63   gpu_device.set_type("GPU");
     64   devices["/GPU:0"] = gpu_device;
     65   VirtualCluster cluster(devices);
     66   VirtualPlacer placer(&cluster);
     67 
     68   NodeDef node;
     69   node.set_op("Conv2D");
     70   // node.device() is empty, but GPU is default device if there is.
     71   EXPECT_EQ("GPU", placer.get_device(node).type());
     72   EXPECT_EQ("/GPU:0", placer.get_canonical_device_name(node));
     73 
     74   node.set_device("CPU");
     75   EXPECT_EQ("CPU", placer.get_device(node).type());
     76   EXPECT_EQ("/CPU:0", placer.get_canonical_device_name(node));
     77 
     78   node.set_device("GPU:0");
     79   EXPECT_EQ("GPU", placer.get_device(node).type());
     80   EXPECT_EQ("/GPU:0", placer.get_canonical_device_name(node));
     81 }
     82 
     83 TEST(VirtualPlacerTest, PlacementOnNonDefaultDevice) {
     84   // Create a virtual cluster with a CPU and a device:TPU
     85   // Test that placement on TPU works
     86   // In contrast with GPU, TPU is not selected as default device at the moment.
     87 
     88   std::unordered_map<string, DeviceProperties> devices;
     89   DeviceProperties cpu_device;
     90   cpu_device.set_type("CPU");
     91   devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
     92   DeviceProperties tpu_device;
     93   tpu_device.set_type("TPU");
     94   devices["/job:localhost/replica:0/task:0/device:TPU:0"] = tpu_device;
     95   VirtualCluster cluster(devices);
     96   VirtualPlacer placer(&cluster);
     97 
     98   NodeDef node;
     99   node.set_op("Conv2D");
    100   // node.device() is empty, and CPU is default device.
    101   EXPECT_EQ("CPU", placer.get_device(node).type());
    102   EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0",
    103             placer.get_canonical_device_name(node));
    104 
    105   node.set_device("/device:TPU:0");
    106   EXPECT_EQ("TPU", placer.get_device(node).type());
    107   EXPECT_EQ("/job:localhost/replica:0/task:0/device:TPU:0",
    108             placer.get_canonical_device_name(node));
    109 }
    110 
    111 TEST(VirtualPlacerTest, EmptyJobName) {
    112   // Virtual placer choose job name from the devices in cluster if a device name
    113   // of an op is empty. In case there are more than one kind of job name
    114   // or job names are missin in the devices in cluster, we use local_host.
    115   for (const string& job_name : {"localhost", "worker", "worker_train"}) {
    116     std::unordered_map<string, DeviceProperties> devices;
    117     DeviceProperties cpu_device;
    118     cpu_device.set_type("CPU");
    119     devices[strings::StrCat("/job:", job_name, "/replica:0/task:0/cpu:0")] =
    120         cpu_device;
    121     DeviceProperties gpu_device;
    122     gpu_device.set_type("GPU");
    123     devices[strings::StrCat("/job:", job_name,
    124                             "/replica:0/task:0/device:GPU:0")] = gpu_device;
    125     VirtualCluster cluster(devices);
    126     VirtualPlacer placer(&cluster);
    127 
    128     NodeDef node;
    129     node.set_op("Conv2D");
    130     node.set_device("/device:CPU:0");
    131     EXPECT_EQ(strings::StrCat("/job:", job_name, "/replica:0/task:0/cpu:0"),
    132               placer.get_canonical_device_name(node));
    133     node.set_device("/device:GPU:0");
    134     EXPECT_EQ(
    135         strings::StrCat("/job:", job_name, "/replica:0/task:0/device:GPU:0"),
    136         placer.get_canonical_device_name(node));
    137   }
    138 
    139   // When more than one job names are used, we use default "localhost"
    140   // This may be improved later.
    141   std::unordered_map<string, DeviceProperties> devices;
    142   DeviceProperties cpu_device;
    143   cpu_device.set_type("CPU");
    144   devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
    145   devices["/job:ps/replica:0/task:0/cpu:0"] = cpu_device;
    146   devices["/job:worker/replica:0/task:0/cpu:0"] = cpu_device;
    147   VirtualCluster cluster(devices);
    148   VirtualPlacer placer(&cluster);
    149 
    150   NodeDef node;
    151   node.set_op("Conv2D");
    152   node.set_device("/device:CPU:0");
    153   EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0",
    154             placer.get_canonical_device_name(node));
    155 }
    156 
    157 string GetDefaultDeviceName(
    158     const std::unordered_map<string, DeviceProperties>& devices) {
    159   VirtualCluster cluster(devices);
    160   VirtualPlacer placer(&cluster);
    161   NodeDef node;
    162   node.set_op("Conv2D");
    163   // Device is not set to the node, so get_canonical_device_name() will return
    164   // the default_device_.
    165   return placer.get_canonical_device_name(node);
    166 }
    167 
    168 TEST(VirtualPlacerTest, DefaultDevice) {
    169   std::unordered_map<string, DeviceProperties> devices;
    170   DeviceProperties cpu_device;
    171   cpu_device.set_type("CPU");
    172   devices["/job:worker/replica:0/task:0/cpu:0"] = cpu_device;
    173 
    174   // CPU is default when there is only CPU.
    175   EXPECT_EQ("/job:worker/replica:0/task:0/cpu:0",
    176             GetDefaultDeviceName(devices));
    177 
    178   DeviceProperties gpu_device;
    179   gpu_device.set_type("GPU");
    180 
    181   // If there is any GPU, then gpu:0 is default device.
    182   for (int i = 0; i < 8; i++) {
    183     devices[strings::StrCat("/job:worker/replica:0/task:0/gpu:", i)] =
    184         gpu_device;
    185     EXPECT_EQ("/job:worker/replica:0/task:0/gpu:0",
    186               GetDefaultDeviceName(devices));
    187   }
    188 }
    189 
    190 TEST(VirtualPlacerTest, MultiReplica) {
    191   // Create a cluster with 8 workers, each with 8 GPUs.
    192   std::unordered_map<string, DeviceProperties> devices;
    193   DeviceProperties cpu_device;
    194   cpu_device.set_type("CPU");
    195   DeviceProperties gpu_device;
    196   gpu_device.set_type("GPU");
    197   for (int i = 0; i < 8; i++) {
    198     devices[strings::StrCat("/job:worker/replica:", i, "/task:0/cpu:0")] =
    199         cpu_device;
    200     for (int j = 0; j < 8; j++) {
    201       devices[strings::StrCat("/job:worker/replica:", i, "/task:0/gpu:", j)] =
    202           gpu_device;
    203     }
    204   }
    205 
    206   std::unique_ptr<VirtualCluster> cluster(new VirtualCluster(devices));
    207   std::unique_ptr<VirtualPlacer> placer(new VirtualPlacer(cluster.get()));
    208 
    209   auto get_device_name = [&placer](const string& device) -> string {
    210     NodeDef node;
    211     node.set_op("Conv2D");
    212     node.set_device(device);
    213     return placer->get_canonical_device_name(node);
    214   };
    215 
    216   // Validate device name is correct when we pass only replica ID and device
    217   // name.
    218   EXPECT_EQ("/job:worker/replica:0/task:0/cpu:0",
    219             get_device_name("/replica:0/cpu:0"));
    220   EXPECT_EQ("/job:worker/replica:2/task:0/cpu:0",
    221             get_device_name("/replica:2/cpu:0"));
    222   EXPECT_EQ("/job:worker/replica:7/task:0/cpu:0",
    223             get_device_name("/replica:7/cpu:0"));
    224   EXPECT_EQ("/job:worker/replica:3/task:0/gpu:0",
    225             get_device_name("/replica:3/gpu:0"));
    226   EXPECT_EQ("/job:worker/replica:5/task:0/gpu:3",
    227             get_device_name("/replica:5/gpu:3"));
    228   EXPECT_EQ("/job:worker/replica:4/task:0/gpu:7",
    229             get_device_name("/replica:4/gpu:7"));
    230 
    231   // Now add PS replicas; with multiple job names present in the cluster,
    232   // device names in nodes should specify job names correctly.
    233   for (int i = 0; i < 4; i++) {
    234     devices[strings::StrCat("/job:ps/replica:", i, "/task:0/cpu:0")] =
    235         cpu_device;
    236   }
    237   cluster.reset(new VirtualCluster(devices));
    238   placer.reset(new VirtualPlacer(cluster.get()));
    239   EXPECT_EQ("/job:worker/replica:0/task:0/cpu:0",
    240             get_device_name("/job:worker/replica:0/cpu:0"));
    241   EXPECT_EQ("/job:worker/replica:7/task:0/gpu:3",
    242             get_device_name("/job:worker/replica:7/gpu:3"));
    243   EXPECT_EQ("/job:ps/replica:0/task:0/cpu:0",
    244             get_device_name("/job:ps/replica:0/cpu:0"));
    245   EXPECT_EQ("/job:ps/replica:1/task:0/cpu:0",
    246             get_device_name("/job:ps/replica:1/cpu:0"));
    247   EXPECT_EQ("/job:ps/replica:2/task:0/cpu:0",
    248             get_device_name("/job:ps/replica:2/cpu:0"));
    249   EXPECT_EQ("/job:ps/replica:3/task:0/cpu:0",
    250             get_device_name("/job:ps/replica:3/cpu:0"));
    251 }
    252 
    253 TEST(VirtualPlacerTest, FallBackUnknown) {
    254   // Virtual placer falls back to "UNKNOWN" only if there are no devices in the
    255   // cluster.
    256   std::unordered_map<string, DeviceProperties> devices;
    257   VirtualCluster cluster(devices);
    258   VirtualPlacer placer(&cluster);
    259 
    260   NodeDef node;
    261   node.set_op("Conv2D");
    262 
    263   // Device falls back to UNKNOWN since the cluster has no devices.
    264   EXPECT_EQ("UNKNOWN", placer.get_device(node).type());
    265   EXPECT_EQ("UNKNOWN", placer.get_canonical_device_name(node));
    266 }
    267 
    268 TEST(VirtualPlacerTest, FallBackCPU) {
    269   std::unordered_map<string, DeviceProperties> devices;
    270   DeviceProperties cpu_device;
    271   cpu_device.set_type("CPU");
    272   devices["/job:my_job/replica:0/task:0/cpu:0"] = cpu_device;
    273   VirtualCluster cluster(devices);
    274   VirtualPlacer placer(&cluster);
    275 
    276   NodeDef node;
    277   node.set_op("Conv2D");
    278 
    279   // Device falls back to CPU since there is no GPU.
    280   EXPECT_EQ("CPU", placer.get_device(node).type());
    281   EXPECT_EQ("/job:my_job/replica:0/task:0/cpu:0",
    282             placer.get_canonical_device_name(node));
    283 }
    284 
    285 TEST(VirtualPlacerTest, RemoteDevices) {
    286   std::unordered_map<string, DeviceProperties> devices;
    287   DeviceProperties cpu_device;
    288   cpu_device.set_type("CPU");
    289   devices["/job:my_job/replica:0/task:0/cpu:0"] = cpu_device;
    290   DeviceProperties gpu_device;
    291   gpu_device.set_type("GPU");
    292   devices["/job:my_job/replica:0/task:0/device:GPU:0"] = gpu_device;
    293   VirtualCluster cluster(devices);
    294   VirtualPlacer placer(&cluster);
    295 
    296   NodeDef node;
    297   node.set_op("Conv2D");
    298 
    299   // Device falls back to GPU.
    300   EXPECT_EQ("GPU", placer.get_device(node).type());
    301   EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
    302             placer.get_canonical_device_name(node));
    303 
    304   node.set_device("/job:my_job/replica:0/task:0/cpu:0");
    305   EXPECT_EQ("CPU", placer.get_device(node).type());
    306   EXPECT_EQ("/job:my_job/replica:0/task:0/cpu:0",
    307             placer.get_canonical_device_name(node));
    308 
    309   node.set_device("/job:my_job/replica:0/task:0/device:GPU:0");
    310   EXPECT_EQ("GPU", placer.get_device(node).type());
    311   EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
    312             placer.get_canonical_device_name(node));
    313 
    314   // There is no local cpu available. Device falls back to GPU.
    315   node.set_device("CPU");
    316   EXPECT_EQ("GPU", placer.get_device(node).type());
    317   EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
    318             placer.get_canonical_device_name(node));
    319 
    320   node.set_device("GPU:0");
    321   // There is no local GPU available. Fall back to default GPU.
    322   EXPECT_EQ("GPU", placer.get_device(node).type());
    323   EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
    324             placer.get_canonical_device_name(node));
    325 
    326   // This isn't a valid name. Fall back to GPU.
    327   node.set_device("/job:my_job/replica:0/task:0");
    328   EXPECT_EQ("GPU", placer.get_device(node).type());
    329   EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
    330             placer.get_canonical_device_name(node));
    331 }
    332 
    333 }  // end namespace grappler
    334 }  // end namespace tensorflow
    335