Home | History | Annotate | Download | only in stream_executor

Lines Matching defs:Stream

16 // The Stream is used in conjunction with the StreamExecutor "parent" to
17 // perform actions with a linear stream of dependencies. Dependencies can also
83 // Represents a stream of dependent computations on a GPU device.
85 // The operations within a stream execute linearly and asynchronously until
87 // the execution of the stream.
89 // If any given operation fails when entraining work for the stream, ok() will
90 // indicate that an error has occurred. After initialization, once a stream is
94 class Stream {
96 // Instantiate a stream tied to parent as a platform executor. Work
97 // entrained onto this stream will be launched/managed on that
99 explicit Stream(StreamExecutor *parent);
102 // platform-specific stream implementation.
103 Stream(StreamExecutor *parent, internal::StreamInterface *implementation);
105 // Deallocates any stream resources that the parent StreamExecutor has
108 ~Stream();
111 // stream.
114 // Initialize the stream. This must be performed before entraining any other
116 Stream &Init() LOCKS_EXCLUDED(mu_);
119 Stream &InitTimer(Timer *t);
122 Stream &InitWithTimer(Timer *t);
124 // Get or create a sub-stream from this stream. If there is any sub-stream in
125 // the pool that can be reused then just return this sub-stream. Otherwise
126 // create a new sub-stream.
127 Stream *GetOrCreateSubStream() LOCKS_EXCLUDED(mu_);
129 // Return the sub-stream back to the host stream so that it can be reused
131 void ReturnSubStream(Stream *sub_stream) LOCKS_EXCLUDED(mu_);
133 // Allocate temporary memories. The stream will deallocate them when blocked
139 // Entrains onto the stream of operations: a kernel launch with the given
158 Stream &ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
162 // stream's
164 // the stream's execution). Streams may be started/stopped multiple times.
165 Stream &ThenStartTimer(Timer *t);
168 // stream's
169 // execution. See also Stream::ThenStartTimer.
170 Stream &ThenStopTimer(Timer *t);
172 // TODO(leary) If work is added to the stream that is being depended upon,
175 Stream &ThenWaitFor(Stream *other, Params... more_streams) {
179 // Create a dependency for this stream's next work on the other stream
183 // Checks that a stream does not wait for itself, and it is up to the
184 // user to guarantee that a stream does not come to wait on itself in a
189 Stream &ThenWaitFor(Stream *other);
195 Stream &ThenWaitFor(P others) {
196 for (auto &stream : *others) {
197 CHECK_NE(stream.get(), this);
198 ThenWaitFor(stream.get());
207 Stream &ThenWaitFor(Event *event);
209 // Inserts the specified event into the end of this stream. Once the stream
212 // The stream does not take ownership of event - meaning that event's lifetime
214 Stream &ThenRecordEvent(Event *event);
221 Stream &ThenBatchNormalizationForward(
234 Stream &ThenBatchNormalizationBackward(
242 Stream &ThenBatchNormalizationForward(
255 Stream &ThenBatchNormalizationBackward(
266 Stream &ThenFusedConvolve(
278 Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
286 Stream &ThenConvolveQuantized(
296 Stream &ThenConvolveQuantized(
306 Stream &ThenFusedConvolveWithScratch(
318 Stream &ThenFusedConvolveWithScratch(
331 Stream &ThenFusedConvolveWithScratch(
343 Stream &ThenConvolveWithScratch(
352 Stream &ThenConvolveWithScratch(
361 Stream &ThenConvolveWithAlgorithm(
372 Stream &ThenConvolveWithAlgorithm(
383 Stream &ThenFusedConvolveWithAlgorithm(
397 Stream &ThenFusedConvolveWithAlgorithm(
411 Stream &ThenFusedConvolveWithAlgorithm(
426 Stream &ThenFusedConvolveWithAlgorithm(
440 Stream &ThenSeparableConvolve(
450 Stream &ThenConvolveBackwardData(
459 Stream &ThenConvolveBackwardDataWithScratch(
469 Stream &ThenConvolveBackwardDataWithScratch(
479 Stream &ThenConvolveBackwardDataWithAlgorithm(
491 Stream &ThenConvolveBackwardDataWithAlgorithm(
503 Stream &ThenConvolveBackwardFilter(
512 Stream &ThenConvolveBackwardFilterWithScratch(
522 Stream &ThenConvolveBackwardFilterWithScratch(
532 Stream &ThenConvolveBackwardFilterWithAlgorithm(
544 Stream &ThenConvolveBackwardFilterWithAlgorithm(
556 Stream &ThenConvolveBackwardBias(const dnn::BatchDescriptor &input_descriptor,
561 Stream &ThenConvolveBackwardBias(const dnn::BatchDescriptor &input_descriptor,
566 Stream &ThenConvolveBackwardBias(
572 Stream &ThenMatMul(const DeviceMemory<float> &input_data,
578 Stream &ThenMatMulQuantized(const DeviceMemory<float> &input_data,
585 Stream &ThenMatMulQuantized(const DeviceMemory<float> &input_data,
592 Stream &ThenBiasAdd(const DeviceMemory<float> &input_data,
597 Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
603 Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
609 Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
615 Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
623 Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
631 Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
639 Stream &ThenNormalize(const dnn::NormalizeDescriptor &normalize_descriptor,
645 Stream &ThenNormalizeWithDimensions(
650 Stream &ThenNormalizeBackwardWithDimensions(
658 Stream &ThenActivate(dnn::ActivationMode activation_mode,
665 Stream &ThenActivateWithOptions(dnn::ActivationMode activation_mode,
671 Stream &ThenDepthConcatenate(
676 Stream &ThenSpaceConcatenate(
686 Stream &ThenReshape(const dnn::BatchDescriptor &input_dimensions,
697 Stream &ThenDepthToSpace(const dnn::BatchDescriptor &input_dimensions,
708 Stream &ThenSpaceToDepth(const dnn::BatchDescriptor &input_dimensions,
714 Stream &ThenElementwiseOperate(
721 Stream &ThenElementwiseOperateScaledQuantized(
729 Stream &ThenXYPad(const dnn::BatchDescriptor &dimensions,
734 Stream &ThenXYSlice(const dnn::BatchDescriptor &dimensions,
742 Stream &ThenXYBroadcast(const dnn::BatchDescriptor &dimensions,
748 Stream &ThenMemcpyD2HQuantized(const DeviceMemory<float> &gpu_unquantized_src,
756 Stream &ThenMemcpyD2HQuantized(
765 Stream &ThenMemcpyH2DQuantized(const void *host_src, uint64 size,
773 Stream &ThenMemcpyH2DQuantized(port::ArraySlice<ElementType> host_src,
781 Stream &ThenCopyHostBuffer2Device(HostBuffer *buffer_src,
785 Stream &ThenCopyDevice2HostBuffer(
792 Stream &ThenBlasAsum(uint64 elem_count, const DeviceMemory<float> &x,
794 Stream &ThenBlasAsum(uint64 elem_count, const DeviceMemory<double> &x,
796 Stream &ThenBlasAsum(uint64 elem_count,
799 Stream &ThenBlasAsum(uint64 elem_count,
806 // that the stream does not change or populate during the course of
807 // execution). The value is effectively captured at stream-enqueue time.
808 Stream &ThenBlasAxpy(uint64 elem_count, float alpha,
811 Stream &ThenBlasAxpy(uint64 elem_count, double alpha,
814 Stream &ThenBlasAxpy(uint64 elem_count, std::complex<float> alpha,
817 Stream &ThenBlasAxpy(uint64 elem_count, std::complex<double> alpha,
822 Stream &ThenBlasCopy(uint64 elem_count, const DeviceMemory<float> &x,
824 Stream &ThenBlasCopy(uint64 elem_count, const DeviceMemory<double> &x,
826 Stream &ThenBlasCopy(uint64 elem_count,
829 Stream &ThenBlasCopy(uint64 elem_count,
834 Stream &ThenBlasDot(uint64 elem_count, const DeviceMemory<float> &x, int incx,
837 Stream &ThenBlasDot(uint64 elem_count, const DeviceMemory<double> &x,
842 Stream &ThenBlasDotc(uint64 elem_count,
846 Stream &ThenBlasDotc(uint64 elem_count,
852 Stream &ThenBlasDotu(uint64 elem_count,
856 Stream &ThenBlasDotu(uint64 elem_count,
862 Stream &ThenBlasNrm2(uint64 elem_count, const DeviceMemory<float> &x,
864 Stream &ThenBlasNrm2(uint64 elem_count, const DeviceMemory<double> &x,
866 Stream &ThenBlasNrm2(uint64 elem_count,
869 Stream &ThenBlasNrm2(uint64 elem_count,
874 Stream &ThenBlasRot(uint64 elem_count, DeviceMemory<float> *x, int incx,
876 Stream &ThenBlasRot(uint64 elem_count, DeviceMemory<double> *x, int incx,
878 Stream &ThenBlasRot(uint64 elem_count, DeviceMemory<std::complex<float>> *x,
881 Stream &ThenBlasRot(uint64 elem_count, DeviceMemory<std::complex<double>> *x,
886 Stream &ThenBlasRotg(DeviceMemory<float> *a, DeviceMemory<float> *b,
888 Stream &ThenBlasRotg(DeviceMemory<double> *a, DeviceMemory<double> *b,
890 Stream &ThenBlasRotg(DeviceMemory<std::complex<float>> *a,
894 Stream &ThenBlasRotg(DeviceMemory<std::complex<double>> *a,
900 Stream &ThenBlasRotm(uint64 elem_count, DeviceMemory<float> *x, int incx,
903 Stream &ThenBlasRotm(uint64 elem_count, DeviceMemory<double> *x, int incx,
908 Stream &ThenBlasRotmg(DeviceMemory<float> *d1, DeviceMemory<float> *d2,
911 Stream &ThenBlasRotmg(DeviceMemory<double> *d1, DeviceMemory<double> *d2,
917 Stream &ThenBlasScal(uint64 elem_count, float alpha, DeviceMemory<float> *x,
919 Stream &ThenBlasScal(uint64 elem_count, double alpha, DeviceMemory<double> *x,
921 Stream &ThenBlasScal(uint64 elem_count, float alpha,
923 Stream &ThenBlasScal(uint64 elem_count, double alpha,
925 Stream &ThenBlasScal(uint64 elem_count, std::complex<float> alpha,
927 Stream &ThenBlasScal(uint64 elem_count, std::complex<double> alpha,
931 Stream &ThenBlasSwap(uint64 elem_count, DeviceMemory<float> *x, int incx,
933 Stream &ThenBlasSwap(uint64 elem_count, DeviceMemory<double> *x, int incx,
935 Stream &ThenBlasSwap(uint64 elem_count, DeviceMemory<std::complex<float>> *x,
938 Stream &ThenBlasSwap(uint64 elem_count, DeviceMemory<std::complex<double>> *x,
943 Stream &ThenBlasIamax(uint64 elem_count, const DeviceMemory<float> &x,
945 Stream &ThenBlasIamax(uint64 elem_count, const DeviceMemory<double> &x,
947 Stream &ThenBlasIamax(uint64 elem_count,
950 Stream &ThenBlasIamax(uint64 elem_count,
955 Stream &ThenBlasIamin(uint64 elem_count, const DeviceMemory<float> &x,
957 Stream &ThenBlasIamin(uint64 elem_count, const DeviceMemory<double> &x,
959 Stream &ThenBlasIamin(uint64 elem_count,
962 Stream &ThenBlasIamin(uint64 elem_count,
967 Stream &ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl,
971 Stream &ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl,
975 Stream &ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl,
981 Stream &ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n, uint64 kl,
989 Stream &ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, float alpha,
993 Stream &ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n, double alpha,
997 Stream &ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
1003 Stream &ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
1010 Stream &ThenBlasGemvWithProfiling(blas::Transpose trans, uint64 m, uint64 n,
1016 Stream &ThenBlasGemvWithProfiling(blas::Transpose trans, uint64 m, uint64 n,
1022 Stream &ThenBlasGemvWithProfiling(
1028 Stream &ThenBlasGemvWithProfiling(
1036 Stream &ThenBlasGer(uint64 m, uint64 n, float alpha,
1040 Stream &ThenBlasGer(uint64 m, uint64 n, double alpha,
1046 Stream &ThenBlasGerc(uint64 m, uint64 n, std::complex<float> alpha,
1050 Stream &ThenBlasGerc(uint64 m, uint64 n, std::complex<double> alpha,
1056 Stream &ThenBlasGeru(uint64 m, uint64 n, std::complex<float> alpha,
1060 Stream &ThenBlasGeru(uint64 m, uint64 n, std::complex<double> alpha,
1066 Stream &ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k,
1072 Stream &ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k,
1080 Stream &ThenBlasHemv(blas::UpperLower uplo, uint64 n,
1086 Stream &ThenBlasHemv(blas::UpperLower uplo, uint64 n,
1094 Stream &ThenBlasHer(blas::UpperLower uplo, uint64 n, float alpha,
1097 Stream &ThenBlasHer(blas::UpperLower uplo, uint64 n, double alpha,
1102 Stream &ThenBlasHer2(blas::UpperLower uplo, uint64 n,
1107 Stream &ThenBlasHer2(blas::UpperLower uplo, uint64 n,
1114 Stream &ThenBlasHpmv(blas::UpperLower uplo, uint64 n,
1120 Stream &ThenBlasHpmv(blas::UpperLower uplo, uint64 n,
1128 Stream &ThenBlasHpr(blas::UpperLower uplo, uint64 n, float alpha,
1131 Stream &ThenBlasHpr(blas::UpperLower uplo, uint64 n, double alpha,
1136 Stream &ThenBlasHpr2(blas::UpperLower uplo, uint64 n,
1141 Stream &ThenBlasHpr2(blas::UpperLower uplo, uint64 n,
1148 Stream &ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k, float alpha,
1152 Stream &ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k, double alpha,
1158 Stream &ThenBlasSpmv(blas::UpperLower uplo, uint64 n, float alpha,
1162 Stream &ThenBlasSpmv(blas::UpperLower uplo, uint64 n, double alpha,
1168 Stream &ThenBlasSpr(blas::UpperLower uplo, uint64 n, float alpha,
1171 Stream &ThenBlasSpr(blas::UpperLower uplo, uint64 n, double alpha,
1176 Stream &ThenBlasSpr2(blas::UpperLower uplo, uint64 n, float alpha,
1180 Stream &ThenBlasSpr2(blas::UpperLower uplo, uint64 n, double alpha,
1186 Stream &ThenBlasSymv(blas::UpperLower uplo, uint64 n, float alpha,
1190 Stream &ThenBlasSymv(blas::UpperLower uplo, uint64 n, double alpha,
1196 Stream &ThenBlasSyr(blas::UpperLower uplo, uint64 n, float alpha,
1199 Stream &ThenBlasSyr(blas::UpperLower uplo, uint64 n, double alpha,
1204 Stream &ThenBlasSyr2(blas::UpperLower uplo, uint64 n, float alpha,
1208 Stream &ThenBlasSyr2(blas::UpperLower uplo, uint64 n, double alpha,
1214 Stream &ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
1218 Stream &ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
1222 Stream &ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
1226 Stream &ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
1232 Stream &ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
1236 Stream &ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
1240 Stream &ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
1244 Stream &ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
1250 Stream &ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
1254 Stream &ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
1258 Stream &ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
1262 Stream &ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
1268 Stream &ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
1272 Stream &ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
1276 Stream &ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
1280 Stream &ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
1286 Stream &ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
1290 Stream &ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
1294 Stream &ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
1298 Stream &ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
1304 Stream &ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
1308 Stream &ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
1312 Stream &ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
1316 Stream &ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
1322 Stream &ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m,
1327 Stream &ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m,
1332 Stream &ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m,
1337 Stream &ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m,
1343 Stream &ThenBlasGemm(blas::Transpose transa, blas::Transpose transb, uint64 m,
1350 Stream &ThenBlasGemmWithProfiling(blas::Transpose transa,
1358 Stream &ThenBlasGemmWithProfiling(blas::Transpose transa,
1365 Stream &ThenBlasGemmWithProfiling(blas::Transpose transa,
1373 Stream &ThenBlasGemmWithProfiling(
1380 Stream &ThenBlasGemmWithProfiling(
1389 Stream &ThenBlasGemmWithAlgorithm(
1396 Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa,
1405 Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa,
1414 Stream &ThenBlasGemmWithAlgorithm(
1421 Stream &ThenBlasGemmWithAlgorithm(
1429 Stream &ThenBlasGemmWithAlgorithm(
1439 Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
1447 Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
1455 Stream &ThenBlasGemmBatched(
1463 Stream &ThenBlasGemmBatched(
1471 Stream &ThenBlasGemmBatchedWithScratch(
1477 Stream &ThenBlasGemmBatchedWithScratch(
1483 Stream &ThenBlasGemmBatchedWithScratch(
1491 Stream &ThenBlasGemmBatchedWithScratch(
1501 Stream &ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m,
1507 Stream &ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m,
1515 Stream &ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1520 Stream &ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1527 Stream &ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1533 Stream &ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1541 Stream &ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
1545 Stream &ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
1549 Stream &ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
1555 Stream &ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
1563 Stream &ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1566 Stream &ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1569 Stream &ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1574 Stream &ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1581 Stream &ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1585 Stream &ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1589 Stream &ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1595 Stream &ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans, uint64 n,
1603 Stream &ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
1607 Stream &ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
1611 Stream &ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
1616 Stream &ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
1623 Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
1627 Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
1631 Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
1636 Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
1643 Stream &ThenFft(fft::Plan *plan,
1646 Stream &ThenFft(fft::Plan *plan,
1649 Stream &ThenFft(fft::Plan *plan, const DeviceMemory<float> &input,
1651 Stream &ThenFft(fft::Plan *plan, const DeviceMemory<double> &input,
1653 Stream &ThenFft(fft::Plan *plan,
1656 Stream &ThenFft(fft::Plan *plan,
1672 // stream.ThenSetRngSeed(seed_data, bytes_read);
1676 // stream.ThenSetRngSeed(seed_data, 16);
1677 Stream &ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes);
1684 Stream &ThenPopulateRandUniform(DeviceMemory<float> *values);
1685 Stream &ThenPopulateRandUniform(DeviceMemory<double> *values);
1686 Stream &ThenPopulateRandUniform(DeviceMemory<std::complex<float>> *values);
1687 Stream &ThenPopulateRandUniform(DeviceMemory<std::complex<double>> *values);
1688 Stream &ThenPopulateRandGaussian(float mean, float stddev,
1690 Stream &ThenPopulateRandGaussian(double mean, double stddev,
1693 // Entrain onto the stream: a memcpy to a host destination from a GPU source
1697 Stream &ThenMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
1700 // Entrain onto the stream: a memcpy to a GPU destination from a host source
1704 Stream &ThenMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
1711 Stream &ThenMemcpyD2H(const DeviceMemory<T> &gpu_src,
1722 Stream &ThenMemcpyH2D(port::ArraySlice<T> host_src,
1729 // Entrain onto the stream: a memcpy to a GPU destination from a GPU source
1732 Stream &ThenMemcpy(DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src,
1738 Stream &ThenMemcpyD2D(DeviceMemoryBase *gpu_dst,
1743 // Entrain onto the stream: a memset of zero at a GPU location of size bytes.
1745 Stream &ThenMemZero(DeviceMemoryBase *location, uint64 size);
1747 // Entrain onto the stream: a memset of a 32-bit pattern at a GPU location of
1750 Stream &ThenMemset32(DeviceMemoryBase *location, uint32 pattern, uint64 size);
1752 // Enqueue a forward operation of the RNN model onto the stream.
1754 Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
1772 Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
1789 Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
1806 // Enqueue a backward operation of the RNN model onto the stream.
1808 Stream &ThenRnnBackward(
1833 Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
1857 Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
1881 // Enqueue onto the stream a operation that transforms a tensor.
1883 Stream &ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
1893 Stream &ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
1903 // entrained on the stream (enqueued to this point in program
1906 // Returns an OK status if the blocking was successful and the stream is ok().
1916 // Entrains onto the stream a function to be executed on the host at some
1918 // Async host callbacks DO NOT block the stream as device functions (or as
1924 // parameter will still be valid - this Stream may not be!
1926 Stream &ThenEnqueueOnBackgroundThread(
1933 // Entrains onto the stream a callback to the host (from the device).
1934 // Host callbacks block/occupy the stream just as device functions
1935 // (execute one at a time, block later stream operations).
1938 // them into any stream.
1941 Stream &ThenDoHostCallback(std::function<void()> callback);
1944 Stream &ThenDoHostCallbackForTest(std::function<void()> callback);
1946 // Returns the StreamExecutor (parent object) associated with this stream.
1953 // with this stream.
1970 // This is a useful shorthand for many stream routines.
1987 // The StreamExecutor that supports the operation of this stream.
1998 // Whether Init() was successfully called to allocate this stream on the
2007 // Sub-streams that are generated from this stream. Each element has a pointer
2008 // to sub-stream and a boolean value indicating if this substream is ready to
2010 std::vector<std::pair<std::unique_ptr<Stream>, bool>> sub_streams_
2021 Stream &ThenConvolveBackwardBiasImpl(
2027 SE_DISALLOW_COPY_AND_ASSIGN(Stream);
2035 Stream::AllocateTemporaryArray(uint64 element_count) {
2039 inline internal::TemporaryMemoryManager *Stream::temporary_memory_manager() {