10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 26 template<
typename Expression,
typename Device,
bool Vectorizable>
30 typedef typename Expression::Index Index;
32 static inline void run(
const Expression& expr,
const Device& device = Device())
34 TensorEvaluator<Expression, Device> evaluator(expr, device);
35 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
38 const Index size = array_prod(evaluator.dimensions());
39 for (Index i = 0; i < size; ++i) {
40 evaluator.evalScalar(i);
48 template<
typename Expression>
52 typedef typename Expression::Index Index;
54 static inline void run(
const Expression& expr,
const DefaultDevice& device = DefaultDevice())
56 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
57 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
60 const Index size = array_prod(evaluator.dimensions());
61 const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
62 const Index VectorizedSize = (size / PacketSize) * PacketSize;
64 for (Index i = 0; i < VectorizedSize; i += PacketSize) {
65 evaluator.evalPacket(i);
67 for (Index i = VectorizedSize; i < size; ++i) {
68 evaluator.evalScalar(i);
78 #ifdef EIGEN_USE_THREADS 79 template <
typename Evaluator,
typename Index,
bool Vectorizable>
81 static void run(Evaluator evaluator,
const Index first,
const Index last) {
82 eigen_assert(last > first);
83 for (Index i = first; i < last; ++i) {
84 evaluator.evalScalar(i);
89 template <
typename Evaluator,
typename Index>
90 struct EvalRange<Evaluator, Index, true> {
91 static void run(Evaluator evaluator,
const Index first,
const Index last) {
92 eigen_assert(last > first);
95 static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
96 if (last - first >= PacketSize) {
97 eigen_assert(first % PacketSize == 0);
98 Index lastPacket = last - (last % PacketSize);
99 for (; i < lastPacket; i += PacketSize) {
100 evaluator.evalPacket(i);
104 for (; i < last; ++i) {
105 evaluator.evalScalar(i);
110 template<
typename Expression,
bool Vectorizable>
114 typedef typename Expression::Index Index;
115 static inline void run(
const Expression& expr,
const ThreadPoolDevice& device)
117 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
118 Evaluator evaluator(expr, device);
119 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
122 const Index size = array_prod(evaluator.dimensions());
124 static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
126 int blocksz = std::ceil<int>(
static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
127 const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
128 const Index numblocks = size / blocksize;
130 std::vector<Notification*> results;
131 results.reserve(numblocks);
132 for (
int i = 0; i < numblocks; ++i) {
133 results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
136 if (numblocks * blocksize < size) {
137 EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
140 for (
int i = 0; i < numblocks; ++i) {
141 wait_until_ready(results[i]);
153 #if defined(EIGEN_USE_GPU) 155 template <
typename Expression>
158 typedef typename Expression::Index Index;
159 static void run(
const Expression& expr,
const GpuDevice& device);
162 template <
typename Expression>
165 typedef typename Expression::Index Index;
166 static void run(
const Expression& expr,
const GpuDevice& device);
169 #if defined(__CUDACC__) 171 template <
typename Evaluator,
typename Index>
173 __launch_bounds__(1024)
174 EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) {
178 Evaluator eval(memcopied_eval);
180 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
181 const Index step_size = blockDim.x * gridDim.x;
184 for (Index i = first_index; i < size; i += step_size) {
189 template <
typename Evaluator,
typename Index>
191 __launch_bounds__(1024)
192 EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
196 Evaluator eval(memcopied_eval);
198 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
199 const Index step_size = blockDim.x * gridDim.x;
202 const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
203 const Index vectorized_step_size = step_size * PacketSize;
204 const Index vectorized_size = (size / PacketSize) * PacketSize;
205 for (Index i = first_index * PacketSize; i < vectorized_size;
206 i += vectorized_step_size) {
209 for (Index i = vectorized_size + first_index; i < size; i += step_size) {
215 template <
typename Expression>
218 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
219 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
222 const int block_size = device.maxCudaThreadsPerBlock();
223 const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
224 const Index size = array_prod(evaluator.dimensions());
226 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
227 LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
234 template<
typename Expression>
237 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
238 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
241 const int block_size = device.maxCudaThreadsPerBlock();
242 const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
243 const Index size = array_prod(evaluator.dimensions());
245 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
246 LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
252 #endif // EIGEN_USE_GPU 258 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13
The tensor executor class.