Halide 18.0.0
Halide compiler and libraries
 
Loading...
Searching...
No Matches
LoopNest.h
Go to the documentation of this file.
1/** This file defines the LoopNest, which is our
2 * representation of a Halide schedule, and contains methods to
3 * generate candidates for scheduling as well as extract a
4 * featurization that can be used to cost each candidate. */
5
6#ifndef LOOP_NEST_H
7#define LOOP_NEST_H
8
9#include "ASLog.h"
10#include "CostModel.h"
11#include "FunctionDAG.h"
12#include "GPULoopInfo.h"
13#include "GPUMemInfo.h"
14#include "PerfectHashMap.h"
15#include "SearchSpaceOptions.h"
16#include "Statistics.h"
17#include "ThreadInfo.h"
18#include "Tiling.h"
19#include <set>
20#include <vector>
21
22namespace Halide {
23namespace Internal {
24namespace Autoscheduler {
25
26template<typename T>
27using NodeMap = PerfectHashMap<FunctionDAG::Node, T>;
28
29template<typename T>
30using StageMap = PerfectHashMap<FunctionDAG::Node::Stage, T>;
31
40
41std::string stringify(GPU_parallelism label);
42
43// inlined => func is inlined so has no memory store location
51
52bool may_subtile(const Anderson2021Params &params);
53
55
57
59
61 return 128;
62}
63
64int get_unroll_limit(const Target &target);
65
66bool in_range_zero_one(double x);
67
68bool are_valid_thread_extents(const vector<int64_t> &counts);
69
72
73bool all(const vector<int> &v);
74bool accessed_at_constant_indices(const std::vector<int> &unrolled, const FunctionDAG::Edge *e);
75
76// We're going to do a tree search over possible schedules to find an
77// optimal one. A tree search requires a state, and a function that
78// gives you children of the state (with costs). The following struct
79// represents the state, which is a partial schedule.
80//
81// A partial schedule is a tree. Each node is some portion of the for
82// loop nest of some Func. If there are no children, it's the
83// innermost set of loops. If there are children, it's a loop over
84// tiles of that Func.
85struct LoopNest {
86 mutable RefCount ref_count;
87
88 // The extents of this loop. Put another way, the number of tiles,
89 // not the size of each tile.
90 vector<int64_t> size;
91
92 // The nodes inside the loop body
93 vector<IntrusivePtr<const LoopNest>> children;
94
95 // Funcs inlined into this inner loop, and the number of times
96 // each is called. Only valid if children is empty.
98
99 // Funcs stored inside this loop
100 std::set<const FunctionDAG::Node *> store_at;
101
102 // The total bounds required of any given Func over all iterations
103 // of this loop. In the paper, this is represented using the
104 // little boxes to the left of the loop nest tree figures.
105 mutable NodeMap<Bound> bounds;
106
107 // The Func this loop nest belongs to
108 const FunctionDAG::Node *node = nullptr;
109
110 // The stage of the Func
111 const FunctionDAG::Node::Stage *stage = nullptr;
112
113 // Is this the innermost loop of this func (the SIMD loop)?
114 bool innermost = false;
115
116 // Are we permitted to tile this loop?
117 bool tileable = false;
118
119 // Is this the parallel outer loop?
120 bool parallel = false;
121
122 // What dimension is this Func vectorized over, in terms of the pure args of the Func?
123 int vector_dim = -1;
124
125 // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.
126 int vectorized_loop_index = -1;
127
128 // Apply gpu threads to this loop nest
130
142
143 mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates;
144 mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features;
145
146 bool is_gpu_serial(const Target &target) const {
148 }
149
150 bool is_gpu_thread(const Target &target) const {
152 }
153
154 bool is_gpu_block(const Target &target) const {
156 }
157
158 bool is_scalar() const {
159 return size.empty();
160 }
161
162 // given a newly inserted node f into this LoopNest, get union of thread counts in each dimension
163 // across all siblings of f.
164 vector<int64_t> get_union_thread_counts(const FunctionDAG::Node *f) const;
165
166 // given a newly inserted node f into this LoopNest, gets the size of
167 // all of f's stages and their pure_dim indices
169 vector<vector<int64_t>> &stage_sizes,
170 vector<vector<int>> &pure_dims,
171 vector<int> &vectorized_indices) const;
172
173 // given the loop nest of a stage to parallelize at root, figure out if using odd tile sizes
174 // for the vectorized dimension will allow the resulting thread tiles to be multiples of 32
175 // if so, we will include these in the serial loop sizes
176 void generate_vec_dim_serial_tilings(vector<int> &serial_sizes) const;
177
178 // get the loop nests of a newly inserted node, f, that is marked GPU threads. Tiles
179 // the newly inserted loop nests of f into a threads loop outside a serial loop.
180 // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result.
182 const Anderson2021Params &params,
183 const Target &target,
184 int v,
185 vector<IntrusivePtr<const LoopNest>> &result,
186 const vector<int64_t> &max_size);
187
188 void copy_from(const LoopNest &n);
190
191 static void hash_combine(uint64_t &h, uint64_t next) {
192 // From boost
193 h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
194 }
195
196 // Hash the loop structure and sizes up to a fixed depth. This is
197 // used as the hash function for the coarse-to-fine beam search in
198 // the paper.
199 void structural_hash(uint64_t &h, int depth) const;
200
201 // How many funcs are scheduled inside this loop level. Used in
202 // the structural hash.
204 size_t count = inlined.size() + store_at.size();
205 for (const auto &c : children) {
206 count += c->funcs_realized_or_inlined();
207 }
208 return count;
209 }
210
211 // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.
212 struct Sites {
213 const LoopNest *compute = nullptr; // Its containing compute_at site
214 const LoopNest *store = nullptr; // Its containing store_at site
215 const LoopNest *produce = nullptr; // Its own outermost node
216 const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop
217 const LoopNest *task = nullptr; // The parallel for loop it belongs to
218 const LoopNest *thread = nullptr; // Its containing gpu_thread loop
219 GPUMemoryType gpu_store_memory_type; // global, local, shared?
220 int64_t allocation_size = 0; // Allocation size in bytes
221 bool is_constant_allocation = false; // Does the allocation have constant size?
222 int64_t num_realizations = 0; // Number of times this stage is realized. Only valid for unscheduled producers
223 bool inlined = false; // Is the Func inlined?
224 std::vector<const LoopNest *> inlined_innermosts; // Is the Func inlined?
226
239 };
240
242 bool in_thread,
243 bool is_inlined = false) const;
244
245 std::vector<int> unrolled_loops(const Target &target,
246 const LoopNest *parent,
247 const LoopNest *grandparent) const;
248
250 StageMap<Sites> &sites,
251 NodeMap<bool> &can_be_promoted_to_registers,
252 const LoopNest *grandparent,
253 const LoopNest *parent) const;
254
256 StageMap<Sites> &sites) const;
257
258 // Compute all the sites of interest for each pipeline stage
259 void get_sites(const Target &target,
260 StageMap<Sites> &sites,
261 StageMap<int64_t> &shared_mem_alloc_sizes,
262 const LoopNest *task = nullptr,
263 const LoopNest *parent = nullptr,
264 const LoopNest *current_thread_loop = nullptr) const;
265
266 // A helper for the working_set_at_task feature. Most features are
267 // computed in the recursive pass 'compute_features' below, but
268 // this one must be done in a second separate recursive pass.
271 for (const auto &c : children) {
272 c->set_working_set_at_task_feature(working_set, features);
273 features->get(c->stage).working_set_at_task = working_set;
274 }
275 }
276
278 const LoopNest *parent,
279 bool in_threads_loop) const;
280
282
283 bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const;
284
286
288
290
291 // Get the stride over "node's" storage for a unit increment in the vectorized loop's
292 // index
293 double storage_stride(const LoadJacobian &jac,
294 int innermost_storage_dim,
295 const FunctionDAG::Node *storage_node,
296 const Bound &store_bounds,
297 const LoopNest &root) const;
298
300 int innermost_storage_dim,
301 const FunctionDAG::Node *storage_node,
302 const Bound &store_bounds,
303 const ThreadInfo *thread_info,
304 bool verbose = false) const;
305
307 const FunctionDAG::Node *storage_node,
308 const LoopNest &root) const;
309
310 int get_actual_vector_dim(const Bound &store_bounds) const;
311
313 int consumer_innermost_dim,
314 const FunctionDAG::Node *node,
315 const Bound &consumer_store_bounds,
316 const GPULoopInfo &gpu_loop_info,
317 const std::vector<int64_t> &inner_serial_loop_extents,
318 const Sites &consumer_site,
319 ScheduleFeatures &feat,
320 const LoopNest *parent,
321 const LoopNest &root,
322 GlobalMemInfo &global_mem_loads,
323 SharedMemInfo &shared_mem_loads,
324 LocalMemInfo &local_mem_loads,
325 bool verbose = false) const;
326
328 const FunctionDAG::Node *accessed,
329 int innermost_dim,
330 int loop_index) const;
331
333 const FunctionDAG::Node *accessed,
334 bool accessed_has_been_scheduled,
335 int innermost_dim,
336 int loop_index,
337 const GPUMemoryType &mem_type) const;
338
340 const FunctionDAG::Node *accessed,
341 bool accessed_has_been_scheduled,
342 int innermost_dim,
343 const GPUMemoryType &mem_type,
344 bool verbose = false) const;
345
346 int vectorized_access_size(size_t loop_index,
347 bool verbose = false) const;
348
349 template<typename T>
351 const FunctionDAG::Node *node,
352 const Bound &store_bounds,
353 const ThreadInfo *thread_info,
354 int innermost_dim,
355 double num_requests_per_warp,
356 MemInfoType<T> &mem_info,
357 bool verbose = false) const;
358
359 std::pair<double, double> compute_local_mem_store_features(const LoadJacobian &jac,
360 int consumer_innermost_dim,
361 const FunctionDAG::Node *node,
362 const Bound &consumer_store_bounds,
363 const LoopNest &root,
364 double serial_loop_extents) const;
365
366 template<typename T>
368 int consumer_innermost_dim,
369 const FunctionDAG::Node *node,
370 const Bound &consumer_store_bounds,
371 const ThreadInfo *thread_info,
372 double serial_loop_extents,
373 bool verbose) const;
374
375 template<typename T>
377 int producer_innermost_dim,
378 const FunctionDAG::Node *node,
379 const Bound &producer_store_bounds,
380 bool producer_has_been_scheduled,
381 const ThreadInfo *thread_info,
382 MemInfoType<T> &mem_info,
383 double serial_loop_extents,
384 bool verbose = false) const;
385
386 double compute_local_mem_stride(double stride,
387 double bytes) const;
388
389 // Assumes block, serial, thread or block, thread nesting
391 const LoopNest *grandparent) const;
392
393 std::pair<int64_t, int64_t> get_block_and_serial_extents(const LoopNest *block) const;
394
396
398
400 const GPULoopInfo &gpu_loop_info) const;
401
402 // Assume that when a block is active, all its warps are active
404 ScheduleFeatures &feat,
405 const GPULoopInfo &gpu_loop_info) const;
406
408 const Target &target,
409 int64_t total_shared_mem_alloc_size,
410 ScheduleFeatures &feat) const;
411
412 std::pair<const LoopNest *, const LoopNest *> find_innermost_and_parent() const;
413
415 const Target &target,
416 const GPULoopInfo &gpu_loop_info,
417 const std::vector<const FunctionDAG::Edge *> &edge_chain,
418 const LoadJacobian &jac,
419 const LoopNest *parent,
420 const LoopNest *grandparent,
421 int64_t n,
422 const ScheduleFeatures &feat,
423 const LoadJacobian &serial_jac,
424 bool producer_has_been_scheduled,
425 int producer_innermost_dim,
426 const GPUMemoryType &mem_type,
427 bool verbose) const;
428
430 const LoopNest *parent,
431 const ScheduleFeatures &feat,
432 const LoadJacobian &jac,
433 int producer_dims) const;
434
437
438 vector<pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
439
441
442 void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
443
446
449
452
453 std::pair<int64_t, bool> compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const;
454
455 // Do a recursive walk over the loop nest computing features to feed the cost model.
457 const Anderson2021Params &params,
458 const Target &target,
459 const StageMap<Sites> &sites,
460 int64_t instances,
461 int64_t parallelism,
462 const LoopNest *parent,
463 const LoopNest *grandparent,
464 const LoopNest &root,
465 GPULoopInfo gpu_loop_info,
466 bool use_memoized_features,
467 const StageMap<int64_t> &total_shared_mem_alloc_sizes,
468 int64_t *working_set,
469 int64_t *working_set_local_constant,
470 int64_t *working_set_local_dynamic,
472 Statistics &stats,
473 bool verbose = false) const;
474
475 bool is_root() const {
476 // The root is the sole node without a Func associated with
477 // it.
478 return node == nullptr;
479 }
480
481 // Set the region required of a Func at this site.
482 const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {
483 return bounds.emplace(f, b);
484 }
485
486 // Get the region required of a Func at this site, from which we
487 // know what region would be computed if it were scheduled here,
488 // and what its loop nest would be.
489 const Bound &get_bounds(const FunctionDAG::Node *f) const;
490
491 // Get the region required of a Func at this site (but only to satisfy the
492 // consumers along the given edge chain), from which we know what region
493 // would be computed if it were scheduled here and what its loop nest
494 // would be.
496 const vector<const FunctionDAG::Edge *> &edge_chain) const;
497
498 void dump() const;
499
500 std::string to_string() const;
501
502 // Recursively print a loop nest representation to stderr
503 template<typename T>
504 void dump(T &stream, string prefix, const LoopNest *parent) const;
505
506 // Does this loop nest access the given Func
507 bool calls(const FunctionDAG::Node *f) const;
508
509 // What is the maximum number of inlined calls to a Func that
510 // occur within this loop. Used to prune states that would
511 // generate too much code.
513
514 // Does this loop nest access an input buffer? Used to select
515 // trail strategies when splitting loops. We don't want to read
516 // out of bounds on inputs, even if we don't intend to use the
517 // values read. It could create annoying assertion failures for
518 // the user. It's OK to read out of range of the values computed
519 // on internal Funcs though. Allocation bounds inference just pads
520 // out the bounds so that it won't fault.
522
523 // Does this loop nest contain a computation of the given Func.
524 bool computes(const FunctionDAG::Node *f) const;
525
526 // Above here most methods query the loop nest. Below we have
527 // methods that mutate the loop nest.
528
529 // Inline a Func into all consumers within this loop.
531
532 // Compute a Func at this site.
534 bool tileable,
535 int v,
536 bool in_threads_loop,
537 const Anderson2021Params &params,
538 const Target &target);
539
540 // Parallelize this loop according to the given tiling.
542 const LoopNest *parent,
543 const Anderson2021Params &params,
544 const Target &target,
545 bool inner_tiling,
546 bool adjust_tiling,
547 bool move_all_rvars_inward = true,
548 const vector<int> &rvars_to_move_inward = {}) const;
549
550 int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false,
551 bool in_threads_loop = false) const;
553
554 // All store ats further in than the block level must be fixed
555 // sized allocations. This method checks if f will require a dynamic
556 // allocation
558 const Target &target,
559 bool in_threads_loop) const;
560
561 // Return all possible ways to compute f in tiles somewhere within
562 // this loop nest.
563 // in_threads_loop tracks whether or not function is going to be placed inside a
564 // loop marked gpu_threads, in which case f's loops cannot be gpu_threads
565 vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
566 const LoopNest *parent,
567 const Anderson2021Params &params,
568 const Target &target,
569 const SearchSpaceOptions &search_space_options,
570 int v,
571 bool in_realization,
572 bool in_threads_loop,
573 bool is_pre_pass,
574 vector<int64_t> union_counts = vector<int64_t>()) const;
575
576 // Below here we have methods that apply a schedule to a Halide pipeline.
577
578 // A model of the state of the loop nest of a Func while applying
579 // Halide's scheduling directives.
580
581 // Note that StageScheduleState is movable-but-not-copyable thanks to its ostringstream member.
582 struct StageScheduleState {
583 // How much parallelism do we need to exploit with this Func?
584 double num_cores = 0;
585
586 // Which storage dimension is vectorized? We need to reorder it innermost
587 int vector_dim = -1;
588 int vectorized_loop_index = -1;
589
590 // The various Vars and RVars used for scheduling a Func.
591 struct FuncVar {
592 // The top-level var or rvar this was split off from
594
595 // This var.
597
598 // Source code to access this Var/RVar. Used for printing
599 // valid Halide source for this schedule.
600 string accessor;
601
602 // Our estimate of the extent of this var. This is exact
603 // when constant_extent flag is true.
604 int64_t extent = 0;
605
606 // Which index in the symbolic loop nest does this var
607 // belong to.
608 size_t index = 0;
609
610 // Some flags.
611 bool innermost_pure_dim = false;
612 bool outermost = false;
613 bool parallel = false;
614 bool exists = false;
615 bool pure = false;
616 bool constant_extent = false;
617
618 bool vectorized = false;
619 bool gpu_threads = false;
620
622 : orig(Var()),
623 var(Var()) {
624 }
625 };
628 bool parallel = false;
629 bool vectorized = false;
632
633 // In order from innermost to outermost. Each group of d is one tiling level.
634 vector<FuncVar> vars;
635
636 // In order from innermost to outermost. Each group of d is one tiling level.
637 vector<FuncVar> ordered_vars;
638 vector<int64_t> gpu_thread_extents;
639
642
643 // From outermost in
644 vector<StageScheduleState *> ancestors;
645
646 std::ostringstream schedule_source;
647 };
648
653 int num_serial_loops() const;
655
657 const NodeMap<bool> &all_inlined) const;
659 const LoopNest *parent) const;
660
661 // Apply the schedule represented by this loop nest to a Halide pipeline.
662 void apply(LoopLevel here,
663 StageMap<std::unique_ptr<StageScheduleState>> &state_map,
664 double num_cores,
665 int depth,
666 const LoopNest *parent,
667 const LoopNest *compute_site,
668 const Target &target,
669 std::vector<StageScheduleState *> &ancestors,
670 const NodeMap<bool> &all_inlined) const;
671
672 double max_idle_lane_wastage(const Target &target,
673 GPULoopInfo gpu_loop_info) const;
674
676
678 NodeMap<bool> &inlined_nodes) const;
679
680 void collect_all_inlined(NodeMap<bool> &all_inlined) const;
681
683 int64_t product_of_descendants(int loop_index) const;
684
686 const LoopNest *compute_root_loop_nest = nullptr) const;
687};
688
689struct Filter {
691 bool logging = false;
692
693 explicit Filter(const LoopNest *loop_nest)
696 if (logging) {
697 std::cerr << "\nState filtered: \n";
698 loop_nest->dump();
699 std::cerr << "Reason: ";
700 }
701 }
702
703 template<typename T>
705 if (logging) {
706 std::cerr << std::forward<T>(x);
707 }
708 return *this;
709 }
710
712};
713
714} // namespace Autoscheduler
715} // namespace Internal
716} // namespace Halide
717
718#endif // LOOP_NEST_H
Data structure containing information about the current GPU loop nest hierarchy of blocks,...
Data structures that help track memory access information.
Data structure containing information about GPU threads for a particular location in the loop nest an...
A class representing a reference count to be used with IntrusivePtr.
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition Schedule.h:203
A Halide variable, to be used when defining functions.
Definition Var.h:19
MemInfoType< SharedMem > SharedMemInfo
Definition GPUMemInfo.h:112
int64_t get_active_block_hardware_limit(const Anderson2021Params &params)
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
Definition LoopNest.h:24
bool all(const vector< int > &v)
IntrusivePtr< const BoundContents > Bound
bool are_valid_thread_extents(const vector< int64_t > &counts)
bool accessed_at_constant_indices(const std::vector< int > &unrolled, const FunctionDAG::Edge *e)
constexpr int64_t get_register_mem_alloc_limit()
Definition LoopNest.h:60
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition LoopNest.h:21
int64_t get_active_warp_hardware_limit(const Anderson2021Params &params)
bool may_subtile(const Anderson2021Params &params)
MemInfo< typename MemTraits< T >::MemInfoType > MemInfoType
Definition GPUMemInfo.h:109
MemInfoType< LocalMem > LocalMemInfo
Definition GPUMemInfo.h:113
int get_unroll_limit(const Target &target)
int64_t get_shared_memory_limit(const Anderson2021Params &params)
std::string stringify(GPU_parallelism label)
MemInfoType< GlobalMem > GlobalMemInfo
Definition GPUMemInfo.h:111
RefCount & ref_count(const T *t) noexcept
Because in this header we don't yet know how client classes store their RefCount (and we don't want t...
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
Filter(const LoopNest *loop_nest)
Definition LoopNest.h:693
std::vector< const LoopNest * > inlined_innermosts
Definition LoopNest.h:224
NodeMap< std::vector< std::pair< const LoopNest *, std::vector< const FunctionDAG::Edge * > > > > producers_to_be_staged
Definition LoopNest.h:641
vector< pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
bool is_gpu_thread(const Target &target) const
Definition LoopNest.h:150
vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Anderson2021Params &params, const Target &target, const SearchSpaceOptions &search_space_options, int v, bool in_realization, bool in_threads_loop, bool is_pre_pass, vector< int64_t > union_counts=vector< int64_t >()) const
const LoopNest * get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const
int num_serial_loops(const FunctionDAG::Node::Stage *stage) const
int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector< const FunctionDAG::Edge * > &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const
bool has_constant_region_required(const FunctionDAG::Node *node) const
std::map< uint64_t, StageMap< ScheduleFeatures > > features
Definition LoopNest.h:144
int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const
const FunctionDAG::Node * node
Definition LoopNest.h:57
void dump(T &stream, string prefix, const LoopNest *parent) const
int64_t product_of_self_and_descendants(int loop_index) const
bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
void inline_func(const FunctionDAG::Node *f)
void generate_vec_dim_serial_tilings(vector< int > &serial_sizes) const
bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const
void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector< int64_t > &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose=false) const
int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const
int64_t product_of_descendants(int loop_index) const
bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const
bool is_gpu_block(const Target &target) const
Definition LoopNest.h:154
bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const
bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector< const FunctionDAG::Edge * > &edge_chain) const
int64_t get_total_constant_local_mem_alloc_size() const
const Bound & get_bounds(const FunctionDAG::Node *f) const
std::pair< double, double > compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const
int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const
void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const
vector< int64_t > get_union_thread_counts(const FunctionDAG::Node *f) const
void collect_nodes_that_should_be_inlined(const NodeMap< bool > &nodes_to_freeze, NodeMap< bool > &inlined_nodes) const
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState > > &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site, const Target &target, std::vector< StageScheduleState * > &ancestors, const NodeMap< bool > &all_inlined) const
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates
Definition LoopNest.h:143
int get_actual_vector_dim(const Bound &store_bounds) const
void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const
Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose=false) const
int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const
bool computes(const FunctionDAG::Node *f) const
double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const
std::vector< IntrusivePtr< const LoopNest > > children
Definition LoopNest.h:42
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition LoopNest.h:269
bool promote_allocs_to_registers(const Target &target, StageMap< Sites > &sites) const
std::pair< int64_t, int64_t > get_block_and_serial_extents(const LoopNest *block) const
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition LoopNest.h:482
bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
bool has_constant_region_computed(const FunctionDAG::Node *node) const
double compute_local_mem_stride(double stride, double bytes) const
MemInfoType< T > compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const
bool add_gpu_thread_tilings(const FunctionDAG::Node *f, const Anderson2021Params &params, const Target &target, int v, vector< IntrusivePtr< const LoopNest > > &result, const vector< int64_t > &max_size)
IntrusivePtr< const LoopNest > parallelize_in_tiles(const vector< int64_t > &tiling, const LoopNest *parent, const Anderson2021Params &params, const Target &target, bool inner_tiling, bool adjust_tiling, bool move_all_rvars_inward=true, const vector< int > &rvars_to_move_inward={}) const
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
void collect_all_inlined(NodeMap< bool > &all_inlined) const
std::vector< int > unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const
bool is_gpu_serial(const Target &target) const
Definition LoopNest.h:146
double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const
static void hash_combine(uint64_t &h, uint64_t next)
Definition LoopNest.h:191
const LoopNest * find_pure_stage_loop_nest(const FunctionDAG::Node *node) const
void get_stage_sizes(const FunctionDAG::Node *f, vector< vector< int64_t > > &stage_sizes, vector< vector< int > > &pure_dims, vector< int > &vectorized_indices) const
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
const FunctionDAG::Node::Stage * stage
Definition LoopNest.h:60
bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const
void get_stages_computed_in_each_compute_root_loop(StageMap< StageMap< bool > > &descendants, const LoopNest *compute_root_loop_nest=nullptr) const
void compute_features(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap< int64_t > &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
void get_sites(const Target &target, StageMap< Sites > &sites, StageMap< int64_t > &shared_mem_alloc_sizes, const LoopNest *task=nullptr, const LoopNest *parent=nullptr, const LoopNest *current_thread_loop=nullptr) const
bool calls(const FunctionDAG::Node *f) const
bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const
void copy_from_including_features(const LoopNest &n)
bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const
void update_producers_to_be_staged(StageScheduleState &state, const NodeMap< bool > &all_inlined) const
bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const
void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType< T > &mem_info, double serial_loop_extents, bool verbose=false) const
void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap< Sites > &sites, NodeMap< bool > &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const
void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const
void structural_hash(uint64_t &h, int depth) const
void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType< T > &mem_info, bool verbose=false) const
std::pair< const LoopNest *, const LoopNest * > find_innermost_and_parent() const
std::set< const FunctionDAG::Node * > store_at
Definition LoopNest.h:49
std::pair< int64_t, bool > compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const
bool compute_here(const FunctionDAG::Node *f, bool tileable, int v, bool in_threads_loop, const Anderson2021Params &params, const Target &target)
int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
int vectorized_access_size(size_t loop_index, bool verbose=false) const
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.
Definition Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled?
A class that can represent Vars or RVars.
Definition Func.h:29