parallel_backend_tbb.h 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291
  1. // -*- C++ -*-
  2. //===-- parallel_backend_tbb.h --------------------------------------------===//
  3. //
  4. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  5. // See https://llvm.org/LICENSE.txt for license information.
  6. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7. //
  8. //===----------------------------------------------------------------------===//
  9. #ifndef _PSTL_PARALLEL_BACKEND_TBB_H
  10. #define _PSTL_PARALLEL_BACKEND_TBB_H
  11. #include <algorithm>
  12. #include <type_traits>
  13. #include "parallel_backend_utils.h"
  14. // Bring in minimal required subset of Intel TBB
  15. #include <tbb/blocked_range.h>
  16. #include <tbb/parallel_for.h>
  17. #include <tbb/parallel_reduce.h>
  18. #include <tbb/parallel_scan.h>
  19. #include <tbb/parallel_invoke.h>
  20. #include <tbb/task_arena.h>
  21. #include <tbb/tbb_allocator.h>
  22. #include <tbb/task.h>
  23. #if TBB_INTERFACE_VERSION < 10000
  24. # error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported.
  25. #endif
  26. namespace __pstl
  27. {
  28. namespace __tbb_backend
  29. {
  30. //! Raw memory buffer with automatic freeing and no exceptions.
  31. /** Some of our algorithms need to start with raw memory buffer,
  32. not an initialize array, because initialization/destruction
  33. would make the span be at least O(N). */
  34. // tbb::allocator can improve performance in some cases.
  35. template <typename _Tp>
  36. class __buffer
  37. {
  38. tbb::tbb_allocator<_Tp> _M_allocator;
  39. _Tp* _M_ptr;
  40. const std::size_t _M_buf_size;
  41. __buffer(const __buffer&) = delete;
  42. void
  43. operator=(const __buffer&) = delete;
  44. public:
  45. //! Try to obtain buffer of given size to store objects of _Tp type
  46. __buffer(std::size_t n) : _M_allocator(), _M_ptr(_M_allocator.allocate(n)), _M_buf_size(n) {}
  47. //! True if buffer was successfully obtained, zero otherwise.
  48. operator bool() const { return _M_ptr != NULL; }
  49. //! Return pointer to buffer, or NULL if buffer could not be obtained.
  50. _Tp*
  51. get() const
  52. {
  53. return _M_ptr;
  54. }
  55. //! Destroy buffer
  56. ~__buffer() { _M_allocator.deallocate(_M_ptr, _M_buf_size); }
  57. };
  58. // Wrapper for tbb::task
  59. inline void
  60. __cancel_execution()
  61. {
  62. #if TBB_INTERFACE_VERSION <= 12000
  63. tbb::task::self().group()->cancel_group_execution();
  64. #else
  65. tbb::task::current_context()->cancel_group_execution();
  66. #endif
  67. }
  68. //------------------------------------------------------------------------
  69. // parallel_for
  70. //------------------------------------------------------------------------
  71. template <class _Index, class _RealBody>
  72. class __parallel_for_body
  73. {
  74. public:
  75. __parallel_for_body(const _RealBody& __body) : _M_body(__body) {}
  76. __parallel_for_body(const __parallel_for_body& __body) : _M_body(__body._M_body) {}
  77. void
  78. operator()(const tbb::blocked_range<_Index>& __range) const
  79. {
  80. _M_body(__range.begin(), __range.end());
  81. }
  82. private:
  83. _RealBody _M_body;
  84. };
  85. //! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
  86. // wrapper over tbb::parallel_for
  87. template <class _ExecutionPolicy, class _Index, class _Fp>
  88. void
  89. __parallel_for(_ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f)
  90. {
  91. tbb::this_task_arena::isolate([=]() {
  92. tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last), __parallel_for_body<_Index, _Fp>(__f));
  93. });
  94. }
  95. //! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
  96. // wrapper over tbb::parallel_reduce
  97. template <class _ExecutionPolicy, class _Value, class _Index, typename _RealBody, typename _Reduction>
  98. _Value
  99. __parallel_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, const _Value& __identity,
  100. const _RealBody& __real_body, const _Reduction& __reduction)
  101. {
  102. return tbb::this_task_arena::isolate([__first, __last, &__identity, &__real_body, &__reduction]() -> _Value {
  103. return tbb::parallel_reduce(
  104. tbb::blocked_range<_Index>(__first, __last), __identity,
  105. [__real_body](const tbb::blocked_range<_Index>& __r, const _Value& __value) -> _Value {
  106. return __real_body(__r.begin(), __r.end(), __value);
  107. },
  108. __reduction);
  109. });
  110. }
  111. //------------------------------------------------------------------------
  112. // parallel_transform_reduce
  113. //
  114. // Notation:
  115. // r(i,j,init) returns reduction of init with reduction over [i,j)
  116. // u(i) returns f(i,i+1,identity) for a hypothetical left identity element of r
  117. // c(x,y) combines values x and y that were the result of r or u
  118. //------------------------------------------------------------------------
  119. template <class _Index, class _Up, class _Tp, class _Cp, class _Rp>
  120. struct __par_trans_red_body
  121. {
  122. alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
  123. _Rp _M_brick_reduce; // Most likely to have non-empty layout
  124. _Up _M_u;
  125. _Cp _M_combine;
  126. bool _M_has_sum; // Put last to minimize size of class
  127. _Tp&
  128. sum()
  129. {
  130. _PSTL_ASSERT_MSG(_M_has_sum, "sum expected");
  131. return *(_Tp*)_M_sum_storage;
  132. }
  133. __par_trans_red_body(_Up __u, _Tp __init, _Cp __c, _Rp __r)
  134. : _M_brick_reduce(__r), _M_u(__u), _M_combine(__c), _M_has_sum(true)
  135. {
  136. new (_M_sum_storage) _Tp(__init);
  137. }
  138. __par_trans_red_body(__par_trans_red_body& __left, tbb::split)
  139. : _M_brick_reduce(__left._M_brick_reduce), _M_u(__left._M_u), _M_combine(__left._M_combine), _M_has_sum(false)
  140. {
  141. }
  142. ~__par_trans_red_body()
  143. {
  144. // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
  145. if (_M_has_sum)
  146. sum().~_Tp();
  147. }
  148. void
  149. join(__par_trans_red_body& __rhs)
  150. {
  151. sum() = _M_combine(sum(), __rhs.sum());
  152. }
  153. void
  154. operator()(const tbb::blocked_range<_Index>& __range)
  155. {
  156. _Index __i = __range.begin();
  157. _Index __j = __range.end();
  158. if (!_M_has_sum)
  159. {
  160. _PSTL_ASSERT_MSG(__range.size() > 1, "there should be at least 2 elements");
  161. new (&_M_sum_storage)
  162. _Tp(_M_combine(_M_u(__i), _M_u(__i + 1))); // The condition i+1 < j is provided by the grain size of 3
  163. _M_has_sum = true;
  164. std::advance(__i, 2);
  165. if (__i == __j)
  166. return;
  167. }
  168. sum() = _M_brick_reduce(__i, __j, sum());
  169. }
  170. };
  171. template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp>
  172. _Tp
  173. __parallel_transform_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, _Up __u, _Tp __init, _Cp __combine,
  174. _Rp __brick_reduce)
  175. {
  176. __tbb_backend::__par_trans_red_body<_Index, _Up, _Tp, _Cp, _Rp> __body(__u, __init, __combine, __brick_reduce);
  177. // The grain size of 3 is used in order to provide mininum 2 elements for each body
  178. tbb::this_task_arena::isolate(
  179. [__first, __last, &__body]() { tbb::parallel_reduce(tbb::blocked_range<_Index>(__first, __last, 3), __body); });
  180. return __body.sum();
  181. }
  182. //------------------------------------------------------------------------
  183. // parallel_scan
  184. //------------------------------------------------------------------------
  185. template <class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
  186. class __trans_scan_body
  187. {
  188. alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
  189. _Rp _M_brick_reduce; // Most likely to have non-empty layout
  190. _Up _M_u;
  191. _Cp _M_combine;
  192. _Sp _M_scan;
  193. bool _M_has_sum; // Put last to minimize size of class
  194. public:
  195. __trans_scan_body(_Up __u, _Tp __init, _Cp __combine, _Rp __reduce, _Sp __scan)
  196. : _M_brick_reduce(__reduce), _M_u(__u), _M_combine(__combine), _M_scan(__scan), _M_has_sum(true)
  197. {
  198. new (_M_sum_storage) _Tp(__init);
  199. }
  200. __trans_scan_body(__trans_scan_body& __b, tbb::split)
  201. : _M_brick_reduce(__b._M_brick_reduce), _M_u(__b._M_u), _M_combine(__b._M_combine), _M_scan(__b._M_scan),
  202. _M_has_sum(false)
  203. {
  204. }
  205. ~__trans_scan_body()
  206. {
  207. // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
  208. if (_M_has_sum)
  209. sum().~_Tp();
  210. }
  211. _Tp&
  212. sum() const
  213. {
  214. _PSTL_ASSERT_MSG(_M_has_sum, "sum expected");
  215. return *const_cast<_Tp*>(reinterpret_cast<_Tp const*>(_M_sum_storage));
  216. }
  217. void
  218. operator()(const tbb::blocked_range<_Index>& __range, tbb::pre_scan_tag)
  219. {
  220. _Index __i = __range.begin();
  221. _Index __j = __range.end();
  222. if (!_M_has_sum)
  223. {
  224. new (&_M_sum_storage) _Tp(_M_u(__i));
  225. _M_has_sum = true;
  226. ++__i;
  227. if (__i == __j)
  228. return;
  229. }
  230. sum() = _M_brick_reduce(__i, __j, sum());
  231. }
  232. void
  233. operator()(const tbb::blocked_range<_Index>& __range, tbb::final_scan_tag)
  234. {
  235. sum() = _M_scan(__range.begin(), __range.end(), sum());
  236. }
  237. void
  238. reverse_join(__trans_scan_body& __a)
  239. {
  240. if (_M_has_sum)
  241. {
  242. sum() = _M_combine(__a.sum(), sum());
  243. }
  244. else
  245. {
  246. new (&_M_sum_storage) _Tp(__a.sum());
  247. _M_has_sum = true;
  248. }
  249. }
  250. void
  251. assign(__trans_scan_body& __b)
  252. {
  253. sum() = __b.sum();
  254. }
  255. };
  256. template <typename _Index>
  257. _Index
  258. __split(_Index __m)
  259. {
  260. _Index __k = 1;
  261. while (2 * __k < __m)
  262. __k *= 2;
  263. return __k;
  264. }
  265. //------------------------------------------------------------------------
  266. // __parallel_strict_scan
  267. //------------------------------------------------------------------------
  268. template <typename _Index, typename _Tp, typename _Rp, typename _Cp>
  269. void
  270. __upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine)
  271. {
  272. if (__m == 1)
  273. __r[0] = __reduce(__i * __tilesize, __lastsize);
  274. else
  275. {
  276. _Index __k = __split(__m);
  277. tbb::parallel_invoke(
  278. [=] { __tbb_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); },
  279. [=] {
  280. __tbb_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine);
  281. });
  282. if (__m == 2 * __k)
  283. __r[__m - 1] = __combine(__r[__k - 1], __r[__m - 1]);
  284. }
  285. }
  286. template <typename _Index, typename _Tp, typename _Cp, typename _Sp>
  287. void
  288. __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine,
  289. _Sp __scan)
  290. {
  291. if (__m == 1)
  292. __scan(__i * __tilesize, __lastsize, __initial);
  293. else
  294. {
  295. const _Index __k = __split(__m);
  296. tbb::parallel_invoke(
  297. [=] { __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); },
  298. // Assumes that __combine never throws.
  299. //TODO: Consider adding a requirement for user functors to be constant.
  300. [=, &__combine] {
  301. __tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize,
  302. __combine(__initial, __r[__k - 1]), __combine, __scan);
  303. });
  304. }
  305. }
  306. // Adapted from Intel(R) Cilk(TM) version from cilkpub.
  307. // Let i:len denote a counted interval of length n starting at i. s denotes a generalized-sum value.
  308. // Expected actions of the functors are:
  309. // reduce(i,len) -> s -- return reduction value of i:len.
  310. // combine(s1,s2) -> s -- return merged sum
  311. // apex(s) -- do any processing necessary between reduce and scan.
  312. // scan(i,len,initial) -- perform scan over i:len starting with initial.
  313. // The initial range 0:n is partitioned into consecutive subranges.
  314. // reduce and scan are each called exactly once per subrange.
  315. // Thus callers can rely upon side effects in reduce.
  316. // combine must not throw an exception.
  317. // apex is called exactly once, after all calls to reduce and before all calls to scan.
  318. // For example, it's useful for allocating a __buffer used by scan but whose size is the sum of all reduction values.
  319. // T must have a trivial constructor and destructor.
  320. template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
  321. void
  322. __parallel_strict_scan(_ExecutionPolicy&&, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan,
  323. _Ap __apex)
  324. {
  325. tbb::this_task_arena::isolate([=, &__combine]() {
  326. if (__n > 1)
  327. {
  328. _Index __p = tbb::this_task_arena::max_concurrency();
  329. const _Index __slack = 4;
  330. _Index __tilesize = (__n - 1) / (__slack * __p) + 1;
  331. _Index __m = (__n - 1) / __tilesize;
  332. __buffer<_Tp> __buf(__m + 1);
  333. _Tp* __r = __buf.get();
  334. __tbb_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce,
  335. __combine);
  336. // When __apex is a no-op and __combine has no side effects, a good optimizer
  337. // should be able to eliminate all code between here and __apex.
  338. // Alternatively, provide a default value for __apex that can be
  339. // recognized by metaprogramming that conditionlly executes the following.
  340. size_t __k = __m + 1;
  341. _Tp __t = __r[__k - 1];
  342. while ((__k &= __k - 1))
  343. __t = __combine(__r[__k - 1], __t);
  344. __apex(__combine(__initial, __t));
  345. __tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial,
  346. __combine, __scan);
  347. return;
  348. }
  349. // Fewer than 2 elements in sequence, or out of memory. Handle has single block.
  350. _Tp __sum = __initial;
  351. if (__n)
  352. __sum = __combine(__sum, __reduce(_Index(0), __n));
  353. __apex(__sum);
  354. if (__n)
  355. __scan(_Index(0), __n, __initial);
  356. });
  357. }
  358. template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
  359. _Tp
  360. __parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init, _Cp __combine, _Rp __brick_reduce,
  361. _Sp __scan)
  362. {
  363. __trans_scan_body<_Index, _Up, _Tp, _Cp, _Rp, _Sp> __body(__u, __init, __combine, __brick_reduce, __scan);
  364. auto __range = tbb::blocked_range<_Index>(0, __n);
  365. tbb::this_task_arena::isolate([__range, &__body]() { tbb::parallel_scan(__range, __body); });
  366. return __body.sum();
  367. }
  368. //------------------------------------------------------------------------
  369. // parallel_stable_sort
  370. //------------------------------------------------------------------------
  371. //------------------------------------------------------------------------
  372. // stable_sort utilities
  373. //
  374. // These are used by parallel implementations but do not depend on them.
  375. //------------------------------------------------------------------------
  376. #define _PSTL_MERGE_CUT_OFF 2000
  377. template <typename _Func>
  378. class __func_task;
  379. template <typename _Func>
  380. class __root_task;
  381. #if TBB_INTERFACE_VERSION <= 12000
  382. class __task : public tbb::task
  383. {
  384. public:
  385. template <typename _Fn>
  386. __task*
  387. make_continuation(_Fn&& __f)
  388. {
  389. return new (allocate_continuation()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
  390. }
  391. template <typename _Fn>
  392. __task*
  393. make_child_of(__task* parent, _Fn&& __f)
  394. {
  395. return new (parent->allocate_child()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
  396. }
  397. template <typename _Fn>
  398. __task*
  399. make_additional_child_of(tbb::task* parent, _Fn&& __f)
  400. {
  401. return new (tbb::task::allocate_additional_child_of(*parent))
  402. __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
  403. }
  404. inline void
  405. recycle_as_continuation()
  406. {
  407. tbb::task::recycle_as_continuation();
  408. }
  409. inline void
  410. recycle_as_child_of(__task* parent)
  411. {
  412. tbb::task::recycle_as_child_of(*parent);
  413. }
  414. inline void
  415. spawn(__task* __t)
  416. {
  417. tbb::task::spawn(*__t);
  418. }
  419. template <typename _Fn>
  420. static inline void
  421. spawn_root_and_wait(__root_task<_Fn>& __root)
  422. {
  423. tbb::task::spawn_root_and_wait(*__root._M_task);
  424. }
  425. };
  426. template <typename _Func>
  427. class __func_task : public __task
  428. {
  429. _Func _M_func;
  430. tbb::task*
  431. execute()
  432. {
  433. return _M_func(this);
  434. };
  435. public:
  436. template <typename _Fn>
  437. __func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)}
  438. {
  439. }
  440. _Func&
  441. body()
  442. {
  443. return _M_func;
  444. }
  445. };
  446. template <typename _Func>
  447. class __root_task
  448. {
  449. tbb::task* _M_task;
  450. public:
  451. template <typename... Args>
  452. __root_task(Args&&... args)
  453. : _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward<Args>(args)...)}}
  454. {
  455. }
  456. friend class __task;
  457. friend class __func_task<_Func>;
  458. };
  459. #else // TBB_INTERFACE_VERSION <= 12000
  460. class __task : public tbb::detail::d1::task
  461. {
  462. protected:
  463. tbb::detail::d1::small_object_allocator _M_allocator{};
  464. tbb::detail::d1::execution_data* _M_execute_data{};
  465. __task* _M_parent{};
  466. std::atomic<int> _M_refcount{};
  467. bool _M_recycle{};
  468. template <typename _Fn>
  469. __task*
  470. allocate_func_task(_Fn&& __f)
  471. {
  472. _PSTL_ASSERT(_M_execute_data != nullptr);
  473. tbb::detail::d1::small_object_allocator __alloc{};
  474. auto __t =
  475. __alloc.new_object<__func_task<typename std::decay<_Fn>::type>>(*_M_execute_data, std::forward<_Fn>(__f));
  476. __t->_M_allocator = __alloc;
  477. return __t;
  478. }
  479. public:
  480. __task*
  481. parent()
  482. {
  483. return _M_parent;
  484. }
  485. void
  486. set_ref_count(int __n)
  487. {
  488. _M_refcount.store(__n, std::memory_order_release);
  489. }
  490. template <typename _Fn>
  491. __task*
  492. make_continuation(_Fn&& __f)
  493. {
  494. auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
  495. __t->_M_parent = _M_parent;
  496. _M_parent = nullptr;
  497. return __t;
  498. }
  499. template <typename _Fn>
  500. __task*
  501. make_child_of(__task* __parent, _Fn&& __f)
  502. {
  503. auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
  504. __t->_M_parent = __parent;
  505. return __t;
  506. }
  507. template <typename _Fn>
  508. __task*
  509. make_additional_child_of(__task* __parent, _Fn&& __f)
  510. {
  511. auto __t = make_child_of(__parent, std::forward<_Fn>(__f));
  512. _PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
  513. ++__parent->_M_refcount;
  514. return __t;
  515. }
  516. inline void
  517. recycle_as_continuation()
  518. {
  519. _M_recycle = true;
  520. }
  521. inline void
  522. recycle_as_child_of(__task* parent)
  523. {
  524. _M_recycle = true;
  525. _M_parent = parent;
  526. }
  527. inline void
  528. spawn(__task* __t)
  529. {
  530. _PSTL_ASSERT(_M_execute_data != nullptr);
  531. tbb::detail::d1::spawn(*__t, *_M_execute_data->context);
  532. }
  533. template <typename _Fn>
  534. static inline void
  535. spawn_root_and_wait(__root_task<_Fn>& __root)
  536. {
  537. tbb::detail::d1::execute_and_wait(*__root._M_func_task, __root._M_context, __root._M_wait_object,
  538. __root._M_context);
  539. }
  540. template <typename _Func>
  541. friend class __func_task;
  542. };
  543. template <typename _Func>
  544. class __func_task : public __task
  545. {
  546. _Func _M_func;
  547. __task*
  548. execute(tbb::detail::d1::execution_data& __ed) override
  549. {
  550. _M_execute_data = &__ed;
  551. _M_recycle = false;
  552. __task* __next = _M_func(this);
  553. return finalize(__next);
  554. };
  555. __task*
  556. cancel(tbb::detail::d1::execution_data& __ed) override
  557. {
  558. return finalize(nullptr);
  559. }
  560. __task*
  561. finalize(__task* __next)
  562. {
  563. bool __recycle = _M_recycle;
  564. _M_recycle = false;
  565. if (__recycle)
  566. {
  567. return __next;
  568. }
  569. auto __parent = _M_parent;
  570. auto __alloc = _M_allocator;
  571. auto __ed = _M_execute_data;
  572. this->~__func_task();
  573. _PSTL_ASSERT(__parent != nullptr);
  574. _PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
  575. if (--__parent->_M_refcount == 0)
  576. {
  577. _PSTL_ASSERT(__next == nullptr);
  578. __alloc.deallocate(this, *__ed);
  579. return __parent;
  580. }
  581. return __next;
  582. }
  583. friend class __root_task<_Func>;
  584. public:
  585. template <typename _Fn>
  586. __func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f))
  587. {
  588. }
  589. _Func&
  590. body()
  591. {
  592. return _M_func;
  593. }
  594. };
  595. template <typename _Func>
  596. class __root_task : public __task
  597. {
  598. __task*
  599. execute(tbb::detail::d1::execution_data& __ed) override
  600. {
  601. _M_wait_object.release();
  602. return nullptr;
  603. };
  604. __task*
  605. cancel(tbb::detail::d1::execution_data& __ed) override
  606. {
  607. _M_wait_object.release();
  608. return nullptr;
  609. }
  610. __func_task<_Func>* _M_func_task{};
  611. tbb::detail::d1::wait_context _M_wait_object{0};
  612. tbb::task_group_context _M_context{};
  613. public:
  614. template <typename... Args>
  615. __root_task(Args&&... args) : _M_wait_object{1}
  616. {
  617. tbb::detail::d1::small_object_allocator __alloc{};
  618. _M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward<Args>(args)...));
  619. _M_func_task->_M_allocator = __alloc;
  620. _M_func_task->_M_parent = this;
  621. _M_refcount.store(1, std::memory_order_relaxed);
  622. }
  623. friend class __task;
  624. };
  625. #endif // TBB_INTERFACE_VERSION <= 12000
  626. template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _Cleanup,
  627. typename _LeafMerge>
  628. class __merge_func
  629. {
  630. typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
  631. typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
  632. typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
  633. typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType;
  634. _RandomAccessIterator1 _M_x_beg;
  635. _RandomAccessIterator2 _M_z_beg;
  636. _SizeType _M_xs, _M_xe;
  637. _SizeType _M_ys, _M_ye;
  638. _SizeType _M_zs;
  639. _Compare _M_comp;
  640. _LeafMerge _M_leaf_merge;
  641. _SizeType _M_nsort; //number of elements to be sorted for partial_sort alforithm
  642. static const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
  643. bool _root; //means a task is merging root task
  644. bool _x_orig; //"true" means X(or left ) subrange is in the original container; false - in the buffer
  645. bool _y_orig; //"true" means Y(or right) subrange is in the original container; false - in the buffer
  646. bool _split; //"true" means a merge task is a split task for parallel merging, the execution logic differs
  647. bool
  648. is_partial() const
  649. {
  650. return _M_nsort > 0;
  651. }
  652. struct __move_value
  653. {
  654. template <typename Iterator1, typename Iterator2>
  655. void
  656. operator()(Iterator1 __x, Iterator2 __z)
  657. {
  658. *__z = std::move(*__x);
  659. }
  660. };
  661. struct __move_value_construct
  662. {
  663. template <typename Iterator1, typename Iterator2>
  664. void
  665. operator()(Iterator1 __x, Iterator2 __z)
  666. {
  667. ::new (std::addressof(*__z)) _ValueType(std::move(*__x));
  668. }
  669. };
  670. struct __move_range
  671. {
  672. template <typename Iterator1, typename Iterator2>
  673. Iterator2
  674. operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
  675. {
  676. if (__last1 - __first1 < __merge_cut_off)
  677. return std::move(__first1, __last1, __first2);
  678. auto __n = __last1 - __first1;
  679. tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
  680. [__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
  681. std::move(__first1 + __range.begin(), __first1 + __range.end(),
  682. __first2 + __range.begin());
  683. });
  684. return __first2 + __n;
  685. }
  686. };
  687. struct __move_range_construct
  688. {
  689. template <typename Iterator1, typename Iterator2>
  690. Iterator2
  691. operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
  692. {
  693. if (__last1 - __first1 < __merge_cut_off)
  694. {
  695. for (; __first1 != __last1; ++__first1, ++__first2)
  696. __move_value_construct()(__first1, __first2);
  697. return __first2;
  698. }
  699. auto __n = __last1 - __first1;
  700. tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
  701. [__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
  702. for (auto i = __range.begin(); i != __range.end(); ++i)
  703. __move_value_construct()(__first1 + i, __first2 + i);
  704. });
  705. return __first2 + __n;
  706. }
  707. };
  708. struct __cleanup_range
  709. {
  710. template <typename Iterator>
  711. void
  712. operator()(Iterator __first, Iterator __last)
  713. {
  714. if (__last - __first < __merge_cut_off)
  715. _Cleanup()(__first, __last);
  716. else
  717. {
  718. auto __n = __last - __first;
  719. tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
  720. [__first](const tbb::blocked_range<_SizeType>& __range) {
  721. _Cleanup()(__first + __range.begin(), __first + __range.end());
  722. });
  723. }
  724. }
  725. };
  726. public:
  727. __merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp,
  728. _Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg,
  729. _RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root)
  730. : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg),
  731. _M_comp(__comp), _M_leaf_merge(__leaf_merge), _M_nsort(__nsort), _root(__root),
  732. _x_orig(__x_orig), _y_orig(__y_orig), _split(false)
  733. {
  734. }
  735. bool
  736. is_left(_SizeType __idx) const
  737. {
  738. return _M_xs == __idx;
  739. }
  740. template <typename IndexType>
  741. void
  742. set_odd(IndexType __idx, bool __on_off)
  743. {
  744. if (is_left(__idx))
  745. _x_orig = __on_off;
  746. else
  747. _y_orig = __on_off;
  748. }
  749. __task*
  750. operator()(__task* __self);
  751. private:
  752. __merge_func*
  753. parent_merge(__task* __self) const
  754. {
  755. return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body();
  756. }
  757. bool
  758. x_less_y()
  759. {
  760. const auto __nx = (_M_xe - _M_xs);
  761. const auto __ny = (_M_ye - _M_ys);
  762. _PSTL_ASSERT(__nx > 0 && __ny > 0);
  763. _PSTL_ASSERT(_x_orig == _y_orig);
  764. _PSTL_ASSERT(!is_partial());
  765. if (_x_orig)
  766. {
  767. _PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_comp));
  768. _PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_comp));
  769. return !_M_comp(*(_M_x_beg + _M_ys), *(_M_x_beg + _M_xe - 1));
  770. }
  771. _PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp));
  772. _PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp));
  773. return !_M_comp(*(_M_z_beg + _M_zs + __nx), *(_M_z_beg + _M_zs + __nx - 1));
  774. }
  775. void
  776. move_x_range()
  777. {
  778. const auto __nx = (_M_xe - _M_xs);
  779. const auto __ny = (_M_ye - _M_ys);
  780. _PSTL_ASSERT(__nx > 0 && __ny > 0);
  781. if (_x_orig)
  782. __move_range_construct()(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_z_beg + _M_zs);
  783. else
  784. {
  785. __move_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx, _M_x_beg + _M_xs);
  786. __cleanup_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx);
  787. }
  788. _x_orig = !_x_orig;
  789. }
  790. void
  791. move_y_range()
  792. {
  793. const auto __nx = (_M_xe - _M_xs);
  794. const auto __ny = (_M_ye - _M_ys);
  795. if (_y_orig)
  796. __move_range_construct()(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs + __nx);
  797. else
  798. {
  799. __move_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny, _M_x_beg + _M_ys);
  800. __cleanup_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny);
  801. }
  802. _y_orig = !_y_orig;
  803. }
  804. __task*
  805. merge_ranges(__task* __self)
  806. {
  807. _PSTL_ASSERT(_x_orig == _y_orig); //two merged subrange must be lie into the same buffer
  808. const auto __nx = (_M_xe - _M_xs);
  809. const auto __ny = (_M_ye - _M_ys);
  810. const auto __n = __nx + __ny;
  811. // need to merge {x} and {y}
  812. if (__n > __merge_cut_off)
  813. return split_merging(__self);
  814. //merge to buffer
  815. if (_x_orig)
  816. {
  817. _M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs,
  818. _M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(),
  819. __move_range_construct());
  820. _PSTL_ASSERT(parent_merge(__self)); //not root merging task
  821. }
  822. //merge to "origin"
  823. else
  824. {
  825. _PSTL_ASSERT(_x_orig == _y_orig);
  826. _PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp));
  827. _PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp));
  828. const auto __nx = (_M_xe - _M_xs);
  829. const auto __ny = (_M_ye - _M_ys);
  830. _M_leaf_merge(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_x_beg + _M_zs,
  831. _M_comp, __move_value(), __move_value(), __move_range(), __move_range());
  832. __cleanup_range()(_M_z_beg + _M_xs, _M_z_beg + _M_xe);
  833. __cleanup_range()(_M_z_beg + _M_ys, _M_z_beg + _M_ye);
  834. }
  835. return nullptr;
  836. }
  837. __task*
  838. process_ranges(__task* __self)
  839. {
  840. _PSTL_ASSERT(_x_orig == _y_orig);
  841. _PSTL_ASSERT(!_split);
  842. auto p = parent_merge(__self);
  843. if (!p)
  844. { //root merging task
  845. //optimization, just for sort algorithm, //{x} <= {y}
  846. if (!is_partial() && x_less_y()) //we have a solution
  847. {
  848. if (!_x_orig)
  849. { //we have to move the solution to the origin
  850. move_x_range(); //parallel moving
  851. move_y_range(); //parallel moving
  852. }
  853. return nullptr;
  854. }
  855. //else: if we have data in the origin,
  856. //we have to move data to the buffer for final merging into the origin.
  857. if (_x_orig)
  858. {
  859. move_x_range(); //parallel moving
  860. move_y_range(); //parallel moving
  861. }
  862. // need to merge {x} and {y}.
  863. return merge_ranges(__self);
  864. }
  865. //else: not root merging task (parent_merge() == NULL)
  866. //optimization, just for sort algorithm, //{x} <= {y}
  867. if (!is_partial() && x_less_y())
  868. {
  869. const auto id_range = _M_zs;
  870. p->set_odd(id_range, _x_orig);
  871. return nullptr;
  872. }
  873. //else: we have to revert "_x(y)_orig" flag of the parent merging task
  874. const auto id_range = _M_zs;
  875. p->set_odd(id_range, !_x_orig);
  876. return merge_ranges(__self);
  877. }
  878. //splitting as merge task into 2 of the same level
  879. __task*
  880. split_merging(__task* __self)
  881. {
  882. _PSTL_ASSERT(_x_orig == _y_orig);
  883. const auto __nx = (_M_xe - _M_xs);
  884. const auto __ny = (_M_ye - _M_ys);
  885. _SizeType __xm{};
  886. _SizeType __ym{};
  887. if (__nx < __ny)
  888. {
  889. __ym = _M_ys + __ny / 2;
  890. if (_x_orig)
  891. __xm = std::upper_bound(_M_x_beg + _M_xs, _M_x_beg + _M_xe, *(_M_x_beg + __ym), _M_comp) - _M_x_beg;
  892. else
  893. __xm = std::upper_bound(_M_z_beg + _M_xs, _M_z_beg + _M_xe, *(_M_z_beg + __ym), _M_comp) - _M_z_beg;
  894. }
  895. else
  896. {
  897. __xm = _M_xs + __nx / 2;
  898. if (_y_orig)
  899. __ym = std::lower_bound(_M_x_beg + _M_ys, _M_x_beg + _M_ye, *(_M_x_beg + __xm), _M_comp) - _M_x_beg;
  900. else
  901. __ym = std::lower_bound(_M_z_beg + _M_ys, _M_z_beg + _M_ye, *(_M_z_beg + __xm), _M_comp) - _M_z_beg;
  902. }
  903. auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
  904. __merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort,
  905. _M_x_beg, _M_z_beg, _x_orig, _y_orig, _root);
  906. __right_func._split = true;
  907. auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func));
  908. __self->spawn(__merge_task);
  909. __self->recycle_as_continuation();
  910. _M_xe = __xm;
  911. _M_ye = __ym;
  912. _split = true;
  913. return __self;
  914. }
  915. };
  916. template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename __M_Compare, typename _Cleanup,
  917. typename _LeafMerge>
  918. __task*
  919. __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::
  920. operator()(__task* __self)
  921. {
  922. //a. split merge task into 2 of the same level; the special logic,
  923. //without processing(process_ranges) adjacent sub-ranges x and y
  924. if (_split)
  925. return merge_ranges(__self);
  926. //b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} )
  927. //1. x and y are in the even buffer
  928. //2. x and y are in the odd buffer
  929. if (_x_orig == _y_orig)
  930. return process_ranges(__self);
  931. //3. x is in even buffer, y is in the odd buffer
  932. //4. x is in odd buffer, y is in the even buffer
  933. if (!parent_merge(__self))
  934. { //root merge task
  935. if (_x_orig)
  936. move_x_range();
  937. else
  938. move_y_range();
  939. }
  940. else
  941. {
  942. const _SizeType __nx = (_M_xe - _M_xs);
  943. const _SizeType __ny = (_M_ye - _M_ys);
  944. _PSTL_ASSERT(__nx > 0);
  945. _PSTL_ASSERT(__nx > 0);
  946. if (__nx < __ny)
  947. move_x_range();
  948. else
  949. move_y_range();
  950. }
  951. return process_ranges(__self);
  952. }
  953. template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
  954. class __stable_sort_func
  955. {
  956. public:
  957. typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
  958. typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
  959. typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
  960. private:
  961. _RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg;
  962. _RandomAccessIterator2 _M_zs, _M_z_beg;
  963. _Compare _M_comp;
  964. _LeafSort _M_leaf_sort;
  965. bool _M_root;
  966. _SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm
  967. public:
  968. __stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs,
  969. bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort,
  970. _RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg)
  971. : _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp),
  972. _M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort)
  973. {
  974. }
  975. __task*
  976. operator()(__task* __self);
  977. };
  978. #define _PSTL_STABLE_SORT_CUT_OFF 500
  979. template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
  980. __task*
  981. __stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self)
  982. {
  983. typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy,
  984. __utils::__serial_move_merge>
  985. _MergeTaskType;
  986. const _SizeType __n = _M_xe - _M_xs;
  987. const _SizeType __nmerge = _M_nsort > 0 ? _M_nsort : __n;
  988. const _SizeType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
  989. if (__n <= __sort_cut_off)
  990. {
  991. _M_leaf_sort(_M_xs, _M_xe, _M_comp);
  992. _PSTL_ASSERT(!_M_root);
  993. return nullptr;
  994. }
  995. const _RandomAccessIterator1 __xm = _M_xs + __n / 2;
  996. const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs);
  997. const _RandomAccessIterator2 __ze = _M_zs + __n;
  998. _MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg,
  999. _M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy(),
  1000. __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg,
  1001. /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root));
  1002. auto __parent = __self->make_continuation(std::move(__m));
  1003. __parent->set_ref_count(2);
  1004. auto __right = __self->make_child_of(
  1005. __parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg));
  1006. __self->spawn(__right);
  1007. __self->recycle_as_child_of(__parent);
  1008. _M_root = false;
  1009. _M_xe = __xm;
  1010. return __self;
  1011. }
  1012. template <class _ExecutionPolicy, typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
  1013. void
  1014. __parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp,
  1015. _LeafSort __leaf_sort, std::size_t __nsort = 0)
  1016. {
  1017. tbb::this_task_arena::isolate([=, &__nsort]() {
  1018. //sorting based on task tree and parallel merge
  1019. typedef typename std::iterator_traits<_RandomAccessIterator>::value_type _ValueType;
  1020. typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _DifferenceType;
  1021. const _DifferenceType __n = __xe - __xs;
  1022. if (__nsort == __n)
  1023. __nsort = 0; // 'partial_sort' becames 'sort'
  1024. const _DifferenceType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
  1025. if (__n > __sort_cut_off)
  1026. {
  1027. __buffer<_ValueType> __buf(__n);
  1028. __root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{
  1029. __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()};
  1030. __task::spawn_root_and_wait(__root);
  1031. return;
  1032. }
  1033. //serial sort
  1034. __leaf_sort(__xs, __xe, __comp);
  1035. });
  1036. }
  1037. //------------------------------------------------------------------------
  1038. // parallel_merge
  1039. //------------------------------------------------------------------------
  1040. template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
  1041. typename _Compare, typename _LeafMerge>
  1042. class __merge_func_static
  1043. {
  1044. _RandomAccessIterator1 _M_xs, _M_xe;
  1045. _RandomAccessIterator2 _M_ys, _M_ye;
  1046. _RandomAccessIterator3 _M_zs;
  1047. _Compare _M_comp;
  1048. _LeafMerge _M_leaf_merge;
  1049. public:
  1050. __merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys,
  1051. _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
  1052. _LeafMerge __leaf_merge)
  1053. : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge)
  1054. {
  1055. }
  1056. __task*
  1057. operator()(__task* __self);
  1058. };
  1059. //TODO: consider usage of parallel_for with a custom blocked_range
  1060. template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
  1061. typename __M_Compare, typename _LeafMerge>
  1062. __task*
  1063. __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>::
  1064. operator()(__task* __self)
  1065. {
  1066. typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
  1067. typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
  1068. typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
  1069. const _SizeType __n = (_M_xe - _M_xs) + (_M_ye - _M_ys);
  1070. const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
  1071. if (__n <= __merge_cut_off)
  1072. {
  1073. _M_leaf_merge(_M_xs, _M_xe, _M_ys, _M_ye, _M_zs, _M_comp);
  1074. return nullptr;
  1075. }
  1076. _RandomAccessIterator1 __xm;
  1077. _RandomAccessIterator2 __ym;
  1078. if (_M_xe - _M_xs < _M_ye - _M_ys)
  1079. {
  1080. __ym = _M_ys + (_M_ye - _M_ys) / 2;
  1081. __xm = std::upper_bound(_M_xs, _M_xe, *__ym, _M_comp);
  1082. }
  1083. else
  1084. {
  1085. __xm = _M_xs + (_M_xe - _M_xs) / 2;
  1086. __ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp);
  1087. }
  1088. const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
  1089. auto __right = __self->make_additional_child_of(
  1090. __self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge));
  1091. __self->spawn(__right);
  1092. __self->recycle_as_continuation();
  1093. _M_xe = __xm;
  1094. _M_ye = __ym;
  1095. return __self;
  1096. }
  1097. template <class _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2,
  1098. typename _RandomAccessIterator3, typename _Compare, typename _LeafMerge>
  1099. void
  1100. __parallel_merge(_ExecutionPolicy&&, _RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe,
  1101. _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
  1102. _LeafMerge __leaf_merge)
  1103. {
  1104. typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
  1105. typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
  1106. typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
  1107. const _SizeType __n = (__xe - __xs) + (__ye - __ys);
  1108. const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
  1109. if (__n <= __merge_cut_off)
  1110. {
  1111. // Fall back on serial merge
  1112. __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp);
  1113. }
  1114. else
  1115. {
  1116. tbb::this_task_arena::isolate([=]() {
  1117. typedef __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3,
  1118. _Compare, _LeafMerge>
  1119. _TaskType;
  1120. __root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge};
  1121. __task::spawn_root_and_wait(__root);
  1122. });
  1123. }
  1124. }
  1125. //------------------------------------------------------------------------
  1126. // parallel_invoke
  1127. //------------------------------------------------------------------------
  1128. template <class _ExecutionPolicy, typename _F1, typename _F2>
  1129. void
  1130. __parallel_invoke(_ExecutionPolicy&&, _F1&& __f1, _F2&& __f2)
  1131. {
  1132. //TODO: a version of tbb::this_task_arena::isolate with variadic arguments pack should be added in the future
  1133. tbb::this_task_arena::isolate([&]() { tbb::parallel_invoke(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); });
  1134. }
  1135. } // namespace __tbb_backend
  1136. } // namespace __pstl
  1137. #endif /* _PSTL_PARALLEL_BACKEND_TBB_H */