unseq_backend_simd.h 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856
  1. // -*- C++ -*-
  2. //===-- unseq_backend_simd.h ----------------------------------------------===//
  3. //
  4. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  5. // See https://llvm.org/LICENSE.txt for license information.
  6. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7. //
  8. //===----------------------------------------------------------------------===//
  9. #ifndef _PSTL_UNSEQ_BACKEND_SIMD_H
  10. #define _PSTL_UNSEQ_BACKEND_SIMD_H
  11. #include <type_traits>
  12. #include "utils.h"
  13. // This header defines the minimum set of vector routines required
  14. // to support parallel STL.
  15. namespace __pstl
  16. {
  17. namespace __unseq_backend
  18. {
  19. // Expect vector width up to 64 (or 512 bit)
  20. const std::size_t __lane_size = 64;
  21. template <class _Iterator, class _DifferenceType, class _Function>
  22. _Iterator
  23. __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept
  24. {
  25. _PSTL_PRAGMA_SIMD
  26. for (_DifferenceType __i = 0; __i < __n; ++__i)
  27. __f(__first[__i]);
  28. return __first + __n;
  29. }
  30. template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
  31. _Iterator2
  32. __simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept
  33. {
  34. _PSTL_PRAGMA_SIMD
  35. for (_DifferenceType __i = 0; __i < __n; ++__i)
  36. __f(__first1[__i], __first2[__i]);
  37. return __first2 + __n;
  38. }
  39. template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
  40. _Iterator3
  41. __simd_walk_3(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3,
  42. _Function __f) noexcept
  43. {
  44. _PSTL_PRAGMA_SIMD
  45. for (_DifferenceType __i = 0; __i < __n; ++__i)
  46. __f(__first1[__i], __first2[__i], __first3[__i]);
  47. return __first3 + __n;
  48. }
  49. // TODO: check whether __simd_first() can be used here
  50. template <class _Index, class _DifferenceType, class _Pred>
  51. bool
  52. __simd_or(_Index __first, _DifferenceType __n, _Pred __pred) noexcept
  53. {
  54. #if _PSTL_EARLYEXIT_PRESENT
  55. _DifferenceType __i;
  56. _PSTL_PRAGMA_VECTOR_UNALIGNED
  57. _PSTL_PRAGMA_SIMD_EARLYEXIT
  58. for (__i = 0; __i < __n; ++__i)
  59. if (__pred(__first[__i]))
  60. break;
  61. return __i < __n;
  62. #else
  63. _DifferenceType __block_size = 4 < __n ? 4 : __n;
  64. const _Index __last = __first + __n;
  65. while (__last != __first)
  66. {
  67. int32_t __flag = 1;
  68. _PSTL_PRAGMA_SIMD_REDUCTION(& : __flag)
  69. for (_DifferenceType __i = 0; __i < __block_size; ++__i)
  70. if (__pred(*(__first + __i)))
  71. __flag = 0;
  72. if (!__flag)
  73. return true;
  74. __first += __block_size;
  75. if (__last - __first >= __block_size << 1)
  76. {
  77. // Double the block _Size. Any unnecessary iterations can be amortized against work done so far.
  78. __block_size <<= 1;
  79. }
  80. else
  81. {
  82. __block_size = __last - __first;
  83. }
  84. }
  85. return false;
  86. #endif
  87. }
  88. template <class _Index, class _DifferenceType, class _Compare>
  89. _Index
  90. __simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept
  91. {
  92. #if _PSTL_EARLYEXIT_PRESENT
  93. _DifferenceType __i = __begin;
  94. _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
  95. _PSTL_PRAGMA_SIMD_EARLYEXIT for (; __i < __end; ++__i)
  96. {
  97. if (__comp(__first, __i))
  98. {
  99. break;
  100. }
  101. }
  102. return __first + __i;
  103. #else
  104. // Experiments show good block sizes like this
  105. const _DifferenceType __block_size = 8;
  106. alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
  107. while (__end - __begin >= __block_size)
  108. {
  109. _DifferenceType __found = 0;
  110. _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
  111. _PSTL_PRAGMA_SIMD_REDUCTION(|
  112. : __found) for (_DifferenceType __i = __begin; __i < __begin + __block_size;
  113. ++__i)
  114. {
  115. const _DifferenceType __t = __comp(__first, __i);
  116. __lane[__i - __begin] = __t;
  117. __found |= __t;
  118. }
  119. if (__found)
  120. {
  121. _DifferenceType __i;
  122. // This will vectorize
  123. for (__i = 0; __i < __block_size; ++__i)
  124. {
  125. if (__lane[__i])
  126. {
  127. break;
  128. }
  129. }
  130. return __first + __begin + __i;
  131. }
  132. __begin += __block_size;
  133. }
  134. //Keep remainder scalar
  135. while (__begin != __end)
  136. {
  137. if (__comp(__first, __begin))
  138. {
  139. return __first + __begin;
  140. }
  141. ++__begin;
  142. }
  143. return __first + __end;
  144. #endif //_PSTL_EARLYEXIT_PRESENT
  145. }
  146. template <class _Index1, class _DifferenceType, class _Index2, class _Pred>
  147. std::pair<_Index1, _Index2>
  148. __simd_first(_Index1 __first1, _DifferenceType __n, _Index2 __first2, _Pred __pred) noexcept
  149. {
  150. #if _PSTL_EARLYEXIT_PRESENT
  151. _DifferenceType __i = 0;
  152. _PSTL_PRAGMA_VECTOR_UNALIGNED
  153. _PSTL_PRAGMA_SIMD_EARLYEXIT
  154. for (; __i < __n; ++__i)
  155. if (__pred(__first1[__i], __first2[__i]))
  156. break;
  157. return std::make_pair(__first1 + __i, __first2 + __i);
  158. #else
  159. const _Index1 __last1 = __first1 + __n;
  160. const _Index2 __last2 = __first2 + __n;
  161. // Experiments show good block sizes like this
  162. const _DifferenceType __block_size = 8;
  163. alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
  164. while (__last1 - __first1 >= __block_size)
  165. {
  166. _DifferenceType __found = 0;
  167. _DifferenceType __i;
  168. _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
  169. _PSTL_PRAGMA_SIMD_REDUCTION(|
  170. : __found) for (__i = 0; __i < __block_size; ++__i)
  171. {
  172. const _DifferenceType __t = __pred(__first1[__i], __first2[__i]);
  173. __lane[__i] = __t;
  174. __found |= __t;
  175. }
  176. if (__found)
  177. {
  178. _DifferenceType __i2;
  179. // This will vectorize
  180. for (__i2 = 0; __i2 < __block_size; ++__i2)
  181. {
  182. if (__lane[__i2])
  183. break;
  184. }
  185. return std::make_pair(__first1 + __i2, __first2 + __i2);
  186. }
  187. __first1 += __block_size;
  188. __first2 += __block_size;
  189. }
  190. //Keep remainder scalar
  191. for (; __last1 != __first1; ++__first1, ++__first2)
  192. if (__pred(*(__first1), *(__first2)))
  193. return std::make_pair(__first1, __first2);
  194. return std::make_pair(__last1, __last2);
  195. #endif //_PSTL_EARLYEXIT_PRESENT
  196. }
  197. template <class _Index, class _DifferenceType, class _Pred>
  198. _DifferenceType
  199. __simd_count(_Index __index, _DifferenceType __n, _Pred __pred) noexcept
  200. {
  201. _DifferenceType __count = 0;
  202. _PSTL_PRAGMA_SIMD_REDUCTION(+ : __count)
  203. for (_DifferenceType __i = 0; __i < __n; ++__i)
  204. if (__pred(*(__index + __i)))
  205. ++__count;
  206. return __count;
  207. }
  208. template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _BinaryPredicate>
  209. _OutputIterator
  210. __simd_unique_copy(_InputIterator __first, _DifferenceType __n, _OutputIterator __result,
  211. _BinaryPredicate __pred) noexcept
  212. {
  213. if (__n == 0)
  214. return __result;
  215. _DifferenceType __cnt = 1;
  216. __result[0] = __first[0];
  217. _PSTL_PRAGMA_SIMD
  218. for (_DifferenceType __i = 1; __i < __n; ++__i)
  219. {
  220. _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
  221. if (!__pred(__first[__i], __first[__i - 1]))
  222. {
  223. __result[__cnt] = __first[__i];
  224. ++__cnt;
  225. }
  226. }
  227. return __result + __cnt;
  228. }
  229. template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _Assigner>
  230. _OutputIterator
  231. __simd_assign(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _Assigner __assigner) noexcept
  232. {
  233. _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
  234. _PSTL_PRAGMA_SIMD
  235. for (_DifferenceType __i = 0; __i < __n; ++__i)
  236. __assigner(__first + __i, __result + __i);
  237. return __result + __n;
  238. }
  239. template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _UnaryPredicate>
  240. _OutputIterator
  241. __simd_copy_if(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _UnaryPredicate __pred) noexcept
  242. {
  243. _DifferenceType __cnt = 0;
  244. _PSTL_PRAGMA_SIMD
  245. for (_DifferenceType __i = 0; __i < __n; ++__i)
  246. {
  247. _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
  248. if (__pred(__first[__i]))
  249. {
  250. __result[__cnt] = __first[__i];
  251. ++__cnt;
  252. }
  253. }
  254. return __result + __cnt;
  255. }
  256. template <class _InputIterator, class _DifferenceType, class _BinaryPredicate>
  257. _DifferenceType
  258. __simd_calc_mask_2(_InputIterator __first, _DifferenceType __n, bool* __mask, _BinaryPredicate __pred) noexcept
  259. {
  260. _DifferenceType __count = 0;
  261. _PSTL_PRAGMA_SIMD_REDUCTION(+ : __count)
  262. for (_DifferenceType __i = 0; __i < __n; ++__i)
  263. {
  264. __mask[__i] = !__pred(__first[__i], __first[__i - 1]);
  265. __count += __mask[__i];
  266. }
  267. return __count;
  268. }
  269. template <class _InputIterator, class _DifferenceType, class _UnaryPredicate>
  270. _DifferenceType
  271. __simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, bool* __mask, _UnaryPredicate __pred) noexcept
  272. {
  273. _DifferenceType __count = 0;
  274. _PSTL_PRAGMA_SIMD_REDUCTION(+ : __count)
  275. for (_DifferenceType __i = 0; __i < __n; ++__i)
  276. {
  277. __mask[__i] = __pred(__first[__i]);
  278. __count += __mask[__i];
  279. }
  280. return __count;
  281. }
  282. template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _Assigner>
  283. void
  284. __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, bool* __mask,
  285. _Assigner __assigner) noexcept
  286. {
  287. _DifferenceType __cnt = 0;
  288. _PSTL_PRAGMA_SIMD
  289. for (_DifferenceType __i = 0; __i < __n; ++__i)
  290. {
  291. if (__mask[__i])
  292. {
  293. _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
  294. {
  295. __assigner(__first + __i, __result + __cnt);
  296. ++__cnt;
  297. }
  298. }
  299. }
  300. }
  301. template <class _InputIterator, class _DifferenceType, class _OutputIterator1, class _OutputIterator2>
  302. void
  303. __simd_partition_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator1 __out_true,
  304. _OutputIterator2 __out_false, bool* __mask) noexcept
  305. {
  306. _DifferenceType __cnt_true = 0, __cnt_false = 0;
  307. _PSTL_PRAGMA_SIMD
  308. for (_DifferenceType __i = 0; __i < __n; ++__i)
  309. {
  310. _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(__cnt_true : 1, __cnt_false : 1)
  311. if (__mask[__i])
  312. {
  313. __out_true[__cnt_true] = __first[__i];
  314. ++__cnt_true;
  315. }
  316. else
  317. {
  318. __out_false[__cnt_false] = __first[__i];
  319. ++__cnt_false;
  320. }
  321. }
  322. }
  323. template <class _Index, class _DifferenceType, class _Tp>
  324. _Index
  325. __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept
  326. {
  327. _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
  328. _PSTL_PRAGMA_SIMD
  329. for (_DifferenceType __i = 0; __i < __n; ++__i)
  330. __first[__i] = __value;
  331. return __first + __n;
  332. }
  333. template <class _Index, class _DifferenceType, class _Generator>
  334. _Index
  335. __simd_generate_n(_Index __first, _DifferenceType __size, _Generator __g) noexcept
  336. {
  337. _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
  338. _PSTL_PRAGMA_SIMD
  339. for (_DifferenceType __i = 0; __i < __size; ++__i)
  340. __first[__i] = __g();
  341. return __first + __size;
  342. }
  343. template <class _Index, class _BinaryPredicate>
  344. _Index
  345. __simd_adjacent_find(_Index __first, _Index __last, _BinaryPredicate __pred, bool __or_semantic) noexcept
  346. {
  347. if (__last - __first < 2)
  348. return __last;
  349. typedef typename std::iterator_traits<_Index>::difference_type _DifferenceType;
  350. _DifferenceType __i = 0;
  351. #if _PSTL_EARLYEXIT_PRESENT
  352. //Some compiler versions fail to compile the following loop when iterators are used. Indices are used instead
  353. const _DifferenceType __n = __last - __first - 1;
  354. _PSTL_PRAGMA_VECTOR_UNALIGNED
  355. _PSTL_PRAGMA_SIMD_EARLYEXIT
  356. for (; __i < __n; ++__i)
  357. if (__pred(__first[__i], __first[__i + 1]))
  358. break;
  359. return __i < __n ? __first + __i : __last;
  360. #else
  361. // Experiments show good block sizes like this
  362. //TODO: to consider tuning block_size for various data types
  363. const _DifferenceType __block_size = 8;
  364. alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
  365. while (__last - __first >= __block_size)
  366. {
  367. _DifferenceType __found = 0;
  368. _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
  369. _PSTL_PRAGMA_SIMD_REDUCTION(|
  370. : __found) for (__i = 0; __i < __block_size - 1; ++__i)
  371. {
  372. //TODO: to improve SIMD vectorization
  373. const _DifferenceType __t = __pred(*(__first + __i), *(__first + __i + 1));
  374. __lane[__i] = __t;
  375. __found |= __t;
  376. }
  377. //Process a pair of elements on a boundary of a data block
  378. if (__first + __block_size < __last && __pred(*(__first + __i), *(__first + __i + 1)))
  379. __lane[__i] = __found = 1;
  380. if (__found)
  381. {
  382. if (__or_semantic)
  383. return __first;
  384. // This will vectorize
  385. for (__i = 0; __i < __block_size; ++__i)
  386. if (__lane[__i])
  387. break;
  388. return __first + __i; //As far as found is true a __result (__lane[__i] is true) is guaranteed
  389. }
  390. __first += __block_size;
  391. }
  392. //Process the rest elements
  393. for (; __last - __first > 1; ++__first)
  394. if (__pred(*__first, *(__first + 1)))
  395. return __first;
  396. return __last;
  397. #endif
  398. }
  399. // It was created to reduce the code inside std::enable_if
  400. template <typename _Tp, typename _BinaryOperation>
  401. using is_arithmetic_plus = std::integral_constant<bool, std::is_arithmetic<_Tp>::value &&
  402. std::is_same<_BinaryOperation, std::plus<_Tp>>::value>;
  403. template <typename _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation>
  404. typename std::enable_if<is_arithmetic_plus<_Tp, _BinaryOperation>::value, _Tp>::type
  405. __simd_transform_reduce(_DifferenceType __n, _Tp __init, _BinaryOperation, _UnaryOperation __f) noexcept
  406. {
  407. _PSTL_PRAGMA_SIMD_REDUCTION(+ : __init)
  408. for (_DifferenceType __i = 0; __i < __n; ++__i)
  409. __init += __f(__i);
  410. return __init;
  411. }
  412. template <typename _Size, typename _Tp, typename _BinaryOperation, typename _UnaryOperation>
  413. typename std::enable_if<!is_arithmetic_plus<_Tp, _BinaryOperation>::value, _Tp>::type
  414. __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __f) noexcept
  415. {
  416. const _Size __block_size = __lane_size / sizeof(_Tp);
  417. if (__n > 2 * __block_size && __block_size > 1)
  418. {
  419. alignas(__lane_size) char __lane_[__lane_size];
  420. _Tp* __lane = reinterpret_cast<_Tp*>(__lane_);
  421. // initializer
  422. _PSTL_PRAGMA_SIMD
  423. for (_Size __i = 0; __i < __block_size; ++__i)
  424. {
  425. ::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i)));
  426. }
  427. // main loop
  428. _Size __i = 2 * __block_size;
  429. const _Size last_iteration = __block_size * (__n / __block_size);
  430. for (; __i < last_iteration; __i += __block_size)
  431. {
  432. _PSTL_PRAGMA_SIMD
  433. for (_Size __j = 0; __j < __block_size; ++__j)
  434. {
  435. __lane[__j] = __binary_op(__lane[__j], __f(__i + __j));
  436. }
  437. }
  438. // remainder
  439. _PSTL_PRAGMA_SIMD
  440. for (_Size __j = 0; __j < __n - last_iteration; ++__j)
  441. {
  442. __lane[__j] = __binary_op(__lane[__j], __f(last_iteration + __j));
  443. }
  444. // combiner
  445. for (_Size __j = 0; __j < __block_size; ++__j)
  446. {
  447. __init = __binary_op(__init, __lane[__j]);
  448. }
  449. // destroyer
  450. _PSTL_PRAGMA_SIMD
  451. for (_Size __j = 0; __j < __block_size; ++__j)
  452. {
  453. __lane[__j].~_Tp();
  454. }
  455. }
  456. else
  457. {
  458. for (_Size __i = 0; __i < __n; ++__i)
  459. {
  460. __init = __binary_op(__init, __f(__i));
  461. }
  462. }
  463. return __init;
  464. }
  465. // Exclusive scan for "+" and arithmetic types
  466. template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
  467. class _BinaryOperation>
  468. typename std::enable_if<is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
  469. __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
  470. _BinaryOperation, /*Inclusive*/ std::false_type)
  471. {
  472. _PSTL_PRAGMA_SIMD_SCAN(+ : __init)
  473. for (_Size __i = 0; __i < __n; ++__i)
  474. {
  475. __result[__i] = __init;
  476. _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN(__init)
  477. __init += __unary_op(__first[__i]);
  478. }
  479. return std::make_pair(__result + __n, __init);
  480. }
  481. // As soon as we cannot call __binary_op in "combiner" we create a wrapper over _Tp to encapsulate __binary_op
  482. template <typename _Tp, typename _BinaryOp>
  483. struct _Combiner
  484. {
  485. _Tp __value;
  486. _BinaryOp* __bin_op; // Here is a pointer to function because of default ctor
  487. _Combiner() : __value{}, __bin_op(nullptr) {}
  488. _Combiner(const _Tp& value, const _BinaryOp* bin_op) : __value(value), __bin_op(const_cast<_BinaryOp*>(bin_op)) {}
  489. _Combiner(const _Combiner& __obj) : __value{}, __bin_op(__obj.__bin_op) {}
  490. void
  491. operator()(const _Combiner& __obj)
  492. {
  493. __value = (*__bin_op)(__value, __obj.__value);
  494. }
  495. };
  496. // Exclusive scan for other binary operations and types
  497. template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
  498. class _BinaryOperation>
  499. typename std::enable_if<!is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
  500. __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
  501. _BinaryOperation __binary_op, /*Inclusive*/ std::false_type)
  502. {
  503. typedef _Combiner<_Tp, _BinaryOperation> _CombinerType;
  504. _CombinerType __init_{__init, &__binary_op};
  505. _PSTL_PRAGMA_DECLARE_REDUCTION(__bin_op, _CombinerType)
  506. _PSTL_PRAGMA_SIMD_SCAN(__bin_op : __init_)
  507. for (_Size __i = 0; __i < __n; ++__i)
  508. {
  509. __result[__i] = __init_.__value;
  510. _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN(__init_)
  511. _PSTL_PRAGMA_FORCEINLINE
  512. __init_.__value = __binary_op(__init_.__value, __unary_op(__first[__i]));
  513. }
  514. return std::make_pair(__result + __n, __init_.__value);
  515. }
  516. // Inclusive scan for "+" and arithmetic types
  517. template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
  518. class _BinaryOperation>
  519. typename std::enable_if<is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
  520. __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
  521. _BinaryOperation, /*Inclusive*/ std::true_type)
  522. {
  523. _PSTL_PRAGMA_SIMD_SCAN(+ : __init)
  524. for (_Size __i = 0; __i < __n; ++__i)
  525. {
  526. __init += __unary_op(__first[__i]);
  527. _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN(__init)
  528. __result[__i] = __init;
  529. }
  530. return std::make_pair(__result + __n, __init);
  531. }
  532. // Inclusive scan for other binary operations and types
  533. template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
  534. class _BinaryOperation>
  535. typename std::enable_if<!is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
  536. __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
  537. _BinaryOperation __binary_op, std::true_type)
  538. {
  539. typedef _Combiner<_Tp, _BinaryOperation> _CombinerType;
  540. _CombinerType __init_{__init, &__binary_op};
  541. _PSTL_PRAGMA_DECLARE_REDUCTION(__bin_op, _CombinerType)
  542. _PSTL_PRAGMA_SIMD_SCAN(__bin_op : __init_)
  543. for (_Size __i = 0; __i < __n; ++__i)
  544. {
  545. _PSTL_PRAGMA_FORCEINLINE
  546. __init_.__value = __binary_op(__init_.__value, __unary_op(__first[__i]));
  547. _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN(__init_)
  548. __result[__i] = __init_.__value;
  549. }
  550. return std::make_pair(__result + __n, __init_.__value);
  551. }
  552. // [restriction] - std::iterator_traits<_ForwardIterator>::value_type should be DefaultConstructible.
  553. // complexity [violation] - We will have at most (__n-1 + number_of_lanes) comparisons instead of at most __n-1.
  554. template <typename _ForwardIterator, typename _Size, typename _Compare>
  555. _ForwardIterator
  556. __simd_min_element(_ForwardIterator __first, _Size __n, _Compare __comp) noexcept
  557. {
  558. if (__n == 0)
  559. {
  560. return __first;
  561. }
  562. typedef typename std::iterator_traits<_ForwardIterator>::value_type _ValueType;
  563. struct _ComplexType
  564. {
  565. _ValueType __min_val;
  566. _Size __min_ind;
  567. _Compare* __min_comp;
  568. _ComplexType() : __min_val{}, __min_ind{}, __min_comp(nullptr) {}
  569. _ComplexType(const _ValueType& val, const _Compare* comp)
  570. : __min_val(val), __min_ind(0), __min_comp(const_cast<_Compare*>(comp))
  571. {
  572. }
  573. _ComplexType(const _ComplexType& __obj)
  574. : __min_val(__obj.__min_val), __min_ind(__obj.__min_ind), __min_comp(__obj.__min_comp)
  575. {
  576. }
  577. _PSTL_PRAGMA_DECLARE_SIMD
  578. void
  579. operator()(const _ComplexType& __obj)
  580. {
  581. if (!(*__min_comp)(__min_val, __obj.__min_val) &&
  582. ((*__min_comp)(__obj.__min_val, __min_val) || __obj.__min_ind - __min_ind < 0))
  583. {
  584. __min_val = __obj.__min_val;
  585. __min_ind = __obj.__min_ind;
  586. }
  587. }
  588. };
  589. _ComplexType __init{*__first, &__comp};
  590. _PSTL_PRAGMA_DECLARE_REDUCTION(__min_func, _ComplexType)
  591. _PSTL_PRAGMA_SIMD_REDUCTION(__min_func : __init)
  592. for (_Size __i = 1; __i < __n; ++__i)
  593. {
  594. const _ValueType __min_val = __init.__min_val;
  595. const _ValueType __current = __first[__i];
  596. if (__comp(__current, __min_val))
  597. {
  598. __init.__min_val = __current;
  599. __init.__min_ind = __i;
  600. }
  601. }
  602. return __first + __init.__min_ind;
  603. }
  604. // [restriction] - std::iterator_traits<_ForwardIterator>::value_type should be DefaultConstructible.
  605. // complexity [violation] - We will have at most (2*(__n-1) + 4*number_of_lanes) comparisons instead of at most [1.5*(__n-1)].
  606. template <typename _ForwardIterator, typename _Size, typename _Compare>
  607. std::pair<_ForwardIterator, _ForwardIterator>
  608. __simd_minmax_element(_ForwardIterator __first, _Size __n, _Compare __comp) noexcept
  609. {
  610. if (__n == 0)
  611. {
  612. return std::make_pair(__first, __first);
  613. }
  614. typedef typename std::iterator_traits<_ForwardIterator>::value_type _ValueType;
  615. struct _ComplexType
  616. {
  617. _ValueType __min_val;
  618. _ValueType __max_val;
  619. _Size __min_ind;
  620. _Size __max_ind;
  621. _Compare* __minmax_comp;
  622. _ComplexType() : __min_val{}, __max_val{}, __min_ind{}, __max_ind{}, __minmax_comp(nullptr) {}
  623. _ComplexType(const _ValueType& min_val, const _ValueType& max_val, const _Compare* comp)
  624. : __min_val(min_val), __max_val(max_val), __min_ind(0), __max_ind(0),
  625. __minmax_comp(const_cast<_Compare*>(comp))
  626. {
  627. }
  628. _ComplexType(const _ComplexType& __obj)
  629. : __min_val(__obj.__min_val), __max_val(__obj.__max_val), __min_ind(__obj.__min_ind),
  630. __max_ind(__obj.__max_ind), __minmax_comp(__obj.__minmax_comp)
  631. {
  632. }
  633. void
  634. operator()(const _ComplexType& __obj)
  635. {
  636. // min
  637. if ((*__minmax_comp)(__obj.__min_val, __min_val))
  638. {
  639. __min_val = __obj.__min_val;
  640. __min_ind = __obj.__min_ind;
  641. }
  642. else if (!(*__minmax_comp)(__min_val, __obj.__min_val))
  643. {
  644. __min_val = __obj.__min_val;
  645. __min_ind = (__min_ind - __obj.__min_ind < 0) ? __min_ind : __obj.__min_ind;
  646. }
  647. // max
  648. if ((*__minmax_comp)(__max_val, __obj.__max_val))
  649. {
  650. __max_val = __obj.__max_val;
  651. __max_ind = __obj.__max_ind;
  652. }
  653. else if (!(*__minmax_comp)(__obj.__max_val, __max_val))
  654. {
  655. __max_val = __obj.__max_val;
  656. __max_ind = (__max_ind - __obj.__max_ind < 0) ? __obj.__max_ind : __max_ind;
  657. }
  658. }
  659. };
  660. _ComplexType __init{*__first, *__first, &__comp};
  661. _PSTL_PRAGMA_DECLARE_REDUCTION(__min_func, _ComplexType);
  662. _PSTL_PRAGMA_SIMD_REDUCTION(__min_func : __init)
  663. for (_Size __i = 1; __i < __n; ++__i)
  664. {
  665. auto __min_val = __init.__min_val;
  666. auto __max_val = __init.__max_val;
  667. auto __current = __first + __i;
  668. if (__comp(*__current, __min_val))
  669. {
  670. __init.__min_val = *__current;
  671. __init.__min_ind = __i;
  672. }
  673. else if (!__comp(*__current, __max_val))
  674. {
  675. __init.__max_val = *__current;
  676. __init.__max_ind = __i;
  677. }
  678. }
  679. return std::make_pair(__first + __init.__min_ind, __first + __init.__max_ind);
  680. }
  681. template <class _InputIterator, class _DifferenceType, class _OutputIterator1, class _OutputIterator2,
  682. class _UnaryPredicate>
  683. std::pair<_OutputIterator1, _OutputIterator2>
  684. __simd_partition_copy(_InputIterator __first, _DifferenceType __n, _OutputIterator1 __out_true,
  685. _OutputIterator2 __out_false, _UnaryPredicate __pred) noexcept
  686. {
  687. _DifferenceType __cnt_true = 0, __cnt_false = 0;
  688. _PSTL_PRAGMA_SIMD
  689. for (_DifferenceType __i = 0; __i < __n; ++__i)
  690. {
  691. _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(__cnt_true : 1, __cnt_false : 1)
  692. if (__pred(__first[__i]))
  693. {
  694. __out_true[__cnt_true] = __first[__i];
  695. ++__cnt_true;
  696. }
  697. else
  698. {
  699. __out_false[__cnt_false] = __first[__i];
  700. ++__cnt_false;
  701. }
  702. }
  703. return std::make_pair(__out_true + __cnt_true, __out_false + __cnt_false);
  704. }
  705. template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
  706. _ForwardIterator1
  707. __simd_find_first_of(_ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __s_first,
  708. _ForwardIterator2 __s_last, _BinaryPredicate __pred) noexcept
  709. {
  710. typedef typename std::iterator_traits<_ForwardIterator1>::difference_type _DifferencType;
  711. const _DifferencType __n1 = __last - __first;
  712. const _DifferencType __n2 = __s_last - __s_first;
  713. if (__n1 == 0 || __n2 == 0)
  714. {
  715. return __last; // according to the standard
  716. }
  717. // Common case
  718. // If first sequence larger than second then we'll run simd_first with parameters of first sequence.
  719. // Otherwise, vice versa.
  720. if (__n1 < __n2)
  721. {
  722. for (; __first != __last; ++__first)
  723. {
  724. if (__unseq_backend::__simd_or(
  725. __s_first, __n2,
  726. __internal::__equal_value_by_pred<decltype(*__first), _BinaryPredicate>(*__first, __pred)))
  727. {
  728. return __first;
  729. }
  730. }
  731. }
  732. else
  733. {
  734. for (; __s_first != __s_last; ++__s_first)
  735. {
  736. const auto __result = __unseq_backend::__simd_first(
  737. __first, _DifferencType(0), __n1, [__s_first, &__pred](_ForwardIterator1 __it, _DifferencType __i) {
  738. return __pred(__it[__i], *__s_first);
  739. });
  740. if (__result != __last)
  741. {
  742. return __result;
  743. }
  744. }
  745. }
  746. return __last;
  747. }
  748. template <class _RandomAccessIterator, class _DifferenceType, class _UnaryPredicate>
  749. _RandomAccessIterator
  750. __simd_remove_if(_RandomAccessIterator __first, _DifferenceType __n, _UnaryPredicate __pred) noexcept
  751. {
  752. // find first element we need to remove
  753. auto __current = __unseq_backend::__simd_first(
  754. __first, _DifferenceType(0), __n,
  755. [&__pred](_RandomAccessIterator __it, _DifferenceType __i) { return __pred(__it[__i]); });
  756. __n -= __current - __first;
  757. // if we have in sequence only one element that pred(__current[1]) != false we can exit the function
  758. if (__n < 2)
  759. {
  760. return __current;
  761. }
  762. _DifferenceType __cnt = 0;
  763. _PSTL_PRAGMA_SIMD
  764. for (_DifferenceType __i = 1; __i < __n; ++__i)
  765. {
  766. _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
  767. if (!__pred(__current[__i]))
  768. {
  769. __current[__cnt] = std::move(__current[__i]);
  770. ++__cnt;
  771. }
  772. }
  773. return __current + __cnt;
  774. }
  775. } // namespace __unseq_backend
  776. } // namespace __pstl
  777. #endif /* _PSTL_UNSEQ_BACKEND_SIMD_H */