ieee754-sf.S 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933
  1. /* IEEE-754 single-precision functions for Xtensa
  2. Copyright (C) 2006-2022 Free Software Foundation, Inc.
  3. Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
  4. This file is part of GCC.
  5. GCC is free software; you can redistribute it and/or modify it
  6. under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 3, or (at your option)
  8. any later version.
  9. GCC is distributed in the hope that it will be useful, but WITHOUT
  10. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11. or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
  12. License for more details.
  13. Under Section 7 of GPL version 3, you are granted additional
  14. permissions described in the GCC Runtime Library Exception, version
  15. 3.1, as published by the Free Software Foundation.
  16. You should have received a copy of the GNU General Public License and
  17. a copy of the GCC Runtime Library Exception along with this program;
  18. see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. <http://www.gnu.org/licenses/>. */
  20. #ifdef __XTENSA_EB__
  21. #define xh a2
  22. #define xl a3
  23. #define yh a4
  24. #define yl a5
  25. #else
  26. #define xh a3
  27. #define xl a2
  28. #define yh a5
  29. #define yl a4
  30. #endif
  31. /* Warning! The branch displacements for some Xtensa branch instructions
  32. are quite small, and this code has been carefully laid out to keep
  33. branch targets in range. If you change anything, be sure to check that
  34. the assembler is not relaxing anything to branch over a jump. */
  35. #ifdef L_negsf2
  36. .align 4
  37. .global __negsf2
  38. .type __negsf2, @function
  39. __negsf2:
  40. leaf_entry sp, 16
  41. movi a4, 0x80000000
  42. xor a2, a2, a4
  43. leaf_return
  44. #endif /* L_negsf2 */
  45. #ifdef L_addsubsf3
  46. .literal_position
  47. /* Addition */
  48. __addsf3_aux:
  49. /* Handle NaNs and Infinities. (This code is placed before the
  50. start of the function just to keep it in range of the limited
  51. branch displacements.) */
  52. .Ladd_xnan_or_inf:
  53. /* If y is neither Infinity nor NaN, return x. */
  54. bnall a3, a6, .Ladd_return_nan_or_inf
  55. /* If x is a NaN, return it. Otherwise, return y. */
  56. slli a7, a2, 9
  57. bnez a7, .Ladd_return_nan
  58. .Ladd_ynan_or_inf:
  59. /* Return y. */
  60. mov a2, a3
  61. .Ladd_return_nan_or_inf:
  62. slli a7, a2, 9
  63. bnez a7, .Ladd_return_nan
  64. leaf_return
  65. .Ladd_return_nan:
  66. movi a6, 0x400000 /* make it a quiet NaN */
  67. or a2, a2, a6
  68. leaf_return
  69. .Ladd_opposite_signs:
  70. /* Operand signs differ. Do a subtraction. */
  71. slli a7, a6, 8
  72. xor a3, a3, a7
  73. j .Lsub_same_sign
  74. .align 4
  75. .global __addsf3
  76. .type __addsf3, @function
  77. __addsf3:
  78. leaf_entry sp, 16
  79. movi a6, 0x7f800000
  80. /* Check if the two operands have the same sign. */
  81. xor a7, a2, a3
  82. bltz a7, .Ladd_opposite_signs
  83. .Ladd_same_sign:
  84. /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity). */
  85. ball a2, a6, .Ladd_xnan_or_inf
  86. ball a3, a6, .Ladd_ynan_or_inf
  87. /* Compare the exponents. The smaller operand will be shifted
  88. right by the exponent difference and added to the larger
  89. one. */
  90. extui a7, a2, 23, 9
  91. extui a8, a3, 23, 9
  92. bltu a7, a8, .Ladd_shiftx
  93. .Ladd_shifty:
  94. /* Check if the smaller (or equal) exponent is zero. */
  95. bnone a3, a6, .Ladd_yexpzero
  96. /* Replace y sign/exponent with 0x008. */
  97. or a3, a3, a6
  98. slli a3, a3, 8
  99. srli a3, a3, 8
  100. .Ladd_yexpdiff:
  101. /* Compute the exponent difference. */
  102. sub a10, a7, a8
  103. /* Exponent difference > 32 -- just return the bigger value. */
  104. bgeui a10, 32, 1f
  105. /* Shift y right by the exponent difference. Any bits that are
  106. shifted out of y are saved in a9 for rounding the result. */
  107. ssr a10
  108. movi a9, 0
  109. src a9, a3, a9
  110. srl a3, a3
  111. /* Do the addition. */
  112. add a2, a2, a3
  113. /* Check if the add overflowed into the exponent. */
  114. extui a10, a2, 23, 9
  115. beq a10, a7, .Ladd_round
  116. mov a8, a7
  117. j .Ladd_carry
  118. .Ladd_yexpzero:
  119. /* y is a subnormal value. Replace its sign/exponent with zero,
  120. i.e., no implicit "1.0", and increment the apparent exponent
  121. because subnormals behave as if they had the minimum (nonzero)
  122. exponent. Test for the case when both exponents are zero. */
  123. slli a3, a3, 9
  124. srli a3, a3, 9
  125. bnone a2, a6, .Ladd_bothexpzero
  126. addi a8, a8, 1
  127. j .Ladd_yexpdiff
  128. .Ladd_bothexpzero:
  129. /* Both exponents are zero. Handle this as a special case. There
  130. is no need to shift or round, and the normal code for handling
  131. a carry into the exponent field will not work because it
  132. assumes there is an implicit "1.0" that needs to be added. */
  133. add a2, a2, a3
  134. 1: leaf_return
  135. .Ladd_xexpzero:
  136. /* Same as "yexpzero" except skip handling the case when both
  137. exponents are zero. */
  138. slli a2, a2, 9
  139. srli a2, a2, 9
  140. addi a7, a7, 1
  141. j .Ladd_xexpdiff
  142. .Ladd_shiftx:
  143. /* Same thing as the "shifty" code, but with x and y swapped. Also,
  144. because the exponent difference is always nonzero in this version,
  145. the shift sequence can use SLL and skip loading a constant zero. */
  146. bnone a2, a6, .Ladd_xexpzero
  147. or a2, a2, a6
  148. slli a2, a2, 8
  149. srli a2, a2, 8
  150. .Ladd_xexpdiff:
  151. sub a10, a8, a7
  152. bgeui a10, 32, .Ladd_returny
  153. ssr a10
  154. sll a9, a2
  155. srl a2, a2
  156. add a2, a2, a3
  157. /* Check if the add overflowed into the exponent. */
  158. extui a10, a2, 23, 9
  159. bne a10, a8, .Ladd_carry
  160. .Ladd_round:
  161. /* Round up if the leftover fraction is >= 1/2. */
  162. bgez a9, 1f
  163. addi a2, a2, 1
  164. /* Check if the leftover fraction is exactly 1/2. */
  165. slli a9, a9, 1
  166. beqz a9, .Ladd_exactlyhalf
  167. 1: leaf_return
  168. .Ladd_returny:
  169. mov a2, a3
  170. leaf_return
  171. .Ladd_carry:
  172. /* The addition has overflowed into the exponent field, so the
  173. value needs to be renormalized. The mantissa of the result
  174. can be recovered by subtracting the original exponent and
  175. adding 0x800000 (which is the explicit "1.0" for the
  176. mantissa of the non-shifted operand -- the "1.0" for the
  177. shifted operand was already added). The mantissa can then
  178. be shifted right by one bit. The explicit "1.0" of the
  179. shifted mantissa then needs to be replaced by the exponent,
  180. incremented by one to account for the normalizing shift.
  181. It is faster to combine these operations: do the shift first
  182. and combine the additions and subtractions. If x is the
  183. original exponent, the result is:
  184. shifted mantissa - (x << 22) + (1 << 22) + (x << 23)
  185. or:
  186. shifted mantissa + ((x + 1) << 22)
  187. Note that the exponent is incremented here by leaving the
  188. explicit "1.0" of the mantissa in the exponent field. */
  189. /* Shift x right by one bit. Save the lsb. */
  190. mov a10, a2
  191. srli a2, a2, 1
  192. /* See explanation above. The original exponent is in a8. */
  193. addi a8, a8, 1
  194. slli a8, a8, 22
  195. add a2, a2, a8
  196. /* Return an Infinity if the exponent overflowed. */
  197. ball a2, a6, .Ladd_infinity
  198. /* Same thing as the "round" code except the msb of the leftover
  199. fraction is bit 0 of a10, with the rest of the fraction in a9. */
  200. bbci.l a10, 0, 1f
  201. addi a2, a2, 1
  202. beqz a9, .Ladd_exactlyhalf
  203. 1: leaf_return
  204. .Ladd_infinity:
  205. /* Clear the mantissa. */
  206. srli a2, a2, 23
  207. slli a2, a2, 23
  208. /* The sign bit may have been lost in a carry-out. Put it back. */
  209. slli a8, a8, 1
  210. or a2, a2, a8
  211. leaf_return
  212. .Ladd_exactlyhalf:
  213. /* Round down to the nearest even value. */
  214. srli a2, a2, 1
  215. slli a2, a2, 1
  216. leaf_return
  217. /* Subtraction */
  218. __subsf3_aux:
  219. /* Handle NaNs and Infinities. (This code is placed before the
  220. start of the function just to keep it in range of the limited
  221. branch displacements.) */
  222. .Lsub_xnan_or_inf:
  223. /* If y is neither Infinity nor NaN, return x. */
  224. bnall a3, a6, .Lsub_return_nan_or_inf
  225. /* Both x and y are either NaN or Inf, so the result is NaN. */
  226. .Lsub_return_nan:
  227. movi a4, 0x400000 /* make it a quiet NaN */
  228. or a2, a2, a4
  229. leaf_return
  230. .Lsub_ynan_or_inf:
  231. /* Negate y and return it. */
  232. slli a7, a6, 8
  233. xor a2, a3, a7
  234. .Lsub_return_nan_or_inf:
  235. slli a7, a2, 9
  236. bnez a7, .Lsub_return_nan
  237. leaf_return
  238. .Lsub_opposite_signs:
  239. /* Operand signs differ. Do an addition. */
  240. slli a7, a6, 8
  241. xor a3, a3, a7
  242. j .Ladd_same_sign
  243. .align 4
  244. .global __subsf3
  245. .type __subsf3, @function
  246. __subsf3:
  247. leaf_entry sp, 16
  248. movi a6, 0x7f800000
  249. /* Check if the two operands have the same sign. */
  250. xor a7, a2, a3
  251. bltz a7, .Lsub_opposite_signs
  252. .Lsub_same_sign:
  253. /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity). */
  254. ball a2, a6, .Lsub_xnan_or_inf
  255. ball a3, a6, .Lsub_ynan_or_inf
  256. /* Compare the operands. In contrast to addition, the entire
  257. value matters here. */
  258. extui a7, a2, 23, 8
  259. extui a8, a3, 23, 8
  260. bltu a2, a3, .Lsub_xsmaller
  261. .Lsub_ysmaller:
  262. /* Check if the smaller (or equal) exponent is zero. */
  263. bnone a3, a6, .Lsub_yexpzero
  264. /* Replace y sign/exponent with 0x008. */
  265. or a3, a3, a6
  266. slli a3, a3, 8
  267. srli a3, a3, 8
  268. .Lsub_yexpdiff:
  269. /* Compute the exponent difference. */
  270. sub a10, a7, a8
  271. /* Exponent difference > 32 -- just return the bigger value. */
  272. bgeui a10, 32, 1f
  273. /* Shift y right by the exponent difference. Any bits that are
  274. shifted out of y are saved in a9 for rounding the result. */
  275. ssr a10
  276. movi a9, 0
  277. src a9, a3, a9
  278. srl a3, a3
  279. sub a2, a2, a3
  280. /* Subtract the leftover bits in a9 from zero and propagate any
  281. borrow from a2. */
  282. neg a9, a9
  283. addi a10, a2, -1
  284. movnez a2, a10, a9
  285. /* Check if the subtract underflowed into the exponent. */
  286. extui a10, a2, 23, 8
  287. beq a10, a7, .Lsub_round
  288. j .Lsub_borrow
  289. .Lsub_yexpzero:
  290. /* Return zero if the inputs are equal. (For the non-subnormal
  291. case, subtracting the "1.0" will cause a borrow from the exponent
  292. and this case can be detected when handling the borrow.) */
  293. beq a2, a3, .Lsub_return_zero
  294. /* y is a subnormal value. Replace its sign/exponent with zero,
  295. i.e., no implicit "1.0". Unless x is also a subnormal, increment
  296. y's apparent exponent because subnormals behave as if they had
  297. the minimum (nonzero) exponent. */
  298. slli a3, a3, 9
  299. srli a3, a3, 9
  300. bnone a2, a6, .Lsub_yexpdiff
  301. addi a8, a8, 1
  302. j .Lsub_yexpdiff
  303. .Lsub_returny:
  304. /* Negate and return y. */
  305. slli a7, a6, 8
  306. xor a2, a3, a7
  307. 1: leaf_return
  308. .Lsub_xsmaller:
  309. /* Same thing as the "ysmaller" code, but with x and y swapped and
  310. with y negated. */
  311. bnone a2, a6, .Lsub_xexpzero
  312. or a2, a2, a6
  313. slli a2, a2, 8
  314. srli a2, a2, 8
  315. .Lsub_xexpdiff:
  316. sub a10, a8, a7
  317. bgeui a10, 32, .Lsub_returny
  318. ssr a10
  319. movi a9, 0
  320. src a9, a2, a9
  321. srl a2, a2
  322. /* Negate y. */
  323. slli a11, a6, 8
  324. xor a3, a3, a11
  325. sub a2, a3, a2
  326. neg a9, a9
  327. addi a10, a2, -1
  328. movnez a2, a10, a9
  329. /* Check if the subtract underflowed into the exponent. */
  330. extui a10, a2, 23, 8
  331. bne a10, a8, .Lsub_borrow
  332. .Lsub_round:
  333. /* Round up if the leftover fraction is >= 1/2. */
  334. bgez a9, 1f
  335. addi a2, a2, 1
  336. /* Check if the leftover fraction is exactly 1/2. */
  337. slli a9, a9, 1
  338. beqz a9, .Lsub_exactlyhalf
  339. 1: leaf_return
  340. .Lsub_xexpzero:
  341. /* Same as "yexpzero". */
  342. beq a2, a3, .Lsub_return_zero
  343. slli a2, a2, 9
  344. srli a2, a2, 9
  345. bnone a3, a6, .Lsub_xexpdiff
  346. addi a7, a7, 1
  347. j .Lsub_xexpdiff
  348. .Lsub_return_zero:
  349. movi a2, 0
  350. leaf_return
  351. .Lsub_borrow:
  352. /* The subtraction has underflowed into the exponent field, so the
  353. value needs to be renormalized. Shift the mantissa left as
  354. needed to remove any leading zeros and adjust the exponent
  355. accordingly. If the exponent is not large enough to remove
  356. all the leading zeros, the result will be a subnormal value. */
  357. slli a8, a2, 9
  358. beqz a8, .Lsub_xzero
  359. do_nsau a6, a8, a7, a11
  360. srli a8, a8, 9
  361. bge a6, a10, .Lsub_subnormal
  362. addi a6, a6, 1
  363. .Lsub_normalize_shift:
  364. /* Shift the mantissa (a8/a9) left by a6. */
  365. ssl a6
  366. src a8, a8, a9
  367. sll a9, a9
  368. /* Combine the shifted mantissa with the sign and exponent,
  369. decrementing the exponent by a6. (The exponent has already
  370. been decremented by one due to the borrow from the subtraction,
  371. but adding the mantissa will increment the exponent by one.) */
  372. srli a2, a2, 23
  373. sub a2, a2, a6
  374. slli a2, a2, 23
  375. add a2, a2, a8
  376. j .Lsub_round
  377. .Lsub_exactlyhalf:
  378. /* Round down to the nearest even value. */
  379. srli a2, a2, 1
  380. slli a2, a2, 1
  381. leaf_return
  382. .Lsub_xzero:
  383. /* If there was a borrow from the exponent, and the mantissa and
  384. guard digits are all zero, then the inputs were equal and the
  385. result should be zero. */
  386. beqz a9, .Lsub_return_zero
  387. /* Only the guard digit is nonzero. Shift by min(24, a10). */
  388. addi a11, a10, -24
  389. movi a6, 24
  390. movltz a6, a10, a11
  391. j .Lsub_normalize_shift
  392. .Lsub_subnormal:
  393. /* The exponent is too small to shift away all the leading zeros.
  394. Set a6 to the current exponent (which has already been
  395. decremented by the borrow) so that the exponent of the result
  396. will be zero. Do not add 1 to a6 in this case, because: (1)
  397. adding the mantissa will not increment the exponent, so there is
  398. no need to subtract anything extra from the exponent to
  399. compensate, and (2) the effective exponent of a subnormal is 1
  400. not 0 so the shift amount must be 1 smaller than normal. */
  401. mov a6, a10
  402. j .Lsub_normalize_shift
  403. #endif /* L_addsubsf3 */
  404. #ifdef L_mulsf3
  405. /* Multiplication */
  406. #if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
  407. #define XCHAL_NO_MUL 1
  408. #endif
  409. .literal_position
  410. __mulsf3_aux:
  411. /* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
  412. (This code is placed before the start of the function just to
  413. keep it in range of the limited branch displacements.) */
  414. .Lmul_xexpzero:
  415. /* Clear the sign bit of x. */
  416. slli a2, a2, 1
  417. srli a2, a2, 1
  418. /* If x is zero, return zero. */
  419. beqz a2, .Lmul_return_zero
  420. /* Normalize x. Adjust the exponent in a8. */
  421. do_nsau a10, a2, a11, a12
  422. addi a10, a10, -8
  423. ssl a10
  424. sll a2, a2
  425. movi a8, 1
  426. sub a8, a8, a10
  427. j .Lmul_xnormalized
  428. .Lmul_yexpzero:
  429. /* Clear the sign bit of y. */
  430. slli a3, a3, 1
  431. srli a3, a3, 1
  432. /* If y is zero, return zero. */
  433. beqz a3, .Lmul_return_zero
  434. /* Normalize y. Adjust the exponent in a9. */
  435. do_nsau a10, a3, a11, a12
  436. addi a10, a10, -8
  437. ssl a10
  438. sll a3, a3
  439. movi a9, 1
  440. sub a9, a9, a10
  441. j .Lmul_ynormalized
  442. .Lmul_return_zero:
  443. /* Return zero with the appropriate sign bit. */
  444. srli a2, a7, 31
  445. slli a2, a2, 31
  446. j .Lmul_done
  447. .Lmul_xnan_or_inf:
  448. /* If y is zero, return NaN. */
  449. slli a8, a3, 1
  450. beqz a8, .Lmul_return_nan
  451. /* If y is NaN, return y. */
  452. bnall a3, a6, .Lmul_returnx
  453. slli a8, a3, 9
  454. beqz a8, .Lmul_returnx
  455. .Lmul_returny:
  456. mov a2, a3
  457. .Lmul_returnx:
  458. slli a8, a2, 9
  459. bnez a8, .Lmul_return_nan
  460. /* Set the sign bit and return. */
  461. extui a7, a7, 31, 1
  462. slli a2, a2, 1
  463. ssai 1
  464. src a2, a7, a2
  465. j .Lmul_done
  466. .Lmul_ynan_or_inf:
  467. /* If x is zero, return NaN. */
  468. slli a8, a2, 1
  469. bnez a8, .Lmul_returny
  470. mov a2, a3
  471. .Lmul_return_nan:
  472. movi a4, 0x400000 /* make it a quiet NaN */
  473. or a2, a2, a4
  474. j .Lmul_done
  475. .align 4
  476. .global __mulsf3
  477. .type __mulsf3, @function
  478. __mulsf3:
  479. #if __XTENSA_CALL0_ABI__
  480. leaf_entry sp, 32
  481. addi sp, sp, -32
  482. s32i a12, sp, 16
  483. s32i a13, sp, 20
  484. s32i a14, sp, 24
  485. s32i a15, sp, 28
  486. #elif XCHAL_NO_MUL
  487. /* This is not really a leaf function; allocate enough stack space
  488. to allow CALL12s to a helper function. */
  489. leaf_entry sp, 64
  490. #else
  491. leaf_entry sp, 32
  492. #endif
  493. movi a6, 0x7f800000
  494. /* Get the sign of the result. */
  495. xor a7, a2, a3
  496. /* Check for NaN and infinity. */
  497. ball a2, a6, .Lmul_xnan_or_inf
  498. ball a3, a6, .Lmul_ynan_or_inf
  499. /* Extract the exponents. */
  500. extui a8, a2, 23, 8
  501. extui a9, a3, 23, 8
  502. beqz a8, .Lmul_xexpzero
  503. .Lmul_xnormalized:
  504. beqz a9, .Lmul_yexpzero
  505. .Lmul_ynormalized:
  506. /* Add the exponents. */
  507. add a8, a8, a9
  508. /* Replace sign/exponent fields with explicit "1.0". */
  509. movi a10, 0xffffff
  510. or a2, a2, a6
  511. and a2, a2, a10
  512. or a3, a3, a6
  513. and a3, a3, a10
  514. /* Multiply 32x32 to 64 bits. The result ends up in a2/a6. */
  515. #if XCHAL_HAVE_MUL32_HIGH
  516. mull a6, a2, a3
  517. muluh a2, a2, a3
  518. #else
  519. /* Break the inputs into 16-bit chunks and compute 4 32-bit partial
  520. products. These partial products are:
  521. 0 xl * yl
  522. 1 xl * yh
  523. 2 xh * yl
  524. 3 xh * yh
  525. If using the Mul16 or Mul32 multiplier options, these input
  526. chunks must be stored in separate registers. For Mac16, the
  527. UMUL.AA.* opcodes can specify that the inputs come from either
  528. half of the registers, so there is no need to shift them out
  529. ahead of time. If there is no multiply hardware, the 16-bit
  530. chunks can be extracted when setting up the arguments to the
  531. separate multiply function. */
  532. #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
  533. /* Calling a separate multiply function will clobber a0 and requires
  534. use of a8 as a temporary, so save those values now. (The function
  535. uses a custom ABI so nothing else needs to be saved.) */
  536. s32i a0, sp, 0
  537. s32i a8, sp, 4
  538. #endif
  539. #if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
  540. #define a2h a4
  541. #define a3h a5
  542. /* Get the high halves of the inputs into registers. */
  543. srli a2h, a2, 16
  544. srli a3h, a3, 16
  545. #define a2l a2
  546. #define a3l a3
  547. #if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
  548. /* Clear the high halves of the inputs. This does not matter
  549. for MUL16 because the high bits are ignored. */
  550. extui a2, a2, 0, 16
  551. extui a3, a3, 0, 16
  552. #endif
  553. #endif /* MUL16 || MUL32 */
  554. #if XCHAL_HAVE_MUL16
  555. #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
  556. mul16u dst, xreg ## xhalf, yreg ## yhalf
  557. #elif XCHAL_HAVE_MUL32
  558. #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
  559. mull dst, xreg ## xhalf, yreg ## yhalf
  560. #elif XCHAL_HAVE_MAC16
  561. /* The preprocessor insists on inserting a space when concatenating after
  562. a period in the definition of do_mul below. These macros are a workaround
  563. using underscores instead of periods when doing the concatenation. */
  564. #define umul_aa_ll umul.aa.ll
  565. #define umul_aa_lh umul.aa.lh
  566. #define umul_aa_hl umul.aa.hl
  567. #define umul_aa_hh umul.aa.hh
  568. #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
  569. umul_aa_ ## xhalf ## yhalf xreg, yreg; \
  570. rsr dst, ACCLO
  571. #else /* no multiply hardware */
  572. #define set_arg_l(dst, src) \
  573. extui dst, src, 0, 16
  574. #define set_arg_h(dst, src) \
  575. srli dst, src, 16
  576. #if __XTENSA_CALL0_ABI__
  577. #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
  578. set_arg_ ## xhalf (a13, xreg); \
  579. set_arg_ ## yhalf (a14, yreg); \
  580. call0 .Lmul_mulsi3; \
  581. mov dst, a12
  582. #else
  583. #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
  584. set_arg_ ## xhalf (a14, xreg); \
  585. set_arg_ ## yhalf (a15, yreg); \
  586. call12 .Lmul_mulsi3; \
  587. mov dst, a14
  588. #endif /* __XTENSA_CALL0_ABI__ */
  589. #endif /* no multiply hardware */
  590. /* Add pp1 and pp2 into a6 with carry-out in a9. */
  591. do_mul(a6, a2, l, a3, h) /* pp 1 */
  592. do_mul(a11, a2, h, a3, l) /* pp 2 */
  593. movi a9, 0
  594. add a6, a6, a11
  595. bgeu a6, a11, 1f
  596. addi a9, a9, 1
  597. 1:
  598. /* Shift the high half of a9/a6 into position in a9. Note that
  599. this value can be safely incremented without any carry-outs. */
  600. ssai 16
  601. src a9, a9, a6
  602. /* Compute the low word into a6. */
  603. do_mul(a11, a2, l, a3, l) /* pp 0 */
  604. sll a6, a6
  605. add a6, a6, a11
  606. bgeu a6, a11, 1f
  607. addi a9, a9, 1
  608. 1:
  609. /* Compute the high word into a2. */
  610. do_mul(a2, a2, h, a3, h) /* pp 3 */
  611. add a2, a2, a9
  612. #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
  613. /* Restore values saved on the stack during the multiplication. */
  614. l32i a0, sp, 0
  615. l32i a8, sp, 4
  616. #endif
  617. #endif /* ! XCHAL_HAVE_MUL32_HIGH */
  618. /* Shift left by 9 bits, unless there was a carry-out from the
  619. multiply, in which case, shift by 8 bits and increment the
  620. exponent. */
  621. movi a4, 9
  622. srli a5, a2, 24 - 9
  623. beqz a5, 1f
  624. addi a4, a4, -1
  625. addi a8, a8, 1
  626. 1: ssl a4
  627. src a2, a2, a6
  628. sll a6, a6
  629. /* Subtract the extra bias from the exponent sum (plus one to account
  630. for the explicit "1.0" of the mantissa that will be added to the
  631. exponent in the final result). */
  632. movi a4, 0x80
  633. sub a8, a8, a4
  634. /* Check for over/underflow. The value in a8 is one less than the
  635. final exponent, so values in the range 0..fd are OK here. */
  636. movi a4, 0xfe
  637. bgeu a8, a4, .Lmul_overflow
  638. .Lmul_round:
  639. /* Round. */
  640. bgez a6, .Lmul_rounded
  641. addi a2, a2, 1
  642. slli a6, a6, 1
  643. beqz a6, .Lmul_exactlyhalf
  644. .Lmul_rounded:
  645. /* Add the exponent to the mantissa. */
  646. slli a8, a8, 23
  647. add a2, a2, a8
  648. .Lmul_addsign:
  649. /* Add the sign bit. */
  650. srli a7, a7, 31
  651. slli a7, a7, 31
  652. or a2, a2, a7
  653. .Lmul_done:
  654. #if __XTENSA_CALL0_ABI__
  655. l32i a12, sp, 16
  656. l32i a13, sp, 20
  657. l32i a14, sp, 24
  658. l32i a15, sp, 28
  659. addi sp, sp, 32
  660. #endif
  661. leaf_return
  662. .Lmul_exactlyhalf:
  663. /* Round down to the nearest even value. */
  664. srli a2, a2, 1
  665. slli a2, a2, 1
  666. j .Lmul_rounded
  667. .Lmul_overflow:
  668. bltz a8, .Lmul_underflow
  669. /* Return +/- Infinity. */
  670. movi a8, 0xff
  671. slli a2, a8, 23
  672. j .Lmul_addsign
  673. .Lmul_underflow:
  674. /* Create a subnormal value, where the exponent field contains zero,
  675. but the effective exponent is 1. The value of a8 is one less than
  676. the actual exponent, so just negate it to get the shift amount. */
  677. neg a8, a8
  678. mov a9, a6
  679. ssr a8
  680. bgeui a8, 32, .Lmul_flush_to_zero
  681. /* Shift a2 right. Any bits that are shifted out of a2 are saved
  682. in a6 (combined with the shifted-out bits currently in a6) for
  683. rounding the result. */
  684. sll a6, a2
  685. srl a2, a2
  686. /* Set the exponent to zero. */
  687. movi a8, 0
  688. /* Pack any nonzero bits shifted out into a6. */
  689. beqz a9, .Lmul_round
  690. movi a9, 1
  691. or a6, a6, a9
  692. j .Lmul_round
  693. .Lmul_flush_to_zero:
  694. /* Return zero with the appropriate sign bit. */
  695. srli a2, a7, 31
  696. slli a2, a2, 31
  697. j .Lmul_done
  698. #if XCHAL_NO_MUL
  699. /* For Xtensa processors with no multiply hardware, this simplified
  700. version of _mulsi3 is used for multiplying 16-bit chunks of
  701. the floating-point mantissas. When using CALL0, this function
  702. uses a custom ABI: the inputs are passed in a13 and a14, the
  703. result is returned in a12, and a8 and a15 are clobbered. */
  704. .align 4
  705. .Lmul_mulsi3:
  706. leaf_entry sp, 16
  707. .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
  708. movi \dst, 0
  709. 1: add \tmp1, \src2, \dst
  710. extui \tmp2, \src1, 0, 1
  711. movnez \dst, \tmp1, \tmp2
  712. do_addx2 \tmp1, \src2, \dst, \tmp1
  713. extui \tmp2, \src1, 1, 1
  714. movnez \dst, \tmp1, \tmp2
  715. do_addx4 \tmp1, \src2, \dst, \tmp1
  716. extui \tmp2, \src1, 2, 1
  717. movnez \dst, \tmp1, \tmp2
  718. do_addx8 \tmp1, \src2, \dst, \tmp1
  719. extui \tmp2, \src1, 3, 1
  720. movnez \dst, \tmp1, \tmp2
  721. srli \src1, \src1, 4
  722. slli \src2, \src2, 4
  723. bnez \src1, 1b
  724. .endm
  725. #if __XTENSA_CALL0_ABI__
  726. mul_mulsi3_body a12, a13, a14, a15, a8
  727. #else
  728. /* The result will be written into a2, so save that argument in a4. */
  729. mov a4, a2
  730. mul_mulsi3_body a2, a4, a3, a5, a6
  731. #endif
  732. leaf_return
  733. #endif /* XCHAL_NO_MUL */
  734. #endif /* L_mulsf3 */
  735. #ifdef L_divsf3
  736. /* Division */
  737. #if XCHAL_HAVE_FP_DIV
  738. .align 4
  739. .global __divsf3
  740. .type __divsf3, @function
  741. __divsf3:
  742. leaf_entry sp, 16
  743. wfr f1, a2 /* dividend */
  744. wfr f2, a3 /* divisor */
  745. div0.s f3, f2
  746. nexp01.s f4, f2
  747. const.s f5, 1
  748. maddn.s f5, f4, f3
  749. mov.s f6, f3
  750. mov.s f7, f2
  751. nexp01.s f2, f1
  752. maddn.s f6, f5, f6
  753. const.s f5, 1
  754. const.s f0, 0
  755. neg.s f8, f2
  756. maddn.s f5, f4, f6
  757. maddn.s f0, f8, f3
  758. mkdadj.s f7, f1
  759. maddn.s f6, f5, f6
  760. maddn.s f8, f4, f0
  761. const.s f3, 1
  762. maddn.s f3, f4, f6
  763. maddn.s f0, f8, f6
  764. neg.s f2, f2
  765. maddn.s f6, f3, f6
  766. maddn.s f2, f4, f0
  767. addexpm.s f0, f7
  768. addexp.s f6, f7
  769. divn.s f0, f2, f6
  770. rfr a2, f0
  771. leaf_return
  772. #else
  773. .literal_position
  774. __divsf3_aux:
  775. /* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
  776. (This code is placed before the start of the function just to
  777. keep it in range of the limited branch displacements.) */
  778. .Ldiv_yexpzero:
  779. /* Clear the sign bit of y. */
  780. slli a3, a3, 1
  781. srli a3, a3, 1
  782. /* Check for division by zero. */
  783. beqz a3, .Ldiv_yzero
  784. /* Normalize y. Adjust the exponent in a9. */
  785. do_nsau a10, a3, a4, a5
  786. addi a10, a10, -8
  787. ssl a10
  788. sll a3, a3
  789. movi a9, 1
  790. sub a9, a9, a10
  791. j .Ldiv_ynormalized
  792. .Ldiv_yzero:
  793. /* y is zero. Return NaN if x is also zero; otherwise, infinity. */
  794. slli a4, a2, 1
  795. srli a4, a4, 1
  796. srli a2, a7, 31
  797. slli a2, a2, 31
  798. or a2, a2, a6
  799. bnez a4, 1f
  800. movi a4, 0x400000 /* make it a quiet NaN */
  801. or a2, a2, a4
  802. 1: leaf_return
  803. .Ldiv_xexpzero:
  804. /* Clear the sign bit of x. */
  805. slli a2, a2, 1
  806. srli a2, a2, 1
  807. /* If x is zero, return zero. */
  808. beqz a2, .Ldiv_return_zero
  809. /* Normalize x. Adjust the exponent in a8. */
  810. do_nsau a10, a2, a4, a5
  811. addi a10, a10, -8
  812. ssl a10
  813. sll a2, a2
  814. movi a8, 1
  815. sub a8, a8, a10
  816. j .Ldiv_xnormalized
  817. .Ldiv_return_zero:
  818. /* Return zero with the appropriate sign bit. */
  819. srli a2, a7, 31
  820. slli a2, a2, 31
  821. leaf_return
  822. .Ldiv_xnan_or_inf:
  823. /* Set the sign bit of the result. */
  824. srli a7, a3, 31
  825. slli a7, a7, 31
  826. xor a2, a2, a7
  827. /* If y is NaN or Inf, return NaN. */
  828. ball a3, a6, .Ldiv_return_nan
  829. slli a7, a2, 9
  830. bnez a7, .Ldiv_return_nan
  831. leaf_return
  832. .Ldiv_ynan_or_inf:
  833. /* If y is Infinity, return zero. */
  834. slli a8, a3, 9
  835. beqz a8, .Ldiv_return_zero
  836. /* y is NaN; return it. */
  837. mov a2, a3
  838. .Ldiv_return_nan:
  839. movi a4, 0x400000 /* make it a quiet NaN */
  840. or a2, a2, a4
  841. leaf_return
  842. .align 4
  843. .global __divsf3
  844. .type __divsf3, @function
  845. __divsf3:
  846. leaf_entry sp, 16
  847. movi a6, 0x7f800000
  848. /* Get the sign of the result. */
  849. xor a7, a2, a3
  850. /* Check for NaN and infinity. */
  851. ball a2, a6, .Ldiv_xnan_or_inf
  852. ball a3, a6, .Ldiv_ynan_or_inf
  853. /* Extract the exponents. */
  854. extui a8, a2, 23, 8
  855. extui a9, a3, 23, 8
  856. beqz a9, .Ldiv_yexpzero
  857. .Ldiv_ynormalized:
  858. beqz a8, .Ldiv_xexpzero
  859. .Ldiv_xnormalized:
  860. /* Subtract the exponents. */
  861. sub a8, a8, a9
  862. /* Replace sign/exponent fields with explicit "1.0". */
  863. movi a10, 0xffffff
  864. or a2, a2, a6
  865. and a2, a2, a10
  866. or a3, a3, a6
  867. and a3, a3, a10
  868. /* The first digit of the mantissa division must be a one.
  869. Shift x (and adjust the exponent) as needed to make this true. */
  870. bltu a3, a2, 1f
  871. slli a2, a2, 1
  872. addi a8, a8, -1
  873. 1:
  874. /* Do the first subtraction and shift. */
  875. sub a2, a2, a3
  876. slli a2, a2, 1
  877. /* Put the quotient into a10. */
  878. movi a10, 1
  879. /* Divide one bit at a time for 23 bits. */
  880. movi a9, 23
  881. #if XCHAL_HAVE_LOOPS
  882. loop a9, .Ldiv_loopend
  883. #endif
  884. .Ldiv_loop:
  885. /* Shift the quotient << 1. */
  886. slli a10, a10, 1
  887. /* Is this digit a 0 or 1? */
  888. bltu a2, a3, 1f
  889. /* Output a 1 and subtract. */
  890. addi a10, a10, 1
  891. sub a2, a2, a3
  892. /* Shift the dividend << 1. */
  893. 1: slli a2, a2, 1
  894. #if !XCHAL_HAVE_LOOPS
  895. addi a9, a9, -1
  896. bnez a9, .Ldiv_loop
  897. #endif
  898. .Ldiv_loopend:
  899. /* Add the exponent bias (less one to account for the explicit "1.0"
  900. of the mantissa that will be added to the exponent in the final
  901. result). */
  902. addi a8, a8, 0x7e
  903. /* Check for over/underflow. The value in a8 is one less than the
  904. final exponent, so values in the range 0..fd are OK here. */
  905. movi a4, 0xfe
  906. bgeu a8, a4, .Ldiv_overflow
  907. .Ldiv_round:
  908. /* Round. The remainder (<< 1) is in a2. */
  909. bltu a2, a3, .Ldiv_rounded
  910. addi a10, a10, 1
  911. beq a2, a3, .Ldiv_exactlyhalf
  912. .Ldiv_rounded:
  913. /* Add the exponent to the mantissa. */
  914. slli a8, a8, 23
  915. add a2, a10, a8
  916. .Ldiv_addsign:
  917. /* Add the sign bit. */
  918. srli a7, a7, 31
  919. slli a7, a7, 31
  920. or a2, a2, a7
  921. leaf_return
  922. .Ldiv_overflow:
  923. bltz a8, .Ldiv_underflow
  924. /* Return +/- Infinity. */
  925. addi a8, a4, 1 /* 0xff */
  926. slli a2, a8, 23
  927. j .Ldiv_addsign
  928. .Ldiv_exactlyhalf:
  929. /* Remainder is exactly half the divisor. Round even. */
  930. srli a10, a10, 1
  931. slli a10, a10, 1
  932. j .Ldiv_rounded
  933. .Ldiv_underflow:
  934. /* Create a subnormal value, where the exponent field contains zero,
  935. but the effective exponent is 1. The value of a8 is one less than
  936. the actual exponent, so just negate it to get the shift amount. */
  937. neg a8, a8
  938. ssr a8
  939. bgeui a8, 32, .Ldiv_flush_to_zero
  940. /* Shift a10 right. Any bits that are shifted out of a10 are
  941. saved in a6 for rounding the result. */
  942. sll a6, a10
  943. srl a10, a10
  944. /* Set the exponent to zero. */
  945. movi a8, 0
  946. /* Pack any nonzero remainder (in a2) into a6. */
  947. beqz a2, 1f
  948. movi a9, 1
  949. or a6, a6, a9
  950. /* Round a10 based on the bits shifted out into a6. */
  951. 1: bgez a6, .Ldiv_rounded
  952. addi a10, a10, 1
  953. slli a6, a6, 1
  954. bnez a6, .Ldiv_rounded
  955. srli a10, a10, 1
  956. slli a10, a10, 1
  957. j .Ldiv_rounded
  958. .Ldiv_flush_to_zero:
  959. /* Return zero with the appropriate sign bit. */
  960. srli a2, a7, 31
  961. slli a2, a2, 31
  962. leaf_return
  963. #endif /* XCHAL_HAVE_FP_DIV */
  964. #endif /* L_divsf3 */
  965. #ifdef L_cmpsf2
  966. /* Equal and Not Equal */
  967. .align 4
  968. .global __eqsf2
  969. .global __nesf2
  970. .set __nesf2, __eqsf2
  971. .type __eqsf2, @function
  972. __eqsf2:
  973. leaf_entry sp, 16
  974. bne a2, a3, 4f
  975. /* The values are equal but NaN != NaN. Check the exponent. */
  976. movi a6, 0x7f800000
  977. ball a2, a6, 3f
  978. /* Equal. */
  979. movi a2, 0
  980. leaf_return
  981. /* Not equal. */
  982. 2: movi a2, 1
  983. leaf_return
  984. /* Check if the mantissas are nonzero. */
  985. 3: slli a7, a2, 9
  986. j 5f
  987. /* Check if x and y are zero with different signs. */
  988. 4: or a7, a2, a3
  989. slli a7, a7, 1
  990. /* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa
  991. or x when exponent(x) = 0x7f8 and x == y. */
  992. 5: movi a2, 0
  993. movi a3, 1
  994. movnez a2, a3, a7
  995. leaf_return
  996. /* Greater Than */
  997. .align 4
  998. .global __gtsf2
  999. .type __gtsf2, @function
  1000. __gtsf2:
  1001. leaf_entry sp, 16
  1002. movi a6, 0x7f800000
  1003. ball a2, a6, 2f
  1004. 1: bnall a3, a6, .Lle_cmp
  1005. /* Check if y is a NaN. */
  1006. slli a7, a3, 9
  1007. beqz a7, .Lle_cmp
  1008. movi a2, 0
  1009. leaf_return
  1010. /* Check if x is a NaN. */
  1011. 2: slli a7, a2, 9
  1012. beqz a7, 1b
  1013. movi a2, 0
  1014. leaf_return
  1015. /* Less Than or Equal */
  1016. .align 4
  1017. .global __lesf2
  1018. .type __lesf2, @function
  1019. __lesf2:
  1020. leaf_entry sp, 16
  1021. movi a6, 0x7f800000
  1022. ball a2, a6, 2f
  1023. 1: bnall a3, a6, .Lle_cmp
  1024. /* Check if y is a NaN. */
  1025. slli a7, a3, 9
  1026. beqz a7, .Lle_cmp
  1027. movi a2, 1
  1028. leaf_return
  1029. /* Check if x is a NaN. */
  1030. 2: slli a7, a2, 9
  1031. beqz a7, 1b
  1032. movi a2, 1
  1033. leaf_return
  1034. .Lle_cmp:
  1035. /* Check if x and y have different signs. */
  1036. xor a7, a2, a3
  1037. bltz a7, .Lle_diff_signs
  1038. /* Check if x is negative. */
  1039. bltz a2, .Lle_xneg
  1040. /* Check if x <= y. */
  1041. bltu a3, a2, 5f
  1042. 4: movi a2, 0
  1043. leaf_return
  1044. .Lle_xneg:
  1045. /* Check if y <= x. */
  1046. bgeu a2, a3, 4b
  1047. 5: movi a2, 1
  1048. leaf_return
  1049. .Lle_diff_signs:
  1050. bltz a2, 4b
  1051. /* Check if both x and y are zero. */
  1052. or a7, a2, a3
  1053. slli a7, a7, 1
  1054. movi a2, 1
  1055. movi a3, 0
  1056. moveqz a2, a3, a7
  1057. leaf_return
  1058. /* Greater Than or Equal */
  1059. .align 4
  1060. .global __gesf2
  1061. .type __gesf2, @function
  1062. __gesf2:
  1063. leaf_entry sp, 16
  1064. movi a6, 0x7f800000
  1065. ball a2, a6, 2f
  1066. 1: bnall a3, a6, .Llt_cmp
  1067. /* Check if y is a NaN. */
  1068. slli a7, a3, 9
  1069. beqz a7, .Llt_cmp
  1070. movi a2, -1
  1071. leaf_return
  1072. /* Check if x is a NaN. */
  1073. 2: slli a7, a2, 9
  1074. beqz a7, 1b
  1075. movi a2, -1
  1076. leaf_return
  1077. /* Less Than */
  1078. .align 4
  1079. .global __ltsf2
  1080. .type __ltsf2, @function
  1081. __ltsf2:
  1082. leaf_entry sp, 16
  1083. movi a6, 0x7f800000
  1084. ball a2, a6, 2f
  1085. 1: bnall a3, a6, .Llt_cmp
  1086. /* Check if y is a NaN. */
  1087. slli a7, a3, 9
  1088. beqz a7, .Llt_cmp
  1089. movi a2, 0
  1090. leaf_return
  1091. /* Check if x is a NaN. */
  1092. 2: slli a7, a2, 9
  1093. beqz a7, 1b
  1094. movi a2, 0
  1095. leaf_return
  1096. .Llt_cmp:
  1097. /* Check if x and y have different signs. */
  1098. xor a7, a2, a3
  1099. bltz a7, .Llt_diff_signs
  1100. /* Check if x is negative. */
  1101. bltz a2, .Llt_xneg
  1102. /* Check if x < y. */
  1103. bgeu a2, a3, 5f
  1104. 4: movi a2, -1
  1105. leaf_return
  1106. .Llt_xneg:
  1107. /* Check if y < x. */
  1108. bltu a3, a2, 4b
  1109. 5: movi a2, 0
  1110. leaf_return
  1111. .Llt_diff_signs:
  1112. bgez a2, 5b
  1113. /* Check if both x and y are nonzero. */
  1114. or a7, a2, a3
  1115. slli a7, a7, 1
  1116. movi a2, 0
  1117. movi a3, -1
  1118. movnez a2, a3, a7
  1119. leaf_return
  1120. /* Unordered */
  1121. .align 4
  1122. .global __unordsf2
  1123. .type __unordsf2, @function
  1124. __unordsf2:
  1125. leaf_entry sp, 16
  1126. movi a6, 0x7f800000
  1127. ball a2, a6, 3f
  1128. 1: ball a3, a6, 4f
  1129. 2: movi a2, 0
  1130. leaf_return
  1131. 3: slli a7, a2, 9
  1132. beqz a7, 1b
  1133. movi a2, 1
  1134. leaf_return
  1135. 4: slli a7, a3, 9
  1136. beqz a7, 2b
  1137. movi a2, 1
  1138. leaf_return
  1139. #endif /* L_cmpsf2 */
  1140. #ifdef L_fixsfsi
  1141. .align 4
  1142. .global __fixsfsi
  1143. .type __fixsfsi, @function
  1144. __fixsfsi:
  1145. leaf_entry sp, 16
  1146. /* Check for NaN and Infinity. */
  1147. movi a6, 0x7f800000
  1148. ball a2, a6, .Lfixsfsi_nan_or_inf
  1149. /* Extract the exponent and check if 0 < (exp - 0x7e) < 32. */
  1150. extui a4, a2, 23, 8
  1151. addi a4, a4, -0x7e
  1152. bgei a4, 32, .Lfixsfsi_maxint
  1153. blti a4, 1, .Lfixsfsi_zero
  1154. /* Add explicit "1.0" and shift << 8. */
  1155. or a7, a2, a6
  1156. slli a5, a7, 8
  1157. /* Shift back to the right, based on the exponent. */
  1158. ssl a4 /* shift by 32 - a4 */
  1159. srl a5, a5
  1160. /* Negate the result if sign != 0. */
  1161. neg a2, a5
  1162. movgez a2, a5, a7
  1163. leaf_return
  1164. .Lfixsfsi_nan_or_inf:
  1165. /* Handle Infinity and NaN. */
  1166. slli a4, a2, 9
  1167. beqz a4, .Lfixsfsi_maxint
  1168. /* Translate NaN to +maxint. */
  1169. movi a2, 0
  1170. .Lfixsfsi_maxint:
  1171. slli a4, a6, 8 /* 0x80000000 */
  1172. addi a5, a4, -1 /* 0x7fffffff */
  1173. movgez a4, a5, a2
  1174. mov a2, a4
  1175. leaf_return
  1176. .Lfixsfsi_zero:
  1177. movi a2, 0
  1178. leaf_return
  1179. #endif /* L_fixsfsi */
  1180. #ifdef L_fixsfdi
  1181. .align 4
  1182. .global __fixsfdi
  1183. .type __fixsfdi, @function
  1184. __fixsfdi:
  1185. leaf_entry sp, 16
  1186. /* Check for NaN and Infinity. */
  1187. movi a6, 0x7f800000
  1188. ball a2, a6, .Lfixsfdi_nan_or_inf
  1189. /* Extract the exponent and check if 0 < (exp - 0x7e) < 64. */
  1190. extui a4, a2, 23, 8
  1191. addi a4, a4, -0x7e
  1192. bgei a4, 64, .Lfixsfdi_maxint
  1193. blti a4, 1, .Lfixsfdi_zero
  1194. /* Add explicit "1.0" and shift << 8. */
  1195. or a7, a2, a6
  1196. slli xh, a7, 8
  1197. /* Shift back to the right, based on the exponent. */
  1198. ssl a4 /* shift by 64 - a4 */
  1199. bgei a4, 32, .Lfixsfdi_smallshift
  1200. srl xl, xh
  1201. movi xh, 0
  1202. .Lfixsfdi_shifted:
  1203. /* Negate the result if sign != 0. */
  1204. bgez a7, 1f
  1205. neg xl, xl
  1206. neg xh, xh
  1207. beqz xl, 1f
  1208. addi xh, xh, -1
  1209. 1: leaf_return
  1210. .Lfixsfdi_smallshift:
  1211. movi xl, 0
  1212. sll xl, xh
  1213. srl xh, xh
  1214. j .Lfixsfdi_shifted
  1215. .Lfixsfdi_nan_or_inf:
  1216. /* Handle Infinity and NaN. */
  1217. slli a4, a2, 9
  1218. beqz a4, .Lfixsfdi_maxint
  1219. /* Translate NaN to +maxint. */
  1220. movi a2, 0
  1221. .Lfixsfdi_maxint:
  1222. slli a7, a6, 8 /* 0x80000000 */
  1223. bgez a2, 1f
  1224. mov xh, a7
  1225. movi xl, 0
  1226. leaf_return
  1227. 1: addi xh, a7, -1 /* 0x7fffffff */
  1228. movi xl, -1
  1229. leaf_return
  1230. .Lfixsfdi_zero:
  1231. movi xh, 0
  1232. movi xl, 0
  1233. leaf_return
  1234. #endif /* L_fixsfdi */
  1235. #ifdef L_fixunssfsi
  1236. .align 4
  1237. .global __fixunssfsi
  1238. .type __fixunssfsi, @function
  1239. __fixunssfsi:
  1240. leaf_entry sp, 16
  1241. /* Check for NaN and Infinity. */
  1242. movi a6, 0x7f800000
  1243. ball a2, a6, .Lfixunssfsi_nan_or_inf
  1244. /* Extract the exponent and check if 0 <= (exp - 0x7f) < 32. */
  1245. extui a4, a2, 23, 8
  1246. addi a4, a4, -0x7f
  1247. bgei a4, 32, .Lfixunssfsi_maxint
  1248. bltz a4, .Lfixunssfsi_zero
  1249. /* Add explicit "1.0" and shift << 8. */
  1250. or a7, a2, a6
  1251. slli a5, a7, 8
  1252. /* Shift back to the right, based on the exponent. */
  1253. addi a4, a4, 1
  1254. beqi a4, 32, .Lfixunssfsi_bigexp
  1255. ssl a4 /* shift by 32 - a4 */
  1256. srl a5, a5
  1257. /* Negate the result if sign != 0. */
  1258. neg a2, a5
  1259. movgez a2, a5, a7
  1260. leaf_return
  1261. .Lfixunssfsi_nan_or_inf:
  1262. /* Handle Infinity and NaN. */
  1263. slli a4, a2, 9
  1264. beqz a4, .Lfixunssfsi_maxint
  1265. /* Translate NaN to 0xffffffff. */
  1266. movi a2, -1
  1267. leaf_return
  1268. .Lfixunssfsi_maxint:
  1269. slli a4, a6, 8 /* 0x80000000 */
  1270. movi a5, -1 /* 0xffffffff */
  1271. movgez a4, a5, a2
  1272. mov a2, a4
  1273. leaf_return
  1274. .Lfixunssfsi_zero:
  1275. movi a2, 0
  1276. leaf_return
  1277. .Lfixunssfsi_bigexp:
  1278. /* Handle unsigned maximum exponent case. */
  1279. bltz a2, 1f
  1280. mov a2, a5 /* no shift needed */
  1281. leaf_return
  1282. /* Return 0x80000000 if negative. */
  1283. 1: slli a2, a6, 8
  1284. leaf_return
  1285. #endif /* L_fixunssfsi */
  1286. #ifdef L_fixunssfdi
  1287. .align 4
  1288. .global __fixunssfdi
  1289. .type __fixunssfdi, @function
  1290. __fixunssfdi:
  1291. leaf_entry sp, 16
  1292. /* Check for NaN and Infinity. */
  1293. movi a6, 0x7f800000
  1294. ball a2, a6, .Lfixunssfdi_nan_or_inf
  1295. /* Extract the exponent and check if 0 <= (exp - 0x7f) < 64. */
  1296. extui a4, a2, 23, 8
  1297. addi a4, a4, -0x7f
  1298. bgei a4, 64, .Lfixunssfdi_maxint
  1299. bltz a4, .Lfixunssfdi_zero
  1300. /* Add explicit "1.0" and shift << 8. */
  1301. or a7, a2, a6
  1302. slli xh, a7, 8
  1303. /* Shift back to the right, based on the exponent. */
  1304. addi a4, a4, 1
  1305. beqi a4, 64, .Lfixunssfdi_bigexp
  1306. ssl a4 /* shift by 64 - a4 */
  1307. bgei a4, 32, .Lfixunssfdi_smallshift
  1308. srl xl, xh
  1309. movi xh, 0
  1310. .Lfixunssfdi_shifted:
  1311. /* Negate the result if sign != 0. */
  1312. bgez a7, 1f
  1313. neg xl, xl
  1314. neg xh, xh
  1315. beqz xl, 1f
  1316. addi xh, xh, -1
  1317. 1: leaf_return
  1318. .Lfixunssfdi_smallshift:
  1319. movi xl, 0
  1320. src xl, xh, xl
  1321. srl xh, xh
  1322. j .Lfixunssfdi_shifted
  1323. .Lfixunssfdi_nan_or_inf:
  1324. /* Handle Infinity and NaN. */
  1325. slli a4, a2, 9
  1326. beqz a4, .Lfixunssfdi_maxint
  1327. /* Translate NaN to 0xffffffff.... */
  1328. 1: movi xh, -1
  1329. movi xl, -1
  1330. leaf_return
  1331. .Lfixunssfdi_maxint:
  1332. bgez a2, 1b
  1333. 2: slli xh, a6, 8 /* 0x80000000 */
  1334. movi xl, 0
  1335. leaf_return
  1336. .Lfixunssfdi_zero:
  1337. movi xh, 0
  1338. movi xl, 0
  1339. leaf_return
  1340. .Lfixunssfdi_bigexp:
  1341. /* Handle unsigned maximum exponent case. */
  1342. bltz a7, 2b
  1343. movi xl, 0
  1344. leaf_return /* no shift needed */
  1345. #endif /* L_fixunssfdi */
  1346. #ifdef L_floatsisf
  1347. .align 4
  1348. .global __floatunsisf
  1349. .type __floatunsisf, @function
  1350. __floatunsisf:
  1351. leaf_entry sp, 16
  1352. beqz a2, .Lfloatsisf_return
  1353. /* Set the sign to zero and jump to the floatsisf code. */
  1354. movi a7, 0
  1355. j .Lfloatsisf_normalize
  1356. .align 4
  1357. .global __floatsisf
  1358. .type __floatsisf, @function
  1359. __floatsisf:
  1360. leaf_entry sp, 16
  1361. /* Check for zero. */
  1362. beqz a2, .Lfloatsisf_return
  1363. /* Save the sign. */
  1364. extui a7, a2, 31, 1
  1365. /* Get the absolute value. */
  1366. #if XCHAL_HAVE_ABS
  1367. abs a2, a2
  1368. #else
  1369. neg a4, a2
  1370. movltz a2, a4, a2
  1371. #endif
  1372. .Lfloatsisf_normalize:
  1373. /* Normalize with the first 1 bit in the msb. */
  1374. do_nsau a4, a2, a5, a6
  1375. ssl a4
  1376. sll a5, a2
  1377. /* Shift the mantissa into position, with rounding bits in a6. */
  1378. srli a2, a5, 8
  1379. slli a6, a5, (32 - 8)
  1380. /* Set the exponent. */
  1381. movi a5, 0x9d /* 0x7e + 31 */
  1382. sub a5, a5, a4
  1383. slli a5, a5, 23
  1384. add a2, a2, a5
  1385. /* Add the sign. */
  1386. slli a7, a7, 31
  1387. or a2, a2, a7
  1388. /* Round up if the leftover fraction is >= 1/2. */
  1389. bgez a6, .Lfloatsisf_return
  1390. addi a2, a2, 1 /* Overflow to the exponent is OK. */
  1391. /* Check if the leftover fraction is exactly 1/2. */
  1392. slli a6, a6, 1
  1393. beqz a6, .Lfloatsisf_exactlyhalf
  1394. .Lfloatsisf_return:
  1395. leaf_return
  1396. .Lfloatsisf_exactlyhalf:
  1397. /* Round down to the nearest even value. */
  1398. srli a2, a2, 1
  1399. slli a2, a2, 1
  1400. leaf_return
  1401. #endif /* L_floatsisf */
  1402. #ifdef L_floatdisf
  1403. .align 4
  1404. .global __floatundisf
  1405. .type __floatundisf, @function
  1406. __floatundisf:
  1407. leaf_entry sp, 16
  1408. /* Check for zero. */
  1409. or a4, xh, xl
  1410. beqz a4, 2f
  1411. /* Set the sign to zero and jump to the floatdisf code. */
  1412. movi a7, 0
  1413. j .Lfloatdisf_normalize
  1414. .align 4
  1415. .global __floatdisf
  1416. .type __floatdisf, @function
  1417. __floatdisf:
  1418. leaf_entry sp, 16
  1419. /* Check for zero. */
  1420. or a4, xh, xl
  1421. beqz a4, 2f
  1422. /* Save the sign. */
  1423. extui a7, xh, 31, 1
  1424. /* Get the absolute value. */
  1425. bgez xh, .Lfloatdisf_normalize
  1426. neg xl, xl
  1427. neg xh, xh
  1428. beqz xl, .Lfloatdisf_normalize
  1429. addi xh, xh, -1
  1430. .Lfloatdisf_normalize:
  1431. /* Normalize with the first 1 bit in the msb of xh. */
  1432. beqz xh, .Lfloatdisf_bigshift
  1433. do_nsau a4, xh, a5, a6
  1434. ssl a4
  1435. src xh, xh, xl
  1436. sll xl, xl
  1437. .Lfloatdisf_shifted:
  1438. /* Shift the mantissa into position, with rounding bits in a6. */
  1439. ssai 8
  1440. sll a5, xl
  1441. src a6, xh, xl
  1442. srl xh, xh
  1443. beqz a5, 1f
  1444. movi a5, 1
  1445. or a6, a6, a5
  1446. 1:
  1447. /* Set the exponent. */
  1448. movi a5, 0xbd /* 0x7e + 63 */
  1449. sub a5, a5, a4
  1450. slli a5, a5, 23
  1451. add a2, xh, a5
  1452. /* Add the sign. */
  1453. slli a7, a7, 31
  1454. or a2, a2, a7
  1455. /* Round up if the leftover fraction is >= 1/2. */
  1456. bgez a6, 2f
  1457. addi a2, a2, 1 /* Overflow to the exponent is OK. */
  1458. /* Check if the leftover fraction is exactly 1/2. */
  1459. slli a6, a6, 1
  1460. beqz a6, .Lfloatdisf_exactlyhalf
  1461. 2: leaf_return
  1462. .Lfloatdisf_bigshift:
  1463. /* xh is zero. Normalize with first 1 bit of xl in the msb of xh. */
  1464. do_nsau a4, xl, a5, a6
  1465. ssl a4
  1466. sll xh, xl
  1467. movi xl, 0
  1468. addi a4, a4, 32
  1469. j .Lfloatdisf_shifted
  1470. .Lfloatdisf_exactlyhalf:
  1471. /* Round down to the nearest even value. */
  1472. srli a2, a2, 1
  1473. slli a2, a2, 1
  1474. leaf_return
  1475. #endif /* L_floatdisf */
  1476. #if XCHAL_HAVE_FP_SQRT
  1477. #ifdef L_sqrtf
  1478. /* Square root */
  1479. .align 4
  1480. .global __ieee754_sqrtf
  1481. .type __ieee754_sqrtf, @function
  1482. __ieee754_sqrtf:
  1483. leaf_entry sp, 16
  1484. wfr f1, a2
  1485. sqrt0.s f2, f1
  1486. const.s f3, 0
  1487. maddn.s f3, f2, f2
  1488. nexp01.s f4, f1
  1489. const.s f0, 3
  1490. addexp.s f4, f0
  1491. maddn.s f0, f3, f4
  1492. nexp01.s f3, f1
  1493. neg.s f5, f3
  1494. maddn.s f2, f0, f2
  1495. const.s f0, 0
  1496. const.s f6, 0
  1497. const.s f7, 0
  1498. maddn.s f0, f5, f2
  1499. maddn.s f6, f2, f4
  1500. const.s f4, 3
  1501. maddn.s f7, f4, f2
  1502. maddn.s f3, f0, f0
  1503. maddn.s f4, f6, f2
  1504. neg.s f2, f7
  1505. maddn.s f0, f3, f2
  1506. maddn.s f7, f4, f7
  1507. mksadj.s f2, f1
  1508. nexp01.s f1, f1
  1509. maddn.s f1, f0, f0
  1510. neg.s f3, f7
  1511. addexpm.s f0, f2
  1512. addexp.s f3, f2
  1513. divn.s f0, f1, f3
  1514. rfr a2, f0
  1515. leaf_return
  1516. #endif /* L_sqrtf */
  1517. #endif /* XCHAL_HAVE_FP_SQRT */
  1518. #if XCHAL_HAVE_FP_RECIP
  1519. #ifdef L_recipsf2
  1520. /* Reciprocal */
  1521. .align 4
  1522. .global __recipsf2
  1523. .type __recipsf2, @function
  1524. __recipsf2:
  1525. leaf_entry sp, 16
  1526. wfr f1, a2
  1527. recip0.s f0, f1
  1528. const.s f2, 1
  1529. msub.s f2, f1, f0
  1530. maddn.s f0, f0, f2
  1531. const.s f2, 1
  1532. msub.s f2, f1, f0
  1533. maddn.s f0, f0, f2
  1534. rfr a2, f0
  1535. leaf_return
  1536. #endif /* L_recipsf2 */
  1537. #endif /* XCHAL_HAVE_FP_RECIP */
  1538. #if XCHAL_HAVE_FP_RSQRT
  1539. #ifdef L_rsqrtsf2
  1540. /* Reciprocal square root */
  1541. .align 4
  1542. .global __rsqrtsf2
  1543. .type __rsqrtsf2, @function
  1544. __rsqrtsf2:
  1545. leaf_entry sp, 16
  1546. wfr f1, a2
  1547. rsqrt0.s f0, f1
  1548. mul.s f2, f1, f0
  1549. const.s f3, 3;
  1550. mul.s f4, f3, f0
  1551. const.s f5, 1
  1552. msub.s f5, f2, f0
  1553. maddn.s f0, f4, f5
  1554. mul.s f2, f1, f0
  1555. mul.s f1, f3, f0
  1556. const.s f3, 1
  1557. msub.s f3, f2, f0
  1558. maddn.s f0, f1, f3
  1559. rfr a2, f0
  1560. leaf_return
  1561. #endif /* L_rsqrtsf2 */
  1562. #endif /* XCHAL_HAVE_FP_RSQRT */