arith_arm64.s 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520
  1. // +build arm64,!noasm
  2. #include "textflag.h"
  3. TEXT ·fp751ConditionalSwap(SB), NOSPLIT, $0-17
  4. MOVD x+0(FP), R0
  5. MOVD y+8(FP), R1
  6. MOVB choice+16(FP), R2
  7. // Set flags
  8. // If choice is not 0 or 1, this implementation will swap completely
  9. CMP $0, R2
  10. LDP 0(R0), (R3, R4)
  11. LDP 0(R1), (R5, R6)
  12. CSEL EQ, R3, R5, R7
  13. CSEL EQ, R4, R6, R8
  14. STP (R7, R8), 0(R0)
  15. CSEL NE, R3, R5, R9
  16. CSEL NE, R4, R6, R10
  17. STP (R9, R10), 0(R1)
  18. LDP 16(R0), (R3, R4)
  19. LDP 16(R1), (R5, R6)
  20. CSEL EQ, R3, R5, R7
  21. CSEL EQ, R4, R6, R8
  22. STP (R7, R8), 16(R0)
  23. CSEL NE, R3, R5, R9
  24. CSEL NE, R4, R6, R10
  25. STP (R9, R10), 16(R1)
  26. LDP 32(R0), (R3, R4)
  27. LDP 32(R1), (R5, R6)
  28. CSEL EQ, R3, R5, R7
  29. CSEL EQ, R4, R6, R8
  30. STP (R7, R8), 32(R0)
  31. CSEL NE, R3, R5, R9
  32. CSEL NE, R4, R6, R10
  33. STP (R9, R10), 32(R1)
  34. LDP 48(R0), (R3, R4)
  35. LDP 48(R1), (R5, R6)
  36. CSEL EQ, R3, R5, R7
  37. CSEL EQ, R4, R6, R8
  38. STP (R7, R8), 48(R0)
  39. CSEL NE, R3, R5, R9
  40. CSEL NE, R4, R6, R10
  41. STP (R9, R10), 48(R1)
  42. LDP 64(R0), (R3, R4)
  43. LDP 64(R1), (R5, R6)
  44. CSEL EQ, R3, R5, R7
  45. CSEL EQ, R4, R6, R8
  46. STP (R7, R8), 64(R0)
  47. CSEL NE, R3, R5, R9
  48. CSEL NE, R4, R6, R10
  49. STP (R9, R10), 64(R1)
  50. LDP 80(R0), (R3, R4)
  51. LDP 80(R1), (R5, R6)
  52. CSEL EQ, R3, R5, R7
  53. CSEL EQ, R4, R6, R8
  54. STP (R7, R8), 80(R0)
  55. CSEL NE, R3, R5, R9
  56. CSEL NE, R4, R6, R10
  57. STP (R9, R10), 80(R1)
  58. RET
  59. TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24
  60. MOVD z+0(FP), R2
  61. MOVD x+8(FP), R0
  62. MOVD y+16(FP), R1
  63. // Load first summand into R3-R14
  64. // Add first summand and second summand and store result in R3-R14
  65. LDP 0(R0), (R3, R4)
  66. LDP 0(R1), (R15, R16)
  67. LDP 16(R0), (R5, R6)
  68. LDP 16(R1), (R17, R19)
  69. ADDS R15, R3
  70. ADCS R16, R4
  71. ADCS R17, R5
  72. ADCS R19, R6
  73. LDP 32(R0), (R7, R8)
  74. LDP 32(R1), (R15, R16)
  75. LDP 48(R0), (R9, R10)
  76. LDP 48(R1), (R17, R19)
  77. ADCS R15, R7
  78. ADCS R16, R8
  79. ADCS R17, R9
  80. ADCS R19, R10
  81. LDP 64(R0), (R11, R12)
  82. LDP 64(R1), (R15, R16)
  83. LDP 80(R0), (R13, R14)
  84. LDP 80(R1), (R17, R19)
  85. ADCS R15, R11
  86. ADCS R16, R12
  87. ADCS R17, R13
  88. ADC R19, R14
  89. // Subtract 2 * p751 in R15-R24 from the result in R3-R14
  90. LDP ·p751x2+0(SB), (R15, R16)
  91. SUBS R15, R3
  92. SBCS R16, R4
  93. LDP ·p751x2+40(SB), (R17, R19)
  94. SBCS R16, R5
  95. SBCS R16, R6
  96. SBCS R16, R7
  97. LDP ·p751x2+56(SB), (R20, R21)
  98. SBCS R17, R8
  99. SBCS R19, R9
  100. LDP ·p751x2+72(SB), (R22, R23)
  101. SBCS R20, R10
  102. SBCS R21, R11
  103. MOVD ·p751x2+88(SB), R24
  104. SBCS R22, R12
  105. SBCS R23, R13
  106. SBCS R24, R14
  107. SBC ZR, ZR, R25
  108. // If x + y - 2 * p751 < 0, R25 is 1 and 2 * p751 should be added
  109. AND R25, R15
  110. AND R25, R16
  111. AND R25, R17
  112. AND R25, R19
  113. AND R25, R20
  114. AND R25, R21
  115. AND R25, R22
  116. AND R25, R23
  117. AND R25, R24
  118. ADDS R15, R3
  119. ADCS R16, R4
  120. STP (R3, R4), 0(R2)
  121. ADCS R16, R5
  122. ADCS R16, R6
  123. STP (R5, R6), 16(R2)
  124. ADCS R16, R7
  125. ADCS R17, R8
  126. STP (R7, R8), 32(R2)
  127. ADCS R19, R9
  128. ADCS R20, R10
  129. STP (R9, R10), 48(R2)
  130. ADCS R21, R11
  131. ADCS R22, R12
  132. STP (R11, R12), 64(R2)
  133. ADCS R23, R13
  134. ADC R24, R14
  135. STP (R13, R14), 80(R2)
  136. RET
  137. TEXT ·fp751SubReduced(SB), NOSPLIT, $0-24
  138. MOVD z+0(FP), R2
  139. MOVD x+8(FP), R0
  140. MOVD y+16(FP), R1
  141. // Load x into R3-R14
  142. // Subtract y from x and store result in R3-R14
  143. LDP 0(R0), (R3, R4)
  144. LDP 0(R1), (R15, R16)
  145. LDP 16(R0), (R5, R6)
  146. LDP 16(R1), (R17, R19)
  147. SUBS R15, R3
  148. SBCS R16, R4
  149. SBCS R17, R5
  150. SBCS R19, R6
  151. LDP 32(R0), (R7, R8)
  152. LDP 32(R1), (R15, R16)
  153. LDP 48(R0), (R9, R10)
  154. LDP 48(R1), (R17, R19)
  155. SBCS R15, R7
  156. SBCS R16, R8
  157. SBCS R17, R9
  158. SBCS R19, R10
  159. LDP 64(R0), (R11, R12)
  160. LDP 64(R1), (R15, R16)
  161. LDP 80(R0), (R13, R14)
  162. LDP 80(R1), (R17, R19)
  163. SBCS R15, R11
  164. SBCS R16, R12
  165. SBCS R17, R13
  166. SBCS R19, R14
  167. SBC ZR, ZR, R15
  168. // If x - y < 0, R15 is 1 and 2 * p751 should be added
  169. LDP ·p751x2+0(SB), (R16, R17)
  170. AND R15, R16
  171. AND R15, R17
  172. LDP ·p751x2+40(SB), (R19, R20)
  173. AND R15, R19
  174. AND R15, R20
  175. ADDS R16, R3
  176. ADCS R17, R4
  177. STP (R3, R4), 0(R2)
  178. ADCS R17, R5
  179. ADCS R17, R6
  180. STP (R5, R6), 16(R2)
  181. ADCS R17, R7
  182. ADCS R19, R8
  183. STP (R7, R8), 32(R2)
  184. ADCS R20, R9
  185. LDP ·p751x2+56(SB), (R16, R17)
  186. AND R15, R16
  187. AND R15, R17
  188. LDP ·p751x2+72(SB), (R19, R20)
  189. AND R15, R19
  190. AND R15, R20
  191. ADCS R16, R10
  192. STP (R9, R10), 48(R2)
  193. ADCS R17, R11
  194. ADCS R19, R12
  195. STP (R11, R12), 64(R2)
  196. ADCS R20, R13
  197. MOVD ·p751x2+88(SB), R16
  198. AND R15, R16
  199. ADC R16, R14
  200. STP (R13, R14), 80(R2)
  201. RET
  202. TEXT ·fp751AddLazy(SB), NOSPLIT, $0-24
  203. MOVD z+0(FP), R2
  204. MOVD x+8(FP), R0
  205. MOVD y+16(FP), R1
  206. // Load first summand into R3-R14
  207. // Add first summand and second summand and store result in R3-R14
  208. LDP 0(R0), (R3, R4)
  209. LDP 0(R1), (R15, R16)
  210. LDP 16(R0), (R5, R6)
  211. LDP 16(R1), (R17, R19)
  212. ADDS R15, R3
  213. ADCS R16, R4
  214. STP (R3, R4), 0(R2)
  215. ADCS R17, R5
  216. ADCS R19, R6
  217. STP (R5, R6), 16(R2)
  218. LDP 32(R0), (R7, R8)
  219. LDP 32(R1), (R15, R16)
  220. LDP 48(R0), (R9, R10)
  221. LDP 48(R1), (R17, R19)
  222. ADCS R15, R7
  223. ADCS R16, R8
  224. STP (R7, R8), 32(R2)
  225. ADCS R17, R9
  226. ADCS R19, R10
  227. STP (R9, R10), 48(R2)
  228. LDP 64(R0), (R11, R12)
  229. LDP 64(R1), (R15, R16)
  230. LDP 80(R0), (R13, R14)
  231. LDP 80(R1), (R17, R19)
  232. ADCS R15, R11
  233. ADCS R16, R12
  234. STP (R11, R12), 64(R2)
  235. ADCS R17, R13
  236. ADC R19, R14
  237. STP (R13, R14), 80(R2)
  238. RET
  239. TEXT ·fp751X2AddLazy(SB), NOSPLIT, $0-24
  240. MOVD z+0(FP), R2
  241. MOVD x+8(FP), R0
  242. MOVD y+16(FP), R1
  243. LDP 0(R0), (R3, R4)
  244. LDP 0(R1), (R15, R16)
  245. LDP 16(R0), (R5, R6)
  246. LDP 16(R1), (R17, R19)
  247. ADDS R15, R3
  248. ADCS R16, R4
  249. STP (R3, R4), 0(R2)
  250. ADCS R17, R5
  251. ADCS R19, R6
  252. STP (R5, R6), 16(R2)
  253. LDP 32(R0), (R7, R8)
  254. LDP 32(R1), (R15, R16)
  255. LDP 48(R0), (R9, R10)
  256. LDP 48(R1), (R17, R19)
  257. ADCS R15, R7
  258. ADCS R16, R8
  259. STP (R7, R8), 32(R2)
  260. ADCS R17, R9
  261. ADCS R19, R10
  262. STP (R9, R10), 48(R2)
  263. LDP 64(R0), (R11, R12)
  264. LDP 64(R1), (R15, R16)
  265. LDP 80(R0), (R13, R14)
  266. LDP 80(R1), (R17, R19)
  267. ADCS R15, R11
  268. ADCS R16, R12
  269. STP (R11, R12), 64(R2)
  270. ADCS R17, R13
  271. ADCS R19, R14
  272. STP (R13, R14), 80(R2)
  273. LDP 96(R0), (R3, R4)
  274. LDP 96(R1), (R15, R16)
  275. LDP 112(R0), (R5, R6)
  276. LDP 112(R1), (R17, R19)
  277. ADCS R15, R3
  278. ADCS R16, R4
  279. STP (R3, R4), 96(R2)
  280. ADCS R17, R5
  281. ADCS R19, R6
  282. STP (R5, R6), 112(R2)
  283. LDP 128(R0), (R7, R8)
  284. LDP 128(R1), (R15, R16)
  285. LDP 144(R0), (R9, R10)
  286. LDP 144(R1), (R17, R19)
  287. ADCS R15, R7
  288. ADCS R16, R8
  289. STP (R7, R8), 128(R2)
  290. ADCS R17, R9
  291. ADCS R19, R10
  292. STP (R9, R10), 144(R2)
  293. LDP 160(R0), (R11, R12)
  294. LDP 160(R1), (R15, R16)
  295. LDP 176(R0), (R13, R14)
  296. LDP 176(R1), (R17, R19)
  297. ADCS R15, R11
  298. ADCS R16, R12
  299. STP (R11, R12), 160(R2)
  300. ADCS R17, R13
  301. ADC R19, R14
  302. STP (R13, R14), 176(R2)
  303. RET
  304. TEXT ·fp751X2SubLazy(SB), NOSPLIT, $0-24
  305. MOVD z+0(FP), R2
  306. MOVD x+8(FP), R0
  307. MOVD y+16(FP), R1
  308. LDP 0(R0), (R3, R4)
  309. LDP 0(R1), (R15, R16)
  310. LDP 16(R0), (R5, R6)
  311. LDP 16(R1), (R17, R19)
  312. SUBS R15, R3
  313. SBCS R16, R4
  314. STP (R3, R4), 0(R2)
  315. SBCS R17, R5
  316. SBCS R19, R6
  317. STP (R5, R6), 16(R2)
  318. LDP 32(R0), (R7, R8)
  319. LDP 32(R1), (R15, R16)
  320. LDP 48(R0), (R9, R10)
  321. LDP 48(R1), (R17, R19)
  322. SBCS R15, R7
  323. SBCS R16, R8
  324. STP (R7, R8), 32(R2)
  325. SBCS R17, R9
  326. SBCS R19, R10
  327. STP (R9, R10), 48(R2)
  328. LDP 64(R0), (R11, R12)
  329. LDP 64(R1), (R15, R16)
  330. LDP 80(R0), (R13, R14)
  331. LDP 80(R1), (R17, R19)
  332. SBCS R15, R11
  333. SBCS R16, R12
  334. STP (R11, R12), 64(R2)
  335. SBCS R17, R13
  336. SBCS R19, R14
  337. STP (R13, R14), 80(R2)
  338. LDP 96(R0), (R3, R4)
  339. LDP 96(R1), (R15, R16)
  340. LDP 112(R0), (R5, R6)
  341. LDP 112(R1), (R17, R19)
  342. SBCS R15, R3
  343. SBCS R16, R4
  344. SBCS R17, R5
  345. SBCS R19, R6
  346. LDP 128(R0), (R7, R8)
  347. LDP 128(R1), (R15, R16)
  348. LDP 144(R0), (R9, R10)
  349. LDP 144(R1), (R17, R19)
  350. SBCS R15, R7
  351. SBCS R16, R8
  352. SBCS R17, R9
  353. SBCS R19, R10
  354. LDP 160(R0), (R11, R12)
  355. LDP 160(R1), (R15, R16)
  356. LDP 176(R0), (R13, R14)
  357. LDP 176(R1), (R17, R19)
  358. SBCS R15, R11
  359. SBCS R16, R12
  360. SBCS R17, R13
  361. SBCS R19, R14
  362. SBC ZR, ZR, R15
  363. // If x - y < 0, R15 is 1 and p751 should be added
  364. MOVD ·p751+0(SB), R20
  365. AND R15, R20
  366. LDP ·p751+40(SB), (R16, R17)
  367. ADDS R20, R3
  368. ADCS R20, R4
  369. STP (R3, R4), 96(R2)
  370. ADCS R20, R5
  371. ADCS R20, R6
  372. STP (R5, R6), 112(R2)
  373. ADCS R20, R7
  374. LDP ·p751+56(SB), (R19, R20)
  375. AND R15, R16
  376. AND R15, R17
  377. ADCS R16, R8
  378. STP (R7, R8), 128(R2)
  379. ADCS R17, R9
  380. LDP ·p751+72(SB), (R16, R17)
  381. AND R15, R19
  382. AND R15, R20
  383. ADCS R19, R10
  384. STP (R9, R10), 144(R2)
  385. ADCS R20, R11
  386. MOVD ·p751+88(SB), R19
  387. AND R15, R16
  388. AND R15, R17
  389. ADCS R16, R12
  390. STP (R11, R12), 160(R2)
  391. ADCS R17, R13
  392. AND R15, R19
  393. ADC R19, R14
  394. STP (R13, R14), 176(R2)
  395. RET
  396. // Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high)
  397. // Z0 is not actually touched
  398. // Result of (X0-X2) * (Y0-Y2) will be in Z0-Z5
  399. // Inputs remain intact
  400. #define mul192x192comba(X0, X1, X2, Y0, Y1, Y2, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1, T2, T3) \
  401. MUL X1, Y0, T2 \
  402. UMULH X1, Y0, T3 \
  403. \
  404. ADDS Z3, Z1 \
  405. ADCS ZR, Z2 \
  406. ADC ZR, ZR, Z3 \
  407. \
  408. MUL X0, Y2, T0 \
  409. UMULH X0, Y2, T1 \
  410. \
  411. ADDS T2, Z1 \
  412. ADCS T3, Z2 \
  413. ADC ZR, Z3 \
  414. \
  415. MUL X1, Y1, T2 \
  416. UMULH X1, Y1, T3 \
  417. \
  418. ADDS T0, Z2 \
  419. ADCS T1, Z3 \
  420. ADC ZR, ZR, Z4 \
  421. \
  422. MUL X2, Y0, T0 \
  423. UMULH X2, Y0, T1 \
  424. \
  425. ADDS T2, Z2 \
  426. ADCS T3, Z3 \
  427. ADC ZR, Z4 \
  428. \
  429. MUL X1, Y2, T2 \
  430. UMULH X1, Y2, T3 \
  431. \
  432. ADDS T0, Z2 \
  433. ADCS T1, Z3 \
  434. ADC ZR, Z4 \
  435. \
  436. MUL X2, Y1, T0 \
  437. UMULH X2, Y1, T1 \
  438. \
  439. ADDS T2, Z3 \
  440. ADCS T3, Z4 \
  441. ADC ZR, ZR, Z5 \
  442. \
  443. MUL X2, Y2, T2 \
  444. UMULH X2, Y2, T3 \
  445. \
  446. ADDS T0, Z3 \
  447. ADCS T1, Z4 \
  448. ADC ZR, Z5 \
  449. \
  450. ADDS T2, Z4 \
  451. ADC T3, Z5
  452. // Expects that X points to (X4-X6), Y to (Y4-Y6)
  453. // Result of (X0-X5) * (Y0-Y5) will be in (0(Z), 8(Z), 16(Z), T0-T8)
  454. // Inputs get overwritten
  455. #define mul384x384karatsuba(X, Y, Z, X0, X1, X2, X3, X4, X5, Y0, Y1, Y2, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10)\
  456. ADDS X0, X3 \ // xH + xL, destroys xH
  457. ADCS X1, X4 \
  458. ADCS X2, X5 \
  459. ADC ZR, ZR, T10 \
  460. \
  461. ADDS Y0, Y3 \ // yH + yL, destroys yH
  462. ADCS Y1, Y4 \
  463. ADCS Y2, Y5 \
  464. ADC ZR, ZR, T6 \
  465. \
  466. SUB T10, ZR, T7 \
  467. SUB T6, ZR, T8 \
  468. AND T6, T10 \ // combined carry
  469. \
  470. AND T7, Y3, T0 \ // masked(yH + yL)
  471. AND T7, Y4, T1 \
  472. AND T7, Y5, T2 \
  473. \
  474. AND T8, X3, T3 \ // masked(xH + xL)
  475. AND T8, X4, T4 \
  476. AND T8, X5, T5 \
  477. \
  478. ADDS T3, T0 \
  479. ADCS T4, T1 \
  480. STP (T0, T1), 0+Z \
  481. \
  482. MUL X3, Y3, T0 \
  483. MUL X3, Y4, T1 \
  484. \
  485. ADCS T5, T2 \
  486. MOVD T2, 16+Z \
  487. \
  488. UMULH X3, Y4, T2 \
  489. UMULH X3, Y3, T3 \
  490. \
  491. ADC ZR, T10 \
  492. \ // (xH + xL) * (yH + yL)
  493. mul192x192comba(X3, X4, X5, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
  494. \
  495. MUL X0, Y0, X3 \
  496. LDP 0+Z, (T6, T7) \
  497. MOVD 16+Z, T8 \
  498. \
  499. UMULH X0, Y0, Y3 \
  500. ADDS T6, T3 \
  501. ADCS T7, T4 \
  502. MUL X0, Y1, X4 \
  503. ADCS T8, T5 \
  504. ADC ZR, T10 \
  505. UMULH X0, Y1, X5 \
  506. \ // xL * yL
  507. mul192x192comba(X0, X1, X2, Y0, Y1, Y2, X3, X4, X5, Y3, Y4, Y5, T6, T7, T8, T9)\
  508. \
  509. STP (X3, X4), 0+Z \
  510. MOVD X5, 16+Z \
  511. \
  512. SUBS X3, T0 \ // (xH + xL) * (yH + yL) - xL * yL
  513. SBCS X4, T1 \
  514. LDP 0+X, (X3, X4) \
  515. SBCS X5, T2 \
  516. MOVD 16+X, X5 \
  517. SBCS Y3, T3 \
  518. SBCS Y4, T4 \
  519. SBCS Y5, T5 \
  520. SBC ZR, T10 \
  521. \
  522. ADDS Y3, T0 \ // ((xH + xL) * (yH + yL) - xL * yL) * 2^192 + xL * yL
  523. ADCS Y4, T1 \
  524. LDP 0+Y, (Y3, Y4) \
  525. MUL X3, Y3, X0 \
  526. ADCS Y5, T2 \
  527. UMULH X3, Y3, Y0 \
  528. MOVD 16+Y, Y5 \
  529. MUL X3, Y4, X1 \
  530. ADCS ZR, T3 \
  531. UMULH X3, Y4, X2 \
  532. ADCS ZR, T4 \
  533. ADCS ZR, T5 \
  534. ADC ZR, T10 \
  535. \ // xH * yH, overwrite xLow, yLow
  536. mul192x192comba(X3, X4, X5, Y3, Y4, Y5, X0, X1, X2, Y0, Y1, Y2, T6, T7, T8, T9)\
  537. \
  538. SUBS X0, T0 \ // ((xH + xL) * (yH + yL) - xL * yL - xH * yH)
  539. SBCS X1, T1 \
  540. SBCS X2, T2 \
  541. SBCS Y0, T3 \
  542. SBCS Y1, T4 \
  543. SBCS Y2, T5 \
  544. SBC ZR, T10 \
  545. \
  546. ADDS X0, T3 \
  547. ADCS X1, T4 \
  548. ADCS X2, T5 \
  549. ADCS T10, Y0, T6 \
  550. ADCS ZR, Y1, T7 \
  551. ADC ZR, Y2, T8
  552. TEXT ·fp751Mul(SB), NOSPLIT, $0-24
  553. MOVD z+0(FP), R2
  554. MOVD x+8(FP), R0
  555. MOVD y+16(FP), R1
  556. // Load xL in R3-R8, xH in R9-R14
  557. // (xH + xL) in R3-R8, destroys xH
  558. LDP 0(R0), (R3, R4)
  559. LDP 48(R0), (R9, R10)
  560. ADDS R9, R3
  561. ADCS R10, R4
  562. LDP 16(R0), (R5, R6)
  563. LDP 64(R0), (R11, R12)
  564. ADCS R11, R5
  565. ADCS R12, R6
  566. LDP 32(R0), (R7, R8)
  567. LDP 80(R0), (R13, R14)
  568. ADCS R13, R7
  569. ADCS R14, R8
  570. ADC ZR, ZR, R22
  571. // Load yL in R9-R14, yH in R15-21
  572. // (yH + yL) in R9-R14, destroys yH
  573. LDP 0(R1), (R9, R10)
  574. LDP 48(R1), (R15, R16)
  575. ADDS R15, R9
  576. ADCS R16, R10
  577. LDP 16(R1), (R11, R12)
  578. LDP 64(R1), (R17, R19)
  579. ADCS R17, R11
  580. ADCS R19, R12
  581. LDP 32(R1), (R13, R14)
  582. LDP 80(R1), (R20, R21)
  583. ADCS R20, R13
  584. ADCS R21, R14
  585. ADC ZR, ZR, R23
  586. // Compute masks and combined carry
  587. SUB R22, ZR, R24
  588. SUB R23, ZR, R25
  589. AND R23, R22
  590. // Store xH, yH in z so mul384x384karatsuba can retrieve them from memory
  591. // It doesn't have enough registers
  592. // Meanwhile computed masked(xH + xL) in R15-R21
  593. STP (R6, R7), 0(R2)
  594. AND R25, R3, R15
  595. AND R25, R4, R16
  596. STP (R8, R12), 16(R2)
  597. AND R25, R5, R17
  598. AND R25, R6, R19
  599. STP (R13, R14), 32(R2)
  600. AND R25, R7, R20
  601. AND R25, R8, R21
  602. // Masked(xH + xL) + masked(yH + yL) in R15-R21
  603. // Store intermediate values in z
  604. AND R24, R9, R25
  605. AND R24, R10, R26
  606. ADDS R25, R15
  607. ADCS R26, R16
  608. STP (R15, R16), 96(R2)
  609. AND R24, R11, R25
  610. AND R24, R12, R26
  611. ADCS R25, R17
  612. ADCS R26, R19
  613. STP (R17, R19), 112(R2)
  614. AND R24, R13, R25
  615. AND R24, R14, R26
  616. ADCS R25, R20
  617. ADCS R26, R21
  618. STP (R20, R21), 128(R2)
  619. // Store carry in R29 so it can remain there
  620. ADC ZR, R22, R29
  621. // (xH + xL) * (yH + yL)
  622. mul384x384karatsuba(0(R2), 24(R2), 48(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)
  623. // Load masked(xH + xL) + masked(yH + yL) and add that to its top half
  624. // Store the result back in z
  625. STP (R15, R16), 72(R2)
  626. LDP 96(R2), (R3, R4)
  627. ADDS R3, R19
  628. STP (R17, R19), 88(R2)
  629. ADCS R4, R20
  630. LDP 112(R2), (R5, R6)
  631. ADCS R5, R21
  632. STP (R20, R21), 104(R2)
  633. ADCS R6, R22
  634. LDP 128(R2), (R7, R8)
  635. ADCS R7, R23
  636. STP (R22, R23), 120(R2)
  637. ADCS R8, R24
  638. MOVD R24, 136(R2)
  639. ADC ZR, R29
  640. // Load xL, yL
  641. LDP 0(R0), (R3, R4)
  642. LDP 16(R0), (R5, R6)
  643. LDP 32(R0), (R7, R8)
  644. LDP 0(R1), (R9, R10)
  645. LDP 16(R1), (R11, R12)
  646. LDP 32(R1), (R13, R14)
  647. // xL * yL
  648. mul384x384karatsuba(24(R0), 24(R1), 0(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)
  649. // (xH + xL) * (yH + yL) - xL * yL in R3-R14
  650. LDP 0(R2), (R12, R13)
  651. LDP 48(R2), (R3, R4)
  652. SUBS R12, R3
  653. LDP 64(R2), (R5, R6)
  654. MOVD 16(R2), R14
  655. SBCS R13, R4
  656. SBCS R14, R5
  657. LDP 80(R2), (R7, R8)
  658. SBCS R15, R6
  659. SBCS R16, R7
  660. LDP 96(R2), (R9, R10)
  661. SBCS R17, R8
  662. SBCS R19, R9
  663. LDP 112(R2), (R11, R12)
  664. SBCS R20, R10
  665. SBCS R21, R11
  666. LDP 128(R2), (R13, R14)
  667. SBCS R22, R12
  668. SBCS R23, R13
  669. SBCS R24, R14
  670. SBC ZR, R29
  671. STP (R15, R16), 24(R2)
  672. MOVD R17, 40(R2)
  673. // ((xH + xL) * (yH + yL) - xL * yL) * 2^384 + xL * yL and store back in z
  674. ADDS R19, R3
  675. ADCS R20, R4
  676. STP (R3, R4), 48(R2)
  677. ADCS R21, R5
  678. ADCS R22, R6
  679. STP (R5, R6), 64(R2)
  680. ADCS R23, R7
  681. ADCS R24, R8
  682. STP (R7, R8), 80(R2)
  683. ADCS ZR, R9
  684. ADCS ZR, R10
  685. STP (R9, R10), 96(R2)
  686. ADCS ZR, R11
  687. ADCS ZR, R12
  688. STP (R11, R12), 112(R2)
  689. ADCS ZR, R13
  690. ADCS ZR, R14
  691. STP (R13, R14), 128(R2)
  692. ADC ZR, R29
  693. // Load xH, yH
  694. LDP 48(R0), (R3, R4)
  695. LDP 64(R0), (R5, R6)
  696. LDP 80(R0), (R7, R8)
  697. LDP 48(R1), (R9, R10)
  698. LDP 64(R1), (R11, R12)
  699. LDP 80(R1), (R13, R14)
  700. // xH * yH
  701. mul384x384karatsuba(72(R0), 72(R1), 144(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)
  702. LDP 144(R2), (R12, R13)
  703. MOVD 160(R2), R14
  704. // (xH + xL) * (yH + yL) - xL * yL - xH * yH in R3-R14
  705. // Store lower half in z, that's done
  706. LDP 48(R2), (R3, R4)
  707. SUBS R12, R3
  708. LDP 64(R2), (R5, R6)
  709. SBCS R13, R4
  710. SBCS R14, R5
  711. LDP 80(R2), (R7, R8)
  712. SBCS R15, R6
  713. SBCS R16, R7
  714. LDP 96(R2), (R9, R10)
  715. SBCS R17, R8
  716. SBCS R19, R9
  717. LDP 112(R2), (R11, R12)
  718. SBCS R20, R10
  719. SBCS R21, R11
  720. LDP 128(R2), (R13, R14)
  721. SBCS R22, R12
  722. SBCS R23, R13
  723. STP (R3, R4), 48(R2)
  724. SBCS R24, R14
  725. STP (R5, R6), 64(R2)
  726. SBC ZR, R29
  727. STP (R7, R8), 80(R2)
  728. // (xH * yH) * 2^768 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^384 + xL * yL
  729. // Store remaining limbs in z
  730. LDP 144(R2), (R3, R4)
  731. MOVD 160(R2), R5
  732. ADDS R3, R9
  733. ADCS R4, R10
  734. STP (R9, R10), 96(R2)
  735. ADCS R5, R11
  736. ADCS R15, R12
  737. STP (R11, R12), 112(R2)
  738. ADCS R16, R13
  739. ADCS R17, R14
  740. STP (R13, R14), 128(R2)
  741. ADCS R29, R19
  742. ADCS ZR, R20
  743. STP (R19, R20), 144(R2)
  744. ADCS ZR, R21
  745. ADCS ZR, R22
  746. STP (R21, R22), 160(R2)
  747. ADCS ZR, R23
  748. ADC ZR, R24
  749. STP (R23, R24), 176(R2)
  750. RET
  751. TEXT ·fp751MontgomeryReduce(SB), NOSPLIT, $0-16
  752. MOVD z+0(FP), R0
  753. MOVD x+8(FP), R1
  754. // Load p751+1 in R14-R17, R29, R19-R20, spread over arithmetic
  755. LDP ·p751p1+40(SB), (R14, R15)
  756. // z0-z11 will be R2-R13
  757. // Load x0-x4 to z0-z4 and x5, spread over arithmetic
  758. LDP 0(R1), (R2, R3)
  759. // x5 iteration
  760. MUL R2, R14, R22
  761. LDP 32(R1), (R6, R21)
  762. UMULH R2, R14, R23
  763. ADDS R21, R22, R7 // Set z5
  764. ADC ZR, R23, R25
  765. // x6 iteration
  766. MUL R2, R15, R22
  767. MOVD 48(R1), R21
  768. UMULH R2, R15, R23
  769. ADDS R22, R25
  770. ADC R23, ZR, R26
  771. MUL R3, R14, R22
  772. LDP ·p751p1+56(SB), (R16, R17)
  773. UMULH R3, R14, R23
  774. ADDS R22, R25
  775. ADCS R23, R26
  776. ADC ZR, ZR, R24
  777. ADDS R21, R25, R8 // Set z6
  778. ADCS ZR, R26
  779. ADC ZR, R24
  780. // x7 iteration
  781. MUL R2, R16, R22
  782. MOVD 56(R1), R21
  783. UMULH R2, R16, R23
  784. ADDS R22, R26
  785. ADCS R23, R24
  786. ADC ZR, ZR, R25
  787. MUL R3, R15, R22
  788. LDP 16(R1), (R4, R5)
  789. UMULH R3, R15, R23
  790. ADDS R22, R26
  791. ADCS R23, R24
  792. ADC ZR, R25
  793. MUL R4, R14, R22
  794. LDP ·p751p1+72(SB), (R29, R19)
  795. UMULH R4, R14, R23
  796. ADDS R22, R26
  797. ADCS R23, R24
  798. ADC ZR, R25
  799. ADDS R21, R26, R9 // Set z7
  800. ADCS ZR, R24
  801. ADC ZR, R25
  802. // x8 iteration
  803. MUL R2, R17, R22
  804. MOVD 64(R1), R21
  805. UMULH R2, R17, R23
  806. ADDS R22, R24
  807. ADCS R23, R25
  808. ADC ZR, ZR, R26
  809. MUL R3, R16, R22
  810. MOVD ·p751p1+88(SB), R20
  811. UMULH R3, R16, R23
  812. ADDS R22, R24
  813. ADCS R23, R25
  814. ADC ZR, R26
  815. MUL R4, R15, R22
  816. UMULH R4, R15, R23
  817. ADDS R22, R24
  818. ADCS R23, R25
  819. ADC ZR, R26
  820. MUL R5, R14, R22
  821. UMULH R5, R14, R23
  822. ADDS R22, R24
  823. ADCS R23, R25
  824. ADC ZR, R26
  825. ADDS R24, R21, R10 // Set z8
  826. ADCS ZR, R25
  827. ADC ZR, R26
  828. // x9 iteration
  829. MUL R2, R29, R22
  830. MOVD 72(R1), R21
  831. UMULH R2, R29, R23
  832. ADDS R22, R25
  833. ADCS R23, R26
  834. ADC ZR, ZR, R24
  835. MUL R3, R17, R22
  836. UMULH R3, R17, R23
  837. ADDS R22, R25
  838. ADCS R23, R26
  839. ADC ZR, R24
  840. MUL R4, R16, R22
  841. UMULH R4, R16, R23
  842. ADDS R22, R25
  843. ADCS R23, R26
  844. ADC ZR, R24
  845. MUL R5, R15, R22
  846. UMULH R5, R15, R23
  847. ADDS R22, R25
  848. ADCS R23, R26
  849. ADC ZR, R24
  850. MUL R6, R14, R22
  851. UMULH R6, R14, R23
  852. ADDS R22, R25
  853. ADCS R23, R26
  854. ADC ZR, R24
  855. ADDS R21, R25, R11 // Set z9
  856. ADCS ZR, R26
  857. ADC ZR, R24
  858. // x10 iteration
  859. MUL R2, R19, R22
  860. MOVD 80(R1), R21
  861. UMULH R2, R19, R23
  862. ADDS R22, R26
  863. ADCS R23, R24
  864. ADC ZR, ZR, R25
  865. MUL R3, R29, R22
  866. UMULH R3, R29, R23
  867. ADDS R22, R26
  868. ADCS R23, R24
  869. ADC ZR, R25
  870. MUL R4, R17, R22
  871. UMULH R4, R17, R23
  872. ADDS R22, R26
  873. ADCS R23, R24
  874. ADC ZR, R25
  875. MUL R5, R16, R22
  876. UMULH R5, R16, R23
  877. ADDS R22, R26
  878. ADCS R23, R24
  879. ADC ZR, R25
  880. MUL R6, R15, R22
  881. UMULH R6, R15, R23
  882. ADDS R22, R26
  883. ADCS R23, R24
  884. ADC ZR, R25
  885. MUL R7, R14, R22
  886. UMULH R7, R14, R23
  887. ADDS R22, R26
  888. ADCS R23, R24
  889. ADC ZR, R25
  890. ADDS R21, R26, R12 // Set z10
  891. ADCS ZR, R24
  892. ADC ZR, R25
  893. // x11 iteration
  894. MUL R2, R20, R22
  895. MOVD 88(R1), R21
  896. UMULH R2, R20, R23
  897. ADDS R22, R24
  898. ADCS R23, R25
  899. ADC ZR, ZR, R26
  900. MUL R3, R19, R22
  901. UMULH R3, R19, R23
  902. ADDS R22, R24
  903. ADCS R23, R25
  904. ADC ZR, R26
  905. MUL R4, R29, R22
  906. UMULH R4, R29, R23
  907. ADDS R22, R24
  908. ADCS R23, R25
  909. ADC ZR, R26
  910. MUL R5, R17, R22
  911. UMULH R5, R17, R23
  912. ADDS R22, R24
  913. ADCS R23, R25
  914. ADC ZR, R26
  915. MUL R6, R16, R22
  916. UMULH R6, R16, R23
  917. ADDS R22, R24
  918. ADCS R23, R25
  919. ADC ZR, R26
  920. MUL R7, R15, R22
  921. UMULH R7, R15, R23
  922. ADDS R22, R24
  923. ADCS R23, R25
  924. ADC ZR, R26
  925. MUL R8, R14, R22
  926. UMULH R8, R14, R23
  927. ADDS R22, R24
  928. ADCS R23, R25
  929. ADC ZR, R26
  930. ADDS R21, R24, R13 // Set z11
  931. ADCS ZR, R25
  932. ADC ZR, R26
  933. // x12 iteration
  934. MUL R3, R20, R22
  935. MOVD 96(R1), R21
  936. UMULH R3, R20, R23
  937. ADDS R22, R25
  938. ADCS R23, R26
  939. ADC ZR, ZR, R24
  940. MUL R4, R19, R22
  941. UMULH R4, R19, R23
  942. ADDS R22, R25
  943. ADCS R23, R26
  944. ADC ZR, R24
  945. MUL R5, R29, R22
  946. UMULH R5, R29, R23
  947. ADDS R22, R25
  948. ADCS R23, R26
  949. ADC ZR, R24
  950. MUL R6, R17, R22
  951. UMULH R6, R17, R23
  952. ADDS R22, R25
  953. ADCS R23, R26
  954. ADC ZR, R24
  955. MUL R7, R16, R22
  956. UMULH R7, R16, R23
  957. ADDS R22, R25
  958. ADCS R23, R26
  959. ADC ZR, R24
  960. MUL R8, R15, R22
  961. UMULH R8, R15, R23
  962. ADDS R22, R25
  963. ADCS R23, R26
  964. ADC ZR, R24
  965. MUL R9, R14, R22
  966. UMULH R9, R14, R23
  967. ADDS R22, R25
  968. ADCS R23, R26
  969. ADC ZR, R24
  970. ADDS R21, R25, R2 // Set z0
  971. ADCS ZR, R26
  972. ADC ZR, R24
  973. // x13 iteration
  974. MUL R4, R20, R22
  975. MOVD 104(R1), R21
  976. UMULH R4, R20, R23
  977. ADDS R22, R26
  978. ADCS R23, R24
  979. ADC ZR, ZR, R25
  980. MUL R5, R19, R22
  981. UMULH R5, R19, R23
  982. ADDS R22, R26
  983. ADCS R23, R24
  984. ADC ZR, R25
  985. MUL R6, R29, R22
  986. UMULH R6, R29, R23
  987. ADDS R22, R26
  988. ADCS R23, R24
  989. ADC ZR, R25
  990. MUL R7, R17, R22
  991. UMULH R7, R17, R23
  992. ADDS R22, R26
  993. ADCS R23, R24
  994. ADC ZR, R25
  995. MUL R8, R16, R22
  996. UMULH R8, R16, R23
  997. ADDS R22, R26
  998. ADCS R23, R24
  999. ADC ZR, R25
  1000. MUL R9, R15, R22
  1001. UMULH R9, R15, R23
  1002. ADDS R22, R26
  1003. ADCS R23, R24
  1004. ADC ZR, R25
  1005. MUL R10, R14, R22
  1006. UMULH R10, R14, R23
  1007. ADDS R22, R26
  1008. ADCS R23, R24
  1009. ADC ZR, R25
  1010. ADDS R21, R26, R3 // Set z1
  1011. STP (R2, R3), 0(R0)
  1012. ADCS ZR, R24
  1013. ADC ZR, R25
  1014. // x14 iteration
  1015. MUL R5, R20, R22
  1016. MOVD 112(R1), R21
  1017. UMULH R5, R20, R23
  1018. ADDS R22, R24
  1019. ADCS R23, R25
  1020. ADC ZR, ZR, R26
  1021. MUL R6, R19, R22
  1022. UMULH R6, R19, R23
  1023. ADDS R22, R24
  1024. ADCS R23, R25
  1025. ADC ZR, R26
  1026. MUL R7, R29, R22
  1027. UMULH R7, R29, R23
  1028. ADDS R22, R24
  1029. ADCS R23, R25
  1030. ADC ZR, R26
  1031. MUL R8, R17, R22
  1032. UMULH R8, R17, R23
  1033. ADDS R22, R24
  1034. ADCS R23, R25
  1035. ADC ZR, R26
  1036. MUL R9, R16, R22
  1037. UMULH R9, R16, R23
  1038. ADDS R22, R24
  1039. ADCS R23, R25
  1040. ADC ZR, R26
  1041. MUL R10, R15, R22
  1042. UMULH R10, R15, R23
  1043. ADDS R22, R24
  1044. ADCS R23, R25
  1045. ADC ZR, R26
  1046. MUL R11, R14, R22
  1047. UMULH R11, R14, R23
  1048. ADDS R22, R24
  1049. ADCS R23, R25
  1050. ADC ZR, R26
  1051. ADDS R21, R24, R4 // Set z2
  1052. ADCS ZR, R25
  1053. ADC ZR, R26
  1054. // x15 iteration
  1055. MUL R6, R20, R22
  1056. MOVD 120(R1), R21
  1057. UMULH R6, R20, R23
  1058. ADDS R22, R25
  1059. ADCS R23, R26
  1060. ADC ZR, ZR, R24
  1061. MUL R7, R19, R22
  1062. UMULH R7, R19, R23
  1063. ADDS R22, R25
  1064. ADCS R23, R26
  1065. ADC ZR, R24
  1066. MUL R8, R29, R22
  1067. UMULH R8, R29, R23
  1068. ADDS R22, R25
  1069. ADCS R23, R26
  1070. ADC ZR, R24
  1071. MUL R9, R17, R22
  1072. UMULH R9, R17, R23
  1073. ADDS R22, R25
  1074. ADCS R23, R26
  1075. ADC ZR, R24
  1076. MUL R10, R16, R22
  1077. UMULH R10, R16, R23
  1078. ADDS R22, R25
  1079. ADCS R23, R26
  1080. ADC ZR, R24
  1081. MUL R11, R15, R22
  1082. UMULH R11, R15, R23
  1083. ADDS R22, R25
  1084. ADCS R23, R26
  1085. ADC ZR, R24
  1086. MUL R12, R14, R22
  1087. UMULH R12, R14, R23
  1088. ADDS R22, R25
  1089. ADCS R23, R26
  1090. ADC ZR, R24
  1091. ADDS R21, R25, R5 // Set z3
  1092. STP (R4, R5), 16(R0)
  1093. ADCS ZR, R26
  1094. ADC ZR, R24
  1095. // x16 iteration
  1096. MUL R7, R20, R22
  1097. MOVD 128(R1), R21
  1098. UMULH R7, R20, R23
  1099. ADDS R22, R26
  1100. ADCS R23, R24
  1101. ADC ZR, ZR, R25
  1102. MUL R8, R19, R22
  1103. UMULH R8, R19, R23
  1104. ADDS R22, R26
  1105. ADCS R23, R24
  1106. ADC ZR, R25
  1107. MUL R9, R29, R22
  1108. UMULH R9, R29, R23
  1109. ADDS R22, R26
  1110. ADCS R23, R24
  1111. ADC ZR, R25
  1112. MUL R10, R17, R22
  1113. UMULH R10, R17, R23
  1114. ADDS R22, R26
  1115. ADCS R23, R24
  1116. ADC ZR, R25
  1117. MUL R11, R16, R22
  1118. UMULH R11, R16, R23
  1119. ADDS R22, R26
  1120. ADCS R23, R24
  1121. ADC ZR, R25
  1122. MUL R12, R15, R22
  1123. UMULH R12, R15, R23
  1124. ADDS R22, R26
  1125. ADCS R23, R24
  1126. ADC ZR, R25
  1127. MUL R13, R14, R22
  1128. UMULH R13, R14, R23
  1129. ADDS R22, R26
  1130. ADCS R23, R24
  1131. ADC ZR, R25
  1132. ADDS R21, R26, R6 // Set z4
  1133. ADCS ZR, R24
  1134. ADC ZR, R25
  1135. // x17 iteration
  1136. MUL R8, R20, R22
  1137. MOVD 136(R1), R21
  1138. UMULH R8, R20, R23
  1139. ADDS R22, R24
  1140. ADCS R23, R25
  1141. ADC ZR, ZR, R26
  1142. MUL R9, R19, R22
  1143. UMULH R9, R19, R23
  1144. ADDS R22, R24
  1145. ADCS R23, R25
  1146. ADC ZR, R26
  1147. MUL R10, R29, R22
  1148. UMULH R10, R29, R23
  1149. ADDS R22, R24
  1150. ADCS R23, R25
  1151. ADC ZR, R26
  1152. MUL R11, R17, R22
  1153. UMULH R11, R17, R23
  1154. ADDS R22, R24
  1155. ADCS R23, R25
  1156. ADC ZR, R26
  1157. MUL R12, R16, R22
  1158. UMULH R12, R16, R23
  1159. ADDS R22, R24
  1160. ADCS R23, R25
  1161. ADC ZR, R26
  1162. MUL R13, R15, R22
  1163. UMULH R13, R15, R23
  1164. ADDS R22, R24
  1165. ADCS R23, R25
  1166. ADC ZR, R26
  1167. ADDS R21, R24, R7 // Set z5
  1168. STP (R6, R7), 32(R0)
  1169. ADCS ZR, R25
  1170. ADC ZR, R26
  1171. // x18 iteration
  1172. MUL R9, R20, R22
  1173. MOVD 144(R1), R21
  1174. UMULH R9, R20, R23
  1175. ADDS R22, R25
  1176. ADCS R23, R26
  1177. ADC ZR, ZR, R24
  1178. MUL R10, R19, R22
  1179. UMULH R10, R19, R23
  1180. ADDS R22, R25
  1181. ADCS R23, R26
  1182. ADC ZR, R24
  1183. MUL R11, R29, R22
  1184. UMULH R11, R29, R23
  1185. ADDS R22, R25
  1186. ADCS R23, R26
  1187. ADC ZR, R24
  1188. MUL R12, R17, R22
  1189. UMULH R12, R17, R23
  1190. ADDS R22, R25
  1191. ADCS R23, R26
  1192. ADC ZR, R24
  1193. MUL R13, R16, R22
  1194. UMULH R13, R16, R23
  1195. ADDS R22, R25
  1196. ADCS R23, R26
  1197. ADC ZR, R24
  1198. ADDS R21, R25, R8 // Set z6
  1199. ADCS ZR, R26
  1200. ADC ZR, R24
  1201. // x19 iteration
  1202. MUL R10, R20, R22
  1203. MOVD 152(R1), R21
  1204. UMULH R10, R20, R23
  1205. ADDS R22, R26
  1206. ADCS R23, R24
  1207. ADC ZR, ZR, R25
  1208. MUL R11, R19, R22
  1209. UMULH R11, R19, R23
  1210. ADDS R22, R26
  1211. ADCS R23, R24
  1212. ADC ZR, R25
  1213. MUL R12, R29, R22
  1214. UMULH R12, R29, R23
  1215. ADDS R22, R26
  1216. ADCS R23, R24
  1217. ADC ZR, R25
  1218. MUL R13, R17, R22
  1219. UMULH R13, R17, R23
  1220. ADDS R22, R26
  1221. ADCS R23, R24
  1222. ADC ZR, R25
  1223. ADDS R21, R26, R9 // Set z7
  1224. STP (R8, R9), 48(R0)
  1225. ADCS ZR, R24
  1226. ADC ZR, R25
  1227. // x20 iteration
  1228. MUL R11, R20, R22
  1229. MOVD 160(R1), R21
  1230. UMULH R11, R20, R23
  1231. ADDS R22, R24
  1232. ADCS R23, R25
  1233. ADC ZR, ZR, R26
  1234. MUL R12, R19, R22
  1235. UMULH R12, R19, R23
  1236. ADDS R22, R24
  1237. ADCS R23, R25
  1238. ADC ZR, R26
  1239. MUL R13, R29, R22
  1240. UMULH R13, R29, R23
  1241. ADDS R22, R24
  1242. ADCS R23, R25
  1243. ADC ZR, R26
  1244. ADDS R21, R24, R10 // Set z8
  1245. ADCS ZR, R25
  1246. ADC ZR, R26
  1247. // x21 iteration
  1248. MUL R12, R20, R22
  1249. MOVD 168(R1), R21
  1250. UMULH R12, R20, R23
  1251. ADDS R22, R25
  1252. ADCS R23, R26
  1253. ADC ZR, ZR, R24
  1254. MUL R13, R19, R22
  1255. UMULH R13, R19, R23
  1256. ADDS R22, R25
  1257. ADCS R23, R26
  1258. ADC ZR, R24
  1259. ADDS R21, R25, R11 // Set z9
  1260. STP (R10, R11), 64(R0)
  1261. ADCS ZR, R26
  1262. ADC ZR, R24
  1263. // x22 iteration
  1264. MUL R13, R20, R22
  1265. MOVD 176(R1), R21
  1266. UMULH R13, R20, R23
  1267. ADDS R22, R26
  1268. ADC R23, R24
  1269. ADDS R21, R26, R12 // Set z10
  1270. MOVD 184(R1), R21
  1271. ADC R21, R24, R13 // Set z11
  1272. STP (R12, R13), 80(R0)
  1273. RET
  1274. TEXT ·fp751StrongReduce(SB), NOSPLIT, $0-8
  1275. MOVD x+0(FP), R0
  1276. // Keep x in R1-R12, p751 in R13-R21, subtract to R1-R12
  1277. MOVD ·p751+0(SB), R13
  1278. LDP 0(R0), (R1, R2)
  1279. LDP 16(R0), (R3, R4)
  1280. SUBS R13, R1
  1281. SBCS R13, R2
  1282. LDP 32(R0), (R5, R6)
  1283. LDP ·p751+40(SB), (R14, R15)
  1284. SBCS R13, R3
  1285. SBCS R13, R4
  1286. LDP 48(R0), (R7, R8)
  1287. LDP ·p751+56(SB), (R16, R17)
  1288. SBCS R13, R5
  1289. SBCS R14, R6
  1290. LDP 64(R0), (R9, R10)
  1291. LDP ·p751+72(SB), (R19, R20)
  1292. SBCS R15, R7
  1293. SBCS R16, R8
  1294. LDP 80(R0), (R11, R12)
  1295. MOVD ·p751+88(SB), R21
  1296. SBCS R17, R9
  1297. SBCS R19, R10
  1298. SBCS R20, R11
  1299. SBCS R21, R12
  1300. SBC ZR, ZR, R22
  1301. // Mask with the borrow and add p751
  1302. AND R22, R13
  1303. AND R22, R14
  1304. AND R22, R15
  1305. AND R22, R16
  1306. AND R22, R17
  1307. AND R22, R19
  1308. AND R22, R20
  1309. AND R22, R21
  1310. ADDS R13, R1
  1311. ADCS R13, R2
  1312. STP (R1, R2), 0(R0)
  1313. ADCS R13, R3
  1314. ADCS R13, R4
  1315. STP (R3, R4), 16(R0)
  1316. ADCS R13, R5
  1317. ADCS R14, R6
  1318. STP (R5, R6), 32(R0)
  1319. ADCS R15, R7
  1320. ADCS R16, R8
  1321. STP (R7, R8), 48(R0)
  1322. ADCS R17, R9
  1323. ADCS R19, R10
  1324. STP (R9, R10), 64(R0)
  1325. ADCS R20, R11
  1326. ADC R21, R12
  1327. STP (R11, R12), 80(R0)
  1328. RET