// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Extend Montgomery reduce in 8-digit blocks, uses an extra storage to
// temporarily cache multiplied differences appearing in ADK.
// Results are stored in input-output buffer (z).
// k must be divisible by 8 and not smaller than 16.
// Inputs z[2*k], m[k], w;
// Outputs function return (extra result bit) and z[2*k]
// Temporary buffer m_precalc[12*(k/4-1)]
//
//    extern uint64_t bignum_emontredc_8n_cdiff(uint64_t k, uint64_t *z,
//                                              const uint64_t *m, uint64_t w,
//                                              uint64_t *m_precalc);
//
// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, X4 = m_precalc
//                   returns X0
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum_arm.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_cdiff)
        S2N_BN_FUNCTION_TYPE_DIRECTIVE(bignum_emontredc_8n_cdiff)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_cdiff)
        .text
        .balign 4

#define count x27

// Helper macro for the pre-computations
#define cdiff(t, c, x, y) subs t, x, y; cneg t, t, cc; csetm c, cc

// Some immediate offsets for cached differences+carry used
// in the inner ADK multiplications
#define cache_a01 (32+0*16)
#define cache_a02 (32+1*16)
#define cache_a03 (32+2*16)
#define cache_a12 (32+3*16)
#define cache_a13 (32+4*16)
#define cache_a23 (32+5*16)
#define cache_m10 (0*16)
#define cache_m20 (1*16)
#define cache_m30 (2*16)
#define cache_m21 (3*16)
#define cache_m31 (4*16)
#define cache_m32 (5*16)

#define a0 x4
#define a1 x5
#define a2 x6
#define a3 x7

// Registers for precalculation
#define vpre00 v30
#define vpre01 v28
#define vpre02 v17
#define vpre10 v18
#define vpre11 v19
#define vpre12 v20

#define m x2

S2N_BN_SYMBOL(bignum_emontredc_8n_cdiff):
        CFI_START

        CFI_DEC_SP((10*16))
        CFI_STACKSAVE2(x19,x20,(9*16))
        CFI_STACKSAVE2(x21,x22,(8*16))
        CFI_STACKSAVE2(x23,x24,(7*16))
        CFI_STACKSAVE2(x25,x26,(6*16))
        CFI_STACKSAVE2(x27,x28,(5*16))
        CFI_STACKSAVE2(x29,x30,(4*16))
        CFI_STACKSAVE2(d14,d15,(3*16))
        CFI_STACKSAVE2(d12,d13,(2*16))
        CFI_STACKSAVE2(d10,d11,(1*16))
        CFI_STACKSAVE2(d8,d9,(0*16))

        // Leave space for cached differences of words of a in inner loop
        CFI_DEC_SP((6*16))

        CFI_DEC_SP(32)
        lsr x0, x0, #2
        mov x26, x0
        subs x12, x0, #1
        bcc Lbignum_emontredc_8n_cdiff_end

        // x30 = buffer holding precomputed ADK carry-differences for modulus

        //
        // Start of precomputation
        //
        // Precompute and cache signed differences of modulus components
        // used in the ADK multiplication in the inner loop.
        //

        // Number of extra limbs required:
        // 6 * (number of limbs / 4 - 1) * 2 = 12 * (number_of_limbs/4 - 1)
        //
        mov x24, x4
        mov x30, x4

        // Save modulus pointer
        mov x25, m

        mov count, x12

Lbignum_emontredc_8n_cdiff_precomp:
        ldp a0, a1, [m, #32]!
        ldp a2, a3, [m, #16]

#define t x28
#define c x29

        cdiff(t, c, a1, a0)
        stp   t, c, [x30, #cache_m10]
        cdiff(t, c, a2, a0)
        stp   t, c, [x30, #cache_m20]
        cdiff(t, c, a3, a0)
        stp   t, c, [x30, #cache_m30]
        cdiff(t, c, a2, a1)
        stp   t, c, [x30, #cache_m21]
        cdiff(t, c, a3, a1)
        stp   t, c, [x30, #cache_m31]
        cdiff(t, c, a3, a2)
        stp   t, c, [x30, #cache_m32]

        add x30, x30, #(6*16)

        subs count, count, #1
        cbnz count, Lbignum_emontredc_8n_cdiff_precomp

        // Set modulus pointer and buffer pointer back to its original value
        mov m, x25
        mov x30, x24

        //
        // End of precomputation
        //

        stp x3, x30, [sp]
        stp x26, xzr, [sp, #16]
        mov x28, xzr
        lsl x0, x12, #5

        movi    v29.2d, #0x000000ffffffff

Lbignum_emontredc_8n_cdiff_outerloop:
        ldp x9, x13, [x1, #0]                      // .*..................................................................................................................................................................................................................
        ldr x3, [sp]                               // *...................................................................................................................................................................................................................
        lsr x27, x0, #5                            // ......................................................................................................................................*.............................................................................
        sub x27, x27, #1                           // ...................................................................................................................................................................................................................*
        ldp x10, x12, [x1, #16]                    // ..*.................................................................................................................................................................................................................
        ldp x4, x15, [x2, #0]                      // ...*................................................................................................................................................................................................................
        ldr q1, [x2, #16]                          // .....*..............................................................................................................................................................................................................
        mul x11, x9, x3                            // ......*.............................................................................................................................................................................................................
        uzp2 v18.4S, v1.4S, v1.4S                  // ........*...........................................................................................................................................................................................................
        dup v27.2D, x11                            // .......*............................................................................................................................................................................................................
        xtn v13.2S, v1.2D                          // ..........*.........................................................................................................................................................................................................
        rev64 v9.4S, v1.4S                         // ...........*........................................................................................................................................................................................................
        mul x7, x11, x4                            // ...........................*........................................................................................................................................................................................
        rev64 v2.4S, v1.4S                         // ....................................................................................*...............................................................................................................................
        uzp2 v20.4S, v1.4S, v1.4S                  // .................................................................................*..................................................................................................................................
        mul v31.4S, v9.4S, v27.4S                  // ...............*....................................................................................................................................................................................................
        xtn v14.2S, v1.2D                          // ..............................................*.....................................................................................................................................................................
        uzp2 v21.4S, v1.4S, v1.4S                  // ............................................*.......................................................................................................................................................................
        umulh x22, x11, x15                        // ................................*...................................................................................................................................................................................
        xtn v17.2S, v27.2D                         // .........*..........................................................................................................................................................................................................
        adds x19, x9, x7                           // ............................*.......................................................................................................................................................................................
        umull v28.2D, v17.2S, v13.2S               // ............*.......................................................................................................................................................................................................
        umull v26.2D, v17.2S, v18.2S               // .............*......................................................................................................................................................................................................
        uaddlp v8.2D, v31.4S                       // ..................*.................................................................................................................................................................................................
        umulh x8, x11, x4                          // .............................*......................................................................................................................................................................................
        shl v7.2D, v8.2D, #32                      // .....................*..............................................................................................................................................................................................
        uzp2 v30.4S, v27.4S, v27.4S                // ..............*.....................................................................................................................................................................................................
        umlal v7.2D, v17.2S, v13.2S                // .......................*............................................................................................................................................................................................
        mul x14, x11, x15                          // ..............................*.....................................................................................................................................................................................
        usra v26.2D, v28.2D, #32                   // ................*...................................................................................................................................................................................................
        umull v12.2D, v30.2S, v18.2S               // .................*..................................................................................................................................................................................................
        mov x24, v7.d[0]                           // .........................*..........................................................................................................................................................................................
        adcs x29, x13, x14                         // ...............................*....................................................................................................................................................................................
        and v4.16B, v26.16B, v29.16B               // ...................*................................................................................................................................................................................................
        mov x17, v7.d[1]                           // ..........................*.........................................................................................................................................................................................
        rev64 v27.4S, v1.4S                        // ...............................................*....................................................................................................................................................................
        adcs x5, x10, x24                          // .................................*..................................................................................................................................................................................
        umlal v4.2D, v30.2S, v13.2S                // ....................*...............................................................................................................................................................................................
        usra v12.2D, v26.2D, #32                   // ......................*.............................................................................................................................................................................................
        adcs x14, x12, x17                         // ..................................*.................................................................................................................................................................................
        adc x23, xzr, xzr                          // .....................................*..............................................................................................................................................................................
        adds x8, x29, x8                           // ......................................*.............................................................................................................................................................................
        adcs x7, x5, x22                           // .......................................*............................................................................................................................................................................
        mul x25, x8, x3                            // ..........................................*.........................................................................................................................................................................
        usra v12.2D, v4.2D, #32                    // ........................*...........................................................................................................................................................................................
        dup v8.2D, x25                             // ...........................................*........................................................................................................................................................................
        stp x11, x25, [x1, #0]                     // ..............................................................................*.....................................................................................................................................
        mul x22, x25, x4                           // ...............................................................*....................................................................................................................................................
        mov x16, v12.d[1]                          // ....................................*...............................................................................................................................................................................
        ldr q16, [x1, #0]                          // .......................................................................................................................................*............................................................................
        mov x21, v12.d[0]                          // ...................................*................................................................................................................................................................................
        mul v31.4S, v27.4S, v8.4S                  // ...................................................*................................................................................................................................................................
        adcs x20, x14, x21                         // ........................................*...........................................................................................................................................................................
        xtn v27.2S, v8.2D                          // .............................................*......................................................................................................................................................................
        adc x10, x23, x16                          // .........................................*..........................................................................................................................................................................
        subs x14, x11, x25                         // .........................................................................................................................................*..........................................................................
        rev64 v17.4S, v16.4S                       // ...................................................................................................................................................................*................................................
        cneg x17, x14, cc                          // ..........................................................................................................................................*.........................................................................
        csetm x26, cc                              // ...........................................................................................................................................*........................................................................
        uaddlp v26.2D, v31.4S                      // ......................................................*.............................................................................................................................................................
        mul x6, x25, x15                           // ..................................................................*.................................................................................................................................................
        stp x17, x26, [sp, #cache_a01]             // ............................................................................................................................................*.......................................................................
        umull v24.2D, v27.2S, v14.2S               // ................................................*...................................................................................................................................................................
        uzp2 v30.4S, v16.4S, v16.4S                // .................................................................................................................................................................*..................................................
        shl v4.2D, v26.2D, #32                     // .........................................................*..........................................................................................................................................................
        uzp2 v5.4S, v8.4S, v8.4S                   // ..................................................*.................................................................................................................................................................
        umulh x17, x25, x4                         // .................................................................*..................................................................................................................................................
        umlal v4.2D, v27.2S, v14.2S                // ...........................................................*........................................................................................................................................................
        umull v8.2D, v27.2S, v21.2S                // .................................................*..................................................................................................................................................................
        mov x21, v4.d[0]                           // .............................................................*......................................................................................................................................................
        adds x8, x8, x22                           // ................................................................*...................................................................................................................................................
        mov x12, v4.d[1]                           // ..............................................................*.....................................................................................................................................................
        ldp x23, x14, [x2, #16]                    // ....*...............................................................................................................................................................................................................
        adcs x29, x7, x6                           // ...................................................................*................................................................................................................................................
        umulh x13, x25, x15                        // ....................................................................*...............................................................................................................................................
        usra v8.2D, v24.2D, #32                    // ....................................................*...............................................................................................................................................................
        ldp x8, x24, [x30, #cache_m20]             // ...........................................................................................................................................................................................................*........
        adcs x9, x20, x21                          // .....................................................................*..............................................................................................................................................
        ldr q9, [x2, #32]!                         // .......................................................................................................................................................................*............................................
        xtn v28.2S, v16.2D                         // ..................................................................................................................................................................*.................................................
        adcs x19, x10, x12                         // ......................................................................*.............................................................................................................................................
        ldr q13, [x2, #16]                         // ........................................................................................................................................................................*...........................................
        umull v18.2D, v5.2S, v21.2S                // .....................................................*..............................................................................................................................................................
        adc x7, xzr, xzr                           // .........................................................................*..........................................................................................................................................
        adds x5, x29, x17                          // ..........................................................................*.........................................................................................................................................
        xtn v21.2S, v1.2D                          // ...................................................................................*................................................................................................................................
        mul x12, x5, x3                            // ...............................................................................*....................................................................................................................................
        and v4.16B, v8.16B, v29.16B                // .......................................................*............................................................................................................................................................
        adcs x21, x9, x13                          // ...........................................................................*........................................................................................................................................
        uzp2 v31.4S, v9.4S, v9.4S                  // ...............................................................................................................................................................................*....................................
        xtn v23.2S, v9.2D                          // .........................................................................................................................................................................*..........................................
        usra v18.2D, v8.2D, #32                    // ..........................................................*.........................................................................................................................................................
        umlal v4.2D, v5.2S, v14.2S                 // ........................................................*...........................................................................................................................................................
        dup v5.2D, x12                             // ................................................................................*...................................................................................................................................
        umull v16.2D, v23.2S, v30.2S               // ............................................................................................................................................................................*.......................................
        umull v1.2D, v23.2S, v28.2S                // ..............................................................................................................................................................................*.....................................
        umulh x29, x12, x15                        // .........................................................................................................*..........................................................................................................
        umull v8.2D, v31.2S, v30.2S                // ....................................................................................................................................................................................*...............................
        xtn v24.2S, v13.2D                         // ..........................................................................................................................................................................*.........................................
        mul v25.4S, v2.4S, v5.4S                   // ........................................................................................*...........................................................................................................................
        usra v18.2D, v4.2D, #32                    // ............................................................*.......................................................................................................................................................
        xtn v3.2S, v5.2D                           // ..................................................................................*.................................................................................................................................
        uzp2 v19.4S, v5.4S, v5.4S                  // .......................................................................................*............................................................................................................................
        mul x10, x12, x15                          // .......................................................................................................*............................................................................................................
        umull v26.2D, v3.2S, v20.2S                // ......................................................................................*.............................................................................................................................
        mov x22, v18.d[0]                          // .......................................................................*............................................................................................................................................
        umull v10.2D, v3.2S, v21.2S                // .....................................................................................*..............................................................................................................................
        uaddlp v11.2D, v25.4S                      // ...........................................................................................*........................................................................................................................
        mov x6, v18.d[1]                           // ........................................................................*...........................................................................................................................................
        mul x16, x12, x4                           // ....................................................................................................*...............................................................................................................
        umull v4.2D, v19.2S, v20.2S                // ..........................................................................................*.........................................................................................................................
        usra v16.2D, v1.2D, #32                    // ...................................................................................................................................................................................*................................
        adcs x13, x19, x22                         // ............................................................................*.......................................................................................................................................
        shl v11.2D, v11.2D, #32                    // ..............................................................................................*.....................................................................................................................
        adc x6, x7, x6                             // .............................................................................*......................................................................................................................................
        subs x7, x11, x12                          // .............................................................................................................................................*......................................................................
        usra v26.2D, v10.2D, #32                   // .........................................................................................*..........................................................................................................................
        csetm x26, cc                              // ...............................................................................................................................................*....................................................................
        cneg x20, x7, cc                           // ..............................................................................................................................................*.....................................................................
        subs x19, x25, x12                         // .....................................................................................................................................................*..............................................................
        umlal v11.2D, v3.2S, v21.2S                // ................................................................................................*...................................................................................................................
        cneg x9, x19, cc                           // ......................................................................................................................................................*.............................................................
        stp x20, x26, [sp, #cache_a02]             // ................................................................................................................................................*...................................................................
        umulh x7, x12, x4                          // ......................................................................................................*.............................................................................................................
        usra v8.2D, v16.2D, #32                    // ........................................................................................................................................................................................*...........................
        mul v7.4S, v17.4S, v9.4S                   // ................................................................................................................................................................................................*...................
        csetm x26, cc                              // .......................................................................................................................................................*............................................................
        adds x19, x5, x16                          // .....................................................................................................*..............................................................................................................
        and v1.16B, v16.16B, v29.16B               // .......................................................................................................................................................................................*............................
        adcs x21, x21, x10                         // ........................................................................................................*...........................................................................................................
        stp x9, x26, [sp, #cache_a12]              // ........................................................................................................................................................*...........................................................
        ldp x17, x20, [sp, #cache_a02]             // .....................................................................................................................................................................................................*..............
        usra v4.2D, v26.2D, #32                    // ...............................................................................................*....................................................................................................................
        and v18.16B, v26.16B, v29.16B              // ............................................................................................*.......................................................................................................................
        umlal v1.2D, v31.2S, v28.2S                // ..........................................................................................................................................................................................*.........................
        mov x22, v11.d[0]                          // ..................................................................................................*.................................................................................................................
        mov x16, v11.d[1]                          // ...................................................................................................*................................................................................................................
        umlal v18.2D, v19.2S, v21.2S               // .............................................................................................*......................................................................................................................
        adcs x19, x13, x22                         // ..........................................................................................................*.........................................................................................................
        mul x22, x17, x8                           // ...............................................................................................................................................................................................................*....
        uaddlp v5.2D, v7.4S                        // ..................................................................................................................................................................................................*.................
        adcs x13, x6, x16                          // ...........................................................................................................*........................................................................................................
        usra v8.2D, v1.2D, #32                     // ..............................................................................................................................................................................................*.....................
        adc x9, xzr, xzr                           // ..............................................................................................................*.....................................................................................................
        adds x5, x21, x7                           // ...............................................................................................................*....................................................................................................
        usra v4.2D, v18.2D, #32                    // .................................................................................................*..................................................................................................................
        adcs x6, x19, x29                          // .................................................................................................................*..................................................................................................
        mul x19, x5, x3                            // ................................................................................................................*...................................................................................................
        shl v15.2D, v5.2D, #32                     // ......................................................................................................................................................................................................*.............
        mov x3, v8.d[1]                            // ...................................................................................................................................................................................................*................
        umlal v15.2D, v23.2S, v28.2S               // .......................................................................................................................................................................................................*............
        mov x21, v4.d[0]                           // ............................................................................................................*.......................................................................................................
        mul x7, x19, x23                           // .......................................................................................................................*............................................................................................
        stp x12, x19, [x1, #16]                    // ....................................................................................................................*...............................................................................................
        mov x10, v4.d[1]                           // .............................................................................................................*......................................................................................................
        ldr q9, [x1, #16]                          // ........................................................................................................................................*...........................................................................
        adcs x13, x13, x21                         // ..................................................................................................................*.................................................................................................
        mov x21, v15.d[1]                          // ............................................................................................................................................................................................................*.......
        mul x16, x19, x4                           // .....................................................................................................................*..............................................................................................
        adc x9, x9, x10                            // ...................................................................................................................*................................................................................................
        subs x29, x25, x19                         // .........................................................................................................................................................*..........................................................
        csetm x26, cc                              // ...........................................................................................................................................................*........................................................
        cneg x10, x29, cc                          // ..........................................................................................................................................................*.........................................................
        subs x29, x12, x19                         // .............................................................................................................................................................*......................................................
        stp x10, x26, [sp, #cache_a13]             // ............................................................................................................................................................*.......................................................
        uzp2 v18.4S, v9.4S, v9.4S                  // ....................................................................................................................................................................*...............................................
        mul x12, x19, x15                          // ......................................................................................................................*.............................................................................................
        rev64 v20.4S, v9.4S                        // ......................................................................................................................................................................*.............................................
        xtn v19.2S, v9.2D                          // .....................................................................................................................................................................*..............................................
        umull v25.2D, v24.2S, v18.2S               // .............................................................................................................................................................................*......................................
        csetm x26, cc                              // ...............................................................................................................................................................*....................................................
        umull v14.2D, v24.2S, v19.2S               // ................................................................................................................................................................................*...................................
        cneg x29, x29, cc                          // ..............................................................................................................................................................*.....................................................
        umulh x10, x19, x23                        // ..............................................................................................................................*.....................................................................................
        adds x25, x5, x16                          // .........................................................................................................................*..........................................................................................
        mul v7.4S, v20.4S, v13.4S                  // .................................................................................................................................................................................*..................................
        adcs x12, x6, x12                          // ...........................................................................................................................*........................................................................................
        ldp x6, x5, [sp, #cache_a01]               // ..........................................................................................................................................................................................................*.........
        mov x16, v8.d[0]                           // .........................................................................................................................................................................................................*..........
        adcs x25, x13, x7                          // .............................................................................................................................*......................................................................................
        stp x29, x26, [sp, #cache_a23]             // ................................................................................................................................................................*...................................................
        usra v25.2D, v14.2D, #32                   // .....................................................................................................................................................................................*..............................
        mul x29, x19, x14                          // ........................................................................................................................*...........................................................................................
        uzp2 v1.4S, v13.4S, v13.4S                 // ...........................................................................................................................................................................*........................................
        uaddlp v7.2D, v7.4S                        // ......................................................................................................................................................................................*.............................
        umull v0.2D, v1.2S, v18.2S                 // ..................................................................................................................................................................................*.................................
        umulh x13, x19, x4                         // ..........................................................................................................................*.........................................................................................
        and v10.16B, v25.16B, v29.16B              // .........................................................................................................................................................................................*..........................
        shl v13.2D, v7.2D, #32                     // ............................................................................................................................................................................................*.......................
        adcs x4, x9, x29                           // ...............................................................................................................................*....................................................................................
        umlal v10.2D, v1.2S, v19.2S                // .............................................................................................................................................................................................*......................
        adc x9, xzr, xzr                           // .................................................................................................................................*..................................................................................
        subs x29, x11, x19                         // .................................................................................................................................................*..................................................................
        usra v0.2D, v25.2D, #32                    // ...........................................................................................................................................................................................*........................
        eor x11, x20, x24                          // ................................................................................................................................................................................................................*...
        umulh x15, x19, x15                        // ............................................................................................................................*.......................................................................................
        umlal v13.2D, v24.2S, v19.2S               // ...............................................................................................................................................................................................*....................
        cneg x7, x29, cc                           // ..................................................................................................................................................*.................................................................
        ldp x20, x29, [x1, #32]!                   // .................................................................................................................................................................................................................*..
        csetm x26, cc                              // ...................................................................................................................................................*................................................................
        usra v0.2D, v10.2D, #32                    // .................................................................................................................................................................................................*..................
        umulh x19, x19, x14                        // ................................................................................................................................*...................................................................................
        mov x23, v13.d[1]                          // .............................................................................................................................................................................................................*......
        stp x7, x26, [sp, #cache_a03]              // ....................................................................................................................................................*...............................................................
        adds x12, x12, x13                         // ..................................................................................................................................*.................................................................................
        adcs x13, x25, x15                         // ...................................................................................................................................*................................................................................
        mov x26, v0.d[0]                           // ....................................................................................................................................................................................................*...............
        umulh x8, x17, x8                          // ..................................................................................................................................................................................................................*.
        adcs x14, x4, x10                          // ....................................................................................................................................*...............................................................................
        mov x17, v13.d[0]                          // ........................................................................................................................................................................................................*...........
        adc x15, x9, x19                           // .....................................................................................................................................*..............................................................................
        ldp x24, x10, [x30], #96                   // ..............................................................................................................................................................................................................*.....

Lbignum_emontredc_8n_cdiff_maddloop_neon:

        ldr q14, [x2, #32]!                          // e....................................................................................................................................................
        ldr q25, [x2, #16]                           // .e...................................................................................................................................................
        eor x19, x5, x10                             // .................................................................................*...................................................................
        adds x25, x21, x16                           // .....................................*...............................................................................................................
        mov x16, v0.d[1]                             // .................................*...................................................................................................................
        ldp x4, x7, [x1, #16]                        // .............................................*.......................................................................................................
        adcs x21, x17, x3                            // ......................................*..............................................................................................................
        eor x22, x22, x11                            // .................................................................................................................*...................................
        adcs x23, x23, x26                           // .......................................*.............................................................................................................
        adc x17, x16, xzr                            // ........................................*............................................................................................................
        adds x16, x12, x20                           // ...........................................*.........................................................................................................
        mul x5, x6, x24                              // ..................................................................................*..................................................................
        xtn v21.2S, v14.2D                           // ..e..................................................................................................................................................
        xtn v31.2S, v25.2D                           // ................e....................................................................................................................................
        adcs x9, x13, x29                            // ............................................*........................................................................................................
        uzp2 v24.4S, v25.4S, v25.4S                  // ...................e.................................................................................................................................
        mov x29, v15.d[0]                            // .........................................*...........................................................................................................
        adcs x4, x14, x4                             // ..............................................*......................................................................................................
        ldp x10, x13, [sp, #cache_a23]               // ....................................................................*................................................................................
        umull v5.2D, v21.2S, v30.2S                  // ....e................................................................................................................................................
        umulh x20, x6, x24                           // ...................................................................................*.................................................................
        adcs x24, x15, x7                            // ...............................................*.....................................................................................................
        ldp x12, x7, [x30, #cache_m32 - 96]          // .....................................................................*...............................................................................
        umull v16.2D, v31.2S, v18.2S                 // ..................e..................................................................................................................................
        adc x6, xzr, xzr                             // ................................................*....................................................................................................
        adds x14, x25, x29                           // .................................................*...................................................................................................
        umull v13.2D, v21.2S, v28.2S                 // ...e.................................................................................................................................................
        uzp2 v10.4S, v14.4S, v14.4S                  // .....e...............................................................................................................................................
        eor x15, x8, x11                             // ...................................................................................................................*.................................
        adcs x25, x21, x25                           // ..................................................*..................................................................................................
        umull v1.2D, v31.2S, v19.2S                  // .................e...................................................................................................................................
        adcs x8, x23, x21                            // ...................................................*.................................................................................................
        mul v6.4S, v20.4S, v25.4S                    // ....................e................................................................................................................................
        eor x7, x13, x7                              // ......................................................................*..............................................................................
        adcs x23, x17, x23                           // ....................................................*................................................................................................
        eor x21, x5, x19                             // .....................................................................................*...............................................................
        adc x13, xzr, x17                            // .....................................................*...............................................................................................
        adds x17, x25, x29                           // ......................................................*..............................................................................................
        umull v0.2D, v24.2S, v18.2S                  // ......................e..............................................................................................................................
        usra v5.2D, v13.2D, #32                      // .......e.............................................................................................................................................
        adcs x5, x8, x14                             // .......................................................*.............................................................................................
        umull v2.2D, v10.2S, v30.2S                  // ........e............................................................................................................................................
        adcs x25, x23, x25                           // ........................................................*............................................................................................
        usra v16.2D, v1.2D, #32                      // .....................e...............................................................................................................................
        adcs x8, x13, x8                             // .........................................................*...........................................................................................
        uaddlp v13.2D, v6.4S                         // .......................e.............................................................................................................................
        adcs x23, xzr, x23                           // ..........................................................*..........................................................................................
        and v7.16B, v5.16B, v29.16B                  // ..........e..........................................................................................................................................
        adc x13, xzr, x13                            // ...........................................................*.........................................................................................
        adds x29, x29, x16                           // ............................................................*........................................................................................
        mul x16, x10, x12                            // .......................................................................*.............................................................................
        usra v2.2D, v5.2D, #32                       // .............e.......................................................................................................................................
        adcs x9, x14, x9                             // .............................................................*.......................................................................................
        and v25.16B, v16.16B, v29.16B                // ........................e............................................................................................................................
        adcs x17, x17, x4                            // ..............................................................*......................................................................................
        umlal v7.2D, v10.2S, v28.2S                  // ...........e.........................................................................................................................................
        umulh x12, x10, x12                          // ........................................................................*............................................................................
        adcs x10, x5, x24                            // ...............................................................*.....................................................................................
        usra v0.2D, v16.2D, #32                      // ...........................e.........................................................................................................................
        eor x5, x16, x7                              // ..........................................................................*..........................................................................
        ldp x16, x14, [x30, #cache_m31 - 96]         // ................................................................................................*....................................................
        adcs x6, x25, x6                             // ................................................................*....................................................................................
        shl v16.2D, v13.2D, #32                      // ..........................e..........................................................................................................................
        eor x24, x20, x19                            // .......................................................................................*.............................................................
        adcs x4, x8, xzr                             // .................................................................*...................................................................................
        ldp x20, x25, [sp, #cache_a13]               // ...............................................................................................*.....................................................
        umlal v25.2D, v24.2S, v19.2S                 // .........................e...........................................................................................................................
        adcs x23, x23, xzr                           // ..................................................................*..................................................................................
        usra v2.2D, v7.2D, #32                       // ...............e.....................................................................................................................................
        umlal v16.2D, v31.2S, v19.2S                 // ............................e........................................................................................................................
        adc x8, x13, xzr                             // ...................................................................*.................................................................................
        adds xzr, x7, #1                             // .........................................................................*...........................................................................
        mul v7.4S, v17.4S, v14.4S                    // ......e..............................................................................................................................................
        adcs x4, x4, x5                              // ...........................................................................*.........................................................................
        eor x5, x12, x7                              // ............................................................................*........................................................................
        adcs x23, x23, x5                            // .............................................................................*.......................................................................
        mul x12, x20, x16                            // ..................................................................................................*..................................................
        adc x5, x8, x7                               // ..............................................................................*......................................................................
        adds xzr, x19, #1                            // ....................................................................................*................................................................
        adcs x21, x9, x21                            // ......................................................................................*..............................................................
        eor x8, x25, x14                             // .................................................................................................*...................................................
        usra v0.2D, v25.2D, #32                      // .............................e.......................................................................................................................
        adcs x13, x17, x24                           // ........................................................................................*............................................................
        stp x29, x21, [x1, #0]                       // ..............................................................................................*......................................................
        umulh x20, x20, x16                          // ...................................................................................................*.................................................
        uaddlp v10.2D, v7.4S                         // .........e...........................................................................................................................................
        adcs x17, x10, x19                           // .........................................................................................*...........................................................
        mov x3, v2.d[1]                              // ................................e....................................................................................................................
        ldp x29, x24, [sp, #cache_a03]               // .........................................................................................................................*...........................
        adcs x25, x6, x19                            // ..........................................................................................*..........................................................
        ldp x6, x21, [x30, #cache_m30 - 96]          // ..........................................................................................................................*..........................
        eor x10, x12, x8                             // .....................................................................................................*...............................................
        adcs x9, x4, x19                             // ...........................................................................................*.........................................................
        mov x26, v0.d[0]                             // ...............................e.....................................................................................................................
        ldp x4, x16, [x30, #cache_m21 - 96]          // .......................................................................................................................................*.............
        adcs x12, x23, x19                           // ............................................................................................*........................................................
        adc x5, x5, x19                              // .............................................................................................*.......................................................
        adds xzr, x8, #1                             // ....................................................................................................*................................................
        ldp x7, x19, [sp, #cache_a12]                // ......................................................................................................................................*..............
        adcs x14, x25, x10                           // ......................................................................................................*..............................................
        mul x25, x29, x6                             // ............................................................................................................................*........................
        eor x20, x20, x8                             // .......................................................................................................*.............................................
        adcs x23, x9, x20                            // ........................................................................................................*............................................
        ldp x9, x20, [sp, #cache_a02]                // ...........................................................................................................e.........................................
        eor x24, x24, x21                            // ...........................................................................................................................*.........................
        adcs x12, x12, x8                            // .........................................................................................................*...........................................
        adc x10, x5, x8                              // ..........................................................................................................*..........................................
        adds xzr, x11, #1                            // ................................................................................................................*....................................
        umulh x5, x29, x6                            // .............................................................................................................................*.......................
        shl v15.2D, v10.2D, #32                      // ............e........................................................................................................................................
        adcs x8, x13, x22                            // ..................................................................................................................*..................................
        eor x13, x25, x24                            // ...............................................................................................................................*.....................
        adcs x29, x17, x15                           // ....................................................................................................................*................................
        umlal v15.2D, v21.2S, v28.2S                 // ..............e......................................................................................................................................
        adcs x22, x14, x11                           // .....................................................................................................................*...............................
        mov x17, v16.d[0]                            // ...................................e.................................................................................................................
        adcs x21, x23, x11                           // ......................................................................................................................*..............................
        mul x23, x7, x4                              // .........................................................................................................................................*...........
        adcs x14, x12, x11                           // .......................................................................................................................*.............................
        eor x12, x19, x16                            // ........................................................................................................................................*............
        mov x16, v2.d[0]                             // ..............................e......................................................................................................................
        adc x15, x10, x11                            // ........................................................................................................................*............................
        adds xzr, x24, #1                            // ..............................................................................................................................*......................
        eor x19, x5, x24                             // .................................................................................................................................*...................
        adcs x11, x29, x13                           // ................................................................................................................................*....................
        umulh x29, x7, x4                            // ..........................................................................................................................................*..........
        adcs x13, x22, x19                           // ..................................................................................................................................*..................
        ldp x6, x5, [sp, #cache_a01]                 // ...............................................................................e.....................................................................
        ldp x7, x25, [x30, #cache_m20]               // ............................................................................................................e........................................
        adcs x19, x21, x24                           // ...................................................................................................................................*.................
        mov x21, v15.d[1]                            // ..................................e..................................................................................................................
        eor x22, x23, x12                            // ............................................................................................................................................*........
        adcs x14, x14, x24                           // ....................................................................................................................................*................
        mov x23, v16.d[1]                            // ....................................e................................................................................................................
        adc x15, x15, x24                            // .....................................................................................................................................*...............
        adds xzr, x12, #1                            // ...........................................................................................................................................*.........
        ldp x24, x10, [x30], #96                     // ................................................................................e....................................................................
        adcs x11, x11, x22                           // .............................................................................................................................................*.......
        mul x22, x9, x7                              // ..............................................................................................................e......................................
        eor x4, x29, x12                             // ...............................................................................................................................................*.....
        adcs x4, x13, x4                             // ................................................................................................................................................*....
        stp x8, x11, [x1, #16]                       // ..............................................................................................................................................*......
        adcs x13, x19, x12                           // .................................................................................................................................................*...
        eor x11, x20, x25                            // .............................................................................................................e.......................................
        ldp x20, x29, [x1, #32]!                     // ..........................................e..........................................................................................................
        adcs x14, x14, x12                           // ..................................................................................................................................................*..
        adc x15, x15, x12                            // ...................................................................................................................................................*.
        mov x12, x4                                  // ....................................................................................................................................................*
        umulh x8, x9, x7                             // ...............................................................................................................e.....................................

        sub count, count, #1
        cbnz count, Lbignum_emontredc_8n_cdiff_maddloop_neon
Lbignum_emontredc_8n_cdiff_inner_loop_postamble:
        umulh x19, x6, x24                           // ..............*...........................................................................................................
        ldp x7, x9, [sp, #cache_a23]                 // .............*............................................................................................................
        adds x4, x21, x16                            // .*........................................................................................................................
        mov x25, v0.d[1]                             // ..*.......................................................................................................................
        eor x5, x5, x10                              // *.........................................................................................................................
        adcs x17, x17, x3                            // ....*.....................................................................................................................
        ldp x16, x10, [x1, #16]                      // ...*......................................................................................................................
        adcs x21, x23, x26                           // ......*...................................................................................................................
        eor x8, x8, x11                              // ...................*......................................................................................................
        adc x23, x25, xzr                            // .......*..................................................................................................................
        adds x20, x12, x20                           // ........*.................................................................................................................
        adcs x12, x13, x29                           // ..........*...............................................................................................................
        mov x25, v15.d[0]                            // ...........*..............................................................................................................
        adcs x13, x14, x16                           // ............*.............................................................................................................
        eor x16, x19, x5                             // .........................................*................................................................................
        adcs x29, x15, x10                           // ...............*..........................................................................................................
        ldp x14, x19, [x30, #cache_m32 - 96]         // ................*.........................................................................................................
        mul x15, x6, x24                             // .........*................................................................................................................
        adc x24, xzr, xzr                            // .................*........................................................................................................
        adds x6, x4, x25                             // ..................*.......................................................................................................
        adcs x10, x17, x4                            // ....................*.....................................................................................................
        eor x4, x22, x11                             // .....*....................................................................................................................
        adcs x17, x21, x17                           // .....................*....................................................................................................
        eor x22, x9, x19                             // ......................*...................................................................................................
        adcs x9, x23, x21                            // .......................*..................................................................................................
        adc x21, xzr, x23                            // .........................*................................................................................................
        adds x23, x10, x25                           // ..........................*...............................................................................................
        eor x15, x15, x5                             // ........................*.................................................................................................
        adcs x19, x17, x6                            // ...........................*..............................................................................................
        ldp x26, xzr, [sp, #16]                      // ...........................................................................................................*..............
        sub x2, x2, x0                               // .....................................................................................................................*....
        adcs x10, x9, x10                            // ............................*.............................................................................................
        adcs x17, x21, x17                           // .............................*............................................................................................
        adcs x9, xzr, x9                             // ..............................*...........................................................................................
        adc x21, xzr, x21                            // ...............................*..........................................................................................
        adds x25, x25, x20                           // ................................*.........................................................................................
        mul x20, x7, x14                             // .................................*........................................................................................
        adcs x6, x6, x12                             // ..................................*.......................................................................................
        adcs x23, x23, x13                           // ...................................*......................................................................................
        adcs x13, x19, x29                           // .....................................*....................................................................................
        umulh x19, x7, x14                           // ....................................*.....................................................................................
        ldp x14, x29, [x30, #cache_m31 - 96]         // .......................................*..................................................................................
        adcs x10, x10, x24                           // ........................................*.................................................................................
        ldp x12, x7, [sp, #cache_a13]                // ...........................................*..............................................................................
        adcs x17, x17, xzr                           // ..........................................*...............................................................................
        adcs x24, x9, xzr                            // ............................................*.............................................................................
        adc x9, x21, xzr                             // .............................................*............................................................................
        adds xzr, x22, #1                            // ..............................................*...........................................................................
        eor x21, x20, x22                            // ......................................*...................................................................................
        adcs x20, x17, x21                           // ...............................................*..........................................................................
        eor x17, x19, x22                            // ................................................*.........................................................................
        mul x19, x12, x14                            // ..................................................*.......................................................................
        eor x29, x7, x29                             // ......................................................*...................................................................
        adcs x17, x24, x17                           // .................................................*........................................................................
        adc x9, x9, x22                              // ...................................................*......................................................................
        adds xzr, x5, #1                             // ....................................................*.....................................................................
        umulh x21, x12, x14                          // .........................................................*................................................................
        ldp x14, x24, [sp, #cache_a03]               // ...........................................................*..............................................................
        adcs x6, x6, x15                             // .....................................................*....................................................................
        adcs x7, x23, x16                            // .......................................................*..................................................................
        ldp x15, x12, [x30, #cache_m30 - 96]         // .............................................................*............................................................
        eor x19, x19, x29                            // ..............................................................*...........................................................
        adcs x13, x13, x5                            // ..........................................................*...............................................................
        stp x25, x6, [x1, #0]                        // ........................................................*.................................................................
        adcs x16, x10, x5                            // ............................................................*.............................................................
        ldp x10, x6, [x30, #cache_m21 - 96]          // ................................................................*.........................................................
        adcs x23, x20, x5                            // ...............................................................*..........................................................
        adcs x25, x17, x5                            // .................................................................*........................................................
        umulh x30, x14, x15                          // .............................................................................*............................................
        ldp x17, x22, [sp, #cache_a12]               // ....................................................................*.....................................................
        adc x9, x9, x5                               // ..................................................................*.......................................................
        adds xzr, x29, #1                            // ...................................................................*......................................................
        eor x21, x21, x29                            // .......................................................................*..................................................
        adcs x16, x16, x19                           // .....................................................................*....................................................
        adcs x5, x23, x21                            // ........................................................................*.................................................
        eor x23, x24, x12                            // .........................................................................*................................................
        mul x12, x17, x10                            // ...................................................................................*......................................
        adcs x19, x25, x29                           // ..........................................................................*...............................................
        adc x29, x9, x29                             // ...........................................................................*..............................................
        adds xzr, x11, #1                            // ............................................................................*.............................................
        adcs x7, x7, x4                              // ..............................................................................*...........................................
        adcs x21, x13, x8                            // ................................................................................*.........................................
        mul x4, x14, x15                             // ......................................................................*...................................................
        eor x9, x30, x23                             // ........................................................................................*.................................
        adcs x30, x16, x11                           // .................................................................................*........................................
        adcs x24, x5, x11                            // ..................................................................................*.......................................
        adcs x16, x19, x11                           // ....................................................................................*.....................................
        adc x19, x29, x11                            // ......................................................................................*...................................
        adds xzr, x23, #1                            // .......................................................................................*..................................
        eor x8, x4, x23                              // ...............................................................................*..........................................
        adcs x4, x21, x8                             // .........................................................................................*................................
        umulh x21, x17, x10                          // ..........................................................................................*...............................
        adcs x8, x30, x9                             // ...........................................................................................*..............................
        ldp x10, x30, [x1, #32]                      // .........................................................................................................*................
        adcs x25, x24, x23                           // ............................................................................................*.............................
        eor x11, x22, x6                             // .....................................................................................*....................................
        adcs x22, x16, x23                           // ..............................................................................................*...........................
        eor x5, x12, x11                             // .............................................................................................*............................
        adc x29, x19, x23                            // ...............................................................................................*..........................
        adds xzr, x11, #1                            // ................................................................................................*.........................
        eor x14, x21, x11                            // ..................................................................................................*.......................
        adcs x9, x4, x5                              // .................................................................................................*........................
        stp x7, x9, [x1, #16]                        // ....................................................................................................*.....................
        adcs x9, x8, x14                             // ...................................................................................................*......................
        adcs x19, x25, x11                           // .....................................................................................................*....................
        ldp x8, x21, [x1, #48]                       // ..........................................................................................................*...............
        mov x24, x9                                  // ........................................................................................................*.................
        adcs x5, x22, x11                            // ......................................................................................................*...................
        adc x16, x29, x11                            // .......................................................................................................*..................
        adds xzr, x28, x28                           // ............................................................................................................*.............
        adcs x17, x10, x24                           // .............................................................................................................*............
        adcs x14, x30, x19                           // ..............................................................................................................*...........
        ldr x30, [sp, #8]                            // .........................................................................................................................*
        adcs x8, x8, x5                              // ...............................................................................................................*..........
        stp x17, x14, [x1, #32]                      // ..................................................................................................................*.......
        adcs x9, x21, x16                            // ................................................................................................................*.........
        csetm x28, cs                                // .................................................................................................................*........
        stp x8, x9, [x1, #48]                        // ...................................................................................................................*......
        sub x26, x26, #1                             // .......................................................................................................................*..
        sub x1, x1, x0                               // ....................................................................................................................*.....
        stp x26, xzr, [sp, #16]                      // ........................................................................................................................*.
        add x1, x1, #32                              // ......................................................................................................................*...

Lbignum_emontredc_8n_cdiff_outer_loop_end:

        cbnz x26, Lbignum_emontredc_8n_cdiff_outerloop
        neg x0, x28

Lbignum_emontredc_8n_cdiff_end:
        CFI_INC_SP(32)
        CFI_INC_SP((6*16))
        CFI_STACKLOAD2(d8,d9,(0*16))
        CFI_STACKLOAD2(d10,d11,(1*16))
        CFI_STACKLOAD2(d12,d13,(2*16))
        CFI_STACKLOAD2(d14,d15,(3*16))
        CFI_STACKLOAD2(x29,x30,(4*16))
        CFI_STACKLOAD2(x27,x28,(5*16))
        CFI_STACKLOAD2(x25,x26,(6*16))
        CFI_STACKLOAD2(x23,x24,(7*16))
        CFI_STACKLOAD2(x21,x22,(8*16))
        CFI_STACKLOAD2(x19,x20,(9*16))
        CFI_INC_SP((10*16))

        CFI_RET

S2N_BN_SIZE_DIRECTIVE(bignum_emontredc_8n_cdiff)
