This patch contains files extracted from GMP development tree as of 2004-09-13. Included are all (hopefully) files and changes needed for building the new subquadratic gcd and gcdext code, based on Schönhage's algorithm. To build, unpack gmp-4.1.3.tar.gz, and apply this patch. Note that I patch both configure.in and configure and both mpn/Makefile.am and mpn/Makefile.in (to regenerate the files properly, you need fairly old versions of autoconf and automake). Then build it with the usual ./configure && make && make check. Caveats: * The code in the development tree is fairly well tested and believed to be correct, but it's possible that the code depends on other changes and that I forgot to include some crucial change when making this gcd pre-release. * The tuning code is not included. You have to set sensible values for HGCD_SCHOENHAGE_THRESHOLD, GCD_SCHOENHAGE_THRESHOLD and GCDEXT_SCHOENHAGE_THRESHOLD in gmp-impl.h or your gmp-mparam.h. Tuned values from the GMP daily builds are also included below. If you use this pre-release code, please let me know and tell me how it works out. Happy hacking, /Niels Möller powerpc630-ibm-aix5.1.0.0 HGCD_SCHOENHAGE 56 ultrasparc2-sun-solaris2.7 HGCD_SCHOENHAGE 58 ultrasparc2-sun-solaris2.7 HGCD_SCHOENHAGE 63 alpha-dec-osf4.0e HGCD_SCHOENHAGE 81 mips64-sgi-irix6.5 HGCD_SCHOENHAGE 89 powerpc750-apple-darwin5.5 HGCD_SCHOENHAGE 93 supersparc-sun-solaris2.7 HGCD_SCHOENHAGE 93 alphaev56-dec-osf4.0e HGCD_SCHOENHAGE 97 powerpc64-unknown-linux-gnu HGCD_SCHOENHAGE 97 supersparc-sun-solaris2.7 HGCD_SCHOENHAGE 97 viac32-unknown-freebsd5.2.1 HGCD_SCHOENHAGE 121 pentium4-pc-linux-gnu HGCD_SCHOENHAGE 133 ultrasparc2i-unknown-linux-gnu HGCD_SCHOENHAGE 133 itanium2-unknown-linux-gnu HGCD_SCHOENHAGE 145 pentium3-unknown-freebsd4.9 HGCD_SCHOENHAGE 152 itanium2-hp-hpux11.23 HGCD_SCHOENHAGE 159 pentium3-unknown-freebsd4.9 HGCD_SCHOENHAGE 166 x86_64-unknown-freebsd4.9 HGCD_SCHOENHAGE 166 pentium4-unknown-freebsd5.2.1 HGCD_SCHOENHAGE 174 pentiumpro-pc-linux-gnu HGCD_SCHOENHAGE 174 pentium3-pc-linux-gnu HGCD_SCHOENHAGE 182 pentium4-pc-linux-gnu HGCD_SCHOENHAGE 191 x86_64-unknown-linux-gnu HGCD_SCHOENHAGE 191 ultrasparc2i-unknown-linux-gnu HGCD_SCHOENHAGE 200 x86_64-unknown-linux-gnu HGCD_SCHOENHAGE 200 ultrasparc-sun-solaris2.7 HGCD_SCHOENHAGE 210 k62-unknown-freebsd4.9 HGCD_SCHOENHAGE 242 k63-unknown-freebsd4.7 HGCD_SCHOENHAGE 242 hppa2.0w-hp-hpux11.11 HGCD_SCHOENHAGE 254 itanium2-unknown-linux-gnu HGCD_SCHOENHAGE 337 alphaev6-dec-osf4.0f HGCD_SCHOENHAGE 517 alphaev6-unknown-linux-gnu HGCD_SCHOENHAGE 517 alphaev67-unknown-linux-gnu HGCD_SCHOENHAGE 517 alphaev68-dec-osf5.1 HGCD_SCHOENHAGE 517 alphaev68-dec-osf5.1b HGCD_SCHOENHAGE 517 alphaev7-dec-osf5.1b HGCD_SCHOENHAGE 517 pentium4-pc-linux-gnu GCD_SCHOENHAGE 372 ultrasparc2-sun-solaris2.7 GCD_SCHOENHAGE 394 powerpc630-ibm-aix5.1.0.0 GCD_SCHOENHAGE 426 powerpc64-unknown-linux-gnu GCD_SCHOENHAGE 427 ultrasparc2-sun-solaris2.7 GCD_SCHOENHAGE 433 itanium2-unknown-linux-gnu GCD_SCHOENHAGE 445 ultrasparc2i-unknown-linux-gnu GCD_SCHOENHAGE 449 alpha-dec-osf4.0e GCD_SCHOENHAGE 469 mips64-sgi-irix6.5 GCD_SCHOENHAGE 469 pentium4-pc-linux-gnu GCD_SCHOENHAGE 489 itanium2-unknown-linux-gnu GCD_SCHOENHAGE 491 supersparc-sun-solaris2.7 GCD_SCHOENHAGE 499 x86_64-unknown-linux-gnu GCD_SCHOENHAGE 514 alphaev56-dec-osf4.0e GCD_SCHOENHAGE 515 supersparc-sun-solaris2.7 GCD_SCHOENHAGE 515 pentium4-unknown-freebsd5.2.1 GCD_SCHOENHAGE 537 x86_64-unknown-linux-gnu GCD_SCHOENHAGE 537 powerpc750-apple-darwin5.5 GCD_SCHOENHAGE 602 pentiumpro-pc-linux-gnu GCD_SCHOENHAGE 649 viac32-unknown-freebsd5.2.1 GCD_SCHOENHAGE 655 pentium3-pc-linux-gnu GCD_SCHOENHAGE 751 x86_64-unknown-freebsd4.9 GCD_SCHOENHAGE 751 pentium3-unknown-freebsd4.9 GCD_SCHOENHAGE 826 pentium3-unknown-freebsd4.9 GCD_SCHOENHAGE 827 alphaev7-dec-osf5.1b GCD_SCHOENHAGE 911 itanium2-hp-hpux11.23 GCD_SCHOENHAGE 948 ultrasparc2i-unknown-linux-gnu GCD_SCHOENHAGE 998 ultrasparc-sun-solaris2.7 GCD_SCHOENHAGE 1042 k62-unknown-freebsd4.9 GCD_SCHOENHAGE 1097 k63-unknown-freebsd4.7 GCD_SCHOENHAGE 1097 alphaev6-dec-osf4.0f GCD_SCHOENHAGE 1102 alphaev68-dec-osf5.1 GCD_SCHOENHAGE 1102 alphaev68-dec-osf5.1b GCD_SCHOENHAGE 1102 alphaev6-unknown-linux-gnu GCD_SCHOENHAGE 1212 alphaev67-unknown-linux-gnu GCD_SCHOENHAGE 1333 hppa2.0w-hp-hpux11.11 GCD_SCHOENHAGE 1386 powerpc630-ibm-aix5.1.0.0 GCDEXT_SCHOENHAGE 182 alpha-dec-osf4.0e GCDEXT_SCHOENHAGE 221 ultrasparc2i-unknown-linux-gnu GCDEXT_SCHOENHAGE 256 ultrasparc2-sun-solaris2.7 GCDEXT_SCHOENHAGE 271 alphaev56-dec-osf4.0e GCDEXT_SCHOENHAGE 293 mips64-sgi-irix6.5 GCDEXT_SCHOENHAGE 293 ultrasparc2-sun-solaris2.7 GCDEXT_SCHOENHAGE 298 powerpc750-apple-darwin5.5 GCDEXT_SCHOENHAGE 311 powerpc64-unknown-linux-gnu GCDEXT_SCHOENHAGE 322 supersparc-sun-solaris2.7 GCDEXT_SCHOENHAGE 322 supersparc-sun-solaris2.7 GCDEXT_SCHOENHAGE 342 viac32-unknown-freebsd5.2.1 GCDEXT_SCHOENHAGE 372 pentium4-pc-linux-gnu GCDEXT_SCHOENHAGE 409 pentium3-unknown-freebsd4.9 GCDEXT_SCHOENHAGE 469 pentiumpro-pc-linux-gnu GCDEXT_SCHOENHAGE 489 pentium3-pc-linux-gnu GCDEXT_SCHOENHAGE 514 pentium3-unknown-freebsd4.9 GCDEXT_SCHOENHAGE 514 ultrasparc2i-unknown-linux-gnu GCDEXT_SCHOENHAGE 565 x86_64-unknown-freebsd4.9 GCDEXT_SCHOENHAGE 565 pentium4-pc-linux-gnu GCDEXT_SCHOENHAGE 590 ultrasparc-sun-solaris2.7 GCDEXT_SCHOENHAGE 590 itanium2-hp-hpux11.23 GCDEXT_SCHOENHAGE 649 pentium4-unknown-freebsd5.2.1 GCDEXT_SCHOENHAGE 649 x86_64-unknown-linux-gnu GCDEXT_SCHOENHAGE 649 k63-unknown-freebsd4.7 GCDEXT_SCHOENHAGE 683 itanium2-unknown-linux-gnu GCDEXT_SCHOENHAGE 713 k62-unknown-freebsd4.9 GCDEXT_SCHOENHAGE 751 x86_64-unknown-linux-gnu GCDEXT_SCHOENHAGE 751 hppa2.0w-hp-hpux11.11 GCDEXT_SCHOENHAGE 862 itanium2-unknown-linux-gnu GCDEXT_SCHOENHAGE 1048 alphaev6-dec-osf4.0f GCDEXT_SCHOENHAGE 1212 alphaev68-dec-osf5.1 GCDEXT_SCHOENHAGE 1212 alphaev68-dec-osf5.1b GCDEXT_SCHOENHAGE 1212 alphaev7-dec-osf5.1b GCDEXT_SCHOENHAGE 1212 alphaev6-unknown-linux-gnu GCDEXT_SCHOENHAGE 1333 alphaev67-unknown-linux-gnu GCDEXT_SCHOENHAGE 1333 diff -x *~ -N -u -r gmp-4.1.3/configure.in gmp-4.1.3-gcdext/configure.in --- gmp-4.1.3/configure.in Thu Apr 22 19:02:59 2004 +++ gmp-4.1.3-gcdext/configure.in Tue Sep 14 19:55:08 2004 @@ -1767,6 +1767,7 @@ mul mul_fft mul_n mul_basecase sqr_basecase random random2 pow_1 \ rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \ bdivmod gcd_1 gcd gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase \ + hgcd2 hgcd qstack \ $gmp_mpn_functions_optional" # the list of all object files used by mpn/Makefile.in and the diff -x *~ -N -u -r gmp-4.1.3/configure gmp-4.1.3-gcdext/configure --- gmp-4.1.3/configure Wed Apr 28 02:29:27 2004 +++ gmp-4.1.3-gcdext/configure Tue Sep 14 19:55:18 2004 @@ -18879,6 +18879,7 @@ mul mul_fft mul_n mul_basecase sqr_basecase random random2 pow_1 \ rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \ bdivmod gcd_1 gcd gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase \ + hgcd2 hgcd qstack \ $gmp_mpn_functions_optional" # the list of all object files used by mpn/Makefile.in and the diff -x *~ -N -u -r gmp-4.1.3/gmp-impl.h gmp-4.1.3-gcdext/gmp-impl.h --- gmp-4.1.3/gmp-impl.h Fri Apr 23 01:53:26 2004 +++ gmp-4.1.3-gcdext/gmp-impl.h Tue Sep 14 19:37:17 2004 @@ -1028,11 +1028,11 @@ #ifndef MPN_NORMALIZE #define MPN_NORMALIZE(DST, NLIMBS) \ do { \ - while (NLIMBS > 0) \ + while ((NLIMBS) > 0) \ { \ if ((DST)[(NLIMBS) - 1] != 0) \ break; \ - NLIMBS--; \ + (NLIMBS)--; \ } \ } while (0) #endif @@ -1044,7 +1044,7 @@ { \ if ((DST)[(NLIMBS) - 1] != 0) \ break; \ - NLIMBS--; \ + (NLIMBS)--; \ } \ } while (0) #endif @@ -2745,6 +2745,198 @@ } \ } while (0) + +/* HGCD definitions */ + +/* Limited by 2 + twice the bitsize of mp_size_t */ +#define QSTACK_MAX_QUOTIENTS 82 + +/* Name mangling */ +#define qstack_itch __gmpn_qstack_itch +#define qstack_init __gmpn_qstack_init +#define qstack_reset __gmpn_qstack_reset +#define qstack_rotate __gmpn_qstack_rotate + +#define mpn_hgcd2 __gmpn_hgcd2 +#define mpn_hgcd2_fix __gmpn_hgcd2_fix +#define mpn_hgcd2_lehmer_step __gmpn_hgcd2_lehmer_step +#define mpn_hgcd_max_recursion __gmpn_hgcd_max_recursion +#define mpn_hgcd_init_itch __gmpn_hgcd_init_itch +#define mpn_hgcd_init __gmpn_hgcd_init +#define mpn_hgcd_lehmer_itch __gmpn_hgcd_lehmer_itch +#define mpn_hgcd_lehmer __gmpn_hgcd_lehmer +#define mpn_hgcd_itch __gmpn_hgcd_itch +#define mpn_hgcd __gmpn_hgcd +#define mpn_hgcd_equal __gmpn_hgcd_equal +#define mpn_hgcd_fix __gmpn_hgcd_fix + +struct qstack +{ + /* Throughout the code we represent q = 1 with qsize = 0. */ + mp_size_t size[QSTACK_MAX_QUOTIENTS]; + mp_ptr limb; + mp_size_t limb_alloc; + + /* Number of quotients to keep when we discard old quotients */ + unsigned nkeep; + + /* Top quotient is of size size[size_next-1], and starts at + limb+limb_next - size[size_next-1]. We use size_next == 0 for an + empty stack.*/ + unsigned size_next; + mp_size_t limb_next; +}; + +mp_size_t +qstack_itch __GMP_PROTO ((mp_size_t size)); + +void +qstack_init __GMP_PROTO ((struct qstack *stack, + mp_size_t asize, + mp_limb_t *limbs, mp_size_t alloc)); + +void +qstack_reset __GMP_PROTO ((struct qstack *stack, + mp_size_t asize)); + +void +qstack_rotate __GMP_PROTO ((struct qstack *stack, + mp_size_t size)); + +#if WANT_ASSERT +void +__gmpn_qstack_sanity __GMP_PROTO ((struct qstack *stack)); +#define ASSERT_QSTACK __gmpn_qstack_sanity +#else +#define ASSERT_QSTACK(stack) +#endif + +struct hgcd2_row +{ + /* r = (-)u a + (-)v b */ + mp_limb_t u; + mp_limb_t v; +}; + +struct hgcd2 +{ + /* Sign of the first row, sign >= 0 implies that u >= 0 and v <= 0, + sign < 0 implies u <= 0, v >= 0 */ + int sign; + + struct hgcd2_row row[4]; +}; + +int +mpn_hgcd2 __GMP_PROTO ((struct hgcd2 *hgcd, + mp_limb_t ah, mp_limb_t al, + mp_limb_t bh, mp_limb_t bl, + struct qstack *quotients)); + +mp_size_t +mpn_hgcd2_fix __GMP_PROTO ((mp_ptr rp, mp_size_t ralloc, + int sign, + mp_limb_t u, mp_srcptr ap, mp_size_t asize, + mp_limb_t v, mp_srcptr bp, mp_size_t bsize)); + +int +mpn_hgcd2_lehmer_step __GMP_PROTO ((struct hgcd2 *hgcd, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + struct qstack *quotients)); + +unsigned +mpn_hgcd_max_recursion __GMP_PROTO ((mp_size_t n)); + +struct hgcd_row +{ + /* [rp, rsize] should always be normalized. */ + mp_ptr rp; mp_size_t rsize; + mp_ptr uvp[2]; +}; + +struct hgcd +{ + int sign; + + /* Space allocated for the uv entries, for sanity checking */ + mp_size_t alloc; + + /* Size of the largest u,v entry, usually row[3].uvp[1]. This + element should be normalized. Smaller elements must be zero + padded, and all unused limbs (i.e. between size and alloc) must + be zero. */ + mp_size_t size; + struct hgcd_row row[4]; +}; + +mp_size_t +mpn_hgcd_init_itch __GMP_PROTO ((mp_size_t size)); + +void +mpn_hgcd_init __GMP_PROTO ((struct hgcd *hgcd, + mp_size_t asize, + mp_limb_t *limbs)); + +mp_size_t +mpn_hgcd_lehmer_itch __GMP_PROTO ((mp_size_t asize)); + +int +mpn_hgcd_lehmer __GMP_PROTO ((struct hgcd *hgcd, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + struct qstack *quotients, + mp_ptr tp, mp_size_t talloc)); + +mp_size_t +mpn_hgcd_itch __GMP_PROTO ((mp_size_t size)); + +int +mpn_hgcd __GMP_PROTO ((struct hgcd *hgcd, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + struct qstack *quotients, + mp_ptr tp, mp_size_t talloc)); + +#if WANT_ASSERT +void +__gmpn_hgcd_sanity __GMP_PROTO ((const struct hgcd *hgcd, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + unsigned start, unsigned end)); +#define ASSERT_HGCD __gmpn_hgcd_sanity +#else +#define ASSERT_HGCD(hgcd, ap, asize, bp, bsize, start, end) +#endif + +int +mpn_hgcd_equal __GMP_PROTO ((const struct hgcd *A, const struct hgcd *B)); + +mp_size_t +mpn_hgcd_fix __GMP_PROTO ((mp_size_t k, + mp_ptr rp, mp_size_t ralloc, + int sign, mp_size_t uvsize, + const struct hgcd_row *s, + mp_srcptr ap, mp_srcptr bp, + mp_ptr tp, mp_size_t talloc)); + +#ifndef HGCD_SCHOENHAGE_THRESHOLD +#define HGCD_SCHOENHAGE_THRESHOLD 150 +#endif + +#if 0 +#ifndef GCD_LEHMER_THRESHOLD +#define GCD_LEHMER_THRESHOLD 200 +#endif +#endif + +#ifndef GCD_SCHOENHAGE_THRESHOLD +#define GCD_SCHOENHAGE_THRESHOLD 1000 +#endif + +#ifndef GCDEXT_SCHOENHAGE_THRESHOLD +#define GCDEXT_SCHOENHAGE_THRESHOLD 600 +#endif /* __GMPF_BITS_TO_PREC applies a minimum 53 bits, rounds upwards to a whole limb and adds an extra limb. __GMPF_PREC_TO_BITS drops that extra limb, diff -x *~ -N -u -r gmp-4.1.3/mpn/Makefile.am gmp-4.1.3-gcdext/mpn/Makefile.am --- gmp-4.1.3/mpn/Makefile.am Thu May 9 00:55:30 2002 +++ gmp-4.1.3-gcdext/mpn/Makefile.am Tue Sep 14 19:54:38 2004 @@ -40,11 +40,11 @@ cmp.c com_n.c copyd.c copyi.c \ dc_divrem_n.c dive_1.c diveby3.c divis.c divrem.c divrem_1.c divrem_2.c \ dump.c fib2_ui.c gcd.c \ - gcd_finda.c gcd_1.c gcdext.c get_str.c hamdist.c invert_limb.c \ + gcd_finda.c gcd_1.c gcdext.c get_str.c hamdist.c hgcd.c hgcd2.c invert_limb.c \ ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c \ mul.c mul_1.c mul_2.c mul_fft.c mul_n.c mul_basecase.c \ nand_n.c nior_n.c perfsqr.c popcount.c \ - pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \ + pre_divrem_1.c pre_mod_1.c pow_1.c qstack.c random.c random2.c rshift.c \ rootrem.c sb_divrem_mn.c scan0.c scan1.c set_str.c \ sqr_basecase.c sqr_diagonal.c \ sqrtrem.c sub.c sub_1.c sub_n.c submul_1.c \ diff -x *~ -N -u -r gmp-4.1.3/mpn/Makefile.in gmp-4.1.3-gcdext/mpn/Makefile.in --- gmp-4.1.3/mpn/Makefile.in Wed Apr 28 02:29:47 2004 +++ gmp-4.1.3-gcdext/mpn/Makefile.in Tue Sep 14 19:54:51 2004 @@ -178,11 +178,11 @@ cmp.c com_n.c copyd.c copyi.c \ dc_divrem_n.c dive_1.c diveby3.c divis.c divrem.c divrem_1.c divrem_2.c \ dump.c fib2_ui.c gcd.c \ - gcd_finda.c gcd_1.c gcdext.c get_str.c hamdist.c invert_limb.c \ + gcd_finda.c gcd_1.c gcdext.c get_str.c hamdist.c hgcd.c hgcd2.c invert_limb.c \ ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c \ mul.c mul_1.c mul_2.c mul_fft.c mul_n.c mul_basecase.c \ nand_n.c nior_n.c perfsqr.c popcount.c \ - pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \ + pre_divrem_1.c pre_mod_1.c pow_1.c qstack.c random.c random2.c rshift.c \ rootrem.c sb_divrem_mn.c scan0.c scan1.c set_str.c \ sqr_basecase.c sqr_diagonal.c \ sqrtrem.c sub.c sub_1.c sub_n.c submul_1.c \ diff -x *~ -N -u -r gmp-4.1.3/mpn/generic/gcd.c gmp-4.1.3-gcdext/mpn/generic/gcd.c --- gmp-4.1.3/mpn/generic/gcd.c Tue May 14 18:59:48 2002 +++ gmp-4.1.3-gcdext/mpn/generic/gcd.c Mon Sep 13 21:46:24 2004 @@ -1,7 +1,7 @@ /* mpn/gcd.c: mpn_gcd for gcd of two odd integers. -Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000, 2001, 2002 Free -Software Foundation, Inc. +Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000, 2001, 2002, 2003, +2004 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -44,10 +44,13 @@ K. Weber, The accelerated integer GCD algorithm, ACM Transactions on Mathematical Software, v. 21 (March), 1995, pp. 111-122. */ +#include /* for NULL */ + #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" + /* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated algorithm is used, otherwise the binary algorithm is used. This may be adjusted for different architectures. */ @@ -124,7 +127,6 @@ #if HAVE_NATIVE_mpn_gcd_finda #define find_a(cp) mpn_gcd_finda (cp) - #else static #if ! defined (__i386__) @@ -181,9 +183,9 @@ } #endif - -mp_size_t -mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) +/* v must be odd */ +static mp_size_t +gcd_binary_odd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) { mp_ptr orig_vp = vp; mp_size_t orig_vsize = vsize; @@ -439,4 +441,431 @@ MPN_COPY_INCR (gp, vp, vsize); TMP_FREE (marker); return vsize; +} + +#define EVEN_P(x) (((x) & 1) == 0) + +/* Allows an even v */ +static mp_size_t +gcd_binary (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) +{ + mp_size_t zero_words = 0; + mp_size_t gsize; + unsigned shift = 0; + + ASSERT (usize > 0); + ASSERT (vsize > 0); + + if (up[0] == 0 && vp[0] == 0) + { + do + gp[zero_words++] = 0; + while (up[zero_words] == 0 && vp[zero_words] == 0); + + up += zero_words; usize -= zero_words; + vp += zero_words; vsize -= zero_words; + gp += zero_words; + } + + /* Now u and v can have a common power of two < 2^GMP_NUMB_BITS */ + if (up[0] == 0) + { + ASSERT (vp[0] != 0); + if (EVEN_P (vp[0])) + { + count_trailing_zeros (shift, vp[0]); + ASSERT (shift > 0); + ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, shift)); + if (vp[vsize - 1] == 0) + vsize--; + } + } + else if (vp[0] == 0) + { + if (EVEN_P (up[0])) + { + count_trailing_zeros (shift, up[0]); + ASSERT (shift > 0); + } + while (vp[0] == 0) + { + vp++; + vsize--; + } + + if (EVEN_P (vp[0])) + { + unsigned vcount; + + count_trailing_zeros (vcount, vp[0]); + ASSERT (vcount > 0); + ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount)); + if (vp[vsize - 1] == 0) + vsize--; + } + } + else if (EVEN_P (vp[0])) + { + unsigned vcount; + count_trailing_zeros (vcount, vp[0]); + ASSERT (vcount > 0); + ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount)); + if (vp[vsize - 1] == 0) + vsize--; + + if (EVEN_P (up[0])) + { + unsigned ucount; + count_trailing_zeros (ucount, up[0]); + ASSERT (ucount > 0); + shift = MIN (ucount, vcount); + } + } + + gsize = gcd_binary_odd (gp, up, usize, vp, vsize); + if (shift) + { + mp_limb_t cy = mpn_lshift (gp, gp, gsize, shift); + if (cy) + gp[gsize++] = cy; + } + return gsize + zero_words; +} + +#define MPN_LEQ_P(ap, asize, bp, bsize) \ +((asize) < (bsize) || ((asize) == (bsize) \ + && mpn_cmp ((ap), (bp), (asize)) <= 0)) + +/* Sets (a, b, c, d) <-- (c, d, a, b) */ +#define NHGCD_SWAP4_2(row) \ +do { \ + struct hgcd_row __nhgcd_swap4_2_tmp; \ + __nhgcd_swap4_2_tmp = row[0]; \ + row[0] = row[2]; \ + row[2] = __nhgcd_swap4_2_tmp; \ + __nhgcd_swap4_2_tmp = row[1]; \ + row[1] = row[3]; \ + row[3] = __nhgcd_swap4_2_tmp; \ +} while (0) + +/* Sets (a, b, c) <-- (b, c, a) */ +#define NHGCD_SWAP3_LEFT(row) \ +do { \ + struct hgcd_row __nhgcd_swap4_left_tmp; \ + __nhgcd_swap4_left_tmp = row[0]; \ + row[0] = row[1]; \ + row[1] = row[2]; \ + row[2] = __nhgcd_swap4_left_tmp; \ +} while (0) + +static mp_size_t +hgcd_tdiv (mp_ptr qp, + mp_ptr rp, mp_size_t *rsizep, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize) +{ + mp_size_t qsize; + mp_size_t rsize; + + mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize); + + rsize = bsize; + MPN_NORMALIZE (rp, rsize); + *rsizep = rsize; + + qsize = asize - bsize + 1; + qsize -= (qp[qsize - 1] == 0); + + if (qsize == 1 && qp[0] == 1) + return 0; + + return qsize; +} + + +#if 0 +#define GCD_LEHMER_ITCH(asize) (5*((asize) + 1)) + +static mp_size_t +gcd_lehmer (mp_ptr gp, mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + mp_ptr tp, mp_size_t talloc) +{ + struct hgcd_row r[4]; + mp_ptr qp; + mp_size_t qsize; + mp_size_t ralloc = asize + 1; + + ASSERT (asize >= bsize); + ASSERT (bsize > 0); + +#if 0 + if (BELOW_THRESHOLD (asize, MPN_GCD_LEHMER_THRESHOLD)) + { + ASSERT (asize + bsize + 2 <= talloc); + + MPN_COPY (tp, ap, asize); + MPN_COPY (tp + asize + 1, bp, bsize); + return nhgcd_gcd_binary (gp, tp, asize, tp + asize + 1, bsize); + } +#endif + + ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); + ASSERT (5 * asize + 4 <= talloc); + + r[0].rp = tp; tp += ralloc; talloc -= ralloc; + r[1].rp = tp; tp += ralloc; talloc -= ralloc; + r[2].rp = tp; tp += ralloc; talloc -= ralloc; + r[3].rp = tp; tp += ralloc; talloc -= ralloc; + qp = tp; tp += asize; talloc -= asize; + + MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize; + MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize; + +#if 0 + /* u and v fields aren't used, but zero them out so that we can call + trace_nhgcd_row */ + r[0].uvp[0] = r[0].uvp[1] = NULL; + r[1].uvp[0] = r[1].uvp[1] = NULL; + r[2].uvp[0] = r[2].uvp[1] = NULL; + r[3].uvp[0] = r[3].uvp[1] = NULL; +#endif + + while (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD) && r[1].rsize > 0) + { + struct hgcd2 hgcd; + int res = mpn_hgcd2_lehmer_step (&hgcd, + r[0].rp, r[0].rsize, + r[1].rp, r[1].rsize, + NULL); + + if (!res || (res == 2 && hgcd.row[0].v == 0)) + { + qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize, + r[0].rp, r[0].rsize, + r[1].rp, r[1].rsize); + NHGCD_SWAP3_LEFT (r); + } + else + { + const struct hgcd2_row *s = hgcd.row + (res - 2); + int sign = hgcd.sign; + if (res == 3) + sign = ~sign; + + /* s[0] and s[1] correct. */ + r[2].rsize + = mpn_hgcd2_fix (r[2].rp, ralloc, + sign, + s[0].u, r[0].rp, r[0].rsize, + s[0].v, r[1].rp, r[1].rsize); + + r[3].rsize + = mpn_hgcd2_fix (r[3].rp, ralloc, + ~sign, + s[1].u, r[0].rp, r[0].rsize, + s[1].v, r[1].rp, r[1].rsize); + + NHGCD_SWAP4_2 (r); + } + } + + if (r[1].rsize == 0) + { + MPN_COPY (gp, r[0].rp, r[0].rsize); + return r[0].rsize; + } + + return gcd_binary (gp, r[0].rp, r[0].rsize, r[1].rp, r[1].rsize); +} +#endif + +static mp_size_t +gcd_schoenhage_itch (mp_size_t asize) +{ + /* Size for hgcd calls */ + mp_size_t ralloc = asize + 1; + mp_size_t hgcd_size = (asize + 1) / 2; + return (4 * ralloc /* Remainder storage */ + + mpn_hgcd_init_itch (hgcd_size) /* hgcd storage */ + + qstack_itch (hgcd_size) + + mpn_hgcd_itch (hgcd_size) /* nhgcd call */ + + 1+ 3 * asize / 4); /* hgcd_fix */ +} + +static mp_size_t +gcd_schoenhage (mp_ptr gp, mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + mp_ptr tp, mp_size_t talloc) +{ + mp_size_t scratch; + struct hgcd hgcd; + struct qstack quotients; + struct hgcd_row r[4]; + + mp_size_t ralloc = asize + 1; + + ASSERT (asize >= bsize); + ASSERT (bsize > 0); + + ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); + + ASSERT (4 * ralloc <= talloc); + tp += ralloc; talloc -= ralloc; + r[0].rp = tp; tp += ralloc; talloc -= ralloc; + r[1].rp = tp; tp += ralloc; talloc -= ralloc; + r[2].rp = tp; tp += ralloc; talloc -= ralloc; + r[3].rp = tp; tp += ralloc; talloc -= ralloc; + + MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize; + MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize; + +#if 0 + /* We don't use the u and v fields, but zero them out so that we can + call trace_nhgcd_row while debugging. */ + r[0].uvp[0] = r[0].uvp[1] = NULL; + r[1].uvp[0] = r[1].uvp[1] = NULL; + r[2].uvp[0] = r[2].uvp[1] = NULL; + r[3].uvp[0] = r[3].uvp[1] = NULL; +#endif + + scratch = mpn_hgcd_init_itch ((asize + 1)/2); + ASSERT (scratch <= talloc); + mpn_hgcd_init (&hgcd, (asize + 1)/2, tp); + tp += scratch; talloc -= scratch; + + { + mp_size_t nlimbs = qstack_itch ((asize + 1)/2); + + ASSERT (nlimbs <= talloc); + + qstack_init ("ients, (asize + 1) / 2, tp, nlimbs); + + tp += nlimbs; + talloc -= nlimbs; + } + + while (ABOVE_THRESHOLD (r[0].rsize, GCD_SCHOENHAGE_THRESHOLD) + && r[1].rsize > 0) + { + mp_size_t k = r[0].rsize / 2; + int res; + +#if 0 + trace ("nhgcd_gcd_schoenhage\n"); + trace_nhgcd_row (r); + trace_nhgcd_row (r + 1); +#endif + if (r[1].rsize <= k) + goto euclid; + + qstack_reset ("ients, r[0].rsize - k); + + res = mpn_hgcd (&hgcd, + r[0].rp + k, r[0].rsize - k, + r[1].rp + k, r[1].rsize - k, + "ients, + tp, talloc); + + if (res == 0 || res == 1) + { + euclid: + ASSERT (r[0].rsize - r[1].rsize + 1 <= talloc); + hgcd_tdiv (tp, r[2].rp, &r[2].rsize, + r[0].rp, r[0].rsize, + r[1].rp, r[1].rsize); + + NHGCD_SWAP3_LEFT (r); + } + else + { + const struct hgcd_row *s = hgcd.row + (res - 2); + int sign = hgcd.sign; + if (res == 3) + sign = ~sign; + + /* s[0] and s[1] are correct */ + r[2].rsize + = mpn_hgcd_fix (k, r[2].rp, ralloc, + sign, hgcd.size, s, + r[0].rp, r[1].rp, + tp, talloc); + + r[3].rsize + = mpn_hgcd_fix (k, r[3].rp, ralloc, + ~sign, hgcd.size, s+1, + r[0].rp, r[1].rp, + tp, talloc); + + NHGCD_SWAP4_2 (r); + } + } + +#if 0 + trace ("nhgcd_gcd_schoenhage after loop\n"); + trace_nhgcd_row (r); + trace_nhgcd_row (r + 1); +#endif + + if (r[1].rsize == 0) + { + MPN_COPY (gp, r[0].rp, r[0].rsize); + return r[0].rsize; + } +#if 0 + else if (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD)) + return gcd_lehmer (gp, + r[0].rp, r[0].rsize, + r[1].rp, r[1].rsize, + tp, talloc); +#endif + else + return gcd_binary (gp, + r[0].rp, r[0].rsize, + r[1].rp, r[1].rsize); +} + +/* Should we perform an initial division? */ +mp_size_t +mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) +{ + if (BELOW_THRESHOLD (usize, GCD_SCHOENHAGE_THRESHOLD)) + return gcd_binary_odd (gp, up, usize, vp, vsize); + + /* The algorithms below require U >= V, while mpn_gcd is long documented as + requiring only that the position of U's msb >= V's msb. */ + if (usize == vsize && mpn_cmp (up, vp, usize) < 0) + MP_PTR_SWAP (up, vp); + +#if 0 + if (BELOW_THRESHOLD (usize, GCD_SCHOENHAGE_THRESHOLD)) + { + mp_size_t scratch; + mp_ptr tp; + mp_size_t gsize; + TMP_DECL (marker); + + TMP_MARK (marker); + + scratch = GCD_LEHMER_ITCH (usize); + tp = TMP_ALLOC_LIMBS (scratch); + + gsize = gcd_lehmer (gp, up, usize, vp, vsize, tp, scratch); + TMP_FREE (marker); + return gsize; + } + else +#endif + { + mp_size_t scratch; + mp_ptr tp; + mp_size_t gsize; + + scratch = gcd_schoenhage_itch (usize); + tp = __GMP_ALLOCATE_FUNC_LIMBS (scratch); + + gsize = gcd_schoenhage (gp, up, usize, vp, vsize, tp, scratch); + __GMP_FREE_FUNC_LIMBS (tp, scratch); + return gsize; + } } diff -x *~ -N -u -r gmp-4.1.3/mpn/generic/gcdext.c gmp-4.1.3-gcdext/mpn/generic/gcdext.c --- gmp-4.1.3/mpn/generic/gcdext.c Tue May 14 18:59:48 2002 +++ gmp-4.1.3-gcdext/mpn/generic/gcdext.c Mon Sep 6 19:11:13 2004 @@ -1,6 +1,7 @@ /* mpn_gcdext -- Extended Greatest Common Divisor. -Copyright 1996, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. +Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, +Inc. This file is part of the GNU MP Library. @@ -19,759 +20,1289 @@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#define WANT_TRACE 0 + +/* Default to binary gcdext_1, since it is best on most current machines. + We should teach tuneup to choose the right gcdext_1. */ +#define GCDEXT_1_USE_BINARY 1 + +#if WANT_TRACE +# include +# include +#endif + #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" -#ifndef GCDEXT_THRESHOLD -#define GCDEXT_THRESHOLD 17 +#ifndef NULL +# define NULL ((void *) 0) #endif -#ifndef EXTEND -#define EXTEND 1 +#if WANT_TRACE +static void +trace (const char *format, ...) +{ + va_list args; + va_start (args, format); + gmp_vfprintf (stderr, format, args); + va_end (args); +} #endif -#if STAT -int arr[GMP_LIMB_BITS + 1]; -#endif +/* Comparison of _normalized_ numbers. */ +#define MPN_EQUAL_P(ap, asize, bp, bsize) \ +((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0) -/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE) +#define MPN_LEQ_P(ap, asize, bp, bsize) \ +((asize) < (bsize) || ((asize) == (bsize) \ + && mpn_cmp ((ap), (bp), (asize)) <= 0)) - Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the - greatest common divisor at GP (unless it is 0), and the first cofactor at - SP. Write the size of the cofactor through the pointer SSIZE. Return the - size of the value at GP. Note that SP might be a negative number; this is - denoted by storing the negative of the size through SSIZE. - - {UP,USIZE} and {VP,VSIZE} are both clobbered. - - The space allocation for all four areas needs to be USIZE+1. - - Preconditions: 1) U >= V. - 2) V > 0. */ - -/* We use Lehmer's algorithm. The idea is to extract the most significant - bits of the operands, and compute the continued fraction for them. We then - apply the gathered cofactors to the full operands. - - Idea 1: After we have performed a full division, don't shift operands back, - but instead account for the extra factors-of-2 thus introduced. - Idea 2: Simple generalization to use divide-and-conquer would give us an - algorithm that runs faster than O(n^2). - Idea 3: The input numbers need less space as the computation progresses, - while the s0 and s1 variables need more space. To save memory, we - could make them share space, and have the latter variables grow - into the former. - Idea 4: We should not do double-limb arithmetic from the start. Instead, - do things in single-limb arithmetic until the quotients differ, - and then switch to double-limb arithmetic. */ +/* Returns g, u and v such that g = u A - v B. There are three + different cases for the result: + g = u A - v B, 0 < u < b, 0 < v < a + g = A u = 1, v = 0 + g = B u = B, v = A - 1 + + We always return with 0 < u <= b, 0 <= v < a. +*/ +#if GCDEXT_1_USE_BINARY -/* One-limb division optimized for small quotients. */ static mp_limb_t -div1 (mp_limb_t n0, mp_limb_t d0) +gcdext_1_odd (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) { - if ((mp_limb_signed_t) n0 < 0) + mp_limb_t u0; + mp_limb_t v0; + mp_limb_t v1; + mp_limb_t u1; + + mp_limb_t B = b; + mp_limb_t A = a; + + /* Through out this function maintain + + a = u0 A - v0 B + b = u1 A - v1 B + + where A and B are odd. */ + + u0 = 1; v0 = 0; + u1 = b; v1 = a-1; + + if (A == 1) { - mp_limb_t q; - int cnt; - for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++) + *up = u0; *vp = v0; + return 1; + } + else if (B == 1) + { + *up = u1; *vp = v1; + return 1; + } + + while (a != b) + { + mp_limb_t mask; + + ASSERT (a % 2 == 1); + ASSERT (b % 2 == 1); + + ASSERT (0 < u0); ASSERT (u0 <= B); + ASSERT (0 < u1); ASSERT (u1 <= B); + + ASSERT (0 <= v0); ASSERT (v0 < A); + ASSERT (0 <= v1); ASSERT (v1 < A); + + if (a > b) { - d0 = d0 << 1; + MP_LIMB_T_SWAP (a, b); + MP_LIMB_T_SWAP (u0, u1); + MP_LIMB_T_SWAP (v0, v1); + } + + ASSERT (a < b); + + /* Makes b even */ + b -= a; + + mask = - (mp_limb_t) (u1 < u0); + u1 += B & mask; + v1 += A & mask; + u1 -= u0; + v1 -= v0; + + ASSERT (b % 2 == 0); + + do + { + /* As b = u1 A + v1 B is even, while A and B are odd, + either both or none of u1, v1 is even */ + + ASSERT (u1 % 2 == v1 % 2); + + mask = -(u1 & 1); + u1 = u1 / 2 + ((B / 2) & mask) - mask; + v1 = v1 / 2 + ((A / 2) & mask) - mask; + + b /= 2; } + while (b % 2 == 0); + } + + /* Now g = a = b */ + ASSERT (a == b); + ASSERT (u1 <= B); + ASSERT (v1 < A); + + ASSERT (A % a == 0); + ASSERT (B % a == 0); + ASSERT (u0 % (B/a) == u1 % (B/a)); + ASSERT (v0 % (A/a) == v1 % (A/a)); + + *up = u0; *vp = v0; + + return a; +} + +static mp_limb_t +gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) +{ + unsigned shift = 0; + mp_limb_t g; + mp_limb_t u; + mp_limb_t v; + + /* We use unsigned values in the range 0, ... B - 1. As the values + are uniquely determined only modulo B, we can add B at will, to + get numbers in range or flip the least significant bit. */ + /* Deal with powers of two */ + while ((a | b) % 2 == 0) + { + a /= 2; b /= 2; shift++; + } + + if (b % 2 == 0) + { + unsigned k = 0; - q = 0; - while (cnt) + do { + b /= 2; k++; + } while (b % 2 == 0); + + g = gcdext_1_odd (&u, &v, a, b); + + while (k--) { - q <<= 1; - if (n0 >= d0) + /* We have g = u a + v b, and need to construct + g = u'a + v'(2b). + + If v is even, we can just set u' = u, v' = v/2 + If v is odd, we can set v' = (v + a)/2, u' = u + b + */ + + if (v % 2 == 0) + v /= 2; + else { - n0 = n0 - d0; - q |= 1; + u = u + b; + v = v/2 + a/2 + 1; } - d0 = d0 >> 1; - cnt--; + b *= 2; } + } + else if (a % 2 == 0) + { + unsigned k = 0; + + do { + a /= 2; k++; + } while (a % 2 == 0); - return q; + g = gcdext_1_odd (&u, &v, a, b); + + while (k--) + { + /* We have g = u a + v b, and need to construct + g = u'(2a) + v'b. + + If u is even, we can just set u' = u/2, v' = v. + If u is odd, we can set u' = (u + b)/2 + */ + + if (u % 2 == 0) + u /= 2; + else + { + u = u/2 + b/2 + 1; + v = v + a; + } + a *= 2; + } } else + /* Ok, both are odd */ + g = gcdext_1_odd (&u, &v, a, b); + + *up = u; + *vp = v; + + return g << shift; +} + +#else /* ! GCDEXT_1_USE_BINARY */ +static mp_limb_t +gcdext_1_u (mp_limb_t *up, mp_limb_t a, mp_limb_t b) +{ + /* Maintain + + a = u0 A mod B + b = - u1 A mod B + */ + mp_limb_t u0 = 1; + mp_limb_t u1 = 0; + mp_limb_t B = b; + + ASSERT (a >= b); + ASSERT (b > 0); + + for (;;) { mp_limb_t q; - int cnt; - for (cnt = 0; n0 >= d0; cnt++) + + q = a / b; + a -= q * b; + + if (a == 0) { - d0 = d0 << 1; + *up = B - u1; + return b; } + u0 += q * u1; - q = 0; - while (cnt) + q = b / a; + b -= q * a; + + if (b == 0) { - d0 = d0 >> 1; - q <<= 1; - if (n0 >= d0) - { - n0 = n0 - d0; - q |= 1; - } - cnt--; + *up = u0; + return a; } - - return q; + u1 += q * u0; } } -/* Two-limb division optimized for small quotients. */ static mp_limb_t -div2 (mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0) +gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) { - if ((mp_limb_signed_t) n1 < 0) + /* Maintain + + a = u0 A - v0 B + b = - u1 A + v1 B = (B - u1) A - (A - v1) B + */ + mp_limb_t u0 = 1; + mp_limb_t v0 = 0; + mp_limb_t u1 = 0; + mp_limb_t v1 = 1; + + mp_limb_t A = a; + mp_limb_t B = b; + + ASSERT (a >= b); + ASSERT (b > 0); + + for (;;) { mp_limb_t q; - int cnt; - for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++) + + q = a / b; + a -= q * b; + + if (a == 0) { - d1 = (d1 << 1) | (d0 >> (GMP_LIMB_BITS - 1)); - d0 = d0 << 1; + *up = B - u1; + *vp = A - v1; + return b; } + u0 += q * u1; + v0 += q * v1; - q = 0; - while (cnt) + q = b / a; + b -= q * a; + + if (b == 0) { - q <<= 1; - if (n1 > d1 || (n1 == d1 && n0 >= d0)) - { - sub_ddmmss (n1, n0, n1, n0, d1, d0); - q |= 1; - } - d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1); - d1 = d1 >> 1; - cnt--; + *up = u0; + *vp = v0; + return a; } + u1 += q * u0; + v1 += q * v0; + } +} +#endif /* ! GCDEXT_1_USE_BINARY */ + +/* FIXME: Duplicated in gcd.c */ +static mp_size_t +hgcd_tdiv (mp_ptr qp, + mp_ptr rp, mp_size_t *rsizep, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize) +{ + mp_size_t qsize; + mp_size_t rsize; + + mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize); + + rsize = bsize; + MPN_NORMALIZE (rp, rsize); + *rsizep = rsize; + + qsize = asize - bsize + 1; + qsize -= (qp[qsize - 1] == 0); + + if (qsize == 1 && qp[0] == 1) + return 0; + + return qsize; +} + +/* FIXME: Duplicated in hgcd.c */ +static mp_limb_t +mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n, + mp_ptr ap, mp_limb_t u, + mp_ptr bp, mp_limb_t v) +{ + mp_limb_t h; + mp_limb_t cy; + + h = mpn_mul_1 (rp, ap, n, u); + cy = mpn_addmul_1 (rp, bp, n, v); + h += cy; +#if GMP_NAIL_BITS == 0 + rp[n] = h; + return (h < cy); +#else /* GMP_NAIL_BITS > 0 */ + rp[n] = h & GMP_NUMB_MASK; + return h >> GMP_NUMB_BITS; +#endif /* GMP_NAIL_BITS > 0 */ +} + + +/* Computes u2 = u0 + q u1 + + Returns new size. + + FIXME: Notation in the function not quite consistent + FIXME: Severe code duplication with hgcd_update_uv */ + +static mp_size_t +hgcd_update_u (struct hgcd_row *r, mp_size_t usize, + mp_srcptr qp, mp_size_t qsize, + /* Limbs allocated for the new u, for sanity + checking */ + mp_size_t alloc) +{ + mp_srcptr u0p = r[0].uvp[0]; + mp_srcptr u1p = r[1].uvp[0]; + mp_ptr u2p = r[2].uvp[0]; + + ASSERT (usize < alloc); + + /* u1 = 0 is an exceptional case. Except for this, u1 should be + normalized. */ + + ASSERT ((usize == 1 && u1p[0] == 0) || u1p[usize - 1] != 0); + + /* Compute u2 = u0 + q u1 */ + + if (usize == 1 && u1p[0] == 0) + { + /* u1 == 0 is a special case, then q might be large, but it + doesn't matter. Can happen only when u0 = v1 = 1, u1 = v0 = + 0, and hence usize == 1. */ + MPN_COPY (u2p, u0p, usize); + } + else if (qsize == 0) + /* Represents a unit quotient */ + { + mp_limb_t cy = mpn_add_n (u2p, u0p, u1p, usize); + u2p[usize] = cy; + usize += (cy != 0); + } + else if (qsize == 1) + { + mp_limb_t cy; - return q; + cy = mpn_mul_1 (u2p, u1p, usize, qp[0]); + cy += mpn_add_n (u2p, u2p, u0p, usize); + + u2p[usize] = cy; + usize += (cy != 0); } else { - mp_limb_t q; - int cnt; - for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++) + if (qsize <= usize) + mpn_mul (u2p, u1p, usize, qp, qsize); + else + mpn_mul (u2p, qp, qsize, u1p, usize); + + ASSERT_NOCARRY (mpn_add (u2p, + u2p, usize + qsize, + u0p, usize)); + + usize += qsize; + usize -= (u2p[usize - 1] == 0); + } + ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0); + ASSERT (r[2].uvp[0][usize - 1] != 0); + + return usize; +} + + +/* Computes Y = R * X. No overlap allowed. */ +static mp_size_t +hgcd2_mul_vector (struct hgcd_row *Y, + mp_size_t alloc, + const struct hgcd2_row *R, + const struct hgcd_row *X, mp_size_t n) +{ + unsigned i; + int grow = 0; + mp_limb_t h = 0; + + ASSERT (n < alloc); + + for (i = 0; i < 2; i++) + { + /* Set Y[i] = R[i, 0] X[0] + R[i,1] X[1] + = u X[0] + v X[0] */ + mp_limb_t cy; + + cy = mpn_addmul2_n_1 (Y[i].uvp[0], n, + X[0].uvp[0], R[i].u, + X[1].uvp[0], R[i].v); + + if (cy) { - d1 = (d1 << 1) | (d0 >> (GMP_LIMB_BITS - 1)); - d0 = d0 << 1; + ASSERT (n + 2 <= alloc); + Y[i].uvp[0][n+1] = cy; + grow = 1; } + else + h |= Y[i].uvp[0][n]; + } + if (grow) + return n + 2; + else + /* Don't add redundant zeroes */ + return n + (h != 0); +} + +/* Sets (a, b, c) <-- (b, c, a) */ +#define HGCD_SWAP3_LEFT(row) \ +do { \ + struct hgcd_row __hgcd_swap4_left_tmp = row[0]; \ + row[0] = row[1]; \ + row[1] = row[2]; \ + row[2] = __hgcd_swap4_left_tmp; \ +} while (0) + +/* Sets (a, b, c, d) <-- (c, d, a, b) */ +#define HGCD_SWAP4_2(row) \ +do { \ + struct hgcd_row __hgcd_swap4_2_tmp = row[0]; \ + row[0] = row[2]; \ + row[2] = __hgcd_swap4_2_tmp; \ + __hgcd_swap4_2_tmp = row[1]; \ + row[1] = row[3]; \ + row[3] = __hgcd_swap4_2_tmp; \ +} while (0) + +static mp_size_t +gcdext_lehmer_itch (mp_size_t asize, mp_size_t bsize) +{ + mp_size_t ralloc = asize + 1; + mp_size_t ualloc = bsize + 1; + + return 4 * ralloc + 4 * ualloc + asize; +} + +static mp_size_t +gcdext_lehmer (mp_ptr gp, mp_ptr up, mp_size_t *usize, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + mp_ptr tp, mp_size_t talloc) +{ + struct hgcd_row r[4]; + /* Size and sign of u fields. The largest u should be normalized to + this size, and except for the case u1 = 0, that is the latest + u. */ + int rsize; + int rsign; + + mp_ptr qp; + mp_size_t qsize; + mp_size_t ralloc = asize + 1; + mp_size_t ualloc = bsize + 1; + + struct hgcd2 hgcd; + int res; + + ASSERT (asize >= bsize); + ASSERT (asize > 1); + ASSERT (bsize > 0); + + ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); + + ASSERT (4 * ralloc + 4*ualloc + asize <= talloc); + + r[0].rp = tp; tp += ralloc; talloc -= ralloc; + r[1].rp = tp; tp += ralloc; talloc -= ralloc; + r[2].rp = tp; tp += ralloc; talloc -= ralloc; + r[3].rp = tp; tp += ralloc; talloc -= ralloc; + + /* Must zero out the u fields. We don't use the v fields. */ + MPN_ZERO (tp, 4 * ualloc); + + r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc; + r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc; + r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc; + r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc; + + qp = tp; tp += asize; talloc -= asize; + + res = mpn_hgcd2_lehmer_step (&hgcd, + ap, asize, + bp, bsize, + NULL); + + if (res == 0 || (res == 2 && hgcd.row[0].v == 0)) + { + qsize = hgcd_tdiv (qp, r[1].rp, &r[1].rsize, + ap, asize, + bp, bsize); + MPN_COPY (r[0].rp, bp, bsize); + r[0].rsize = bsize; + + r[0].uvp[0][0] = 0; + r[1].uvp[0][0] = 1; + rsign = -1; + } + else + { + const struct hgcd2_row *s = hgcd.row + (res - 2); + rsign = hgcd.sign; + if (res == 3) + rsign = ~rsign; + + /* s[0] and s[1] correct. */ + r[0].rsize + = mpn_hgcd2_fix (r[0].rp, ralloc, + rsign, + s[0].u, ap, asize, + s[0].v, bp, bsize); + + r[1].rsize + = mpn_hgcd2_fix (r[1].rp, ralloc, + ~rsign, + s[1].u, ap, asize, + s[1].v, bp, bsize); + + r[0].uvp[0][0] = s[0].u; + r[1].uvp[0][0] = s[1].u; + } + rsize = 1; - q = 0; - while (cnt) + while (r[0].rsize >= 2 && r[1].rsize > 0) + { + res = mpn_hgcd2_lehmer_step (&hgcd, + r[0].rp, r[0].rsize, + r[1].rp, r[1].rsize, + NULL); + + if (res == 0 || (res == 2 && hgcd.row[0].v == 0)) { - d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1); - d1 = d1 >> 1; - q <<= 1; - if (n1 > d1 || (n1 == d1 && n0 >= d0)) - { - sub_ddmmss (n1, n0, n1, n0, d1, d0); - q |= 1; - } - cnt--; + qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize, + r[0].rp, r[0].rsize, + r[1].rp, r[1].rsize); + rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc); + HGCD_SWAP3_LEFT (r); + rsign = ~rsign; } + else + { + const struct hgcd2_row *s = hgcd.row + (res - 2); + int sign = hgcd.sign; + if (res == 3) + sign = ~sign; + + /* s[0] and s[1] correct. */ + r[2].rsize + = mpn_hgcd2_fix (r[2].rp, ralloc, + sign, + s[0].u, r[0].rp, r[0].rsize, + s[0].v, r[1].rp, r[1].rsize); + + r[3].rsize + = mpn_hgcd2_fix (r[3].rp, ralloc, + ~sign, + s[1].u, r[0].rp, r[0].rsize, + s[1].v, r[1].rp, r[1].rsize); + + rsize = hgcd2_mul_vector (r + 2, ralloc, s, r, rsize); + rsign ^= sign; + HGCD_SWAP4_2 (r); + } + } - return q; + if (r[1].rsize == 0) + { + MPN_NORMALIZE (r[0].uvp[0], rsize); + MPN_COPY (gp, r[0].rp, r[0].rsize); + MPN_COPY (up, r[0].uvp[0], rsize); + + *usize = (rsign >= 0) ? rsize : -rsize; + return r[0].rsize; + } + else + { + mp_limb_t cy; + mp_limb_t u; + mp_limb_t v; + + gp[0] = gcdext_1 (&u, &v, r[0].rp[0], r[1].rp[0]); + cy = mpn_addmul2_n_1 (up, rsize, + r[0].uvp[0], u, + r[1].uvp[0], v); + rsize++; + if (cy) + up[rsize++] = cy; + else + MPN_NORMALIZE (up, rsize); + + *usize = (rsign >= 0) ? rsize : -rsize; + return 1; } } -mp_size_t -#if EXTEND -mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size, - mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize) -#else -mpn_gcd (mp_ptr gp, - mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize) -#endif -{ - mp_limb_t A, B, C, D; - int cnt; - mp_ptr tp, wp; -#if RECORD - mp_limb_t max = 0; -#endif -#if EXTEND - mp_ptr s1p; - mp_ptr orig_s0p = s0p; - mp_size_t ssize; - int sign = 1; -#endif - int use_double_flag; - TMP_DECL (mark); +/* Computes Y = R * X. No overlap allowed. - ASSERT (size >= vsize); - ASSERT (vsize >= 1); - ASSERT (up[size-1] != 0); - ASSERT (vp[vsize-1] != 0); - ASSERT (! MPN_OVERLAP_P (up, size+1, vp, vsize+1)); -#if EXTEND - ASSERT (! MPN_OVERLAP_P (s0p, size, up, size+1)); - ASSERT (! MPN_OVERLAP_P (s0p, size, vp, vsize+1)); -#endif - ASSERT (MPN_SAME_OR_SEPARATE_P (gp, up, size)); - ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, size, vp, vsize)); + Temporary space is needed for two numbers smaller than the + resulting matrix elements, i.e. bounded by 2*L <= N. - TMP_MARK (mark); + FIXME: Severe code duplication with hgcd.c: hgcd_mul. */ - tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); - wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); -#if EXTEND - s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); - -#if ! WANT_GCDEXT_ONE_STEP - MPN_ZERO (s0p, size); - MPN_ZERO (s1p, size); -#endif +static mp_size_t +hgcd_mul_vector (struct hgcd_row *Y, mp_size_t alloc, + const struct hgcd_row *R, mp_size_t rsize, + const struct hgcd_row *X, mp_size_t xsize, + mp_ptr tp, mp_size_t talloc) +{ + unsigned i; - s0p[0] = 1; - s1p[0] = 0; - ssize = 1; -#endif + mp_size_t ysize; + mp_limb_t h; + int grow; + + MPN_NORMALIZE (R[1].uvp[1], rsize); + /* u1 = 0 is an exceptional case. Except for this, u1 should be + normalized. */ + ASSERT ((xsize == 1 && X[1].uvp[0][0] == 0) + || X[1].uvp[0][xsize - 1] != 0); - if (size > vsize) + if (xsize == 1 && X[1].uvp[0][0] == 0) { - mpn_tdiv_qr (tp, up, (mp_size_t) 0, up, size, vp, vsize); + /* Special case. Set Y[i, 0] = R[i, 0] */ + ASSERT (X[0].uvp[0][0] == 1); -#if EXTEND - /* This is really what it boils down to in this case... */ - s0p[0] = 0; - s1p[0] = 1; - sign = -sign; -#endif - size = vsize; - MP_PTR_SWAP (up, vp); + if (rsize > 1) + MPN_NORMALIZE (R[1].uvp[0], rsize); + MPN_COPY (Y[0].uvp[0], R[0].uvp[0], rsize); + MPN_COPY (Y[1].uvp[0], R[1].uvp[0], rsize); + + return rsize; } - use_double_flag = ABOVE_THRESHOLD (size, GCDEXT_THRESHOLD); + ysize = rsize + xsize; + ASSERT (ysize <= talloc); - for (;;) + h = 0; grow = 0; + + if (rsize >= xsize) { - mp_limb_t asign; - /* Figure out exact size of V. */ - vsize = size; - MPN_NORMALIZE (vp, vsize); - if (vsize <= 1) - break; - - if (use_double_flag) - { - mp_limb_t uh, vh, ul, vl; - /* Let UH,UL be the most significant limbs of U, and let VH,VL be - the corresponding bits from V. */ - uh = up[size - 1]; - vh = vp[size - 1]; - ul = up[size - 2]; - vl = vp[size - 2]; - count_leading_zeros (cnt, uh); -#if GMP_NAIL_BITS == 0 - if (cnt != 0) - { - uh = (uh << cnt) | (ul >> (GMP_LIMB_BITS - cnt)); - vh = (vh << cnt) | (vl >> (GMP_LIMB_BITS - cnt)); - vl <<= cnt; - ul <<= cnt; - if (size >= 3) - { - ul |= (up[size - 3] >> (GMP_LIMB_BITS - cnt)); - vl |= (vp[size - 3] >> (GMP_LIMB_BITS - cnt)); - } - } -#else - uh = uh << cnt; - vh = vh << cnt; - if (cnt < GMP_NUMB_BITS) - { /* GMP_NAIL_BITS <= cnt < GMP_NUMB_BITS */ - uh |= ul >> (GMP_NUMB_BITS - cnt); - vh |= vl >> (GMP_NUMB_BITS - cnt); - ul <<= cnt + GMP_NAIL_BITS; - vl <<= cnt + GMP_NAIL_BITS; - if (size >= 3) - { - if (cnt + GMP_NAIL_BITS > GMP_NUMB_BITS) - { - ul |= up[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS; - vl |= vp[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS; - if (size >= 4) - { - ul |= up[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt; - vl |= vp[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt; - } - } - else - { - ul |= up[size - 3] >> (GMP_LIMB_BITS - cnt - 2 * GMP_NAIL_BITS); - vl |= vp[size - 3] >> (GMP_LIMB_BITS - cnt - 2 * GMP_NAIL_BITS); - } - } - } - else - { /* GMP_NUMB_BITS <= cnt <= GMP_LIMB_BITS-1 */ - uh |= ul << cnt - GMP_NUMB_BITS; /* 0 <= c <= GMP_NAIL_BITS-1 */ - vh |= vl << cnt - GMP_NUMB_BITS; - if (size >= 3) - { - if (cnt - GMP_NUMB_BITS != 0) - { /* uh/vh need yet more bits! */ - uh |= up[size - 3] >> 2 * GMP_NUMB_BITS - cnt; - vh |= vp[size - 3] >> 2 * GMP_NUMB_BITS - cnt; - ul = up[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS; - vl = vp[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS; - if (size >= 4) - { - ul |= up[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt; - vl |= vp[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt; - } - } - else - { - ul = up[size - 3] << GMP_LIMB_BITS - cnt; - vl = vp[size - 3] << GMP_LIMB_BITS - cnt; - if (size >= 4) - { - ul |= up[size - 4] >> GMP_NUMB_BITS - (GMP_LIMB_BITS - cnt); - vl |= vp[size - 4] >> GMP_NUMB_BITS - (GMP_LIMB_BITS - cnt); - } - } - } - else - { - ul = 0; - vl = 0; - } - } -#endif + for (i = 0; i < 2; i++) + { + /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */ + mp_limb_t cy; - A = 1; - B = 0; - C = 0; - D = 1; + mpn_mul (Y[i].uvp[0], R[i].uvp[0], rsize, X[0].uvp[0], xsize); + mpn_mul (tp, R[i].uvp[1], rsize, X[1].uvp[0], xsize); - asign = 0; - for (;;) - { - mp_limb_t Tac, Tbd; - mp_limb_t q1, q2; - mp_limb_t nh, nl, dh, dl; - mp_limb_t t1, t0; - mp_limb_t Th, Tl; - - sub_ddmmss (dh, dl, vh, vl, 0, C); - if (dh == 0) - break; - add_ssaaaa (nh, nl, uh, ul, 0, A); - q1 = div2 (nh, nl, dh, dl); - - add_ssaaaa (dh, dl, vh, vl, 0, D); - if (dh == 0) - break; - sub_ddmmss (nh, nl, uh, ul, 0, B); - q2 = div2 (nh, nl, dh, dl); - - if (q1 != q2) - break; - - Tac = A + q1 * C; - if (GMP_NAIL_BITS != 0 && Tac > GMP_NUMB_MAX) - break; - Tbd = B + q1 * D; - if (GMP_NAIL_BITS != 0 && Tbd > GMP_NUMB_MAX) - break; - A = C; - C = Tac; - B = D; - D = Tbd; - umul_ppmm (t1, t0, q1, vl); - t1 += q1 * vh; - sub_ddmmss (Th, Tl, uh, ul, t1, t0); - uh = vh, ul = vl; - vh = Th, vl = Tl; - - asign = ~asign; - - add_ssaaaa (dh, dl, vh, vl, 0, C); -/* if (dh == 0) should never happen - break; */ - sub_ddmmss (nh, nl, uh, ul, 0, A); - q1 = div2 (nh, nl, dh, dl); - - sub_ddmmss (dh, dl, vh, vl, 0, D); - if (dh == 0) - break; - add_ssaaaa (nh, nl, uh, ul, 0, B); - q2 = div2 (nh, nl, dh, dl); - - if (q1 != q2) - break; - - Tac = A + q1 * C; - if (GMP_NAIL_BITS != 0 && Tac > GMP_NUMB_MAX) - break; - Tbd = B + q1 * D; - if (GMP_NAIL_BITS != 0 && Tbd > GMP_NUMB_MAX) - break; - A = C; - C = Tac; - B = D; - D = Tbd; - umul_ppmm (t1, t0, q1, vl); - t1 += q1 * vh; - sub_ddmmss (Th, Tl, uh, ul, t1, t0); - uh = vh, ul = vl; - vh = Th, vl = Tl; + cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize); - asign = ~asign; + if (cy) + { + ASSERT (ysize + 1 < alloc); + Y[i].uvp[0][ysize] = cy; + grow = 1; } -#if EXTEND - if (asign) - sign = -sign; -#endif + else + h |= Y[i].uvp[0][ysize - 1]; } - else /* Same, but using single-limb calculations. */ + } + else + { + for (i = 0; i < 2; i++) { - mp_limb_t uh, vh; - /* Make UH be the most significant limb of U, and make VH be - corresponding bits from V. */ - uh = up[size - 1]; - vh = vp[size - 1]; - count_leading_zeros (cnt, uh); -#if GMP_NAIL_BITS == 0 - if (cnt != 0) - { - uh = (uh << cnt) | (up[size - 2] >> (GMP_LIMB_BITS - cnt)); - vh = (vh << cnt) | (vp[size - 2] >> (GMP_LIMB_BITS - cnt)); - } -#else - uh <<= cnt; - vh <<= cnt; - if (cnt < GMP_NUMB_BITS) + /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */ + mp_limb_t cy; + + mpn_mul (Y[i].uvp[0], X[0].uvp[0], xsize, R[i].uvp[0], rsize); + mpn_mul (tp, X[1].uvp[0], xsize, R[i].uvp[1], rsize); + + cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize); + + if (cy) { - uh |= up[size - 2] >> (GMP_NUMB_BITS - cnt); - vh |= vp[size - 2] >> (GMP_NUMB_BITS - cnt); + ASSERT (ysize + 1 < alloc); + Y[i].uvp[0][ysize] = cy; + grow = 1; } else - { - uh |= up[size - 2] << cnt - GMP_NUMB_BITS; - vh |= vp[size - 2] << cnt - GMP_NUMB_BITS; - if (size >= 3) - { - uh |= up[size - 3] >> 2 * GMP_NUMB_BITS - cnt; - vh |= vp[size - 3] >> 2 * GMP_NUMB_BITS - cnt; - } - } -#endif + h |= Y[i].uvp[0][ysize - 1]; + } + } - A = 1; - B = 0; - C = 0; - D = 1; + if (grow) + ysize++; + else + ysize -= (h == 0); - asign = 0; - for (;;) - { - mp_limb_t q, T; - if (vh - C == 0 || vh + D == 0) - break; - - q = (uh + A) / (vh - C); - if (q != (uh - B) / (vh + D)) - break; - - T = A + q * C; - A = C; - C = T; - T = B + q * D; - B = D; - D = T; - T = uh - q * vh; - uh = vh; - vh = T; - - asign = ~asign; - - if (vh - D == 0) - break; - - q = (uh - A) / (vh + C); - if (q != (uh + B) / (vh - D)) - break; - - T = A + q * C; - A = C; - C = T; - T = B + q * D; - B = D; - D = T; - T = uh - q * vh; - uh = vh; - vh = T; + ASSERT ((ysize == 1 && Y[1].uvp[0][0] == 0) || Y[1].uvp[0][ysize - 1] != 0); - asign = ~asign; - } -#if EXTEND - if (asign) - sign = -sign; + return ysize; +} + +#define COMPUTE_V_ITCH(asize, bsize, usize) \ + ((usize) + (asize) + 1 + (bsize)) + +/* Computes |v| = |(c - u a)| / b, where u may be positive or negative, + and v is of the opposite sign. Requires that b, c, |u| <= a. */ +static mp_size_t +compute_v (mp_ptr vp, mp_size_t valloc, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + mp_srcptr cp, mp_size_t csize, + mp_srcptr up, mp_size_t usize, + mp_ptr tp, mp_size_t talloc) +{ + mp_size_t size; + mp_size_t vsize; + mp_ptr rp; + + ASSERT (asize); + ASSERT (bsize); + ASSERT (csize); + ASSERT (asize >= bsize); + +#if 0 + trace ("compute_v: a = %Nd\n" + " b = %Nd\n" + " c = %Nd\n" + " u = %Nd\n", + ap, asize, bp, bsize, cp, csize, up, usize); #endif + + ASSERT (usize); + + size = ABS (usize); + + ASSERT (size <= asize); + ASSERT (asize + size <= talloc); + + mpn_mul (tp, ap, asize, up, size); + size += asize; + + ASSERT (csize <= size); + + if (usize > 0) + { + /* |v| = -v = (u a - c) / b */ + + ASSERT_NOCARRY (mpn_sub (tp, tp, size, cp, csize)); + MPN_NORMALIZE (tp, size); + if (size == 0) + return 0; + } + else + { /* usize < 0 */ + /* |v| = v = (c - u a) / b = (c + |u| a) / b */ + mp_limb_t cy = mpn_add (tp, tp, size, cp, csize); + if (cy) + { + ASSERT (size < talloc); + tp[size++] = cy; } + } -#if RECORD - max = MAX (A, max); max = MAX (B, max); - max = MAX (C, max); max = MAX (D, max); + /* Now divide t / b. There must be no remainder */ + + ASSERT (size >= bsize); + ASSERT (size + bsize <= talloc); + rp = tp + size; + + vsize = size + 1 - bsize; + ASSERT (vsize <= valloc); + + mpn_tdiv_qr (vp, rp, 0, tp, size, bp, bsize); + MPN_NORMALIZE (vp, vsize); + + /* Remainder must be zero */ +#if WANT_ASSERT + { + mp_size_t i; + for (i = 0; i < bsize; i++) + { + ASSERT (rp[i] == 0); + } + } #endif + return vsize; +} + +static mp_size_t +gcdext_schoenhage_itch (mp_size_t asize, mp_size_t bsize) +{ + mp_size_t itch; + + mp_size_t ralloc = asize + 1; + mp_size_t ualloc = bsize + 1; + /* Input size for hgcd calls */ + mp_size_t halloc = (asize + 1) / 2; + + /* Storage for the rows and quotient */ + mp_size_t rstorage = 4 * ralloc + 4 * ualloc + asize; + + /* Storage for hgcd calls */ + mp_size_t tstorage = mpn_hgcd_init_itch (halloc) + + qstack_itch (halloc) + + mpn_hgcd_itch (halloc); + + /* Storage needed for final gcdext_lehmer */ + mp_size_t lstorage + = gcdext_lehmer_itch (GCDEXT_SCHOENHAGE_THRESHOLD, + GCDEXT_SCHOENHAGE_THRESHOLD); + + /* Storage needed after final nhgcd_gcdext_lehmer */ + mp_size_t fstorage + = COMPUTE_V_ITCH (GCDEXT_SCHOENHAGE_THRESHOLD, + GCDEXT_SCHOENHAGE_THRESHOLD, + ualloc); + + /* We need rstorage + MAX (tstorage, lstorage, fstorage) */ + + itch = tstorage; + if (lstorage > tstorage) + itch = lstorage; + if (fstorage > itch) + itch = fstorage; + + return rstorage + itch; +} + +#if WANT_ASSERT +static void +sanity_check_row (mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + int sign, mp_size_t usize, + const struct hgcd_row *r) +{ + /* Check that x = u * a + v * b, for some v, i.e. that + x - u*a is divisible by b. */ + mp_srcptr up = r->uvp[0]; + mp_srcptr xp = r->rp; + mp_size_t xsize = r->rsize; + mp_ptr tp; + mp_size_t tsize; + mp_ptr qp; + mp_size_t qsize; + mp_ptr rp; + mp_size_t i; + + TMP_DECL (marker); + + ASSERT (asize > 0 && ap[asize - 1] != 0); + ASSERT (bsize > 0 && bp[bsize - 1] != 0); + ASSERT (xsize == 0 || xp[xsize - 1] != 0); + ASSERT (MPN_LEQ_P (xp, xsize, ap, asize)); + ASSERT (MPN_LEQ_P (up, usize, bp, bsize)); + + MPN_NORMALIZE (up, usize); + if (usize == 0) + { + ASSERT (MPN_EQUAL_P (xp, xsize, bp, bsize)); + return; + } + + tp = TMP_ALLOC_LIMBS (usize + asize + 1); + qp = TMP_ALLOC_LIMBS (usize + asize + 2 - bsize); + rp = TMP_ALLOC_LIMBS (bsize); + + mpn_mul (tp, ap, asize, up, usize); + tsize = asize + usize; + tsize -= (tp[tsize - 1] == 0); + + if (sign >= 0) + { + ASSERT_NOCARRY (mpn_sub (tp, tp, tsize, xp, xsize)); + MPN_NORMALIZE (tp, tsize); + } + else + { + mp_limb_t cy = mpn_add (tp, tp, tsize, xp, xsize); + tp[tsize] = cy; + tsize += (cy != 0); + } + + if (tsize > 0) + { + mpn_tdiv_qr (qp, rp, 0, tp, tsize, bp, bsize); + for (i = 0; i < bsize; i++) + ASSERT (rp[i] == 0); + qsize = tsize - bsize; + qsize += (qp[qsize] != 0); + ASSERT (MPN_LEQ_P (qp, qsize, ap, asize)); + } + TMP_FREE (marker); +} +# define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r) \ +sanity_check_row (ap, asize, bp, bsize, sign, usize, r) + +#else /* !WANT_ASSERT */ +# define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r) +#endif /* !WANT_ASSERT */ + +static mp_size_t +gcdext_schoenhage (mp_ptr gp, mp_ptr up, mp_size_t *usizep, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + mp_ptr tp, mp_size_t talloc) +{ + mp_size_t scratch; + struct hgcd hgcd; + struct qstack quotients; + struct hgcd_row r[4]; + + /* Size and sign of u fields. The largest u should be normalized to + this size, and except for the case u1 = 0, that is the latest + u. */ + int rsize; + int rsign; + + mp_ptr qp; + mp_size_t qsize; + mp_size_t ralloc = asize + 1; + mp_size_t ualloc = bsize + 1; + + ASSERT (asize >= bsize); + ASSERT (bsize > 0); + + ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); - if (B == 0) + ASSERT (4 * ralloc + 4*ualloc + asize <= talloc); + + r[0].rp = tp; tp += ralloc; talloc -= ralloc; + r[1].rp = tp; tp += ralloc; talloc -= ralloc; + r[2].rp = tp; tp += ralloc; talloc -= ralloc; + r[3].rp = tp; tp += ralloc; talloc -= ralloc; + + /* Must zero out the u fields */ + MPN_ZERO (tp, 4 * ualloc); + + r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc; + r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc; + r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc; + r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc; + + qp = tp; tp += asize; talloc -= asize; + + ASSERT (asize >= bsize); + ASSERT (bsize > 0); + MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize; + MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize; + + r[0].uvp[0][0] = 1; + r[1].uvp[0][0] = 0; + + /* We don't use the v fields. */ + rsize = 1; + rsign = 0; + + scratch = mpn_hgcd_init_itch ((asize + 1) / 2); + ASSERT (scratch <= talloc); + mpn_hgcd_init (&hgcd, (asize + 1) / 2, tp); + tp += scratch; talloc -= scratch; + + { + mp_size_t nlimbs = qstack_itch ((asize + 1) / 2); + + ASSERT (nlimbs <= talloc); + qstack_init ("ients, (asize + 1) / 2, tp, nlimbs); + + tp += nlimbs; + talloc -= nlimbs; + } + + while (ABOVE_THRESHOLD (r[0].rsize, GCDEXT_SCHOENHAGE_THRESHOLD) + && r[1].rsize > 0) + { + mp_size_t k = r[0].rsize / 2; + int res; + + ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r); + ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 1); + + if (r[1].rsize <= k) + goto euclid; + + qstack_reset ("ients, r[0].rsize - k); + + res = mpn_hgcd (&hgcd, + r[0].rp + k, r[0].rsize - k, + r[1].rp + k, r[1].rsize - k, + "ients, + tp, talloc); + + if (res == 0 || res == 1) { - /* This is quite rare. I.e., optimize something else! */ + euclid: + qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize, + r[0].rp, r[0].rsize, + r[1].rp, r[1].rsize); + rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc); + ASSERT (rsize < ualloc); - mpn_tdiv_qr (wp, up, (mp_size_t) 0, up, size, vp, vsize); + ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2); -#if EXTEND - MPN_COPY (tp, s0p, ssize); - { - mp_size_t qsize; - mp_size_t i; - - qsize = size - vsize + 1; /* size of stored quotient from division */ - MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ - - for (i = 0; i < qsize; i++) - { - mp_limb_t cy; - cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]); - tp[ssize + i] = cy; - } - - ssize += qsize; - ssize -= tp[ssize - 1] == 0; - } - - sign = -sign; - MP_PTR_SWAP (s0p, s1p); - MP_PTR_SWAP (s1p, tp); -#endif - size = vsize; - MP_PTR_SWAP (up, vp); + HGCD_SWAP3_LEFT (r); + rsign = ~rsign; } else { -#if EXTEND - mp_size_t tsize, wsize; -#endif - /* T = U*A + V*B - W = U*C + V*D - U = T - V = W */ - -#if STAT - { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x); - arr[GMP_LIMB_BITS - cnt]++; } -#endif - if (A == 0) - { - /* B == 1 and C == 1 (D is arbitrary) */ - mp_limb_t cy; - MPN_COPY (tp, vp, size); - MPN_COPY (wp, up, size); - mpn_submul_1 (wp, vp, size, D); - MP_PTR_SWAP (tp, up); - MP_PTR_SWAP (wp, vp); -#if EXTEND - MPN_COPY (tp, s1p, ssize); - tsize = ssize; - tp[ssize] = 0; /* must zero since wp might spill below */ - MPN_COPY (wp, s0p, ssize); - cy = mpn_addmul_1 (wp, s1p, ssize, D); - wp[ssize] = cy; - wsize = ssize + (cy != 0); - MP_PTR_SWAP (tp, s0p); - MP_PTR_SWAP (wp, s1p); - ssize = MAX (wsize, tsize); -#endif - } - else - { - mp_limb_t cy, cy1, cy2; + const struct hgcd_row *s = hgcd.row + (res - 2); + int sign = hgcd.sign; + if (res == 3) + sign = ~sign; + + /* s[0] and s[1] are correct */ + r[2].rsize + = mpn_hgcd_fix (k, r[2].rp, ralloc, + sign, hgcd.size, s, + r[0].rp, r[1].rp, + tp, talloc); + + r[3].rsize + = mpn_hgcd_fix (k, r[3].rp, ralloc, + ~sign, hgcd.size, s+1, + r[0].rp, r[1].rp, + tp, talloc); + + rsize = hgcd_mul_vector (r + 2, ualloc, s, hgcd.size, + r, rsize, tp, talloc); + ASSERT (rsize < ualloc); + + rsign ^= sign; + ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2); + ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 3); - if (asign) - { - mpn_mul_1 (tp, vp, size, B); - mpn_submul_1 (tp, up, size, A); - mpn_mul_1 (wp, up, size, C); - mpn_submul_1 (wp, vp, size, D); - } - else - { - mpn_mul_1 (tp, up, size, A); - mpn_submul_1 (tp, vp, size, B); - mpn_mul_1 (wp, vp, size, D); - mpn_submul_1 (wp, up, size, C); - } - MP_PTR_SWAP (tp, up); - MP_PTR_SWAP (wp, vp); -#if EXTEND - /* Compute new s0 */ - cy1 = mpn_mul_1 (tp, s0p, ssize, A); - cy2 = mpn_addmul_1 (tp, s1p, ssize, B); - cy = cy1 + cy2; - tp[ssize] = cy & GMP_NUMB_MASK; - tsize = ssize + (cy != 0); -#if GMP_NAIL_BITS == 0 - if (cy < cy1) -#else - if (cy > GMP_NUMB_MAX) -#endif - { - tp[tsize] = 1; - wp[tsize] = 0; - tsize++; - /* This happens just for nails, since we get more work done - per numb there. */ - } - - /* Compute new s1 */ - cy1 = mpn_mul_1 (wp, s1p, ssize, D); - cy2 = mpn_addmul_1 (wp, s0p, ssize, C); - cy = cy1 + cy2; - wp[ssize] = cy & GMP_NUMB_MASK; - wsize = ssize + (cy != 0); -#if GMP_NAIL_BITS == 0 - if (cy < cy1) -#else - if (cy > GMP_NUMB_MAX) -#endif - { - wp[wsize] = 1; - if (wsize >= tsize) - tp[wsize] = 0; - wsize++; - } - - MP_PTR_SWAP (tp, s0p); - MP_PTR_SWAP (wp, s1p); - ssize = MAX (wsize, tsize); -#endif - } - size -= up[size - 1] == 0; -#if GMP_NAIL_BITS != 0 - size -= up[size - 1] == 0; -#endif + HGCD_SWAP4_2 (r); } - -#if WANT_GCDEXT_ONE_STEP - TMP_FREE (mark); - return 0; -#endif } - -#if RECORD - printf ("max: %lx\n", max); -#endif - -#if STAT - {int i; for (i = 0; i <= GMP_LIMB_BITS; i++) printf ("%d:%d\n", i, arr[i]);} -#endif - - if (vsize == 0) + if (r[1].rsize == 0) { - if (gp != up && gp != 0) - MPN_COPY (gp, up, size); -#if EXTEND - MPN_NORMALIZE (s0p, ssize); - if (orig_s0p != s0p) - MPN_COPY (orig_s0p, s0p, ssize); - *s0size = sign >= 0 ? ssize : -ssize; -#endif - TMP_FREE (mark); - return size; + MPN_COPY (gp, r[0].rp, r[0].rsize); + MPN_NORMALIZE (r[0].uvp[0], rsize); + MPN_COPY (up, r[0].uvp[0], rsize); + + *usizep = (rsign >= 0) ? rsize : - rsize; + return r[0].rsize; } else { - mp_limb_t vl, ul, t; -#if EXTEND - mp_size_t qsize, i; -#endif - vl = vp[0]; -#if EXTEND - t = mpn_divmod_1 (wp, up, size, vl); - - MPN_COPY (tp, s0p, ssize); - - qsize = size - (wp[size - 1] == 0); /* size of quotient from division */ - if (ssize < qsize) - { - MPN_ZERO (tp + ssize, qsize - ssize); - MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ - for (i = 0; i < ssize; i++) - { - mp_limb_t cy; - cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]); - tp[qsize + i] = cy; - } + /* We have r0 = u0 a + v0 b, + r1 = u1 a + v1 b + + Compute g = u r0 + v r1 = (u u0 + v u1) a + (...) b + In the expression (u u0 + v u1), we have + + u <= r1, + u0 <= b/r0 (except if r0 = a, which should never be the case here) + v <= r0 + u1 <= b/r0 + */ + + mp_size_t gsize; + mp_size_t usize; + mp_size_t vsize; + + /* u1 should be non-zero, and normalized */ + ASSERT (rsize); + ASSERT (r[1].uvp[0][rsize - 1] != 0); +#if WANT_TRACE + trace ("gcdext: \n" + "r0 = %Nd\n" + "r1 = %Nd\n" + "u0 = %Nd\n" + "u1 = %Nd\n", + r[0].rp, r[0].rsize, r[1].rp, r[1].rsize, + r[0].uvp[0], rsize, r[1].uvp[0], rsize); +#endif + /* We don't need the space for hgcd and the quotient stack any more */ + tp -= scratch; talloc += scratch; + + /* Stores u in r[2] and v in r[3] */ + gsize = gcdext_lehmer (gp, r[2].uvp[0], &usize, + r[0].rp, r[0].rsize, + r[1].rp, r[1].rsize, + tp, talloc); + + if (usize == 0) + { + /* u == 0 ==> v = g / b == 1 ==> g = u1 a + (...) b */ + + MPN_NORMALIZE (r[1].uvp[0], rsize); + MPN_COPY (up, r[1].uvp[0], rsize); + *usizep = (rsign >= 0) ? - rsize : rsize; + + return gsize; + } + + /* Compute v = (g - s r0) / r1, storing it in r[3] */ + vsize = compute_v (r[3].uvp[0], ualloc, + r[0].rp, r[0].rsize, r[1].rp, r[1].rsize, + gp, gsize, + r[2].uvp[0], usize, + tp, talloc); + + if (usize < 0) + { + usize = - usize; + rsign = ~rsign; + } + + /* It's possible that u0 = 0, u1 = 1 */ + if (rsize == 1 && r[0].uvp[0][0] == 0) + { + /* u0 == 0 ==> u u0 + v u1 = v */ + MPN_COPY (up, r[3].uvp[0], vsize); + *usizep = (rsign >= 0) ? vsize : - vsize; + + return gsize; } + + /* Ok, now u0, u1, u are non-zero. We may still have v == 0 */ + ASSERT (usize + rsize <= ualloc); + ASSERT (vsize + rsize <= ualloc); + + /* Compute u u0 */ + if (usize <= rsize) + /* Should be the common case */ + mpn_mul (up, + r[0].uvp[0], rsize, + r[2].uvp[0], usize); else + mpn_mul (up, + r[2].uvp[0], usize, + r[0].uvp[0], rsize); + + usize += rsize; + + /* There may be more than one zero limb, if #u0 < #u1 */ + MPN_NORMALIZE (up, usize); + ASSERT (usize < ualloc); + + if (vsize) { - MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ - for (i = 0; i < qsize; i++) + mp_limb_t cy; + + /* Overwrites old r[2].uvp[0] value */ + if (vsize <= rsize) + /* Should be the common case */ + cy = mpn_mul (r[2].uvp[0], + r[1].uvp[0], rsize, + r[3].uvp[0], vsize); + else + cy = mpn_mul (r[2].uvp[0], + r[3].uvp[0], vsize, + r[1].uvp[0], rsize); + + vsize += rsize - (cy == 0); + ASSERT (vsize < ualloc); + + if (vsize <= usize) + cy = mpn_add (up, up, usize, r[2].uvp[0], vsize); + else { - mp_limb_t cy; - cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]); - tp[ssize + i] = cy; + cy = mpn_add (up, r[2].uvp[0], vsize, up, usize); + usize = vsize; } + up[usize] = cy; + usize += (cy != 0); + + ASSERT (usize < ualloc); } - ssize += qsize; - ssize -= tp[ssize - 1] == 0; + *usizep = (rsign >= 0) ? usize : -usize; - sign = -sign; - MP_PTR_SWAP (s0p, s1p); - MP_PTR_SWAP (s1p, tp); -#else - t = mpn_mod_1 (up, size, vl); -#endif - ul = vl; - vl = t; - while (vl != 0) - { - mp_limb_t t; -#if EXTEND - mp_limb_t q; - q = ul / vl; - t = ul - q * vl; - - MPN_COPY (tp, s0p, ssize); - - MPN_ZERO (s1p + ssize, 1); /* zero s1 too */ - - { - mp_limb_t cy; - cy = mpn_addmul_1 (tp, s1p, ssize, q); - tp[ssize] = cy; - } - - ssize += 1; - ssize -= tp[ssize - 1] == 0; - - sign = -sign; - MP_PTR_SWAP (s0p, s1p); - MP_PTR_SWAP (s1p, tp); + return gsize; + } +} + +mp_size_t +mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, + mp_ptr ap, mp_size_t asize, mp_ptr bp, mp_size_t bsize) +{ + ASSERT (asize >= bsize); + ASSERT (bsize > 0); + + if (asize == 1) + { +#if GCDEXT_1_USE_BINARY + mp_limb_t v; + *gp = gcdext_1 (up, &v, ap[0], bp[0]); #else - t = ul % vl; -#endif - ul = vl; - vl = t; - } - if (gp != 0) - gp[0] = ul; -#if EXTEND - MPN_NORMALIZE (s0p, ssize); - if (orig_s0p != s0p) - MPN_COPY (orig_s0p, s0p, ssize); - *s0size = sign >= 0 ? ssize : -ssize; + *gp = gcdext_1_u (up, ap[0], bp[0]); #endif - TMP_FREE (mark); + *usizep = (up[0] != 0); + ASSERT(gp[0] != 0); return 1; + } + else if (BELOW_THRESHOLD (asize, GCDEXT_SCHOENHAGE_THRESHOLD)) + { + mp_size_t gsize; + mp_ptr tp; + mp_size_t talloc = gcdext_lehmer_itch (asize, bsize); + TMP_DECL (marker); + + tp = TMP_ALLOC_LIMBS (talloc); + gsize = gcdext_lehmer (gp, up, usizep, ap, asize, bp, bsize, + tp, talloc); + TMP_FREE (marker); + return gsize; + } + else + { + mp_size_t gsize; + mp_ptr tp; + mp_size_t talloc = gcdext_schoenhage_itch (asize, bsize); + TMP_DECL (marker); + + tp = TMP_ALLOC_LIMBS (talloc); + gsize = gcdext_schoenhage (gp, up, usizep, ap, asize, bp, bsize, + tp, talloc); + TMP_FREE (marker); + return gsize; } } diff -x *~ -N -u -r gmp-4.1.3/mpn/generic/hgcd.c gmp-4.1.3-gcdext/mpn/generic/hgcd.c --- gmp-4.1.3/mpn/generic/hgcd.c Thu Jan 1 01:00:00 1970 +++ gmp-4.1.3-gcdext/mpn/generic/hgcd.c Sat Jan 31 15:05:10 2004 @@ -0,0 +1,2129 @@ +/* hgcd.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define WANT_TRACE 0 + +#if WANT_TRACE +# include +# include +#endif + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#if WANT_TRACE +static void +trace (const char *format, ...) +{ + va_list args; + va_start (args, format); + gmp_vfprintf (stderr, format, args); + va_end (args); +} +#endif + +/* Comparison of _normalized_ numbers. */ + +#define MPN_EQUAL_P(ap, asize, bp, bsize) \ +((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0) + +#define MPN_LEQ_P(ap, asize, bp, bsize) \ +((asize) < (bsize) || ((asize) == (bsize) \ + && mpn_cmp ((ap), (bp), (asize)) <= 0)) + +#define MPN_LESS_P(ap, asize, bp, bsize) \ +((asize) < (bsize) || ((asize) == (bsize) \ + && mpn_cmp ((ap), (bp), (asize)) < 0)) + +/* Extract one limb, shifting count bits left + ________ ________ + |___xh___||___xl___| + |____r____| + >count < + + The count includes any nail bits, so it should work fine if + count is computed using count_leading_zeros. +*/ + +#define MPN_EXTRACT_LIMB(count, xh, xl) \ + ((((xh) << ((count) - GMP_NAIL_BITS)) & GMP_NUMB_MASK) | \ + ((xl) >> (GMP_LIMB_BITS - (count)))) + + +/* Return -1 if a < x + y + z, + 0 if a = x + y + z, + 1 if a > x + y + z. */ +static int +mpn_cmp_sum3 (mp_srcptr ap, mp_size_t an, + mp_srcptr xp, mp_size_t xn, + mp_srcptr yp, mp_size_t yn, + mp_srcptr zp, mp_size_t zn) +{ + mp_limb_t cy; + + /* Check that all limbs beyond an are zero. This should be slightly + cheaper than fully normalizing all the input numbers. */ + + while (xn > an) + if (xp[--xn] > 0) return -1; + while (yn > an) + if (yp[--yn] > 0) return -1; + while (zn > an) + if (zp[--zn] > 0) return -1; + + /* Start by sorting so that xn >= yn >= zn. Six permutations, so we + can't get away with less than three comparisons, at least not for + the worst case. */ + + if (xn < yn) + MPN_SRCPTR_SWAP (xp, xn, yp, yn); + if (yn < zn) + MPN_SRCPTR_SWAP (yp, yn, zp, zn); + if (xn < yn) + MPN_SRCPTR_SWAP (xp, xn, yp, yn); + + ASSERT (an >= xn && xn >= yn && yn >= zn); + + /* Assume that a = x + y + z, and write the addition limb by limb. + + (c[1], a[0]) = x[0] + y[0] + z[0] + c[0] + (c[2], a[1]) = x[1] + y[1] + z[1] + c[1] + (c[k+1], a[k]) = x[k] + y[k] + z[k] + c[2] + ... + (c[n], a[n-1]) = x[n-1] + y[n-1] + z[n-1] + c[n-1] + + where the start and stop conditions are that c[0] = c[n] = 0. + Then we can start at the high end, iterating + + c[k] = (c[k+1], a[k]) - x[k] - y[k] - z[k] + + If equality holds, then 0 <= c[k] <= 2 for all k (since for + example 0xf + 0xf + 0xf + 2 = 0x2f). If we find c[k] < 0, then we + know that a < x + y + z, and if we find c[k] > 2, then we know a + > x + y + z. */ + + cy = 0; + + while (an > xn) + { + /* c[k] = (c[k+1], a[k]) */ + if (cy > 0) + return 1; + + cy = ap[--an]; + } + +#if GMP_NAIL_BITS >= 2 + while (an > yn) + { + if (cy > 1) + return 1; + + cy = (cy << GMP_NUMB_BITS) + ap[--an]; + if (cy < xp[an]) + return -1; + cy -= xp[an]; + } + while (an > zn) + { + mp_limb_t s; + + if (cy > 2) + return 1; + + cy = (cy << GMP_NUMB_BITS ) + ap[--an]; + s = xp[an] + yp[an]; + if (cy < s) + return -1; + cy -= s; + } + while (an > 0) + { + mp_limb_t s; + + if (cy > 2) + return 1; + + cy = (cy << GMP_NUMB_BITS ) + ap[--an]; + s = xp[an] + yp[an] + zp[an]; + if (cy < s) + return -1; + cy -= s; + } +#else /* GMP_NAIL_BITS < 2 */ +#if GMP_NAIL_BITS == 1 +loselose +#endif + while (an > yn) + { + /* c[k] = (c[k+1], a[k]) - x[k] */ + if (cy > 1) + return 1; + + --an; + + if (cy == 1) + { + if (ap[an] >= xp[an]) + return 1; + cy = (ap[an] - xp[an]) & GMP_NUMB_MASK; + } + else + { + /* cy == 0 */ + if (ap[an] < xp[an]) + return -1; + else + cy = ap[an] - xp[an]; + } + } + + while (an > zn) + { + mp_limb_t sh, sl; + + /* c[k] = (c[k+1], a[k]) - x[k] - y[k] */ + if (cy > 2) + return 1; + + --an; + + sl = xp[an] + yp[an]; + sh = (sl < xp[an]); + + if (cy < sh || (cy == sh && ap[an] < sl)) + return -1; + + sl = ap[an] - sl; /* Monkey business */ + sh = cy - sh - (sl > ap[an]); + if (sh > 0) + return 1; + cy = sl; + } + while (an > 0) + { + mp_limb_t sh, sl; + if (cy > 2) + return 1; + + --an; + + sl = xp[an] + yp[an]; + sh = (sl < xp[an]); + + sl += zp[an]; + sh += sl < zp[an]; + + if (cy < sh || (cy == sh && ap[an] < sl)) + return -1; + sl = ap[an] - sl; /* Monkey business */ + sh = cy - sh - (sl > ap[an]); + if (sh > 0) + return 1; + cy = sl; + } +#endif /* GMP_NAIL_BITS < 2 */ + return cy > 0; +} + +/* Only the first row has v = 0, a = 1 * a + 0 * b */ +static inline int +hgcd_start_row_p (const struct hgcd_row *r, mp_size_t n) +{ + mp_size_t i; + mp_srcptr vp = r->uvp[1]; + + for (i = 0; i < n; i++) + if (vp[i] != 0) + return 0; + + return 1; +} + +/* Called when r[0, 1, 2] >= W^M, r[3] < W^M. Returns the number of + remainders that satisfy Jebelean's criterion, i.e. find the largest k + such that + + r[k+1] >= max (-u[k+1], - v[k+1]) + + r[k] - r[k-1] >= max (u[k+1] - u[k], v[k+1] - v[k]) + + Return 0 on failure, i.e. if B or A mod B < W^M. Return 1 in case + r0 and r1 are correct, but we still make no progress because r0 = + A, r1 = B. + + Otherwise return 2, 3 or 4, the number of r:s that are correct. + */ +static int +hgcd_jebelean (const struct hgcd *hgcd, mp_size_t M) +{ + mp_size_t L; + unsigned bit; + + ASSERT (hgcd->row[0].rsize > M); + ASSERT (hgcd->row[1].rsize > M); + ASSERT (hgcd->row[2].rsize > M); + ASSERT (hgcd->row[3].rsize <= M); + + ASSERT (MPN_LESS_P (hgcd->row[1].rp, hgcd->row[1].rsize, + hgcd->row[0].rp, hgcd->row[0].rsize)); + ASSERT (MPN_LESS_P (hgcd->row[2].rp, hgcd->row[2].rsize, + hgcd->row[1].rp, hgcd->row[1].rsize)); + ASSERT (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize, + hgcd->row[2].rp, hgcd->row[2].rsize)); + + ASSERT (mpn_cmp (hgcd->row[0].uvp[1], hgcd->row[1].uvp[1], hgcd->size) <= 0); + ASSERT (mpn_cmp (hgcd->row[1].uvp[1], hgcd->row[2].uvp[1], hgcd->size) <= 0); + ASSERT (mpn_cmp (hgcd->row[2].uvp[1], hgcd->row[3].uvp[1], hgcd->size) <= 0); + + /* The bound is really floor (N/2), which is <= M = ceil (N/2) */ + L = hgcd->size; + ASSERT (L <= M); + + ASSERT (L > 0); + ASSERT (hgcd->row[3].uvp[1][L - 1] != 0); + + bit = hgcd->sign < 0; + + /* Check r1 - r2 >= max (u2 - u1, v2 - v1) = {|u1| + |u2|, |v1| + |v2|}[bit] */ + + if (mpn_cmp_sum3 (hgcd->row[1].rp, hgcd->row[1].rsize, + hgcd->row[2].rp, hgcd->row[2].rsize, + hgcd->row[1].uvp[bit], L, + hgcd->row[2].uvp[bit], L) < 0) + return 2 - (hgcd_start_row_p (hgcd->row, hgcd->size)); + + /* Ok, r2 is correct */ + + /* Check r3 >= max (-u3, -v3) = (|u3|, |v3|)[bit] */ + if (hgcd->row[3].rsize > L) + /* Condition satisfied */ + ; + else + { + mp_size_t size; + for (size = L; size > hgcd->row[3].rsize; size--) + { + if (hgcd->row[3].uvp[bit][size-1] != 0) + return 3; + } + if (mpn_cmp (hgcd->row[3].rp, hgcd->row[3].uvp[bit], size) < 0) + return 3; + } + + /* Check r3 - r2 >= max(u3-u2, v3-v2) = {|u2| + |u3|, |v2| +|v3|}[1-bit] */ + + if (mpn_cmp_sum3 (hgcd->row[2].rp, hgcd->row[2].rsize, + hgcd->row[3].rp, hgcd->row[3].rsize, + hgcd->row[2].uvp[bit ^ 1], L, + hgcd->row[3].uvp[bit ^ 1], L) < 0) + return 3; + + /* Ok, r3 is correct */ + return 4; +} + + +/* Compute au + bv. u and v are single limbs, a and b are n limbs each. + Stores n+1 limbs in rp, and returns the (n+2)'nd limb. */ +/* FIXME: With nails, we can instead return limb n+1, possibly including + one non-zero nail bit. */ +static mp_limb_t +mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n, + mp_srcptr ap, mp_limb_t u, + mp_srcptr bp, mp_limb_t v) +{ + mp_limb_t h; + mp_limb_t cy; + + h = mpn_mul_1 (rp, ap, n, u); + cy = mpn_addmul_1 (rp, bp, n, v); + h += cy; +#if GMP_NAIL_BITS == 0 + rp[n] = h; + return (h < cy); +#else /* GMP_NAIL_BITS > 0 */ + rp[n] = h & GMP_NUMB_MASK; + return h >> GMP_NUMB_BITS; +#endif /* GMP_NAIL_BITS > 0 */ +} + + +static inline void +qstack_drop (struct qstack *stack) +{ + ASSERT (stack->size_next); + stack->limb_next -= stack->size[--stack->size_next]; +} + +/* Get top element */ +static inline mp_size_t +qstack_get_0 (const struct qstack *stack, + mp_srcptr *qp) +{ + mp_size_t qsize; + ASSERT (stack->size_next); + + qsize = stack->size[stack->size_next - 1]; + *qp = stack->limb + stack->limb_next - qsize; + + return qsize; +} + +/* Get element just below the top */ +static inline mp_size_t +qstack_get_1 (const struct qstack *stack, + mp_srcptr *qp) +{ + mp_size_t qsize; + ASSERT (stack->size_next >= 2); + + qsize = stack->size[stack->size_next - 2]; + *qp = stack->limb + stack->limb_next + - stack->size[stack->size_next - 1] + - qsize; + + return qsize; +} + +/* Adds d to the element on top of the stack */ +static void +qstack_adjust (struct qstack *stack) +{ + mp_size_t qsize; + mp_ptr qp; + + ASSERT (stack->size_next); + + ASSERT_QSTACK (stack); + + if (stack->limb_next >= stack->limb_alloc) + { + qstack_rotate (stack, 1); + } + + ASSERT (stack->limb_next < stack->limb_alloc); + + qsize = stack->size[stack->size_next - 1]; + qp = stack->limb + stack->limb_next - qsize; + + if (qsize == 0) + { + qp[0] = 2; + stack->size[stack->size_next - 1] = 1; + stack->limb_next++; + } + else + { + mp_limb_t cy = mpn_add_1 (qp, qp, qsize, 1); + if (cy) + { + qp[qsize] = cy; + stack->size[stack->size_next - 1]++; + stack->limb_next++; + } + } + + ASSERT_QSTACK (stack); +} + +/* hgcd2 operations */ + +/* Computes P = R * S. No overlap allowed. */ +static mp_size_t +hgcd2_mul (struct hgcd_row *P, mp_size_t alloc, + const struct hgcd2_row *R, + const struct hgcd_row *S, mp_size_t n) +{ + int grow = 0; + mp_limb_t h = 0; + unsigned i; + unsigned j; + + ASSERT (n < alloc); + + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + { + /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] + = u_i s0j + v_i s1j */ + mp_limb_t cy; + + cy = mpn_addmul2_n_1 (P[i].uvp[j], n, + S[0].uvp[j], R[i].u, + S[1].uvp[j], R[i].v); + if (cy) + { + ASSERT (n + 2 <= alloc); + P[i].uvp[j][n+1] = cy; + grow = 1; + } + else + h |= P[i].uvp[j][n]; + } + if (grow) + return n + 2; + else + /* Don't add redundant zeroes */ + return n + (h != 0); +} + +unsigned +mpn_hgcd_max_recursion (mp_size_t n) +{ + int count; + + count_leading_zeros (count, (mp_limb_t) + (1 + n / (HGCD_SCHOENHAGE_THRESHOLD - 5))); + + return GMP_LIMB_BITS - count; +} + +mp_size_t +mpn_hgcd_init_itch (mp_size_t size) +{ + /* r0 <= a, r1, r2, r3 <= b, but for simplicity, we allocate asize + + 1 for all of them. The size of the uv:s are limited to asize / 2, + but we allocate one extra limb. */ + + return 4 * (size + 1) + 8 * ((size / 2) + 1); +} + +void +mpn_hgcd_init (struct hgcd *hgcd, + mp_size_t asize, + mp_limb_t *limbs) +{ + unsigned i; + unsigned j; + mp_size_t alloc = (asize / 2) + 1; + + hgcd->sign = 0; + + for (i = 0; i < 4; i++) + { + hgcd->row[i].rp = limbs; + hgcd->row[i].rsize = asize + 1; limbs += asize + 1; + } + + hgcd->alloc = alloc; + hgcd->size = alloc; + + for (i = 0; i < 4; i++) + for (j = 0; j < 2; j++) + { + hgcd->row[i].uvp[j] = limbs; + limbs += alloc; + } +} + +#if WANT_ASSERT +void +__gmpn_hgcd_sanity (const struct hgcd *hgcd, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + unsigned start, unsigned end) +{ + int sign; + unsigned i; + mp_size_t L = hgcd->size; + mp_ptr tp; + mp_size_t talloc; + mp_ptr t1p; + mp_ptr t2p; + const struct hgcd_row *r; + + ASSERT (asize >= bsize); + + ASSERT (L <= asize / 2); + ASSERT (L); + + ASSERT (L <= asize); + ASSERT (L <= bsize); + + /* NOTE: We really need only asize + bsize + 2*L, but since we're + * swapping the pointers around, we allocate 2*(asize + L). */ + talloc = 2*(asize + L); + tp = __GMP_ALLOCATE_FUNC_LIMBS (talloc); + t1p = tp; + t2p = t1p + (asize + L); + + sign = hgcd->sign; + if (start % 2) + sign = ~sign; + for (i = start, r = &hgcd->row[start]; i < end; i++, sign = ~sign, r++) + { + mp_size_t t1size = asize + L; + mp_size_t t2size = bsize + L; + + mp_size_t k; + for (k = hgcd->size; k < hgcd->alloc; k++) + { + ASSERT (r->uvp[0][k] == 0); + ASSERT (r->uvp[1][k] == 0); + } + + mpn_mul (t1p, ap, asize, r->uvp[0], L); + mpn_mul (t2p, bp, bsize, r->uvp[1], L); + + if (sign < 0) + MPN_PTR_SWAP (t1p, t1size, t2p, t2size); + + MPN_NORMALIZE (t2p, t2size); + ASSERT (t2size <= t1size); + ASSERT_NOCARRY (mpn_sub (t1p, t1p, t1size, t2p, t2size)); + + MPN_NORMALIZE (t1p, t1size); + ASSERT (MPN_EQUAL_P (t1p, t1size, r->rp, r->rsize)); + } + __GMP_FREE_FUNC_LIMBS (tp, talloc); + for (i = start; i < end - 1; i++) + { + /* We should have strict inequality after each reduction step, + but we allow equal values for input. */ + ASSERT (MPN_LEQ_P (hgcd->row[i+1].rp, hgcd->row[i+1].rsize, + hgcd->row[i].rp, hgcd->row[i].rsize)); + } +} +#endif /* WANT_ASSERT */ + +/* Helper functions for hgcd */ +/* Sets (a, b, c, d) <-- (b, c, d, a) */ +#define HGCD_SWAP4_LEFT(row) \ +do { \ + struct hgcd_row __hgcd_swap4_left_tmp; \ + __hgcd_swap4_left_tmp = row[0]; \ + row[0] = row[1]; \ + row[1] = row[2]; \ + row[2] = row[3]; \ + row[3] = __hgcd_swap4_left_tmp; \ +} while (0) + +/* Sets (a, b, c, d) <-- (d, a, b, c) */ +#define HGCD_SWAP4_RIGHT(row) \ +do { \ + struct hgcd_row __hgcd_swap4_right_tmp; \ + __hgcd_swap4_right_tmp = row[3]; \ + row[3] = row[2]; \ + row[2] = row[1]; \ + row[1] = row[0]; \ + row[0] = __hgcd_swap4_right_tmp; \ +} while (0) + +/* Sets (a, b, c, d) <-- (c, d, a, b) */ +#define HGCD_SWAP4_2(row) \ +do { \ + struct hgcd_row __hgcd_swap4_2_tmp; \ + __hgcd_swap4_2_tmp = row[0]; \ + row[0] = row[2]; \ + row[2] = __hgcd_swap4_2_tmp; \ + __hgcd_swap4_2_tmp = row[1]; \ + row[1] = row[3]; \ + row[3] = __hgcd_swap4_2_tmp; \ +} while (0) + +/* Sets (a, b, c) <-- (b, c, a) */ +#define HGCD_SWAP3_LEFT(row) \ +do { \ + struct hgcd_row __hgcd_swap4_left_tmp; \ + __hgcd_swap4_left_tmp = row[0]; \ + row[0] = row[1]; \ + row[1] = row[2]; \ + row[2] = __hgcd_swap4_left_tmp; \ +} while (0) + +/* Computes P = R * S. No overlap allowed. + + Temporary space is needed for two numbers smaller than the + resulting matrix elements, i.e. bounded by 2*L <= N. */ +static mp_size_t +hgcd_mul (struct hgcd_row *P, mp_size_t alloc, + const struct hgcd_row *R, mp_size_t rsize, + const struct hgcd_row *S, mp_size_t ssize, + mp_ptr tp, mp_size_t talloc) +{ + unsigned i; + unsigned j; + + mp_size_t psize; + mp_limb_t h = 0; + int grow = 0; + + MPN_NORMALIZE (R[1].uvp[1], rsize); + ASSERT (S[1].uvp[1][ssize - 1] != 0); + + psize = rsize + ssize; + ASSERT (psize <= talloc); + + if (rsize >= ssize) + { + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + { + /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */ + mp_limb_t cy; + + mpn_mul (P[i].uvp[j], R[i].uvp[0], rsize, S[0].uvp[j], ssize); + mpn_mul (tp, R[i].uvp[1], rsize, S[1].uvp[j], ssize); + + cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize); + + if (cy) + { + ASSERT (psize + 1 < alloc); + P[i].uvp[j][psize] = cy; + grow = 1; + } + else + h |= P[i].uvp[j][psize - 1]; + } + } + else + { + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + { + /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */ + mp_limb_t cy; + + mpn_mul (P[i].uvp[j], S[0].uvp[j], ssize, R[i].uvp[0], rsize); + mpn_mul (tp, S[1].uvp[j], ssize, R[i].uvp[1], rsize); + + cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize); + + if (cy) + { + ASSERT (psize + 1 < alloc); + P[i].uvp[j][psize] = cy; + grow = 1; + } + else + h |= P[i].uvp[j][psize - 1]; + } + } + + if (grow) + return psize + 1; + else + return psize - (h == 0); +} + +/* Computes R = W^k s->r + s->u A' - s->v B', which must be + non-negative. W denotes 2^(GMP_NUMB_BITS). Temporary space needed + is k + uvsize <= M + L = N. + + Must have v > 0, v >= u. */ + +mp_size_t +mpn_hgcd_fix (mp_size_t k, + mp_ptr rp, mp_size_t ralloc, + int sign, mp_size_t uvsize, + const struct hgcd_row *s, + mp_srcptr ap, + mp_srcptr bp, + mp_ptr tp, mp_size_t talloc) +{ + mp_size_t tsize; + mp_limb_t cy; + mp_size_t rsize; + mp_srcptr up; + mp_srcptr vp; + + up = s->uvp[0]; vp = s->uvp[1]; + MPN_NORMALIZE (vp, uvsize); + ASSERT (uvsize > 0); + + if (sign < 0) + { + MP_SRCPTR_SWAP (up, vp); + MP_SRCPTR_SWAP (ap, bp); + } + + tsize = k + uvsize; + + ASSERT (k + s->rsize <= ralloc); + ASSERT (tsize <= talloc); + ASSERT (tsize <= ralloc); + + ASSERT (rp != s->rp); + + /* r = W^k s + u a */ + if (uvsize <= k) + mpn_mul (rp, ap, k, up, uvsize); + else + mpn_mul (rp, up, uvsize, ap, k); + + if (uvsize <= s->rsize) + { + cy = mpn_add (rp + k, s->rp, s->rsize, rp + k, uvsize); + rsize = k + s->rsize; + } + else + { + cy = mpn_add (rp + k, rp + k, uvsize, s->rp, s->rsize); + rsize = k + uvsize; + } + + if (cy) + { + ASSERT (rsize < ralloc); + rp[rsize++] = cy; + } + + /* r -= v b */ + + if (uvsize <= k) + mpn_mul (tp, bp, k, vp, uvsize); + else + mpn_mul (tp, vp, uvsize, bp, k); + + ASSERT_NOCARRY (mpn_sub (rp, rp, rsize, tp, tsize)); + MPN_NORMALIZE (rp, rsize); + + return rsize; +} + +/* Compute r2 = r0 - q r1 */ +static void +hgcd_update_r (struct hgcd_row *r, mp_srcptr qp, mp_size_t qsize) +{ + mp_srcptr r0p = r[0].rp; + mp_srcptr r1p = r[1].rp; + mp_ptr r2p = r[2].rp; + mp_size_t r0size = r[0].rsize; + mp_size_t r1size = r[1].rsize; + + ASSERT (MPN_LESS_P (r1p, r1size, r0p, r0size)); + + if (qsize == 0) + { + ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r1p, r1size)); + } + else if (qsize == 1) + { + mp_size_t size; + mp_limb_t cy = mpn_mul_1 (r2p, r1p, r1size, qp[0]); + size = r1size; + + if (cy) + { + ASSERT (size < r0size); + r2p[size++] = cy; + } + + ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size)); + } + else + { + mp_size_t size = r1size + qsize; + ASSERT (size <= r0size + 1); + + if (qsize <= r1size) + mpn_mul (r2p, r1p, r1size, qp, qsize); + else + mpn_mul (r2p, qp, qsize, r1p, r1size); + + if (size > r0size) + { + ASSERT (size == r0size + 1); + size--; + ASSERT (r2p[size] == 0); + } + + ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size)); + } + + MPN_NORMALIZE (r[2].rp, r0size); + r[2].rsize = r0size; + + ASSERT (MPN_LESS_P (r2p, r0size, r1p, r1size)); +} + +/* Compute (u2, v2) = (u0, v0) + q (u1, v1) + Return the size of the largest u,v element. + Caller must ensure that usize + qsize <= available storage */ +static mp_size_t +hgcd_update_uv (struct hgcd_row *r, mp_size_t usize, + mp_srcptr qp, mp_size_t qsize) +{ + unsigned i; + mp_size_t grow; + + ASSERT (r[1].uvp[1][usize - 1] != 0); + + /* Compute u2 = u0 + q u1 */ + + if (qsize == 0) + { + /* Represents a unit quotient */ + mp_limb_t cy; + + cy = mpn_add_n (r[2].uvp[0], r[0].uvp[0], r[1].uvp[0], usize); + r[2].uvp[0][usize] = cy; + + cy = mpn_add_n (r[2].uvp[1], r[0].uvp[1], r[1].uvp[1], usize); + r[2].uvp[1][usize] = cy; + grow = cy; + } + else if (qsize == 1) + { + mp_limb_t q = qp[0]; + for (i = 0; i < 2; i++) + { + mp_srcptr u0p = r[0].uvp[i]; + mp_srcptr u1p = r[1].uvp[i]; + mp_ptr u2p = r[2].uvp[i]; + mp_limb_t cy; + + /* Too bad we don't have an addmul_1 with distinct source and + destination */ + cy = mpn_mul_1 (u2p, u1p, usize, q); + cy += mpn_add_n (u2p, u2p, u0p, usize); + + u2p[usize] = cy; + grow = cy != 0; + } + } + else + { + for (i = 0; i < 2; i++) + { + mp_srcptr u0p = r[0].uvp[i]; + mp_srcptr u1p = r[1].uvp[i]; + mp_ptr u2p = r[2].uvp[i]; + + if (qsize <= usize) + mpn_mul (u2p, u1p, usize, qp, qsize); + else + mpn_mul (u2p, qp, qsize, u1p, usize); + + ASSERT_NOCARRY (mpn_add (u2p, u2p, usize + qsize, u0p, usize)); + grow = qsize - ((u2p[usize + qsize - 1]) == 0); + } + } + + usize += grow; + + /* The values should be allocated with one limb margin */ + ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0); + ASSERT (mpn_cmp (r[1].uvp[1], r[2].uvp[1], usize) <= 0); + ASSERT (r[2].uvp[1][usize - 1] != 0); + + return usize; +} + +/* Compute r0 = r2 + q r1, and the corresponding uv */ +static void +hgcd_backup (struct hgcd_row *r, mp_size_t usize, + mp_srcptr qp, mp_size_t qsize) +{ + mp_ptr r0p = r[0].rp; + mp_srcptr r1p = r[1].rp; + mp_srcptr r2p = r[2].rp; + mp_size_t r0size; + mp_size_t r1size = r[1].rsize; + mp_size_t r2size = r[2].rsize; + + mp_ptr u0p = r[0].uvp[0]; + mp_ptr v0p = r[0].uvp[1]; + mp_srcptr u1p = r[1].uvp[0]; + mp_srcptr v1p = r[1].uvp[1]; + mp_srcptr u2p = r[2].uvp[0]; + mp_srcptr v2p = r[2].uvp[1]; + + ASSERT (MPN_LESS_P (r2p, r2size, r1p, r1size)); + + if (qsize == 0) + { + /* r0 = r2 + r1 */ + mp_limb_t cy = mpn_add (r0p, r1p, r1size, r2p, r2size); + r0size = r1size; + if (cy) + r0p[r0size++] = cy; + + /* (u0,v0) = (u2,v2) - (u1, v1) */ + + ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u1p, usize)); + ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v1p, usize)); + } + else if (qsize == 1) + { + /* r0 = r2 + q r1 + + Just like for mpn_addmul_1, the result is the same size as r1, or + one limb larger. */ + + mp_limb_t cy; + + cy = mpn_mul_1 (r0p, r1p, r1size, qp[0]); + cy += mpn_add (r0p, r0p, r1size, r2p, r2size); + + r0size = r1size; + if (cy) + r0p[r0size++] = cy; + + /* (u0,v0) = (u2,v2) - q (u1, v1) */ + + ASSERT_NOCARRY (mpn_mul_1 (u0p, u1p, usize, qp[0])); + ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize)); + + ASSERT_NOCARRY (mpn_mul_1 (v0p, v1p, usize, qp[0])); + ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize)); + } + else + { + /* r0 = r2 + q r1 + + Result must be of size r1size + q1size - 1, or one limb + larger. */ + + mp_size_t size; + + r0size = r1size + qsize; + if (r1size >= qsize) + mpn_mul (r0p, r1p, r1size, qp, qsize); + else + mpn_mul (r0p, qp, qsize, r1p, r1size); + + ASSERT_NOCARRY (mpn_add (r0p, r0p, r0size, r2p, r2size)); + + r0size -= (r0p[r0size-1] == 0); + + /* (u0,v0) = (u2,v2) - q (u1, v1) */ + + /* We must have + + usize >= #(q u1) >= qsize + #u1 - 1 + + which means that u1 must have at least + + usize - #u1 >= qsize - 1 + + zero limbs at the high end, and similarly for v1. */ + + ASSERT (qsize <= usize); + size = usize - qsize + 1; +#if WANT_ASSERT + { + mp_size_t i; + for (i = size; i < usize; i++) + { + ASSERT (u1p[i] == 0); + ASSERT (v1p[i] == 0); + } + } +#endif + /* NOTE: Needs an extra limb for the u,v values */ + + if (qsize <= size) + { + mpn_mul (u0p, u1p, size, qp, qsize); + mpn_mul (v0p, v1p, size, qp, qsize); + } + else + { + mpn_mul (u0p, qp, qsize, u1p, size); + mpn_mul (v0p, qp, qsize, v1p, size); + } + + /* qsize + size = usize + 1 */ + ASSERT (u0p[usize] == 0); + ASSERT (v0p[usize] == 0); + + ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize)); + ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize)); + } + + r[0].rsize = r0size; +} + +/* Called after HGCD_SWAP4_RIGHT, to adjust the size field. Large + numbers in row 0 don't count, and are overwritten. */ +static void +hgcd_normalize (struct hgcd *hgcd) +{ + mp_size_t size = hgcd->size; + + /* v3 should always be the largest element */ + while (size > 0 && hgcd->row[3].uvp[1][size - 1] == 0) + { + size--; + /* Row 0 is about to be overwritten. We must zero out unused limbs */ + hgcd->row[0].uvp[0][size] = 0; + hgcd->row[0].uvp[1][size] = 0; + + ASSERT (hgcd->row[1].uvp[0][size] == 0); + ASSERT (hgcd->row[1].uvp[1][size] == 0); + ASSERT (hgcd->row[2].uvp[0][size] == 0); + ASSERT (hgcd->row[2].uvp[1][size] == 0); + ASSERT (hgcd->row[3].uvp[0][size] == 0); + } + + hgcd->size = size; +} + +int +mpn_hgcd2_lehmer_step (struct hgcd2 *hgcd, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + struct qstack *quotients) +{ + mp_limb_t ah; + mp_limb_t al; + mp_limb_t bh; + mp_limb_t bl; + + ASSERT (asize >= bsize); + ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); + + if (bsize < 2) + return 0; + +#if 0 && WANT_TRACE + trace ("lehmer_step:\n" + " a = %Nd\n" + " b = %Nd\n", + ap, asize, bp, bsize); +#endif +#if WANT_TRACE + trace ("lehmer_step: asize = %d, bsize = %d\n", asize, bsize); +#endif + + /* The case asize == 2 is needed to take care of values that are + between one and two *full* limbs in size. */ + if (asize == 2 || (ap[asize-1] & GMP_NUMB_HIGHBIT)) + { + if (bsize < asize) + return 0; + + al = ap[asize - 2]; + ah = ap[asize - 1]; + + ASSERT (asize == bsize); + bl = bp[asize - 2]; + bh = bp[asize - 1]; + } + else + { + unsigned shift; + if (bsize + 1 < asize) + return 0; + + /* We want two *full* limbs */ + ASSERT (asize > 2); + + count_leading_zeros (shift, ap[asize-1]); +#if 0 && WANT_TRACE + trace ("shift = %d\n", shift); +#endif + if (bsize == asize) + bh = MPN_EXTRACT_LIMB (shift, bp[asize - 1], bp[asize - 2]); + else + { + ASSERT (asize == bsize + 1); + bh = bp[asize - 2] >> (GMP_LIMB_BITS - shift); + } + + bl = MPN_EXTRACT_LIMB (shift, bp[asize - 2], bp[asize - 3]); + + al = MPN_EXTRACT_LIMB (shift, ap[asize - 2], ap[asize - 3]); + ah = MPN_EXTRACT_LIMB (shift, ap[asize - 1], ap[asize - 2]); + } + +#if WANT_TRACE + trace ("lehmer_step: ah = %lx, al = %lx, bh = %lx, bl = %lx\n", + (unsigned long) ah, (unsigned long) al, + (unsigned long) bh, (unsigned long) bl); +#endif + return mpn_hgcd2 (hgcd, ah, al, bh, bl, quotients); +} + +/* Called when r2 has been computed, and it is too small. Top element + on the stack is r0/r1. One backup step is needed. */ +static int +hgcd_small_1 (struct hgcd *hgcd, mp_size_t M, + struct qstack *quotients) +{ + mp_srcptr qp; + mp_size_t qsize; + + if (hgcd_start_row_p (hgcd->row, hgcd->size)) + { + qstack_drop (quotients); + return 0; + } + + HGCD_SWAP4_RIGHT (hgcd->row); + hgcd_normalize (hgcd); + + qsize = qstack_get_1 (quotients, &qp); + + hgcd_backup (hgcd->row, hgcd->size, qp, qsize); + hgcd->sign = ~hgcd->sign; + +#if WANT_ASSERT + qstack_rotate (quotients, 0); +#endif + + return hgcd_jebelean (hgcd, M); +} + +/* Called when r3 has been computed, and is small enough. Two backup + steps are needed. */ +static int +hgcd_small_2 (struct hgcd *hgcd, mp_size_t M, + const struct qstack *quotients) +{ + mp_srcptr qp; + mp_size_t qsize; + + if (hgcd_start_row_p (hgcd->row + 2, hgcd->size)) + return 0; + + qsize = qstack_get_0 (quotients, &qp); + hgcd_backup (hgcd->row+1, hgcd->size, qp, qsize); + + if (hgcd_start_row_p (hgcd->row + 1, hgcd->size)) + return 0; + + qsize = qstack_get_1 (quotients, &qp); + hgcd_backup (hgcd->row, hgcd->size, qp, qsize); + + return hgcd_jebelean (hgcd, M); +} + +static void +hgcd_start (struct hgcd *hgcd, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize) +{ + MPN_COPY (hgcd->row[0].rp, ap, asize); + hgcd->row[0].rsize = asize; + + MPN_COPY (hgcd->row[1].rp, bp, bsize); + hgcd->row[1].rsize = bsize; + + hgcd->sign = 0; + if (hgcd->size != 0) + { + /* We must zero out the uv array */ + unsigned i; + unsigned j; + + for (i = 0; i < 4; i++) + for (j = 0; j < 2; j++) + MPN_ZERO (hgcd->row[i].uvp[j], hgcd->size); + } +#if WANT_ASSERT + { + unsigned i; + unsigned j; + mp_size_t k; + + for (i = 0; i < 4; i++) + for (j = 0; j < 2; j++) + for (k = hgcd->size; k < hgcd->alloc; k++) + ASSERT (hgcd->row[i].uvp[j][k] == 0); + } +#endif + + hgcd->size = 1; + hgcd->row[0].uvp[0][0] = 1; + hgcd->row[1].uvp[1][0] = 1; +} + +/* Performs one euclid step on r0, r1. Returns >= 0 if hgcd should be + terminated, -1 if we should go on */ +static int +euclid_step (struct hgcd *hgcd, mp_size_t M, + struct qstack *quotients) +{ + mp_size_t asize; + + mp_size_t qsize; + mp_size_t rsize; + mp_ptr qp; + mp_ptr rp; + + asize = hgcd->row[0].rsize; + rsize = hgcd->row[1].rsize; + qsize = asize - rsize + 1; + + /* Make sure we have space on stack */ + ASSERT_QSTACK (quotients); + + if (qsize > quotients->limb_alloc - quotients->limb_next) + { + qstack_rotate (quotients, + qsize - (quotients->limb_alloc - quotients->limb_next)); + ASSERT (quotients->size_next < QSTACK_MAX_QUOTIENTS); + } + else if (quotients->size_next >= QSTACK_MAX_QUOTIENTS) + { + qstack_rotate (quotients, 0); + } + + ASSERT (qsize <= quotients->limb_alloc - quotients->limb_next); + + qp = quotients->limb + quotients->limb_next; + + rp = hgcd->row[2].rp; + mpn_tdiv_qr (qp, rp, 0, hgcd->row[0].rp, asize, hgcd->row[1].rp, rsize); + MPN_NORMALIZE (rp, rsize); + hgcd->row[2].rsize = rsize; + + if (qp[qsize - 1] == 0) + qsize--; + + if (qsize == 1 && qp[0] == 1) + qsize = 0; + + quotients->size[quotients->size_next++] = qsize; + quotients->limb_next += qsize; + + ASSERT_QSTACK (quotients); + + /* Update u and v */ + ASSERT (hgcd->size + qsize <= hgcd->alloc); + hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, qp, qsize); + ASSERT (hgcd->size < hgcd->alloc); + + if (hgcd->row[2].rsize <= M) + return hgcd_small_1 (hgcd, M, quotients); + else + { + /* Keep this remainder */ + hgcd->sign = ~hgcd->sign; + + HGCD_SWAP4_LEFT (hgcd->row); + return -1; + } +} + +/* Called when values have been computed in r[0] and r[1], and the + latter value is too large, and we know that it's not much too + large. Returns the updated size for the uv matrix. */ +static mp_size_t +hgcd_adjust (struct hgcd_row *r, mp_size_t size, + struct qstack *quotients) +{ + mp_limb_t c0; + mp_limb_t c1; + + /* Compute the correct r3, we have r3' = r3 - r2. */ + + ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)); + + MPN_NORMALIZE (r[1].rp, r[1].rsize); + + ASSERT (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)); + + c0 = mpn_add_n (r[1].uvp[0], r[1].uvp[0], r[0].uvp[0], size); + c1 = mpn_add_n (r[1].uvp[1], r[1].uvp[1], r[0].uvp[1], size); + + /* FIXME: Can avoid branches */ + if (c1 != 0) + { + r[1].uvp[0][size] = c0; + r[1].uvp[1][size] = c1; + size++; + } + else + { + ASSERT (c0 == 0); + } + + /* Remains to adjust the quotient on stack */ + qstack_adjust (quotients); + + return size; +} + +/* Reduce using Lehmer steps. Called by mpn_hgcd when r1 has been + reduced to approximately the right size. Also used by + mpn_hgcd_lehmer. */ +static int +hgcd_final (struct hgcd *hgcd, mp_size_t M, + struct qstack *quotients) +{ + ASSERT (hgcd->row[0].rsize > M); + ASSERT (hgcd->row[1].rsize > M); + + /* Can be equal when called by hgcd_lehmer. */ + ASSERT (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize, + hgcd->row[0].rp, hgcd->row[0].rsize)); + + for (;;) + { + mp_size_t L = hgcd->row[0].rsize; + + struct hgcd2 R; + int res; + + if (L <= M + 2 + && (L < M + 2 || (hgcd->row[0].rp[M+1] & GMP_NUMB_HIGHBIT) == 0)) + break; + + res = mpn_hgcd2_lehmer_step (&R, + hgcd->row[0].rp, hgcd->row[0].rsize, + hgcd->row[1].rp, hgcd->row[1].rsize, + quotients); + + if (res == 0) + { + /* We must divide to make progress */ + res = euclid_step (hgcd, M, quotients); + + if (res >= 0) + return res; + } + else if (res == 1) + { + mp_size_t qsize; + + /* The quotient that has been computed for r2 is at most 2 + off. So adjust that, and avoid a full division. */ + qstack_drop (quotients); + + /* Top two rows of R must be the identity matrix, followed + by a row (1, q). */ + ASSERT (R.row[0].u == 1 && R.row[0].v == 0); + ASSERT (R.row[1].u == 0 && R.row[1].v == 1); + ASSERT (R.row[2].u == 1); + + qsize = (R.row[2].v != 0); + + hgcd_update_r (hgcd->row, &R.row[2].v, qsize); + hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, + &R.row[2].v, qsize); + ASSERT (hgcd->size < hgcd->alloc); + + if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize, + hgcd->row[2].rp, hgcd->row[2].rsize)) + hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients); + + ASSERT (hgcd->size < hgcd->alloc); + + hgcd->sign = ~hgcd->sign; + HGCD_SWAP4_LEFT (hgcd->row); + } + else + { + const struct hgcd2_row *s = R.row + (res - 2); + int sign = R.sign; + /* Max size after reduction, plus one */ + mp_size_t ralloc = hgcd->row[1].rsize + 1; + + if (res == 2) + { + qstack_drop (quotients); + qstack_drop (quotients); + } + else if (res == 3) + { + sign = ~sign; + qstack_drop (quotients); + } + + /* s[0] and s[1] correct. */ + hgcd->row[2].rsize + = mpn_hgcd2_fix (hgcd->row[2].rp, ralloc, + sign, + s[0].u, hgcd->row[0].rp, hgcd->row[0].rsize, + s[0].v, hgcd->row[1].rp, hgcd->row[1].rsize); + + hgcd->row[3].rsize + = mpn_hgcd2_fix (hgcd->row[3].rp, ralloc, + ~sign, + s[1].u, hgcd->row[0].rp, hgcd->row[0].rsize, + s[1].v, hgcd->row[1].rp, hgcd->row[1].rsize); + + hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc, + s, hgcd->row, hgcd->size); + hgcd->sign ^= sign; + + ASSERT (hgcd->row[2].rsize > M); + +#if WANT_ASSERT + switch (res) + { + default: + ASSERT_ALWAYS (0 == "Unexpected value of res"); + break; + case 2: + ASSERT (hgcd->row[2].rsize >= L - 1); + ASSERT (hgcd->row[3].rsize >= L - 2); + ASSERT (hgcd->row[2].rsize > M + 1); + ASSERT (hgcd->row[3].rsize > M); + break; + case 3: + ASSERT (hgcd->row[2].rsize >= L - 2); + ASSERT (hgcd->row[3].rsize >= L - 2); + ASSERT (hgcd->row[3].rsize > M); + break; + case 4: + ASSERT (hgcd->row[2].rsize >= L - 2); + ASSERT (hgcd->row[3].rsize < L || hgcd->row[3].rp[L-1] == 1); + break; + } +#endif + if (hgcd->row[3].rsize <= M) + { + /* Can happen only in the res == 4 case */ + ASSERT (res == 4); + + /* Backup two steps */ + ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); + + return hgcd_small_2 (hgcd, M, quotients); + } + + HGCD_SWAP4_2 (hgcd->row); + } + } + + ASSERT (hgcd->row[1].rsize > M); + + for (;;) + { + mp_size_t L = hgcd->row[0].rsize; + mp_size_t ralloc; + + mp_size_t qsize; + mp_srcptr qp; + + struct hgcd2 R; + int res; + + /* We don't want hgcd2 to pickup any bits below r0p[M-1], so + don't tell mpn_hgcd2_lehmer_step about them. */ + res = mpn_hgcd2_lehmer_step (&R, + hgcd->row[0].rp+M-1, hgcd->row[0].rsize-M+1, + hgcd->row[1].rp+M-1, hgcd->row[1].rsize-M+1, + quotients); + if (res == 0) + { + /* We must divide to make progress */ + res = euclid_step (hgcd, M, quotients); + + if (res >= 0) + return res; + + continue; + } + + if (res == 1) + { + mp_size_t qsize; + + /* The quotient that has been computed for r2 is at most 2 + off. So adjust that, and avoid a full division. */ + qstack_drop (quotients); + + /* Top two rows of R must be the identity matrix, followed + by a row (1, q). */ + ASSERT (R.row[0].u == 1 && R.row[0].v == 0); + ASSERT (R.row[1].u == 0 && R.row[1].v == 1); + ASSERT (R.row[2].u == 1); + + qsize = (R.row[2].v != 0); + + hgcd_update_r (hgcd->row, &R.row[2].v, qsize); + hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, + &R.row[2].v, qsize); + ASSERT (hgcd->size < hgcd->alloc); + + if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize, + hgcd->row[2].rp, hgcd->row[2].rsize)) + hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients); + + ASSERT (hgcd->size < hgcd->alloc); + + hgcd->sign = ~hgcd->sign; + HGCD_SWAP4_LEFT (hgcd->row); + + continue; + } + + /* Now r0 and r1 are always correct. */ + /* Store new values in rows 2 and 3, to avoid overlap */ + + /* Max size after reduction, plus one */ + ralloc = hgcd->row[1].rsize + 1; + + hgcd->row[2].rsize + = mpn_hgcd2_fix (hgcd->row[2].rp, ralloc, + R.sign, + R.row[0].u, hgcd->row[0].rp, hgcd->row[0].rsize, + R.row[0].v, hgcd->row[1].rp, hgcd->row[1].rsize); + + hgcd->row[3].rsize + = mpn_hgcd2_fix (hgcd->row[3].rp, ralloc, + ~R.sign, + R.row[1].u, hgcd->row[0].rp, hgcd->row[0].rsize, + R.row[1].v, hgcd->row[1].rp, hgcd->row[1].rsize); + + ASSERT (hgcd->row[2].rsize >= L - 1); + ASSERT (hgcd->row[3].rsize >= L - 2); + + ASSERT (hgcd->row[2].rsize > M); + ASSERT (hgcd->row[3].rsize > M-1); + + hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc, + R.row, hgcd->row, hgcd->size); + hgcd->sign ^= R.sign; + + if (hgcd->row[3].rsize <= M) + { + /* Backup two steps */ + + /* We don't use R.row[2] and R.row[3], so drop the + corresponding quotients. */ + qstack_drop (quotients); + qstack_drop (quotients); + + return hgcd_small_2 (hgcd, M, quotients); + } + + HGCD_SWAP4_2 (hgcd->row); + + if (res == 2) + { + qstack_drop (quotients); + qstack_drop (quotients); + + continue; + } + + /* We already know the correct q for computing r2 */ + + qsize = qstack_get_1 (quotients, &qp); + ASSERT (qsize < 2); + + ASSERT (qsize + hgcd->size <= hgcd->alloc); + hgcd_update_r (hgcd->row, qp, qsize); + hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, + qp, qsize); + ASSERT (hgcd->size < hgcd->alloc); + + ASSERT (hgcd->row[2].rsize >= M - 2); + + if (hgcd->row[2].rsize <= M) + { + /* Discard r3 */ + qstack_drop (quotients); + return hgcd_small_1 (hgcd, M, quotients); + } + if (res == 3) + { + /* Drop quotient for r3 */ + qstack_drop (quotients); + + hgcd->sign = ~hgcd->sign; + HGCD_SWAP4_LEFT (hgcd->row); + + continue; + } + + ASSERT (res == 4); + ASSERT (hgcd->row[2].rsize > M); + + /* We already know the correct q for computing r3 */ + qsize = qstack_get_0 (quotients, &qp); + ASSERT (qsize < 2); + + ASSERT (qsize + hgcd->size <= hgcd->alloc); + hgcd_update_r (hgcd->row + 1, qp, qsize); + hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size, + qp, qsize); + ASSERT (hgcd->size < hgcd->alloc); + + ASSERT (hgcd->row[3].rsize <= M + 1); + /* Appearantly not true. Probably because we have leading zeros + when we call hgcd2. */ + /* ASSERT (hgcd->row[3].rsize <= M || hgcd->row[3].rp[M] == 1); */ + + if (hgcd->row[3].rsize <= M) + return hgcd_jebelean (hgcd, M); + + HGCD_SWAP4_2 (hgcd->row); + } +} + +mp_size_t +mpn_hgcd_itch (mp_size_t asize) +{ + /* Scratch space is needed for calling hgcd. We need space for the + results of all recursive calls. In addition, we need space for + calling hgcd_fix and hgcd_mul, for which N = asize limbs should + be enough. */ + + /* Limit on the recursion depth */ + unsigned k = mpn_hgcd_max_recursion (asize); + + return asize + mpn_hgcd_init_itch (asize + 6 * k) + 12 * k; +} + +/* Repeatedly divides A by B, until the remainder fits in M = + ceil(asize / 2) limbs. Stores cofactors in HGCD, and pushes the + quotients on STACK. On success, HGCD->row[0, 1, 2] correspond to + remainders that are larger than M limbs, while HGCD->row[3] + correspond to a remainder that fit in M limbs. + + Return 0 on failure (if B or A mod B fits in M limbs), otherwise + return one of 1 - 4 as specified for hgcd_jebelean. */ +int +mpn_hgcd (struct hgcd *hgcd, + mp_srcptr ap, mp_size_t asize, + mp_srcptr bp, mp_size_t bsize, + struct qstack *quotients, + mp_ptr tp, mp_size_t talloc) +{ + mp_size_t N = asize; + mp_size_t M = (N + 1)/2; + mp_size_t n; + mp_size_t m; + + struct hgcd R; + mp_size_t itch; + + ASSERT (M); +#if WANT_TRACE + trace ("hgcd: asize = %d, bsize = %d, HGCD_SCHOENHAGE_THRESHOLD = %d\n", + asize, bsize, HGCD_SCHOENHAGE_THRESHOLD); + if (asize < 100) + trace (" a = %Nd\n" + " b = %Nd\n", ap, asize, bp, bsize); +#endif + + if (bsize <= M) + return 0; + + ASSERT (asize >= 2); + + /* Initialize, we keep r0 and r1 as the reduced numbers (so far). */ + hgcd_start (hgcd, ap, asize, bp, bsize); + + if (BELOW_THRESHOLD (N, HGCD_SCHOENHAGE_THRESHOLD)) + return hgcd_final (hgcd, M, quotients); + + /* Reduce the size to M + m + 1. Usually, only one hgcd call is + needed, but we may need multiple calls. When finished, the values + are stored in r0 (potentially large) and r1 (smaller size) */ + + n = N - M; + m = (n + 1)/2; + + /* The second recursive call can use numbers of size up to n+3 */ + itch = mpn_hgcd_init_itch (n+3); + + ASSERT (itch <= talloc); + mpn_hgcd_init (&R, n+3, tp); + tp += itch; talloc -= itch; + + while (hgcd->row[1].rsize > M + m + 1) + { + /* Max size after reduction, plus one */ + mp_size_t ralloc = hgcd->row[1].rsize + 1; + + int res = mpn_hgcd (&R, + hgcd->row[0].rp + M, hgcd->row[0].rsize - M, + hgcd->row[1].rp + M, hgcd->row[1].rsize - M, + quotients, tp, talloc); + + if (res == 0) + { + /* We must divide to make progress */ + res = euclid_step (hgcd, M, quotients); + + if (res > 0) + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); + if (res >= 0) + return res; + + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); + } + else if (res <= 2) + { + /* The reason we use hgcd_adjust also when res == 2 is that + either r2 is correct, and we get it for free. + + Or r2 is too large. Then can correct it by a few bignum + subtractions, and we are *guaranteed* that the result is + small enough that we don't need another run through this + loop. */ + + /* FIXME: For res == 1, the newly computed row[2] will be + the same as the old row[1], so we do some unnecessary + computations. */ + + qstack_drop (quotients); + + /* Store new values in rows 2 and 3, to avoid overlap */ + hgcd->row[2].rsize + = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc, + ~R.sign, R.size, &R.row[1], + hgcd->row[0].rp, hgcd->row[1].rp, + tp, talloc); + + hgcd->row[3].rsize + = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc, + R.sign, R.size, &R.row[2], + hgcd->row[0].rp, hgcd->row[1].rp, + tp, talloc); + + ASSERT (hgcd->row[2].rsize > M); + ASSERT (hgcd->row[3].rsize > M); + + /* Computes the uv matrix for the (possibly incorrect) + values r1, r2. The elements must be smaller than the + correct ones, since they correspond to a too small q. */ + + hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc, + R.row + 1, R.size, + hgcd->row, hgcd->size, + tp, talloc); + hgcd->sign ^= ~R.sign; + + if (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize, + hgcd->row[2].rp, hgcd->row[2].rsize)) + { + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); + + HGCD_SWAP4_2 (hgcd->row); + } + else + { + /* r2 was too large, i.e. q0 too small. In this case we + must have r2 % r1 <= r2 - r1 smaller than M + m + 1. */ + + hgcd->size = hgcd_adjust (hgcd->row + 2, hgcd->size, quotients); + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); + + ASSERT (hgcd->row[3].rsize <= M + m + 1); + + if (hgcd->row[3].rsize <= M) + { + /* Backup two steps */ + ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); + + return hgcd_small_2 (hgcd, M, quotients); + } + + HGCD_SWAP4_2 (hgcd->row); + + /* Loop always terminates here. */ + break; + } + } + else if (res == 3) + { + qstack_drop(quotients); + + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); + + /* Store new values in rows 2 and 3, to avoid overlap */ + hgcd->row[2].rsize + = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc, + ~R.sign, R.size, &R.row[1], + hgcd->row[0].rp, hgcd->row[1].rp, + tp, talloc); + + hgcd->row[3].rsize + = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc, + R.sign, R.size, &R.row[2], + hgcd->row[0].rp, hgcd->row[1].rp, + tp, talloc); + + ASSERT (hgcd->row[2].rsize > M); + ASSERT (hgcd->row[3].rsize > M); + + hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc, + R.row + 1, R.size, + hgcd->row, hgcd->size, + tp, talloc); + hgcd->sign ^= ~R.sign; + + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); + + HGCD_SWAP4_2 (hgcd->row); + } + else + { + ASSERT (res == 4); + + /* All of r0, r1, r3 and r3 are correct. + Compute r2 and r3 */ + + ASSERT_HGCD (&R, + hgcd->row[0].rp + M, hgcd->row[0].rsize - M, + hgcd->row[1].rp + M, hgcd->row[1].rsize - M, + 0, 4); + + /* Store new values in rows 2 and 3, to avoid overlap */ + hgcd->row[2].rsize + = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc, + R.sign, R.size, &R.row[2], + hgcd->row[0].rp, hgcd->row[1].rp, + tp, talloc); + + hgcd->row[3].rsize + = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc, + ~R.sign, R.size, &R.row[3], + hgcd->row[0].rp, hgcd->row[1].rp, + tp, talloc); + + ASSERT (hgcd->row[2].rsize > M); + ASSERT (hgcd->row[3].rsize <= M + m + 1); + + hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc, + R.row+2, R.size, + hgcd->row, hgcd->size, + tp, talloc); + hgcd->sign ^= R.sign; + + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); + + if (hgcd->row[3].rsize <= M) + { + /* Backup two steps */ + /* Both steps must always be possible, but it's not + trivial to ASSERT that here. */ + ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); + + return hgcd_small_2 (hgcd, M, quotients); + } + HGCD_SWAP4_2 (hgcd->row); + + /* Always exit the loop. */ + break; + } + } + + ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize); + ASSERT (hgcd->row[1].rsize > M); + ASSERT (hgcd->row[1].rsize <= M + m + 1); + + if (hgcd->row[0].rsize > M + m + 1) + { + /* One euclid step to reduce size. */ + int res = euclid_step (hgcd, M, quotients); + + if (res > 0) + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); + if (res >= 0) + return res; + + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); + } + + ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize); + ASSERT (hgcd->row[0].rsize <= M + m + 1); + ASSERT (hgcd->row[1].rsize > M); + + /* Second phase, reduce size until we have one number of size > M + and one of size <= M+1 */ + while (hgcd->row[1].rsize > M + 1) + { + mp_size_t k = 2*M - hgcd->row[0].rsize; + mp_size_t n1 = hgcd->row[0].rsize - k; + mp_size_t qsize; + mp_srcptr qp; + int res; + + ASSERT (k + (n1 + 1)/2 == M); + ASSERT (n1 >= 2); + + ASSERT (n1 <= 2*(m + 1)); + ASSERT (n1 <= n + 3); + + res = mpn_hgcd (&R, + hgcd->row[0].rp + k, hgcd->row[0].rsize - k, + hgcd->row[1].rp + k, hgcd->row[1].rsize - k, + quotients, tp, talloc); + + if (res == 0) + { + /* The first remainder was small. Then there's a good chance + that the remainder A % B is also small. */ + res = euclid_step (hgcd, M, quotients); + + if (res > 0) + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); + if (res >= 0) + return res; + + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); + continue; + } + + if (res == 1) + { + mp_srcptr qp; + mp_size_t qsize; + + qstack_drop (quotients); + + /* Compute possibly incorrect r2 and corresponding u2, v2. + Incorrect matrix elements must be smaller than the + correct ones, since they correspond to a too small q. */ + qsize = qstack_get_0 (quotients, &qp); + + ASSERT (qsize + hgcd->size <= hgcd->alloc); + hgcd_update_r (hgcd->row, qp, qsize); + hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, + qp, qsize); + ASSERT (hgcd->size < hgcd->alloc); + + if (!MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize, + hgcd->row[2].rp, hgcd->row[2].rsize)) + hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients); + + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 3); + + if (hgcd->row[2].rsize <= M) + { + /* Backup one steps */ + ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); + + return hgcd_small_1 (hgcd, M, quotients); + } + + HGCD_SWAP4_LEFT (hgcd->row); + hgcd->sign = ~hgcd->sign; + continue; + } + + /* Now r0 and r1 are always correct. */ + + /* It's possible that first two "new" r:s are the same as the + old ones. In that case skip recomputing them. */ + + if (!hgcd_start_row_p (&R.row[0], R.size)) + { + /* Store new values in rows 2 and 3, to avoid overlap */ + hgcd->row[2].rsize + = mpn_hgcd_fix (k, hgcd->row[2].rp, hgcd->row[0].rsize + 1, + R.sign, R.size, &R.row[0], + hgcd->row[0].rp, hgcd->row[1].rp, + tp, talloc); + + hgcd->row[3].rsize + = mpn_hgcd_fix (k, hgcd->row[3].rp, hgcd->row[1].rsize + 1, + ~R.sign, R.size, &R.row[1], + hgcd->row[0].rp, hgcd->row[1].rp, + tp, talloc); + + ASSERT (hgcd->row[2].rsize > M); + ASSERT (hgcd->row[3].rsize > k); + + hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc, + R.row, R.size, hgcd->row, hgcd->size, + tp, talloc); + hgcd->sign ^= R.sign; + + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); + + if (hgcd->row[3].rsize <= M) + { + /* Backup two steps */ + + /* We don't use R.row[2] and R.row[3], so drop the + corresponding quotients. */ + qstack_drop (quotients); + qstack_drop (quotients); + + return hgcd_small_2 (hgcd, M, quotients); + } + + HGCD_SWAP4_2 (hgcd->row); + + if (res == 2) + { + qstack_drop (quotients); + qstack_drop (quotients); + + continue; + } + } + + ASSERT (res >= 3); + + /* We already know the correct q */ + qsize = qstack_get_1 (quotients, &qp); + + ASSERT (qsize + hgcd->size <= hgcd->alloc); + hgcd_update_r (hgcd->row, qp, qsize); + hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, + qp, qsize); + ASSERT (hgcd->size < hgcd->alloc); + + ASSERT (hgcd->row[2].rsize > k); + if (hgcd->row[2].rsize <= M) + { + /* Discard r3 */ + qstack_drop (quotients); + return hgcd_small_1 (hgcd, M, quotients); + } + if (res == 3) + { + /* Drop quotient for r3 */ + qstack_drop (quotients); + hgcd->sign = ~hgcd->sign; + HGCD_SWAP4_LEFT (hgcd->row); + + continue; + } + + ASSERT (hgcd->row[2].rsize > M); + ASSERT (res == 4); + + /* We already know the correct q */ + qsize = qstack_get_0 (quotients, &qp); + + ASSERT (qsize + hgcd->size <= hgcd->alloc); + hgcd_update_r (hgcd->row + 1, qp, qsize); + hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size, + qp, qsize); + ASSERT (hgcd->size < hgcd->alloc); + ASSERT (hgcd->row[3].rsize <= M + 1); + + if (hgcd->row[3].rsize <= M) + { +#if WANT_ASSERT + qstack_rotate (quotients, 0); +#endif + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); + return hgcd_jebelean (hgcd, M); + } + + HGCD_SWAP4_2 (hgcd->row); + } + + ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); + + return hgcd_final (hgcd, M, quotients); +} diff -x *~ -N -u -r gmp-4.1.3/mpn/generic/hgcd2.c gmp-4.1.3-gcdext/mpn/generic/hgcd2.c --- gmp-4.1.3/mpn/generic/hgcd2.c Thu Jan 1 01:00:00 1970 +++ gmp-4.1.3-gcdext/mpn/generic/hgcd2.c Sun Jan 25 22:11:55 2004 @@ -0,0 +1,596 @@ +/* hgcd2.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#if GMP_NAIL_BITS == 0 + +/* Copied from mpn/generic/gcdext.c, and modified slightly to return + the remainder. */ +/* Two-limb division optimized for small quotients. */ +static inline mp_limb_t +div2 (mp_ptr rp, + mp_limb_t nh, mp_limb_t nl, + mp_limb_t dh, mp_limb_t dl) +{ + mp_limb_t q = 0; + + if ((mp_limb_signed_t) nh < 0) + { + int cnt; + for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++) + { + dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1)); + dl = dl << 1; + } + + while (cnt) + { + q <<= 1; + if (nh > dh || (nh == dh && nl >= dl)) + { + sub_ddmmss (nh, nl, nh, nl, dh, dl); + q |= 1; + } + dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1); + dh = dh >> 1; + cnt--; + } + } + else + { + int cnt; + for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++) + { + dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1)); + dl = dl << 1; + } + + while (cnt) + { + dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1); + dh = dh >> 1; + q <<= 1; + if (nh > dh || (nh == dh && nl >= dl)) + { + sub_ddmmss (nh, nl, nh, nl, dh, dl); + q |= 1; + } + cnt--; + } + } + + rp[0] = nl; + rp[1] = nh; + + return q; +} +#else /* GMP_NAIL_BITS != 0 */ +/* Two-limb division optimized for small quotients. Input words + include nails, which must be zero. */ +static inline mp_limb_t +div2 (mp_ptr rp, + mp_limb_t nh, mp_limb_t nl, + mp_limb_t dh, mp_limb_t dl) +{ + mp_limb_t q = 0; + int cnt; + + ASSERT_LIMB(nh); + ASSERT_LIMB(nl); + ASSERT_LIMB(dh); + ASSERT_LIMB(dl); + + /* FIXME: Always called with nh > 0 and dh >0. Then it should be + enough to look at the high limbs to select cnt. */ + for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++) + { + dh = (dh << 1) | (dl >> (GMP_NUMB_BITS - 1)); + dl = (dl << 1) & GMP_NUMB_MASK; + } + + while (cnt) + { + dl = (dh << (GMP_NUMB_BITS - 1)) | (dl >> 1); + dh = dh >> 1; + dl &= GMP_NUMB_MASK; + + q <<= 1; + if (nh > dh || (nh == dh && nl >= dl)) + { + /* FIXME: We could perhaps optimize this by unrolling the + loop 2^GMP_NUMB_BITS - 1 times? */ + nl -= dl; + nh -= dh; + nh -= (nl >> (GMP_LIMB_BITS - 1)); + nl &= GMP_NUMB_MASK; + + q |= 1; + } + cnt--; + } + ASSERT (nh < dh || (nh == dh && nl < dl)); + rp[0] = nl; + rp[1] = nh; + + return q; +} +#endif /* GMP_NAIL_BITS != 0 */ + +#define SUB_2(w1,w0, x1,x0, y1,y0) \ + do { \ + ASSERT_LIMB (x1); \ + ASSERT_LIMB (x0); \ + ASSERT_LIMB (y1); \ + ASSERT_LIMB (y0); \ + \ + if (GMP_NAIL_BITS == 0) \ + sub_ddmmss (w1,w0, x1,x0, y1,y0); \ + else \ + { \ + mp_limb_t __w0, __c; \ + SUBC_LIMB (__c, __w0, x0, y0); \ + (w1) = ((x1) - (y1) - __c) & GMP_NUMB_MASK; \ + (w0) = __w0; \ + } \ + } while (0) + +static inline void +qstack_push_0 (struct qstack *stack) +{ + ASSERT_QSTACK (stack); + + if (stack->size_next >= QSTACK_MAX_QUOTIENTS) + qstack_rotate (stack, 0); + + stack->size[stack->size_next++] = 0; +} + +static inline void +qstack_push_1 (struct qstack *stack, mp_limb_t q) +{ + ASSERT (q >= 2); + + ASSERT_QSTACK (stack); + + if (stack->limb_next >= stack->limb_alloc) + qstack_rotate (stack, 1); + + else if (stack->size_next >= QSTACK_MAX_QUOTIENTS) + qstack_rotate (stack, 0); + + stack->size[stack->size_next++] = 1; + stack->limb[stack->limb_next++] = q; + + ASSERT_QSTACK (stack); +} + +/* Produce r_k from r_i and r_j, and push the corresponding + quotient. */ +#if __GMP_HAVE_TOKEN_PASTE +#define HGCD2_STEP(i, j, k) do { \ + SUB_2 (rh ## k, rl ## k, \ + rh ## i, rl ## i, \ + rh ## j, rl ## j); \ + \ + /* Could check here for the special case rh3 == 0, \ + but it's covered by the below condition as well */ \ + if ( rh ## k < rh ## j \ + || ( rh ## k == rh ## j \ + && rl ## k < rl ## j)) \ + { \ + /* Unit quotient */ \ + u ## k = u ## i + u ## j; \ + v ## k = v ## i + v ## j; \ + \ + if (quotients) \ + qstack_push_0 (quotients); \ + } \ + else \ + { \ + mp_limb_t r[2]; \ + mp_limb_t q = 1 + div2 (r, rh ## k, rl ## k, \ + rh ## j, rl ## j); \ + rl ## k = r[0]; rh ## k = r[1]; \ + u ## k = u ## i + q * u ## j; \ + v ## k = v ## i + q * v ## j; \ + \ + if (quotients) \ + qstack_push_1 (quotients, q); \ + } \ +} while (0) +#else /* ! __GMP_HAVE_TOKEN_PASTE */ +#define HGCD2_STEP(i, j, k) do { \ + SUB_2 (rh/**/k, rl/**/k, \ + rh/**/i, rl/**/i, \ + rh/**/j, rl/**/j); \ + \ + /* Could check here for the special case rh3 == 0, \ + but it's covered by the below condition as well */ \ + if ( rh/**/k < rh/**/j \ + || ( rh/**/k == rh/**/j \ + && rl/**/k < rl/**/j)) \ + { \ + /* Unit quotient */ \ + u/**/k = u/**/i + u/**/j; \ + v/**/k = v/**/i + v/**/j; \ + \ + if (quotients) \ + qstack_push_0 (quotients); \ + } \ + else \ + { \ + mp_limb_t r[2]; \ + mp_limb_t q = 1 + div2 (r, rh/**/k, rl/**/k, \ + rh/**/j, rl/**/j); \ + rl/**/k = r[0]; rh/**/k = r[1]; \ + u/**/k = u/**/i + q * u/**/j; \ + v/**/k = v/**/i + q * v/**/j; \ + \ + if (quotients) \ + qstack_push_1 (quotients, q); \ + } \ +} while (0) +#endif /* ! __GMP_HAVE_TOKEN_PASTE */ + +/* Repeatedly divides A by B, until the remainder is a single limb. + Stores cofactors in HGCD, and pushes the quotients on STACK (unless + STACK is NULL). On success, HGCD->row[0, 1, 2] correspond to + remainders that are larger than one limb, while HGCD->row[3] + correspond to a remainder that fit in a single limb. + + Return 0 on failure (if B or A mod B fits in a single limb). Return + 1 if r0 and r1 are correct, but we still make no progress because + r0 = A, r1 = B. + + Otherwise return 2, 3 or 4 depending on how many of the r:s that + satisfy Jebelean's criterion. */ +/* FIXME: There are two more micro optimizations that could be done to + this code: + + The div2 function starts with checking the most significant bit of + the numerator. When we call div2, that bit is know in advance for + all but the one or two first calls, so we could split div2 in two + functions, and call the right one. + + We could also have two versions of this code, with and without the + quotient argument, to avoid checking if it's NULL in the middle of + the loop. */ + +int +mpn_hgcd2 (struct hgcd2 *hgcd, + mp_limb_t ah, mp_limb_t al, + mp_limb_t bh, mp_limb_t bl, + struct qstack *quotients) +{ + /* For all divisions, we special case q = 1, which accounts for + approximately 41% of the quotients for random numbers (Knuth, + TAOCP 4.5.3) */ + + /* Use scalar variables */ + mp_limb_t rh1, rl1, u1, v1; + mp_limb_t rh2, rl2, u2, v2; + mp_limb_t rh3, rl3, u3, v3; + + ASSERT_LIMB(ah); + ASSERT_LIMB(al); + ASSERT_LIMB(bh); + ASSERT_LIMB(bl); + ASSERT (ah > bh || (ah == bh && al >= bl)); + + if (bh == 0) + return 0; + + { + mp_limb_t rh0, rl0, u0, v0; + + /* Initialize first two rows */ + rh0 = ah; rl0 = al; u0 = 1; v0 = 0; + rh1 = bh; rl1 = bl; u1 = 0; v1 = 1; + + SUB_2 (rh2, rl2, rh0, rl0, rh1, rl1); + + if (rh2 == 0) + return 0; + + if (rh2 < rh1 || (rh2 == rh1 && rl2 < rl1)) + { + /* Unit quotient */ + v2 = 1; + + if (quotients) + qstack_push_0 (quotients); + } + else + { + mp_limb_t r[2]; + mp_limb_t q = 1 + div2 (r, rh2, rl2, rh1, rl1); + + rl2 = r[0]; rh2 = r[1]; + + if (rh2 == 0) + return 0; + + v2 = q; + + if (quotients) + qstack_push_1 (quotients, q); + } + + u2 = 1; + + /* The simple version of the loop is as follows: + | + | hgcd->sign = 0; + | for (;;) + | { + | (q, rh3, rl3]) = divmod (r1, r2); + | u[3] = u1 + q * u2; + | v[3] = v1 + q * v2; + | qstack_push_1 (quotients, q); + | + | if (rh3 == 0) + | break; + | + | HGCD2_SHIFT4_LEFT (hgcd->row); + | hgcd->sign = ~hgcd->sign; + | } + | + | But then we special case for q = 1, and unroll the loop four times + | to avoid data movement. */ + + for (;;) + { + HGCD2_STEP (1, 2, 3); + if (rh3 == 0) + { + hgcd->row[0].u = u0; hgcd->row[0].v = v0; + + hgcd->sign = 0; + + break; + } + HGCD2_STEP (2, 3, 0); + if (rh0 == 0) + { + hgcd->row[0].u = u1; hgcd->row[0].v = v1; + + rh1 = rh2; rl1 = rl2; u1 = u2; v1 = v2; + rh2 = rh3; rl2 = rl3; u2 = u3; v2 = v3; + rh3 = rh0; rl3 = rl0; u3 = u0; v3 = v0; + + hgcd->sign = -1; + break; + } + + HGCD2_STEP (3, 0, 1); + if (rh1 == 0) + { + hgcd->row[0].u = u2; hgcd->row[0].v = v2; + rh2 = rh0; rl2 = rl0; u2 = u0; v2 = v0; + + MP_LIMB_T_SWAP (rh1, rh3); MP_LIMB_T_SWAP (rl1, rl3); + MP_LIMB_T_SWAP ( u1, u3); MP_LIMB_T_SWAP ( v1, v3); + + hgcd->sign = 0; + break; + } + + HGCD2_STEP (0, 1, 2); + if (rh2 == 0) + { + hgcd->row[0].u = u3; hgcd->row[0].v = v3; + + rh3 = rh2; rl3 = rl2; u3 = u2; v3 = v2; + rh2 = rh1; rl2 = rl1; u2 = u1; v2 = v1; + rh1 = rh0; rl1 = rl0; u1 = u0; v1 = v0; + + hgcd->sign = -1; + break; + } + } + } + + ASSERT (rh1 != 0); + ASSERT (rh2 != 0); + ASSERT (rh3 == 0); + ASSERT (rh1 > rh2 || (rh1 == rh2 && rl1 > rl2)); + ASSERT (rh2 > rh3 || (rh2 == rh3 && rl2 > rl3)); + + /* Coefficients to be returned */ + hgcd->row[1].u = u1; hgcd->row[1].v = v1; + hgcd->row[2].u = u2; hgcd->row[2].v = v2; + hgcd->row[3].u = u3; hgcd->row[3].v = v3; + + /* Rows 1, 2 and 3 are used below, rh0, rl0, u0 and v0 are not. */ +#if GMP_NAIL_BITS == 0 + { + mp_limb_t sh; + mp_limb_t sl; + mp_limb_t th; + mp_limb_t tl; + + /* Check r2 */ + /* We always have r2 > u2, v2 */ + + if (hgcd->sign >= 0) + { + /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */ + sl = u2 + u1; + sh = (sl < u1); + } + else + { + /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */ + sl = v2 + v1; + sh = (sl < v1); + } + + sub_ddmmss (th, tl, rh1, rl1, rh2, rl2); + + if (th < sh || (th == sh && tl < sl)) + return 2 - (hgcd->row[0].v == 0); + + /* Check r3 */ + + if (hgcd->sign >= 0) + { + /* Check r3 >= max (-u3, -v3) = |u3| */ + if (rl3 < u3) + return 3; + + /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/ + sl = v3 + v2; + sh = (sl < v2); + } + else + { + /* Check r3 >= max (-u3, -v3) = |v3| */ + if (rl3 < v3) + return 3; + + /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */ + sl = u3 + u2; + sh = (sl < u2); + } + + sub_ddmmss (th, tl, rh2, rl2, 0, rl3); + + if (th < sh || (th == sh && tl < sl)) + return 3; + + return 4; + } +#else /* GMP_NAIL_BITS > 0 */ + { + mp_limb_t sl; + mp_limb_t th; + mp_limb_t tl; + + /* Check r2 */ + /* We always have r2 > u2, v2 */ + + if (hgcd->sign >= 0) + { + /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */ + sl = u2 + u1; + } + else + { + /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */ + sl = v2 + v1; + } + + tl = rl1 - rl2; + th = rh1 - rh2 - (tl >> (GMP_LIMB_BITS - 1)); + ASSERT_LIMB(th); + + if (th < (CNST_LIMB(1) << GMP_NAIL_BITS) + && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl) + return 2 - (hgcd->row[0].v == 0); + + /* Check r3 */ + + if (hgcd->sign >= 0) + { + /* Check r3 >= max (-u3, -v3) = |u3| */ + if (rl3 < u3) + return 3; + + /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/ + sl = v3 + v2; + } + else + { + /* Check r3 >= max (-u3, -v3) = |v3| */ + if (rl3 < v3) + return 3; + + /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */ + sl = u3 + u2; + } + + tl = rl2 - rl3; + th = rh2 - (tl >> (GMP_LIMB_BITS - 1)); + ASSERT_LIMB(th); + + if (th < (CNST_LIMB(1) << GMP_NAIL_BITS) + && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl) + return 3; + + return 4; + } +#endif /* GMP_NAIL_BITS > 0 */ +} + +mp_size_t +mpn_hgcd2_fix (mp_ptr rp, mp_size_t ralloc, + int sign, + mp_limb_t u, mp_srcptr ap, mp_size_t asize, + mp_limb_t v, mp_srcptr bp, mp_size_t bsize) +{ + mp_size_t rsize; + mp_limb_t cy; + + ASSERT_LIMB(u); + ASSERT_LIMB(v); + + if (sign < 0) + { + MP_LIMB_T_SWAP (u,v); + MPN_SRCPTR_SWAP (ap, asize, bp, bsize); + } + + ASSERT (u > 0); + + ASSERT (asize <= ralloc); + rsize = asize; + cy = mpn_mul_1 (rp, ap, asize, u); + if (cy) + { + ASSERT (rsize < ralloc); + rp[rsize++] = cy; + } + + if (v > 0) + { + ASSERT (bsize <= rsize); + cy = mpn_submul_1 (rp, bp, bsize, v); + if (cy) + { + ASSERT (bsize < rsize); + ASSERT_NOCARRY (mpn_sub_1 (rp + bsize, + rp + bsize, rsize - bsize, cy)); + } + + MPN_NORMALIZE (rp, rsize); + } + return rsize; +} + +#undef HGCD2_STEP diff -x *~ -N -u -r gmp-4.1.3/mpn/generic/qstack.c gmp-4.1.3-gcdext/mpn/generic/qstack.c --- gmp-4.1.3/mpn/generic/qstack.c Thu Jan 1 01:00:00 1970 +++ gmp-4.1.3-gcdext/mpn/generic/qstack.c Thu Dec 18 20:29:09 2003 @@ -0,0 +1,142 @@ +/* qstack.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_size_t +qstack_itch (mp_size_t size) +{ + /* Limit on the recursion depth */ + + unsigned k = mpn_hgcd_max_recursion (size); + + ASSERT (2 * k < QSTACK_MAX_QUOTIENTS); + + return 2 * size + 12 * (k+1); +} + +void +qstack_reset (struct qstack *stack, + mp_size_t asize) +{ + /* Limit on the recursion depth */ + unsigned k = mpn_hgcd_max_recursion (asize); + + stack->size_next = 0; + stack->limb_next= 0; + stack->nkeep = 2 * (k + 1); + ASSERT (stack->nkeep < QSTACK_MAX_QUOTIENTS); + + ASSERT_QSTACK (stack); +} + +void +qstack_init (struct qstack *stack, + mp_size_t asize, + mp_limb_t *limbs, mp_size_t alloc) +{ + stack->limb = limbs; + stack->limb_alloc = alloc; + /* Should depend on input size, we need 2 * recursion depth */ + stack->nkeep = QSTACK_MAX_QUOTIENTS - 1; + + qstack_reset (stack, asize); +} + +/* Drop all but the nkeep latest quotients. Drop additional quotients + if needed to reclain at least SIZE limbs of storage. */ +void +qstack_rotate (struct qstack *stack, + mp_size_t size) +{ + unsigned dropped_sizes; + unsigned kept; + unsigned i; + + mp_size_t dropped_limbs; + + ASSERT_QSTACK (stack); + + if (stack->size_next > stack->nkeep) + dropped_sizes = stack->size_next - stack->nkeep; + else + dropped_sizes = 0; + + for (i = 0, dropped_limbs = 0; i < dropped_sizes; i++) + dropped_limbs += stack->size[i]; + + for (; dropped_limbs < size; dropped_sizes++) + { + ASSERT (dropped_sizes < stack->size_next); + dropped_limbs += stack->size[dropped_sizes]; + } + + ASSERT (dropped_limbs <= stack->limb_next); + + kept = stack->size_next - dropped_sizes; + + if (dropped_sizes) + /* memmove isn't portable */ + for (i = 0; i < kept; i++) + stack->size[i] = stack->size[i + dropped_sizes]; + + stack->size_next = kept; + + if (dropped_limbs) + { + if (dropped_limbs < stack->limb_next) + { + MPN_COPY_INCR (stack->limb, stack->limb + dropped_limbs, + stack->limb_next - dropped_limbs); + ASSERT (dropped_limbs <= stack->limb_next); + stack->limb_next -= dropped_limbs; + } + else + stack->limb_next = 0; + } + ASSERT_QSTACK (stack); +} + +#if WANT_ASSERT +void +__gmpn_qstack_sanity (struct qstack *stack) +{ + mp_size_t next; + unsigned i; + + for (i = 0, next = 0; i < stack->size_next; i++) + { + mp_size_t qsize = stack->size[i]; + ASSERT (next <= stack->limb_alloc); + + ASSERT (qsize == 0 || stack->limb[next+qsize - 1] != 0); + next += qsize; + } + + ASSERT (next == stack->limb_next); +} +#endif diff -x *~ -N -u -r gmp-4.1.3/tests/mpz/t-gcd.c gmp-4.1.3-gcdext/tests/mpz/t-gcd.c --- gmp-4.1.3/tests/mpz/t-gcd.c Thu Apr 22 19:03:04 2004 +++ gmp-4.1.3-gcdext/tests/mpz/t-gcd.c Tue Sep 14 19:39:21 2004 @@ -1,7 +1,6 @@ -/* Test mpz_gcd, mpz_gcdext, mpz_mul, mpz_tdiv_r, mpz_add, mpz_cmp, - mpz_cmp_ui, mpz_init_set, mpz_set, mpz_clear. +/* Test mpz_gcd, mpz_gcdext, and mpz_gcd_ui. -Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2004 Free Software +Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -28,8 +27,10 @@ #include "gmp-impl.h" #include "tests.h" -void dump_abort _PROTO ((int, mpz_t, mpz_t)); -void debug_mp _PROTO ((mpz_t, int)); +void one_test __GMP_PROTO ((mpz_t, mpz_t, mpz_t, int)); +void debug_mp __GMP_PROTO ((mpz_t, int)); + +static int gcdext_valid_p __GMP_PROTO ((const mpz_t a, const mpz_t b, const mpz_t g, const mpz_t s)); void check_data (void) @@ -79,15 +80,27 @@ mpz_clear (want); } +/* Keep one_test's variables global, so that we don't need + to reinitialize them for each test. */ +mpz_t gcd1, gcd2, s, t, temp1, temp2; + +/* Define this to make all operands be large enough for Schoenhage gcd + to be used. */ +#ifndef WHACK_SCHOENHAGE +#define WHACK_SCHOENHAGE 0 +#endif + +#if WHACK_SCHOENHAGE +#define MIN_OPERAND_BITSIZE (GCD_SCHOENHAGE_THRESHOLD * GMP_NUMB_BITS) +#else +#define MIN_OPERAND_BITSIZE 1 +#endif + int main (int argc, char **argv) { - mpz_t op1, op2, x; - mpz_t gcd, gcd2, s, t; - mpz_t temp1, temp2; - mp_size_t op1_size, op2_size, x_size; - int i; - int reps = 2000; + mpz_t op1, op2, ref; + int i, j, chain_len; gmp_randstate_ptr rands; mpz_t bs; unsigned long bsi, size_range; @@ -98,36 +111,31 @@ check_data (); mpz_init (bs); - - if (argc == 2) - reps = atoi (argv[1]); - mpz_init (op1); mpz_init (op2); - mpz_init (x); - mpz_init (gcd); + mpz_init (ref); + mpz_init (gcd1); mpz_init (gcd2); mpz_init (temp1); mpz_init (temp2); mpz_init (s); mpz_init (t); - for (i = 0; i < reps; i++) + for (i = 0; i < 50; i++) { - mpz_urandomb (bs, rands, 32); - size_range = mpz_get_ui (bs) % 12 + 2; /* 0..8191 bit operands */ + /* Generate plain operands with unknown gcd. These types of operands + have proven to trigger certain bugs in development versions of the + gcd code. The "hgcd->row[3].rsize > M" ASSERT is not triggered by + the division chain code below, but that is most likely just a result + of that other ASSERTs are triggered before it. */ - mpz_urandomb (bs, rands, size_range); - op1_size = mpz_get_ui (bs); - mpz_rrandomb (op1, rands, op1_size); + mpz_urandomb (bs, rands, 32); + size_range = mpz_get_ui (bs) % 13 + 2; mpz_urandomb (bs, rands, size_range); - op2_size = mpz_get_ui (bs); - mpz_rrandomb (op2, rands, op2_size); - + mpz_urandomb (op1, rands, mpz_get_ui (bs) + MIN_OPERAND_BITSIZE); mpz_urandomb (bs, rands, size_range); - x_size = mpz_get_ui (bs); - mpz_rrandomb (x, rands, x_size); + mpz_urandomb (op2, rands, mpz_get_ui (bs) + MIN_OPERAND_BITSIZE); mpz_urandomb (bs, rands, 2); bsi = mpz_get_ui (bs); @@ -136,44 +144,57 @@ if ((bsi & 2) != 0) mpz_neg (op2, op2); - /* printf ("%ld %ld\n", SIZ (op1), SIZ (op2)); */ + one_test (op1, op2, NULL, i); - mpz_mul (op1, op1, x); - mpz_mul (op2, op2, x); + /* Generate a division chain backwards, allowing otherwise unlikely huge + quotients. */ - mpz_gcd (gcd, op1, op2); - /* We know GCD will be at least X, since we multiplied both operands - with it. */ - if (mpz_cmp (gcd, x) < 0 && mpz_sgn (op1) != 0 && mpz_sgn (op2) != 0) - dump_abort (i, op1, op2); + mpz_set_ui (op1, 0); + mpz_urandomb (bs, rands, 32); + mpz_urandomb (bs, rands, mpz_get_ui (bs) % 16 + 1); + mpz_rrandomb (op2, rands, mpz_get_ui (bs)); + mpz_add_ui (op2, op2, 1); + mpz_set (ref, op2); + +#if WHACK_SCHOENHAGE + chain_len = 1000000; +#else + mpz_urandomb (bs, rands, 32); + chain_len = mpz_get_ui (bs) % (GMP_NUMB_BITS * GCD_SCHOENHAGE_THRESHOLD / 256); +#endif - if (mpz_fits_ulong_p (op2)) + for (j = 0; j < chain_len; j++) { - mpz_gcd_ui (gcd2, op1, mpz_get_ui (op2)); - if (mpz_cmp (gcd, gcd2)) - dump_abort (i, op1, op2); + mpz_urandomb (bs, rands, 32); + mpz_urandomb (bs, rands, mpz_get_ui (bs) % 12 + 1); + mpz_rrandomb (temp2, rands, mpz_get_ui (bs) + 1); + mpz_add_ui (temp2, temp2, 1); + mpz_mul (temp1, op2, temp2); + mpz_add (op1, op1, temp1); + + /* Don't generate overly huge operands. */ + if (SIZ (op1) > 3 * GCD_SCHOENHAGE_THRESHOLD) + break; + + mpz_urandomb (bs, rands, 32); + mpz_urandomb (bs, rands, mpz_get_ui (bs) % 12 + 1); + mpz_rrandomb (temp2, rands, mpz_get_ui (bs) + 1); + mpz_add_ui (temp2, temp2, 1); + mpz_mul (temp1, op1, temp2); + mpz_add (op2, op2, temp1); + + /* Don't generate overly huge operands. */ + if (SIZ (op2) > 3 * GCD_SCHOENHAGE_THRESHOLD) + break; } - - mpz_gcdext (gcd2, s, t, op1, op2); - if (mpz_cmp (gcd, gcd2)) - dump_abort (i, op1, op2); - - mpz_gcdext (gcd2, s, NULL, op1, op2); - if (mpz_cmp (gcd, gcd2)) - dump_abort (i, op1, op2); - - mpz_mul (temp1, s, op1); - mpz_mul (temp2, t, op2); - mpz_add (gcd2, temp1, temp2); - if (mpz_cmp (gcd, gcd2)) - dump_abort (i, op1, op2); + one_test (op1, op2, ref, i); } mpz_clear (bs); mpz_clear (op1); mpz_clear (op2); - mpz_clear (x); - mpz_clear (gcd); + mpz_clear (ref); + mpz_clear (gcd1); mpz_clear (gcd2); mpz_clear (temp1); mpz_clear (temp2); @@ -185,16 +206,129 @@ } void -dump_abort (int testnr, mpz_t op1, mpz_t op2) +debug_mp (mpz_t x, int base) { - fprintf (stderr, "ERROR in test %d\n", testnr); - fprintf (stderr, "op1 = "); debug_mp (op1, -16); - fprintf (stderr, "op2 = "); debug_mp (op2, -16); - abort(); + mpz_out_str (stderr, base, x); fputc ('\n', stderr); } void -debug_mp (mpz_t x, int base) +one_test (mpz_t op1, mpz_t op2, mpz_t ref, int i) { - mpz_out_str (stderr, base, x); fputc ('\n', stderr); + /* + printf ("%ld %ld %ld\n", SIZ (op1), SIZ (op2), SIZ (ref)); + fflush (stdout); + */ + + /* + fprintf (stderr, "op1="); debug_mp (op1, -16); + fprintf (stderr, "op2="); debug_mp (op2, -16); + */ + + mpz_gcdext (gcd1, s, NULL, op1, op2); + + if (ref && mpz_cmp (ref, gcd1) != 0) + { + fprintf (stderr, "ERROR in test %d\n", i); + fprintf (stderr, "mpz_gcdext returned incorrect result\n"); + fprintf (stderr, "op1="); debug_mp (op1, -16); + fprintf (stderr, "op2="); debug_mp (op2, -16); + fprintf (stderr, "expected result:\n"); debug_mp (ref, -16); + fprintf (stderr, "mpz_gcdext returns:\n");debug_mp (gcd1, -16); + abort (); + } + + if (!gcdext_valid_p(op1, op2, gcd1, s)) + { + fprintf (stderr, "ERROR in test %d\n", i); + fprintf (stderr, "mpz_gcdext returned invalid result\n"); + fprintf (stderr, "op1="); debug_mp (op1, -16); + fprintf (stderr, "op2="); debug_mp (op2, -16); + fprintf (stderr, "mpz_gcdext returns:\n");debug_mp (gcd1, -16); + abort (); + } + + mpz_gcd (gcd2, op1, op2); + if (mpz_cmp (gcd2, gcd1) != 0) + { + fprintf (stderr, "ERROR in test %d\n", i); + fprintf (stderr, "mpz_gcd returned incorrect result\n"); + fprintf (stderr, "op1="); debug_mp (op1, -16); + fprintf (stderr, "op2="); debug_mp (op2, -16); + fprintf (stderr, "expected result:\n"); debug_mp (gcd1, -16); + fprintf (stderr, "mpz_gcd returns:\n"); debug_mp (gcd2, -16); + abort (); + } + + /* This should probably move to t-gcd_ui.c */ + if (mpz_fits_ulong_p (op1) || mpz_fits_ulong_p (op2)) + { + if (mpz_fits_ulong_p (op1)) + mpz_gcd_ui (gcd2, op2, mpz_get_ui (op1)); + else + mpz_gcd_ui (gcd2, op1, mpz_get_ui (op2)); + if (mpz_cmp (gcd2, gcd1)) + { + fprintf (stderr, "ERROR in test %d\n", i); + fprintf (stderr, "mpz_gcd_ui returned incorrect result\n"); + fprintf (stderr, "op1="); debug_mp (op1, -16); + fprintf (stderr, "op2="); debug_mp (op2, -16); + fprintf (stderr, "expected result:\n"); debug_mp (gcd1, -16); + fprintf (stderr, "mpz_gcd_ui returns:\n"); debug_mp (gcd2, -16); + abort (); + } + } + + mpz_gcdext (gcd2, temp1, temp2, op1, op2); + mpz_mul (temp1, temp1, op1); + mpz_mul (temp2, temp2, op2); + mpz_add (temp1, temp1, temp2); + + if (mpz_cmp (gcd1, gcd2) != 0 + || mpz_cmp (gcd2, temp1) != 0) + { + fprintf (stderr, "ERROR in test %d\n", i); + fprintf (stderr, "mpz_gcdext returned incorrect result\n"); + fprintf (stderr, "op1="); debug_mp (op1, -16); + fprintf (stderr, "op2="); debug_mp (op2, -16); + fprintf (stderr, "expected result:\n"); debug_mp (gcd1, -16); + fprintf (stderr, "mpz_gcdext returns:\n");debug_mp (gcd2, -16); + abort (); + } +} + +/* Called when g is supposed to be gcd(a,b), and g = s a + t b, for some t. + Uses temp1 and temp2 */ +static int +gcdext_valid_p (const mpz_t a, const mpz_t b, const mpz_t g, const mpz_t s) +{ + /* It's not clear that gcd(0,0) is well defined, but we allow it and require that + allow gcd(0,0) = 0. */ + if (mpz_sgn (g) < 0) + return 0; + + if (mpz_sgn (a) == 0) + { + /* Must have g == abs (b). Any value for s is in some sense "correct", + but it makes sense to require that s == 0. */ + return mpz_cmpabs (g, b) == 0 && mpz_sgn (s) == 0; + } + else if (mpz_sgn (b) == 0) + { + /* Must have g == abs (a), s == sign (a) */ + return mpz_cmpabs (g, a) == 0 && mpz_cmp_si (s, mpz_sgn (a)) == 0; + } + + if (mpz_sgn (g) <= 0) + return 0; + + if (! (mpz_divisible_p (a, g) + && mpz_divisible_p (b, g) + && mpz_cmpabs (s, b) <= 0)) + return 0; + + mpz_mul(temp1, s, a); + mpz_sub(temp1, g, temp1); + mpz_tdiv_qr(temp1, temp2, temp1, b); + + return mpz_sgn (temp2) == 0 && mpz_cmpabs (temp1, a) <= 0; }