diff -r 73a7d81c4142 ChangeLog --- a/ChangeLog Thu Oct 29 14:29:40 2009 +0100 +++ b/ChangeLog Thu Oct 29 14:41:29 2009 +0100 @@ -9,6 +9,14 @@ 2009-10-29 Torbjorn Granlund + + * mpn/generic/toom_eval_pm2.c: New file. + * mpn/generic/toom_eval_pm1.c: New file. + + * mpn/generic/toom52_mul.c (mpn_toom52_mul): Use mpn_toom_eval_pm1 + and mpn_toom_eval_pm2. + * mpn/generic/toom42_mul.c (mpn_toom42_mul): Use + mpn_toom_eval_dgr3_pm1. * tune/speed.c: Added support for mpn_toom4_sqr, @@ -51,6 +59,13 @@ 2009-10-28 Niels Möller + + * mpn/generic/toom43_mul.c (mpn_toom43_mul): Use + mpn_toom_eval_dgr3_pm1. + * mpn/generic/toom53_mul.c: Use new evaluation points. + * mpn/generic/toom62_mul.c: Likewise. 2009-10-26 Torbjorn Granlund @@ -121,6 +136,14 @@ 2009-10-22 Torbjorn Granlund + * mpn/generic/toom_eval_dgr3_pm2.c (mpn_toom_eval_dgr3_pm2): + Untested fixes for cases of native mpn_addlsh_n and + mpn_add_n_sub_n. + + * mpn/generic/toom43_mul.c (mpn_toom43_mul): Check sign from mpn_toom_eval_dgr3_pm2. + +2009-10-22 Niels Möller + * tests/mpn/t-toom44.c: New test program. * tests/mpn/t-toom33.c: New test program. @@ -167,6 +190,27 @@ 2009-10-20 Niels Möller + + * mpn/generic/toom32_mul.c: New toom32 function (for now, + mpn_toom32_xmul). + + * mpn/generic/toom_interpolate_7pts.c: Changed interpolation + points, to use 2, -2, 1/2 rather than 1/2, -1/2 and 2. + + * mpn/generic/toom43_mul.c: Use toom_eval_dgr3_pm2, and changed + evaluation points. + * mpn/generic/toom4_sqr.c: Likewise. + + * mpn/generic/toom43_mul.c (mpn_toom43_mul): Use + mpn_toom_eval_dgr3_pm2. + + * gmp-impl.h: Added prototype for mpn_toom_eval_dgr3_pm2. + + * mpn/generic/toom_eval_dgr3_pm2.c: New file. + + * configure.in (gmp_mpn_functions): Added toom_eval_dgr3_pm2. 2009-10-14 Niels Möller diff -r 73a7d81c4142 configure.in --- a/configure.in Thu Oct 29 14:29:40 2009 +0100 +++ b/configure.in Thu Oct 29 14:41:30 2009 +0100 @@ -2448,7 +2448,7 @@ gmp_mpn_functions="$extra_functions toom33_mul toom43_mul toom53_mul \ toom44_mul \ toom2_sqr toom3_sqr toom4_sqr \ - toom_eval_dgr3_pm1 \ + toom_eval_dgr3_pm1 toom_eval_dgr3_pm2 toom_eval_pm1 toom_eval_pm2 \ toom_interpolate_5pts toom_interpolate_6pts toom_interpolate_7pts \ invert binvert \ sbpi1_div_q sbpi1_div_qr sbpi1_divappr_q \ diff -r 73a7d81c4142 gmp-impl.h --- a/gmp-impl.h Thu Oct 29 14:29:40 2009 +0100 +++ b/gmp-impl.h Thu Oct 29 14:41:30 2009 +0100 @@ -1054,6 +1054,15 @@ __GMP_DECLSPEC void mpn_toom_interp #define mpn_toom_eval_dgr3_pm1 __MPN(toom_eval_dgr3_pm1) __GMP_DECLSPEC int mpn_toom_eval_dgr3_pm1 __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_size_t, mp_ptr)); +#define mpn_toom_eval_dgr3_pm2 __MPN(toom_eval_dgr3_pm2) +__GMP_DECLSPEC int mpn_toom_eval_dgr3_pm2 __GMP_PROTO ((mp_ptr xp1, mp_ptr xm1, mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp)); + +#define mpn_toom_eval_pm1 __MPN(toom_eval_pm1) +__GMP_DECLSPEC int mpn_toom_eval_pm1 __GMP_PROTO ((mp_ptr, mp_ptr, unsigned, mp_srcptr, mp_size_t, mp_size_t, mp_ptr)); + +#define mpn_toom_eval_pm2 __MPN(toom_eval_pm2) +__GMP_DECLSPEC int mpn_toom_eval_pm2 __GMP_PROTO ((mp_ptr xp1, mp_ptr xm1, unsigned, mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp)); + #define mpn_toom22_mul __MPN(toom22_mul) __GMP_DECLSPEC void mpn_toom22_mul __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr)); diff -r 73a7d81c4142 mpn/generic/toom42_mul.c --- a/mpn/generic/toom42_mul.c Thu Oct 29 14:29:40 2009 +0100 +++ b/mpn/generic/toom42_mul.c Thu Oct 29 14:41:30 2009 +0100 @@ -95,32 +95,7 @@ mpn_toom42_mul (mp_ptr pp, a1_a3 = pp + n + 1; /* Compute as1 and asm1. */ - a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n); - a1_a3[n] = mpn_add (a1_a3, a1, n, a3, s); -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) - { - mpn_add_n_sub_n (as1, asm1, a1_a3, a0_a2, n + 1); - vm1_neg = 1; - } - else - { - mpn_add_n_sub_n (as1, asm1, a0_a2, a1_a3, n + 1); - vm1_neg = 0; - } -#else - mpn_add_n (as1, a0_a2, a1_a3, n + 1); - if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) - { - mpn_sub_n (asm1, a1_a3, a0_a2, n + 1); - vm1_neg = 1; - } - else - { - mpn_sub_n (asm1, a0_a2, a1_a3, n + 1); - vm1_neg = 0; - } -#endif + vm1_neg = mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0_a2); /* Compute as2. */ #if HAVE_NATIVE_mpn_addlsh1_n diff -r 73a7d81c4142 mpn/generic/toom43_mul.c --- a/mpn/generic/toom43_mul.c Thu Oct 29 14:29:40 2009 +0100 +++ b/mpn/generic/toom43_mul.c Thu Oct 29 14:41:30 2009 +0100 @@ -101,35 +101,8 @@ mpn_toom43_mul (mp_ptr pp, #define b1d bsm1 /* Compute as2 and asm2. */ - cy = mpn_lshift (asm2, a3, s, 3); /* 8a3 */ - a1a3[n] = mpn_lshift (a1a3, a1, n, 1); /* 2a1 */ - cy += mpn_add_n (a1a3, a1a3, asm2, s); /* 8a3 +2a1 */ - MPN_INCR_U(a1a3 + s, n - s + 1, cy); - cy = mpn_lshift (asm2, a2, n, 2); /* 4a2 */ - a0a2[n] = cy + mpn_add_n (a0a2, a0, asm2, n); /* 4a2 + a0 */ - -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (a0a2, a1a3, n+1) < 0) - { - mpn_add_n_sub_n (as2, asm2, a1a3, a0a2, n+1); - flags ^= toom6_vm2_neg; - } - else - { - mpn_add_n_sub_n (as2, asm2, a0a2, a1a3, n+1); - } -#else - mpn_add_n (as2, a0a2, a1a3, n+1); - if (mpn_cmp (a0a2, a1a3, n+1) < 0) - { - mpn_sub_n (asm2, a1a3, a0a2, n+1); - flags ^= toom6_vm2_neg; - } - else - { - mpn_sub_n (asm2, a0a2, a1a3, n+1); - } -#endif + if (mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3)) + flags ^= toom6_vm2_neg; /* Compute bs2 and bsm2. */ b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */ @@ -163,30 +136,8 @@ mpn_toom43_mul (mp_ptr pp, #endif /* Compute as1 and asm1. */ - a0a2[n] = mpn_add_n (a0a2, a0, a2, n); - asm1[n] = mpn_add (asm1, a1, n, a3, s); -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (a0a2, asm1, n+1) < 0) - { - cy = mpn_add_n_sub_n (as1, asm1, asm1, a0a2, n+1); - flags ^= toom6_vm1_neg; - } - else - { - cy = mpn_add_n_sub_n (as1, asm1, a0a2, asm1, n+1); - } -#else - mpn_add_n (as1, a0a2, asm1, n+1); - if (mpn_cmp (a0a2, asm1, n+1) < 0) - { - mpn_sub_n (asm1, asm1, a0a2, n+1); - flags ^= toom6_vm1_neg; - } - else - { - mpn_sub_n (asm1, a0a2, asm1, n+1); - } -#endif + if (mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2)) + flags ^= toom6_vm1_neg; /* Compute bs1 and bsm1. */ bsm1[n] = mpn_add (bsm1, b0, n, b2, t); diff -r 73a7d81c4142 mpn/generic/toom44_mul.c --- a/mpn/generic/toom44_mul.c Thu Oct 29 14:29:40 2009 +0100 +++ b/mpn/generic/toom44_mul.c Thu Oct 29 14:41:30 2009 +0100 @@ -28,7 +28,7 @@ along with the GNU MP Library. If not, #include "gmp.h" #include "gmp-impl.h" -/* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf +/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf <-s--><--n--><--n--><--n--> ____ ______ ______ ______ @@ -40,8 +40,8 @@ along with the GNU MP Library. If not, v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3 vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1 v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14 + vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) # A(2)*B(2) ah <= 9 |bh| <= 9 vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14 - vmh = (8a0-4a1+2a2- a3)*(8b0-4b1+2b2- b3) # A(-1/2)*B(-1/2) -4<=ah<=9 -4<=bh<=9 vinf= a3 * b2 # A(inf)*B(inf) */ @@ -76,14 +76,14 @@ along with the GNU MP Library. If not, /* Use of scratch space. In the product area, we store ___________________ - |vinf|____|_vh_|_v0_| + |vinf|____|_v1_|_v0_| s+t 2n-1 2n+1 2n - The other recursive products, v1, vm1, v2, vmh are stored in the + The other recursive products, vm1, v2, vm2, vh are stored in the scratch area. When computing them, we use the product area for intermediate values. - Next, we compute vh. We can store the intermediate factors at v0 + Next, we compute v1. We can store the intermediate factors at v0 and at vh + 2n + 2. Finally, for v0 and vinf, factors are parts of the input operands, @@ -126,19 +126,19 @@ mpn_toom44_mul (mp_ptr pp, ASSERT (0 < t && t <= n); ASSERT (s >= t); - /* NOTE: The multiplications to v1, vm1, v2 and vmh overwrites the + /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the * following limb, so these must be computed in order, and we need a * one limb gap to tp. */ #define v0 pp /* 2n */ -#define v1 scratch /* 2n+1 */ -#define vm1 (scratch + 2 * n + 1) /* 2n+1 */ -#define v2 (scratch + 4 * n + 2) /* 2n+1 */ -#define vh (pp + 2 * n) /* 2n+1 */ -#define vmh (scratch + 6 * n + 3) /* 2n+1 */ +#define v1 (pp + 2 * n) /* 2n+1 */ #define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ #define tp (scratch + 8*n + 5) - /* apx and bpx must not overlap with vh */ + /* apx and bpx must not overlap with v1 */ #define apx pp /* n+1 */ #define amx (pp + n + 1) /* n+1 */ #define bmx (pp + 2*n + 2) /* n+1 */ @@ -147,137 +147,77 @@ mpn_toom44_mul (mp_ptr pp, /* Total scratch need: 8*n + 5 + scratch for recursive calls. This gives roughly 32 n/3 + log term. */ + /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ + if (mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp)) + flags = toom7_w1_neg; + else + flags = 0; + + /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3. */ + if (mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp)) + flags ^= toom7_w1_neg; + + TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */ + TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp); /* vm2, 2n+1 limbs */ + + /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (apx, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); + if (s < n) + { + mp_limb_t cy2 = mpn_addlsh1_n (apx, a3, apx, s); + apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); + MPN_INCR_U (apx + s, n+1-s, cy2); + } + else + apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); +#else + cy = mpn_lshift (apx, a0, n, 1); + cy += mpn_add_n (apx, apx, a1, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + cy += mpn_add_n (apx, apx, a2, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + apx[n] = cy + mpn_add (apx, apx, n, a3, s); +#endif + + /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (bpx, b1, b0, n); + cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n); + if (t < n) + { + mp_limb_t cy2 = mpn_addlsh1_n (bpx, b3, bpx, t); + bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1); + MPN_INCR_U (bpx + t, n+1-t, cy2); + } + else + bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n); +#else + cy = mpn_lshift (bpx, b0, n, 1); + cy += mpn_add_n (bpx, bpx, b1, n); + cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); + cy += mpn_add_n (bpx, bpx, b2, n); + cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); + bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t); +#endif + + ASSERT (apx[n] < 15); + ASSERT (bpx[n] < 15); + + TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */ + /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ if (mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp)) - flags = toom7_w3_neg; - else - flags = 0; + flags |= toom7_w3_neg; /* Compute bpx = b0 + b1 + b2 + b3 bnd bmx = b0 - b1 + b2 - b3. */ if (mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp)) flags ^= toom7_w3_neg; + TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp); /* vm1, 2n+1 limbs */ + /* Clobbers amx, bmx. */ TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp); /* v1, 2n+1 limbs */ - TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp); /* vm1, 2n+1 limbs */ - - /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 = a0 + 2 (a1 + 2 (a2 + 2 a3)) */ -#if HAVE_NATIVE_mpn_addlsh1_n - cy = mpn_addlsh1_n (apx, a2, a3, s); - if (s != n) - cy = mpn_add_1 (apx + s, a2 + s, n - s, cy); - cy = 2 * cy + mpn_addlsh1_n (apx, a1, apx, n); - cy = 2 * cy + mpn_addlsh1_n (apx, a0, apx, n); -#else - cy = mpn_lshift (apx, a3, s, 1); - cy += mpn_add_n (apx, a2, apx, s); - if (s != n) - cy = mpn_add_1 (apx + s, a2 + s, n - s, cy); - cy = 2 * cy + mpn_lshift (apx, apx, n, 1); - cy += mpn_add_n (apx, a1, apx, n); - cy = 2 * cy + mpn_lshift (apx, apx, n, 1); - cy += mpn_add_n (apx, a0, apx, n); -#endif - apx[n] = cy; - - ASSERT (apx[n] <= 14); - - /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 = b0 + 2 (b1 + 2 (b2 + 2 b3)) */ -#if HAVE_NATIVE_mpn_addlsh1_n - cy = mpn_addlsh1_n (bpx, b2, b3, t); - if (t != n) - cy = mpn_add_1 (bpx + t, b2 + t, n - t, cy); - cy = 2 * cy + mpn_addlsh1_n (bpx, b1, bpx, n); - cy = 2 * cy + mpn_addlsh1_n (bpx, b0, bpx, n); -#else - cy = mpn_lshift (bpx, b3, t, 1); - cy += mpn_add_n (bpx, b2, bpx, t); - if (t != n) - cy = mpn_add_1 (bpx + t, b2 + t, n - t, cy); - cy = 2 * cy + mpn_lshift (bpx, bpx, n, 1); - cy += mpn_add_n (bpx, b1, bpx, n); - cy = 2 * cy + mpn_lshift (bpx, bpx, n, 1); - cy += mpn_add_n (bpx, b0, bpx, n); -#endif - bpx[n] = cy; - - ASSERT (bpx[n] <= 14); - - TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */ - - /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 and amx = 8 a0 - 4 a1 + 2 a2 - a3 */ - cy = mpn_lshift (apx, a0, n, 3); /* 8a0 */ -#if HAVE_NATIVE_mpn_addlsh1_n - apx[n] = cy + mpn_addlsh1_n (apx, apx, a2, n); /* 8a0 + 2a2 */ -#else - cy += mpn_lshift (tp, a2, n, 1); /* 2a2 */ - apx[n] = cy + mpn_add_n (apx, apx, tp, n); /* 8a0 + 2a2 */ -#endif - cy = mpn_lshift (tp, a1, n, 2); /* 4a1 */ - tp[n] = cy + mpn_add (tp, tp, n, a3, s); /* 4a1 + a3 */ -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (apx, tp, n + 1) < 0) - { - mpn_add_n_sub_n (apx, amx, tp, apx, n + 1); - flags |= toom7_w1_neg; - } - else - { - mpn_add_n_sub_n (apx, amx, apx, tp, n + 1); - } -#else - if (mpn_cmp (apx, tp, n + 1) < 0) - { - mpn_sub_n (amx, tp, apx, n + 1); - flags |= toom7_w1_neg; - } - else - { - mpn_sub_n (amx, apx, tp, n + 1); - } - mpn_add_n (apx, apx, tp, n + 1); -#endif - - ASSERT (apx[n] <= 14); - ASSERT (amx[n] <= 9); - - /* Compute apx = 8 b0 + 4 b1 + 2 b2 + b3 and bmx = 8 b0 - 4 b1 + 2 b2 - b3 */ - cy = mpn_lshift (bpx, b0, n, 3); /* 8b0 */ -#if HAVE_NATIVE_mpn_addlsh1_n - bpx[n] = cy + mpn_addlsh1_n (bpx, bpx, b2, n); /* 8b0 + 2b2 */ -#else - cy += mpn_lshift (tp, b2, n, 1); /* 2b2 */ - bpx[n] = cy + mpn_add_n (bpx, bpx, tp, n); /* 8b0 + 2b2 */ -#endif - cy = mpn_lshift (tp, b1, n, 2); /* 4a1 */ - tp[n] = cy + mpn_add (tp, tp, n, b3, t); /* 4a1 + b3 */ -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (bpx, tp, n + 1) < 0) - { - mpn_add_n_sub_n (bpx, bmx, tp, bpx, n + 1); - flags ^= toom7_w1_neg; - } - else - { - mpn_add_n_sub_n (bpx, bmx, bpx, tp, n + 1); - } -#else - if (mpn_cmp (bpx, tp, n + 1) < 0) - { - mpn_sub_n (bmx, tp, bpx, n + 1); - flags ^= toom7_w1_neg; - } - else - { - mpn_sub_n (bmx, bpx, tp, n + 1); - } - mpn_add_n (bpx, bpx, tp, n + 1); -#endif - - ASSERT (bpx[n] <= 14); - ASSERT (bmx[n] <= 9); - - TOOM44_MUL_N_REC (vmh, amx, bmx, n + 1, tp); /* vmh, 2n+1 limbs */ - TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */ TOOM44_MUL_N_REC (v0, a0, b0, n, tp); if (s > t) @@ -285,5 +225,5 @@ mpn_toom44_mul (mp_ptr pp, else TOOM44_MUL_N_REC (vinf, a3, b3, s, tp); /* vinf, s+t limbs */ - mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, tp); + mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp); } diff -r 73a7d81c4142 mpn/generic/toom4_sqr.c --- a/mpn/generic/toom4_sqr.c Thu Oct 29 14:29:40 2009 +0100 +++ b/mpn/generic/toom4_sqr.c Thu Oct 29 14:41:30 2009 +0100 @@ -89,24 +89,55 @@ mpn_toom4_sqr (mp_ptr pp, ASSERT (0 < s && s <= n); - /* NOTE: The multiplications to v1, vm1, v2 and vmh overwrites the + /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the * following limb, so these must be computed in order, and we need a * one limb gap to tp. */ #define v0 pp /* 2n */ -#define v1 scratch /* 2n+1 */ -#define vm1 (scratch + 2 * n + 1) /* 2n+1 */ -#define v2 (scratch + 4 * n + 2) /* 2n+1 */ -#define vh (pp + 2 * n) /* 2n+1 */ -#define vmh (scratch + 6 * n + 3) /* 2n+1 */ +#define v1 (pp + 2 * n) /* 2n+1 */ #define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ #define tp (scratch + 8*n + 5) - /* apx must not overlap with vh */ + /* No overlap with v1 */ #define apx pp /* n+1 */ -#define amx (pp + n + 1) /* n+1 */ +#define amx (pp + 4*n + 2) /* n+1 */ /* Total scratch need: 8*n + 5 + scratch for recursive calls. This gives roughly 32 n/3 + log term. */ + + /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ + mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp); + + TOOM4_SQR_N_REC (v2, apx, n + 1, tp); /* v2, 2n+1 limbs */ + TOOM4_SQR_N_REC (vm2, amx, n + 1, tp); /* vm2, 2n+1 limbs */ + + /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (apx, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); + if (s < n) + { + mp_limb_t cy2 = mpn_addlsh1_n (apx, a3, apx, s); + apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); + MPN_INCR_U (apx + s, n+1-s, cy2); + } + else + apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); +#else + cy = mpn_lshift (apx, a0, n, 1); + cy += mpn_add_n (apx, apx, a1, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + cy += mpn_add_n (apx, apx, a2, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + apx[n] = cy + mpn_add (apx, apx, n, a3, s); +#endif + + ASSERT (apx[n] < 15); + + TOOM4_SQR_N_REC (vh, apx, n + 1, tp); /* vh, 2n+1 limbs */ /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp); @@ -114,61 +145,8 @@ mpn_toom4_sqr (mp_ptr pp, TOOM4_SQR_N_REC (v1, apx, n + 1, tp); /* v1, 2n+1 limbs */ TOOM4_SQR_N_REC (vm1, amx, n + 1, tp); /* vm1, 2n+1 limbs */ - /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 = a0 + 2 (a1 + 2 (a2 + 2 a3)) */ -#if HAVE_NATIVE_mpn_addlsh1_n - cy = mpn_addlsh1_n (apx, a2, a3, s); - if (s != n) - cy = mpn_add_1 (apx + s, a2 + s, n - s, cy); - cy = 2 * cy + mpn_addlsh1_n (apx, a1, apx, n); - cy = 2 * cy + mpn_addlsh1_n (apx, a0, apx, n); -#else - cy = mpn_lshift (apx, a3, s, 1); - cy += mpn_add_n (apx, a2, apx, s); - if (s != n) - cy = mpn_add_1 (apx + s, a2 + s, n - s, cy); - cy = 2 * cy + mpn_lshift (apx, apx, n, 1); - cy += mpn_add_n (apx, a1, apx, n); - cy = 2 * cy + mpn_lshift (apx, apx, n, 1); - cy += mpn_add_n (apx, a0, apx, n); -#endif - apx[n] = cy; - - ASSERT (apx[n] <= 14); - - TOOM4_SQR_N_REC (v2, apx, n + 1, tp); /* v2, 2n+1 limbs */ - - /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 and amx = 8 a0 - 4 a1 + 2 a2 - a3 */ - cy = mpn_lshift (apx, a0, n, 3); /* 8a0 */ -#if HAVE_NATIVE_mpn_addlsh1_n - apx[n] = cy + mpn_addlsh1_n (apx, apx, a2, n); /* 8a0 + 2a2 */ -#else - cy += mpn_lshift (tp, a2, n, 1); /* 2a2 */ - apx[n] = cy + mpn_add_n (apx, apx, tp, n); /* 8a0 + 2a2 */ -#endif - cy = mpn_lshift (tp, a1, n, 2); /* 4a1 */ - tp[n] = cy + mpn_add (tp, tp, n, a3, s); /* 4a1 + a3 */ -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (apx, tp, n + 1) < 0) - mpn_add_n_sub_n (apx, amx, tp, apx, n + 1); - else - mpn_add_n_sub_n (apx, amx, apx, tp, n + 1); -#else - if (mpn_cmp (apx, tp, n + 1) < 0) - mpn_sub_n (amx, tp, apx, n + 1); - else - mpn_sub_n (amx, apx, tp, n + 1); - - mpn_add_n (apx, apx, tp, n + 1); -#endif - - ASSERT (apx[n] <= 14); - ASSERT (amx[n] <= 9); - - TOOM4_SQR_N_REC (vmh, amx, n + 1, tp); /* vmh, 2n+1 limbs */ - TOOM4_SQR_N_REC (vh, apx, n + 1, tp); /* vh, 2n+1 limbs */ - TOOM4_SQR_N_REC (v0, a0, n, tp); TOOM4_SQR_N_REC (vinf, a3, s, tp); /* vinf, 2s limbs */ - mpn_toom_interpolate_7pts (pp, n, 0, vmh, vm1, v1, v2, 2*s, tp); + mpn_toom_interpolate_7pts (pp, n, 0, vm2, vm1, v2, vh, 2*s, tp); } diff -r 73a7d81c4142 mpn/generic/toom52_mul.c --- a/mpn/generic/toom52_mul.c Thu Oct 29 14:29:40 2009 +0100 +++ b/mpn/generic/toom52_mul.c Thu Oct 29 14:41:30 2009 +0100 @@ -182,66 +182,12 @@ mpn_toom52_mul (mp_ptr pp, } /* Compute as2 and asm2. */ - cy = mpn_lshift (asm2, a3, n, 3); /* 8a3 */ - cy += mpn_lshift (a1a3, a1, n, 1); /* 2a1 */ - cy += mpn_add_n (a1a3, a1a3, asm2, n); /* 8a3 +2a1 */ - a1a3[n] = cy; - - cy = mpn_lshift (a0a2, a2, n, 2); /* 4a2 */ - a0a2[n] = cy + mpn_add_n (a0a2, a0, a0a2, n); /* 4a2 + a0 */ - cy = mpn_lshift (asm2, a4, s, 4); /* 16a4 */ - cy += mpn_add_n (a0a2, a0a2, asm2, s); /* 16a4 +4a2 + a0 */ - MPN_INCR_U(a0a2 + s, n - s + 1, cy); - -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (a0a2, a1a3, n+1) < 0) - { - mpn_add_n_sub_n (as2, asm2, a1a3, a0a2, n+1); - flags ^= toom6_vm1_neg; - } - else - { - mpn_add_n_sub_n (as2, asm2, a0a2, a1a3, n+1); - } -#else - mpn_add_n (as2, a0a2, a1a3, n+1); - if (mpn_cmp (a0a2, a1a3, n+1) < 0) - { - mpn_sub_n (asm2, a1a3, a0a2, n+1); - flags ^= toom6_vm2_neg; - } - else - { - mpn_sub_n (asm2, a0a2, a1a3, n+1); - } -#endif - + if (mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, a0a2)) + flags ^= toom6_vm2_neg; + /* Compute as1 and asm1. */ - cy = mpn_add (a0a2, a2, n, a4, s); - a0a2[n] = cy + mpn_add_n (a0a2, a0, a0a2, n); - asm1[n] = mpn_add_n (asm1, a1, a3, n); -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (a0a2, asm1, n+1) < 0) - { - cy = mpn_add_n_sub_n (as1, asm1, asm1, a0a2, n+1); - flags ^= toom6_vm1_neg; - } - else - { - cy = mpn_add_n_sub_n (as1, asm1, a0a2, asm1, n+1); - } -#else - mpn_add_n (as1, a0a2, asm1, n+1); - if (mpn_cmp (a0a2, asm1, n+1) < 0) - { - mpn_sub_n (asm1, asm1, a0a2, n+1); - flags ^= toom6_vm1_neg; - } - else - { - mpn_sub_n (asm1, a0a2, asm1, n+1); - } -#endif + if (mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, a0a2)) + flags ^= toom6_vm1_neg; ASSERT (as1[n] <= 4); ASSERT (bs1[n] <= 1); diff -r 73a7d81c4142 mpn/generic/toom53_mul.c --- a/mpn/generic/toom53_mul.c Thu Oct 29 14:29:40 2009 +0100 +++ b/mpn/generic/toom53_mul.c Thu Oct 29 14:41:30 2009 +0100 @@ -31,7 +31,7 @@ along with the GNU MP Library. If not, #include "gmp.h" #include "gmp-impl.h" -/* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf +/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf <-s-><--n--><--n--><--n--><--n--> ___ ______ ______ ______ ______ @@ -43,8 +43,8 @@ along with the GNU MP Library. If not, v1 = ( a0+ a1+ a2+ a3+ a4)*( b0+ b1+ b2) # A(1)*B(1) ah <= 4 bh <= 2 vm1 = ( a0- a1+ a2- a3+ a4)*( b0- b1+ b2) # A(-1)*B(-1) |ah| <= 2 bh <= 1 v2 = ( a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) # A(2)*B(2) ah <= 30 bh <= 6 + vm2 = ( a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) # A(2)*B(2) -9<=ah<=20 -1<=bh<=4 vh = (16a0+8a1+4a2+2a3+ a4)*(4b0+2b1+ b2) # A(1/2)*B(1/2) ah <= 30 bh <= 6 - vmh = (16a0-8a1+4a2-2a3+ a4)*(4b0-2b1+ b2) # A(-1/2)*B(-1/2) -9<=ah<=20 -1<=bh<=4 vinf= a4 * b2 # A(inf)*B(inf) */ @@ -55,11 +55,10 @@ mpn_toom53_mul (mp_ptr pp, mp_ptr scratch) { mp_size_t n, s, t; - int vm1_neg, vmh_neg; mp_limb_t cy; - mp_ptr gp, hp; - mp_ptr as1, asm1, as2, ash, asmh; - mp_ptr bs1, bsm1, bs2, bsh, bsmh; + mp_ptr gp; + mp_ptr as1, asm1, as2, asm2, ash; + mp_ptr bs1, bsm1, bs2, bsm2, bsh; enum toom7_flags flags; TMP_DECL; @@ -85,112 +84,52 @@ mpn_toom53_mul (mp_ptr pp, as1 = TMP_SALLOC_LIMBS (n + 1); asm1 = TMP_SALLOC_LIMBS (n + 1); as2 = TMP_SALLOC_LIMBS (n + 1); + asm2 = TMP_SALLOC_LIMBS (n + 1); ash = TMP_SALLOC_LIMBS (n + 1); - asmh = TMP_SALLOC_LIMBS (n + 1); bs1 = TMP_SALLOC_LIMBS (n + 1); bsm1 = TMP_SALLOC_LIMBS (n + 1); bs2 = TMP_SALLOC_LIMBS (n + 1); + bsm2 = TMP_SALLOC_LIMBS (n + 1); bsh = TMP_SALLOC_LIMBS (n + 1); - bsmh = TMP_SALLOC_LIMBS (n + 1); gp = pp; - hp = pp + n + 1; /* Compute as1 and asm1. */ - gp[n] = mpn_add_n (gp, a0, a2, n); - gp[n] += mpn_add (gp, gp, n, a4, s); - hp[n] = mpn_add_n (hp, a1, a3, n); -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (gp, hp, n + 1) < 0) - { - mpn_add_n_sub_n (as1, asm1, hp, gp, n + 1); - vm1_neg = 1; + if (mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp)) + flags = toom7_w3_neg; + else + flags = 0; + + /* Compute as2 and asm2. */ + if (mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp)) + flags |= toom7_w1_neg; + + /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4 + = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (ash, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); + cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); + if (s < n) + { + mp_limb_t cy2 = mpn_addlsh1_n (ash, a4, ash, s); + ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); + MPN_INCR_U (ash + s, n+1-s, cy2); } else - { - mpn_add_n_sub_n (as1, asm1, gp, hp, n + 1); - vm1_neg = 0; - } + ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); #else - mpn_add_n (as1, gp, hp, n + 1); - if (mpn_cmp (gp, hp, n + 1) < 0) - { - mpn_sub_n (asm1, hp, gp, n + 1); - vm1_neg = 1; - } - else - { - mpn_sub_n (asm1, gp, hp, n + 1); - vm1_neg = 0; - } + cy = mpn_lshift (ash, a0, n, 1); + cy += mpn_add_n (ash, ash, a1, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a2, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a3, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + ash[n] = cy + mpn_add (ash, ash, n, a4, s); #endif - - /* Compute as2. */ -#if !HAVE_NATIVE_mpn_addlsh_n - ash[n] = mpn_lshift (ash, a2, n, 2); /* 4a2 */ -#endif -#if HAVE_NATIVE_mpn_addlsh1_n - cy = mpn_addlsh1_n (as2, a3, a4, s); - if (s != n) - cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); - cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); - cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); - as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); -#else - cy = mpn_lshift (as2, a4, s, 1); - cy += mpn_add_n (as2, a3, as2, s); - if (s != n) - cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); - cy = 4 * cy + mpn_lshift (as2, as2, n, 2); - cy += mpn_add_n (as2, a1, as2, n); - cy = 2 * cy + mpn_lshift (as2, as2, n, 1); - as2[n] = cy + mpn_add_n (as2, a0, as2, n); - mpn_add_n (as2, ash, as2, n + 1); -#endif - - /* Compute ash and asmh. */ -#if HAVE_NATIVE_mpn_addlsh_n - cy = mpn_addlsh_n (gp, a2, a0, n, 2); /* 4a0 + a2 */ - cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2); /* 16a0 + 4a2 + a4 */ /* FIXME s */ - gp[n] = cy; - cy = mpn_addlsh_n (hp, a3, a1, n, 2); /* 4a1 + a3 */ - cy = 2 * cy + mpn_lshift (hp, hp, n, 1); /* 8a1 + 2a3 */ - hp[n] = cy; -#else - gp[n] = mpn_lshift (gp, a0, n, 4); /* 16a0 */ - mpn_add (gp, gp, n + 1, a4, s); /* 16a0 + a4 */ - mpn_add_n (gp, ash, gp, n+1); /* 16a0 + 4a2 + a4 */ - cy = mpn_lshift (hp, a1, n, 3); /* 8a1 */ - cy += mpn_lshift (ash, a3, n, 1); /* 2a3 */ - cy += mpn_add_n (hp, ash, hp, n); /* 8a1 + 2a3 */ - hp[n] = cy; -#endif -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (gp, hp, n + 1) < 0) - { - mpn_add_n_sub_n (ash, asmh, hp, gp, n + 1); - vmh_neg = 1; - } - else - { - mpn_add_n_sub_n (ash, asmh, gp, hp, n + 1); - vmh_neg = 0; - } -#else - mpn_add_n (ash, gp, hp, n + 1); - if (mpn_cmp (gp, hp, n + 1) < 0) - { - mpn_sub_n (asmh, hp, gp, n + 1); - vmh_neg = 1; - } - else - { - mpn_sub_n (asmh, gp, hp, n + 1); - vmh_neg = 0; - } -#endif - + /* Compute bs1 and bsm1. */ bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ #if HAVE_NATIVE_mpn_add_n_sub_n @@ -198,7 +137,7 @@ mpn_toom53_mul (mp_ptr pp, { bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1; bsm1[n] = 0; - vm1_neg ^= 1; + flags ^= toom7_w3_neg; } else { @@ -211,7 +150,7 @@ mpn_toom53_mul (mp_ptr pp, { mpn_sub_n (bsm1, b1, bs1, n); bsm1[n] = 0; - vm1_neg ^= 1; + flags ^= toom7_w3_neg; } else { @@ -220,46 +159,60 @@ mpn_toom53_mul (mp_ptr pp, bs1[n] += mpn_add_n (bs1, bs1, b1, n); /* b0+b1+b2 */ #endif - /* Compute bs2 */ - hp[n] = mpn_lshift (hp, b1, n, 1); /* 2b1 */ - -#ifdef HAVE_NATIVE_mpn_addlsh1_n - cy = mpn_addlsh1_n (bs2, b1, b2, t); - if (t != n) - cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy); - bs2[n] = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n); + /* Compute bs2 and bsm2. */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (bs2, b0, b2, t, 2); + if (t < n) + bs2[n] = mpn_add_1 (bs2, b0 + t, n - t, cy); + else + bs2[n] = cy; #else - bs2[t] = mpn_lshift (bs2, b2, t, 2); - mpn_add (bs2, hp, n + 1, bs2, t + 1); - bs2[n] += mpn_add_n (bs2, bs2, b0, n); + cy = mpn_lshift (gp, b2, t, 2); + bs2[n] = mpn_add (bs2, b0, n, gp, t); + MPN_INCR_U (bs2 + t, n+1-t, cy); #endif - /* Compute bsh and bsmh. */ -#if HAVE_NATIVE_mpn_addlsh_n - gp[n] = mpn_addlsh_n (gp, b2, b0, n, 2); /* 4a0 + a2 */ -#else - cy = mpn_lshift (gp, b0, n, 2); /* 4b0 */ - gp[n] = cy + mpn_add (gp, gp, n, b2, t); /* 4b0 + b2 */ -#endif + gp[n] = mpn_lshift (gp, b1, n, 1); + #if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (gp, hp, n + 1) < 0) + if (mpn_cmp (bs2, gp, n+1) < 0) { - mpn_add_n_sub_n (bsh, bsmh, hp, gp, n + 1); - vmh_neg^= 1; - } - else - mpn_add_n_sub_n (bsh, bsmh, gp, hp, n + 1); -#else - mpn_add_n (bsh, gp, hp, n + 1); /* 4b0 + 2b1 + b2 */ - if (mpn_cmp (gp, hp, n + 1) < 0) - { - mpn_sub_n (bsmh, hp, gp, n + 1); - vmh_neg ^= 1; + ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1)); + flags ^= toom7_w1_neg; } else { - mpn_sub_n (bsmh, gp, hp, n + 1); + ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1)); } +#else + if (mpn_cmp (bs2, gp, n+1) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1)); + flags ^= toom7_w1_neg; + } + else + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1)); + } + mpn_add_n (bs2, bs2, gp, n+1); +#endif + + /* Compute bsh = 4 b0 + 2 b1 + b0 = 2*(2*b0 + b1)+b0. */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (bsh, b1, b0, n); + if (s < n) + { + mp_limb_t cy2 = mpn_addlsh1_n (bsh, b2, bsh, t); + bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1); + MPN_INCR_U (bsh + t, n+1-t, cy2); + } + else + bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n); +#else + cy = mpn_lshift (bsh, b0, n, 1); + cy += mpn_add_n (bsh, bsh, b1, n); + cy = 2*cy + mpn_lshift (bsh, bsh, n, 1); + bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t); #endif ASSERT (as1[n] <= 4); @@ -268,18 +221,26 @@ mpn_toom53_mul (mp_ptr pp, ASSERT (bsm1[n] <= 1); ASSERT (as2[n] <= 30); ASSERT (bs2[n] <= 6); + ASSERT (asm2[n] <= 20); + ASSERT (bsm2[n] <= 4); ASSERT (ash[n] <= 30); ASSERT (bsh[n] <= 6); - ASSERT (asmh[n] <= 20); - ASSERT (bsmh[n] <= 4); #define v0 pp /* 2n */ -#define v1 (scratch + 6 * n + 6) /* 2n+1 */ -#define vm1 scratch /* 2n+1 */ -#define v2 (scratch + 2 * n + 2) /* 2n+1 */ +#define v1 (pp + 2 * n) /* 2n+1 */ #define vinf (pp + 6 * n) /* s+t */ -#define vh (pp + 2 * n) /* 2n+1 */ -#define vmh (scratch + 4 * n + 4) +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define scratch_out (scratch + 8 * n + 4) /* 2n+1 */ + /* Total scratch need: 10*n+5 */ + + /* Must be in allocation order, as they overwrite one limb beyond + * 2n+1. */ + mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */ + mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */ /* vm1, 2n+1 limbs */ #ifdef SMALLER_RECURSION @@ -305,8 +266,6 @@ mpn_toom53_mul (mp_ptr pp, vm1[2 * n] = 0; mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0)); #endif /* SMALLER_RECURSION */ - - mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ /* vinf, s+t limbs */ if (s > t) mpn_mul (vinf, a4, s, b2, t); @@ -351,16 +310,14 @@ mpn_toom53_mul (mp_ptr pp, mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0)); #endif /* SMALLER_RECURSION */ - mpn_mul_n (vh, ash, bsh, n + 1); + mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */ - mpn_mul_n (vmh, asmh, bsmh, n + 1); + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a4, s, b2, t); + else mpn_mul (vinf, b2, t, a4, s); - mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */ - - flags = vm1_neg ? toom7_w3_neg : 0; - flags |= vmh_neg ? toom7_w1_neg : 0; - - mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8 * n + 8); + mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, + scratch_out); TMP_FREE; } diff -r 73a7d81c4142 mpn/generic/toom62_mul.c --- a/mpn/generic/toom62_mul.c Thu Oct 29 14:29:40 2009 +0100 +++ b/mpn/generic/toom62_mul.c Thu Oct 29 14:41:30 2009 +0100 @@ -31,7 +31,7 @@ along with the GNU MP Library. If not, #include "gmp.h" #include "gmp-impl.h" -/* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf +/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf <-s-><--n--><--n--><--n--><--n--><--n--> ___ ______ ______ ______ ______ ______ @@ -43,8 +43,8 @@ along with the GNU MP Library. If not, v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1 vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0 v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2 + vm2 = ( a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) # A(-2)*B(-2) -41<=ah<=20 -1<=bh<=0 vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2 - vmh = (32a0-16a1+8a2-4a3+ 2a4- a5)*(2b0- b1) # A(-1/2)*B(-1/2) -20<=ah<=41 0<=bh<=1 vinf= a5 * b1 # A(inf)*B(inf) */ @@ -55,12 +55,11 @@ mpn_toom62_mul (mp_ptr pp, mp_ptr scratch) { mp_size_t n, s, t; - int vm1_neg, vmh_neg, bsm_neg; mp_limb_t cy; - mp_ptr a0_a2, a1_a3; - mp_ptr as1, asm1, as2, ash, asmh; - mp_ptr bs1, bsm1, bs2, bsh, bsmh; - enum toom7_flags flags; + mp_ptr as1, asm1, as2, asm2, ash; + mp_ptr bs1, bsm1, bs2, bsm2, bsh; + mp_ptr gp; + enum toom7_flags aflags, bflags; TMP_DECL; #define a0 ap @@ -85,118 +84,54 @@ mpn_toom62_mul (mp_ptr pp, as1 = TMP_SALLOC_LIMBS (n + 1); asm1 = TMP_SALLOC_LIMBS (n + 1); as2 = TMP_SALLOC_LIMBS (n + 1); + asm2 = TMP_SALLOC_LIMBS (n + 1); ash = TMP_SALLOC_LIMBS (n + 1); - asmh = TMP_SALLOC_LIMBS (n + 1); bs1 = TMP_SALLOC_LIMBS (n + 1); bsm1 = TMP_SALLOC_LIMBS (n); bs2 = TMP_SALLOC_LIMBS (n + 1); + bsm2 = TMP_SALLOC_LIMBS (n + 1); bsh = TMP_SALLOC_LIMBS (n + 1); - bsmh = TMP_SALLOC_LIMBS (n + 1); - a0_a2 = pp; - a1_a3 = pp + n + 1; + gp = pp; /* Compute as1 and asm1. */ - a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n); - a0_a2[n] += mpn_add_n (a0_a2, a0_a2, a4, n); - a1_a3[n] = mpn_add_n (a1_a3, a1, a3, n); - a1_a3[n] += mpn_add (a1_a3, a1_a3, n, a5, s); -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) - { - mpn_add_n_sub_n (as1, asm1, a1_a3, a0_a2, n + 1); - vm1_neg = 1; + if (mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp)) + aflags = toom7_w3_neg; + else + aflags = 0; + + /* Compute as2 and asm2. */ + if (mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp)) + aflags |= toom7_w1_neg; + + /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5 + = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5 */ + +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (ash, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); + cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); + cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); + if (s < n) + { + mp_limb_t cy2 = mpn_addlsh1_n (ash, a5, ash, s); + ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); + MPN_INCR_U (ash + s, n+1-s, cy2); } else - { - mpn_add_n_sub_n (as1, asm1, a0_a2, a1_a3, n + 1); - vm1_neg = 0; - } + ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n); #else - mpn_add_n (as1, a0_a2, a1_a3, n + 1); - if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) - { - mpn_sub_n (asm1, a1_a3, a0_a2, n + 1); - vm1_neg = 1; - } - else - { - mpn_sub_n (asm1, a0_a2, a1_a3, n + 1); - vm1_neg = 0; - } -#endif - - /* Compute as2. */ -#if HAVE_NATIVE_mpn_addlsh1_n - cy = mpn_addlsh1_n (as2, a4, a5, s); - if (s != n) - cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy); - cy = 2 * cy + mpn_addlsh1_n (as2, a3, as2, n); - cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); - cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); - cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); -#else - cy = mpn_lshift (as2, a5, s, 1); - cy += mpn_add_n (as2, a4, as2, s); - if (s != n) - cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy); - cy = 2 * cy + mpn_lshift (as2, as2, n, 1); - cy += mpn_add_n (as2, a3, as2, n); - cy = 2 * cy + mpn_lshift (as2, as2, n, 1); - cy += mpn_add_n (as2, a2, as2, n); - cy = 2 * cy + mpn_lshift (as2, as2, n, 1); - cy += mpn_add_n (as2, a1, as2, n); - cy = 2 * cy + mpn_lshift (as2, as2, n, 1); - cy += mpn_add_n (as2, a0, as2, n); -#endif - as2[n] = cy; - - /* Compute ash and asmh. */ -#if HAVE_NATIVE_mpn_addlsh_n - cy = mpn_addlsh_n (a0_a2, a2, a0, n, 2); /* 4a0 + a2 */ - cy = 4 * cy + mpn_addlsh_n (a0_a2, a4, a0_a2, n, 2); /* 16a0 + 4a2 + a4 */ - cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */ - a0_a2[n] = cy; - cy = mpn_addlsh_n (a1_a3, a3, a1, n, 2); /* 4a1 */ - cy = 4 * cy + mpn_addlsh_n (a1_a3, a5, a1_a3, n, 2); /* 16a1 + 4a3 */ - a1_a3[n] = cy; -#else - cy = mpn_lshift (a0_a2, a0, n, 2); /* 4a0 */ - cy += mpn_add_n (a0_a2, a2, a0_a2, n); /* 4a0 + a2 */ - cy = 4 * cy + mpn_lshift (a0_a2, a0_a2, n, 2); /* 16a0 + 4a2 */ - cy += mpn_add_n (a0_a2, a4, a0_a2, n); /* 16a0 + 4a2 + a4 */ - cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */ - a0_a2[n] = cy; - cy = mpn_lshift (a1_a3, a1, n, 2); /* 4a1 */ - cy += mpn_add_n (a1_a3, a3, a1_a3, n); /* 4a1 + a3 */ - cy = 4 * cy + mpn_lshift (a1_a3, a1_a3, n, 2); /* 16a1 + 4a3 */ - cy += mpn_add (a1_a3, a1_a3, n, a5, s); /* 16a1 + 4a3 + a5 */ - a1_a3[n] = cy; -#endif -#if HAVE_NATIVE_mpn_add_n_sub_n - if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) - { - mpn_add_n_sub_n (ash, asmh, a1_a3, a0_a2, n + 1); - vmh_neg = 1; - } - else - { - mpn_add_n_sub_n (ash, asmh, a0_a2, a1_a3, n + 1); - vmh_neg = 0; - } -#else - mpn_add_n (ash, a0_a2, a1_a3, n + 1); - if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) - { - mpn_sub_n (asmh, a1_a3, a0_a2, n + 1); - vmh_neg = 1; - } - else - { - mpn_sub_n (asmh, a0_a2, a1_a3, n + 1); - vmh_neg = 0; - } + cy = mpn_lshift (ash, a0, n, 1); + cy += mpn_add_n (ash, ash, a1, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a2, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a3, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a4, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + ash[n] = cy + mpn_add (ash, ash, n, a5, s); #endif /* Compute bs1 and bsm1. */ @@ -206,12 +141,12 @@ mpn_toom62_mul (mp_ptr pp, if (mpn_cmp (b0, b1, n) < 0) { cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n); - bsm_neg = 1; + bflags = toom7_w3_neg; } else { cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n); - bsm_neg = 0; + bflags = 0; } bs1[n] = cy >> 1; #else @@ -219,12 +154,12 @@ mpn_toom62_mul (mp_ptr pp, if (mpn_cmp (b0, b1, n) < 0) { mpn_sub_n (bsm1, b1, b0, n); - bsm_neg = 1; + bflags = toom7_w3_neg; } else { mpn_sub_n (bsm1, b0, b1, n); - bsm_neg = 0; + bflags = 0; } #endif } @@ -235,57 +170,84 @@ mpn_toom62_mul (mp_ptr pp, { mpn_sub_n (bsm1, b1, b0, t); MPN_ZERO (bsm1 + t, n - t); - bsm_neg = 1; + bflags = toom7_w3_neg; + } + else + { + mpn_sub (bsm1, b0, n, b1, t); + bflags = 0; + } + } + + /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 = + bsm1 - b1 */ + mpn_add (bs2, bs1, n + 1, b1, t); + if (bflags & toom7_w3_neg) + { + bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t); + bflags |= toom7_w1_neg; + } + else + { + /* FIXME: Simplify this logic? */ + if (t < n) + { + if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t)); + MPN_ZERO (bsm2 + t, n + 1 - t); + bflags |= toom7_w1_neg; + } + else + { + ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t)); + bsm2[n] = 0; + } } else { - mpn_sub (bsm1, b0, n, b1, t); - bsm_neg = 0; + if (mpn_cmp (bsm1, b1, n) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n)); + bflags |= toom7_w1_neg; + } + else + { + ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, n)); + } + bsm2[n] = 0; } } - vm1_neg ^= bsm_neg; - - /* Compute bs2, recycling bs1. bs2=bs1+b1 */ - mpn_add (bs2, bs1, n + 1, b1, t); - - /* Compute bsh and bsmh, recycling bs1 and bsm1. bsh=bs1+b0; bsmh=bsmh+b0 */ - if (bsm_neg == 1) - { - bsmh[n] = 0; - if (mpn_cmp (bsm1, b0, n) < 0) - { - bsm_neg = 0; - mpn_sub_n (bsmh, b0, bsm1, n); - } - else - mpn_sub_n (bsmh, bsm1, b0, n); - } - else - bsmh[n] = mpn_add_n (bsmh, bsm1, b0, n); + /* Compute bsh, recycling bs1 and bsm1. bsh=bs1+b0; */ mpn_add (bsh, bs1, n + 1, b0, n); - vmh_neg ^= bsm_neg; - ASSERT (as1[n] <= 5); ASSERT (bs1[n] <= 1); ASSERT (asm1[n] <= 2); -/*ASSERT (bsm1[n] == 0);*/ ASSERT (as2[n] <= 62); ASSERT (bs2[n] <= 2); + ASSERT (asm2[n] <= 41); + ASSERT (bsm2[n] <= 1); ASSERT (ash[n] <= 62); ASSERT (bsh[n] <= 2); - ASSERT (asmh[n] <= 41); - ASSERT (bsmh[n] <= 1); #define v0 pp /* 2n */ -#define v1 (scratch + 6 * n + 6) /* 2n+1 */ +#define v1 (pp + 2 * n) /* 2n+1 */ #define vinf (pp + 6 * n) /* s+t */ -#define vm1 scratch /* 2n+1 */ -#define v2 (scratch + 2 * n + 2) /* 2n+1 */ -#define vh (pp + 2 * n) /* 2n+1 */ -#define vmh (scratch + 4 * n + 4) +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define scratch_out (scratch + 8 * n + 4) /* 2n+1 */ + /* Total scratch need: 10*n+5 */ + /* Must be in allocation order, as they overwrite one limb beyond + * 2n+1. */ + mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */ + mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */ + /* vm1, 2n+1 limbs */ mpn_mul_n (vm1, asm1, bsm1, n); cy = 0; @@ -302,12 +264,6 @@ mpn_toom62_mul (mp_ptr pp, #endif } vm1[2 * n] = cy; - - mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ - - /* vinf, s+t limbs */ - if (s > t) mpn_mul (vinf, a5, s, b1, t); - else mpn_mul (vinf, b1, t, a5, s); /* v1, 2n+1 limbs */ mpn_mul_n (v1, as1, bs1, n); @@ -333,16 +289,14 @@ mpn_toom62_mul (mp_ptr pp, cy += mpn_add_n (v1 + n, v1 + n, as1, n); v1[2 * n] = cy; - mpn_mul_n (vh, ash, bsh, n + 1); + mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */ - mpn_mul_n (vmh, asmh, bsmh, n + 1); + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a5, s, b1, t); + else mpn_mul (vinf, b1, t, a5, s); - mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */ - - flags = vm1_neg ? toom7_w3_neg : 0; - flags |= vmh_neg ? toom7_w1_neg : 0; - - mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8 * n + 8); + mpn_toom_interpolate_7pts (pp, n, aflags ^ bflags, + vm2, vm1, v2, vh, s + t, scratch_out); TMP_FREE; } diff -r 73a7d81c4142 mpn/generic/toom_eval_dgr3_pm2.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mpn/generic/toom_eval_dgr3_pm2.c Thu Oct 29 14:41:30 2009 +0100 @@ -0,0 +1,80 @@ +/* mpn_toom_eval_dgr3_pm2 -- Evaluate a degree 3 polynomial in +2 and -2 + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +/* Needs n+1 limbs of temporary storage. */ +int +mpn_toom_eval_dgr3_pm2 (mp_ptr xp2, mp_ptr xm2, + mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp) +{ + mp_limb_t cy; + int neg; + + ASSERT (x3n > 0); + ASSERT (x3n <= n); + + /* (x0 + 4 * x2) +/- (2 x1 + 8 x_3) */ +#if HAVE_NATIVE_mpn_addlsh_n + xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2); + + cy = mpn_addlsh_n (tp, xp + n, xp + 3*n, x3n, 2); + if (x3n < n) + cy = mpn_add_1 (tp + x3n, xp + 3*n + x3n, n - x3n, cy); + tp[n] = cy; +#else + cy = mpn_lshift (tp, xp + 2*n, n, 2); + xp2[n] = cy + mpn_add_n (xp2, tp, xp, n); + + tp[x3n] = mpn_lshift (tp, xp + 3*n, x3n, 2); + if (x3n < n) + tp[n] = mpn_add (tp, xp + n, n, tp, x3n + 1); + else + tp[n] += mpn_add_n (tp, xp + n, tp, n); +#endif + mpn_lshift (tp, tp, n+1, 1); + + neg = (mpn_cmp (xp2, tp, n + 1) < 0); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); + else + mpn_add_n_sub_n (xp2, xm2, xp1, tp, n + 1); +#else + if (neg) + mpn_sub_n (xm2, tp, xp2, n + 1); + else + mpn_sub_n (xm2, xp2, tp, n + 1); + + mpn_add_n (xp2, xp2, tp, n + 1); +#endif + + ASSERT (xp2[n] < 15); + ASSERT (xm2[n] < 10); + + return neg; +} diff -r 73a7d81c4142 mpn/generic/toom_eval_pm1.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mpn/generic/toom_eval_pm1.c Thu Oct 29 14:41:30 2009 +0100 @@ -0,0 +1,79 @@ +/* mpn_toom_eval_pm1 -- Evaluate a polynomial in +1 and -1 + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +/* Evaluates a polynomial of degree k > 3, in the points +1 and -1. */ +int +mpn_toom_eval_pm1 (mp_ptr xp1, mp_ptr xm1, unsigned k, + mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp) +{ + unsigned i; + int neg; + + ASSERT (k >= 4); + + ASSERT (hn > 0); + ASSERT (hn <= n); + + /* The degree k is also the number of full-size coefficients, so + * that last coefficient, of size hn, starts at xp + k*n. */ + + xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n); + for (i = 4; i < k; i += 2) + ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+i*n, n)); + + tp[n] = mpn_add_n (tp, xp + n, xp + 3*n, n); + for (i = 5; i < k; i += 2) + ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+i*n, n)); + + if (k & 1) + ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+k*n, hn)); + else + ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+k*n, hn)); + + neg = (mpn_cmp (xp1, tp, n + 1) < 0); + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1); + else + mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1); +#else + if (neg) + mpn_sub_n (xm1, tp, xp1, n + 1); + else + mpn_sub_n (xm1, xp1, tp, n + 1); + + mpn_add_n (xp1, xp1, tp, n + 1); +#endif + + ASSERT (xp1[n] <= k); + ASSERT (xm1[n] <= k/2 + 1); + + return neg; +} diff -r 73a7d81c4142 mpn/generic/toom_eval_pm2.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mpn/generic/toom_eval_pm2.c Thu Oct 29 14:41:30 2009 +0100 @@ -0,0 +1,109 @@ +/* mpn_toom_eval_pm2 -- Evaluate a polynomial in +2 and -2 + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +/* Evaluates a polynomial of degree k > 3, in the points +2 and -2. */ +int +mpn_toom_eval_pm2 (mp_ptr xp2, mp_ptr xm2, unsigned k, + mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp) +{ + unsigned i; + int neg; + + ASSERT (k >= 4); + + ASSERT (hn > 0); + ASSERT (hn <= n); + + /* The degree k is also the number of full-size coefficients, so + * that last coefficient, of size hn, starts at xp + k*n. */ + +#if HAVE_NATIVE_mpn_addlsh_n + xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2); + for (i = 4; i < k; i += 2) + xp[2] += mpn_addlsh_n (xp2, xp, xp + i*n, n, i); + + tp[n] = mpn_lshift (tp, xp+n, n); + for (i = 3; i < k; i+= 2) + tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i); + + if (k & 1) + { + cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k); + MPN_INCR_U (tp + hn, n+1 - hn, cy); + } + else + { + cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k); + MPN_INCR_U (xp2 + hn, n+1 - hn, cy); + } + +#else /* !HAVE_NATIVE_mpn_addlsh_n */ + xp2[n] = mpn_lshift (tp, xp+2*n, n, 2); + xp2[n] += mpn_add_n (xp2, xp, tp, n); + for (i = 4; i < k; i += 2) + { + xp2[n] += mpn_lshift (tp, xp + i*n, n, i); + xp2[n] += mpn_add_n (xp2, xp2, tp, n); + } + + tp[n] = mpn_lshift (tp, xp+n, n, 1); + for (i = 3; i < k; i+= 2) + { + tp[n] += mpn_lshift (xm2, xp + i*n, n, i); + tp[n] += mpn_add_n (tp, tp, xm2, n); + } + + xm2[hn] = mpn_lshift (xm2, xp + k*n, hn, k); + if (k & 1) + mpn_add (tp, tp, n+1, xm2, hn+1); + else + mpn_add (xp2, xp2, n+1, xm2, hn+1); +#endif /* !HAVE_NATIVE_mpn_addlsh_n */ + + neg = (mpn_cmp (xp2, tp, n + 1) < 0); + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); + else + mpn_add_n_sub_n (xp2, xm2, xp1, tp, n + 1); +#else /* !HAVE_NATIVE_mpn_add_n_sub_n */ + if (neg) + mpn_sub_n (xm2, tp, xp2, n + 1); + else + mpn_sub_n (xm2, xp2, tp, n + 1); + + mpn_add_n (xp2, xp2, tp, n + 1); +#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */ + + ASSERT (xp2[n] < (1<<(k+1))-1); + ASSERT (xm2[n] < ((1<<(k+2))-1 - (k & 1))/3); + + return neg; +} diff -r 73a7d81c4142 mpn/generic/toom_interpolate_7pts.c --- a/mpn/generic/toom_interpolate_7pts.c Thu Oct 29 14:29:40 2009 +0100 +++ b/mpn/generic/toom_interpolate_7pts.c Thu Oct 29 14:41:30 2009 +0100 @@ -66,17 +66,17 @@ along with the GNU MP Library. If not, #endif #endif -/* Interpolation for toom4, using the evaluation points infinity, 2, - 1, -1, 1/2, -1/2. More precisely, we want to compute +/* Interpolation for toom4, using the evaluation points 0, infinity, + 1, -1, 2, -2, 1/2. More precisely, we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the seven values w0 = f(0), - w1 = 64 f(-1/2), - w2 = 64 f(1/2), + w1 = f(-2), + w2 = f(1), w3 = f(-1), - w4 = f(1) - w5 = f(2) + w4 = f(2) + w5 = 64 * f(1/2) w6 = limit at infinity of f(x) / x^6, The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n }, @@ -94,36 +94,38 @@ mpn_toom_interpolate_7pts (mp_ptr rp, mp mp_size_t w6n, mp_ptr tp) { mp_size_t m = 2*n + 1; - mp_ptr w2 = rp + 2*n; - mp_ptr w6 = rp + 6*n; mp_limb_t cy; +#define w0 rp +#define w2 (rp + 2*n) +#define w6 (rp + 6*n) + ASSERT (w6n > 0); ASSERT (w6n <= 2*n); - /* Using Marco Bodrato's formulas + /* Using formulas similar to Marco Bodrato's - W5 = W5 + W2 - W1 =(W1 - W2)/2 - W2 = W2 - W6 - W2 =(W2 - W1)/4 - W0*16 - W3 =(W4 - W3)/2 - W4 = W4 - W3 + W5 = W5 + W4 + W1 =(W4 - W1)/2 + W4 = W4 - W0 + W4 =(W4 - W1)/4 - W6*16 + W3 =(W2 - W3)/2 + W2 = W2 - W3 - W5 = W5 - W4*65 May be negative. - W4 = W4 - W6 - W0 - W5 =(W5 + W4*45)/2 Now >= 0 again. - W2 =(W2 - W4)/3 - W4 = W4 - W2 +U W5 = W5 - W2*65 May be negative. + W2 = W2 - W6 - W0 + W5 =(W5 + W2*45)/2 Now >= 0 again. + W4 =(W4 - W2)/3 + W2 = W2 - W4 - W1 = W1 - W5 May be negative. - W5 =(W5 - W3*8)/9 - W3 = W3 - W5 - W1 =(W1/15 + W5)/2 Now >= 0 again. - W5 = W5 - W1 + W1 = W5 - W1 May be negative. +U W5 =(W5 - W3*8)/9 +U W3 = W3 - W5 +U W1 =(W1/15 + W5)/2 Now >= 0 again. +U W5 = W5 - W1 - where W0 = f(0), W1 = 64 f(-1/2), W2 = 64 f(1/2), W3 = f(-1), - W4 = f(1), W5 = f(2), W6 = f(oo), + where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1), + W4 = f(2), W5 = f(1/2), W6 = f(oo), Note that most intermediate results are positive; the ones that may be negative are represented in two's complement. We must @@ -132,74 +134,77 @@ mpn_toom_interpolate_7pts (mp_ptr rp, mp numbers work fine with two's complement. */ - mpn_add_n (w5, w5, w2, m); + mpn_add_n (w5, w5, w4, m); if (flags & toom7_w1_neg) { #ifdef HAVE_NATIVE_mpn_rsh1add_n - mpn_rsh1add_n (w1, w1, w2, m); + mpn_rsh1add_n (w1, w1, w4, m); #else - mpn_add_n (w1, w1, w2, m); + mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n - mpn_rsh1sub_n (w1, w2, w1, m); + mpn_rsh1sub_n (w1, w4, w1, m); #else - mpn_sub_n (w1, w2, w1, m); + mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); #endif } - mpn_sub (w2, w2, m, w6, w6n); - mpn_sub_n (w2, w2, w1, m); - mpn_rshift (w2, w2, m, 2); /* w2>=0 */ - tp[2*n] = mpn_lshift (tp, rp, 2*n, 4); - mpn_sub_n (w2, w2, tp, m); + mpn_sub (w4, w4, m, w0, 2*n); + mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3)); + mpn_rshift (w4, w4, m, 2); /* w4>=0 */ + + tp[w6n] = mpn_lshift (tp, w6, w6n, 4); + mpn_sub (w4, w4, m, tp, w6n+1); if (flags & toom7_w3_neg) { #ifdef HAVE_NATIVE_mpn_rsh1add_n - mpn_rsh1add_n (w3, w3, w4, m); + mpn_rsh1add_n (w3, w3, w2, m); #else - mpn_add_n (w3, w3, w4, m); + mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1)); mpn_rshift (w3, w3, m, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n - mpn_rsh1sub_n (w3, w4, w3, m); + mpn_rsh1sub_n (w3, w2, w3, m); #else - mpn_sub_n (w3, w4, w3, m); + mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1)); mpn_rshift (w3, w3, m, 1); #endif } - mpn_sub_n (w4, w4, w3, m); + + mpn_sub_n (w2, w2, w3, m); - mpn_submul_1 (w5, w4, m, 65); - mpn_sub (w4, w4, m, w6, w6n); - mpn_sub (w4, w4, m, rp, 2*n); - mpn_addmul_1 (w5, w4, m, 45); + mpn_submul_1 (w5, w2, m, 65); + mpn_sub (w2, w2, m, w6, w6n); + mpn_sub (w2, w2, m, w0, 2*n); + + mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1)); + mpn_rshift (w5, w5, m, 1); + mpn_sub_n (w4, w4, w2, m); + + mpn_divexact_by3 (w4, w4, m); mpn_sub_n (w2, w2, w4, m); - mpn_divexact_by3 (w2, w2, m); - mpn_sub_n (w4, w4, w2, m); - - mpn_rshift (w5, w5, m, 1); - mpn_sub_n (w1, w1, w5, m); + mpn_sub_n (w1, w5, w1, m); mpn_lshift (tp, w3, m, 3); mpn_sub_n (w5, w5, tp, m); mpn_divexact_by9 (w5, w5, m); mpn_sub_n (w3, w3, w5, m); mpn_divexact_by15 (w1, w1, m); - mpn_add_n (w1, w1, w5, m); + mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); /* w1>=0 now */ mpn_sub_n (w5, w5, w1, m); /* These bounds are valid for the 4x4 polynomial product of toom44, - * and they are conservative for toom53 and toom42. */ + * and they are conservative for toom53 and toom62. */ ASSERT (w1[2*n] < 2); ASSERT (w2[2*n] < 3); ASSERT (w3[2*n] < 4);