X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Ftune%2Ftuneup.c;fp=gmp%2Ftune%2Ftuneup.c;h=06cb03c583c5eb33dacab96015a44e26aa7da6c4;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/tune/tuneup.c b/gmp/tune/tuneup.c new file mode 100644 index 00000000..06cb03c5 --- /dev/null +++ b/gmp/tune/tuneup.c @@ -0,0 +1,1900 @@ +/* Create tuned thresholds for various algorithms. + +Copyright 1999, 2000, 2001, 2002, 2003, 2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + + +/* Usage: tuneup [-t] [-t] [-p precision] + + -t turns on some diagnostic traces, a second -t turns on more traces. + + Notes: + + The code here isn't a vision of loveliness, mainly because it's subject + to ongoing changes according to new things wanting to be tuned, and + practical requirements of systems tested. + + Sometimes running the program twice produces slightly different results. + This is probably because there's so little separating algorithms near + their crossover, and on that basis it should make little or no difference + to the final speed of the relevant routines, but nothing has been done to + check that carefully. + + Algorithm: + + The thresholds are determined as follows. A crossover may not be a + single size but rather a range where it oscillates between method A or + method B faster. If the threshold is set making B used where A is faster + (or vice versa) that's bad. Badness is the percentage time lost and + total badness is the sum of this over all sizes measured. The threshold + is set to minimize total badness. + + Suppose, as sizes increase, method B becomes faster than method A. The + effect of the rule is that, as you look at increasing sizes, isolated + points where B is faster are ignored, but when it's consistently faster, + or faster on balance, then the threshold is set there. The same result + is obtained thinking in the other direction of A becoming faster at + smaller sizes. + + In practice the thresholds tend to be chosen to bring on the next + algorithm fairly quickly. + + This rule is attractive because it's got a basis in reason and is fairly + easy to implement, but no work has been done to actually compare it in + absolute terms to other possibilities. + + Implementation: + + In a normal library build the thresholds are constants. To tune them + selected objects are recompiled with the thresholds as global variables + instead. #define TUNE_PROGRAM_BUILD does this, with help from code at + the end of gmp-impl.h, and rules in tune/Makefile.am. + + MUL_KARATSUBA_THRESHOLD for example uses a recompiled mpn_mul_n. The + threshold is set to "size+1" to avoid karatsuba, or to "size" to use one + level, but recurse into the basecase. + + MUL_TOOM3_THRESHOLD makes use of the tuned MUL_KARATSUBA_THRESHOLD value. + Other routines in turn will make use of both of those. Naturally the + dependants must be tuned first. + + In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling, + just a threshold based on comparing two routines (mpn_divrem_1 and + mpn_divexact_1), and no further use of the value determined. + + Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being + just comparisons between certain routines on representative data. + + Shortcuts are applied when native (assembler) versions of routines exist. + For instance a native mpn_sqr_basecase is assumed to be always faster + than mpn_mul_basecase, with no measuring. + + No attempt is made to tune within assembler routines, for instance + DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be + written and tuned all by hand. Assembler routines that might have hard + limits are recompiled though, to make them accept a bigger range of sizes + than normal, eg. mpn_sqr_basecase to compare against mpn_kara_sqr_n. + + Limitations: + + The FFTs aren't subject to the same badness rule as the other thresholds, + so each k is probably being brought on a touch early. This isn't likely + to make a difference, and the simpler probing means fewer tests. + +*/ + +#define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ + +#include "config.h" + +#include +#include +#include +#include +#if HAVE_UNISTD_H +#include +#endif + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#include "tests.h" +#include "speed.h" + +#if !HAVE_DECL_OPTARG +extern char *optarg; +extern int optind, opterr; +#endif + + +#define DEFAULT_MAX_SIZE 1000 /* limbs */ + +#if WANT_FFT +mp_size_t option_fft_max_size = 50000; /* limbs */ +#else +mp_size_t option_fft_max_size = 0; +#endif +int option_trace = 0; +int option_fft_trace = 0; +struct speed_params s; + +struct dat_t { + mp_size_t size; + double d; +} *dat = NULL; +int ndat = 0; +int allocdat = 0; + +/* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that + case use zero here, which for params.max_size means no limit. */ +#ifndef TUNE_SQR_KARATSUBA_MAX +#define TUNE_SQR_KARATSUBA_MAX 0 +#endif + +mp_size_t mul_karatsuba_threshold = MP_SIZE_T_MAX; +mp_size_t mul_toom3_threshold = MUL_TOOM3_THRESHOLD_LIMIT; +mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT; +mp_size_t mul_fft_threshold = MP_SIZE_T_MAX; +mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX; +mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX; +mp_size_t sqr_karatsuba_threshold + = (TUNE_SQR_KARATSUBA_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_KARATSUBA_MAX); +mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; +mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; +mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX; +mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; +mp_size_t mullow_basecase_threshold = MP_SIZE_T_MAX; +mp_size_t mullow_dc_threshold = MP_SIZE_T_MAX; +mp_size_t mullow_mul_n_threshold = MP_SIZE_T_MAX; +mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX; +mp_size_t div_dc_threshold = MP_SIZE_T_MAX; +mp_size_t powm_threshold = MP_SIZE_T_MAX; +mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; +mp_size_t hgcd_threshold = MP_SIZE_T_MAX; +mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX; +mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; +mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; +mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; +mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_1_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_2_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_3_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_4_threshold = MP_SIZE_T_MAX; +mp_size_t divrem_2_threshold = MP_SIZE_T_MAX; +mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX; +mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX; +mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX; +mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX; + +mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; +mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; + +struct param_t { + const char *name; + speed_function_t function; + speed_function_t function2; + double step_factor; /* how much to step sizes (rounded down) */ + double function_fudge; /* multiplier for "function" speeds */ + int stop_since_change; + double stop_factor; + mp_size_t min_size; + int min_is_always; + mp_size_t max_size; + mp_size_t check_size; + mp_size_t size_extra; + +#define DATA_HIGH_LT_R 1 +#define DATA_HIGH_GE_R 2 + int data_high; + + int noprint; +}; + + +/* These are normally undefined when false, which suits "#if" fine. + But give them zero values so they can be used in plain C "if"s. */ +#ifndef UDIV_PREINV_ALWAYS +#define UDIV_PREINV_ALWAYS 0 +#endif +#ifndef HAVE_NATIVE_mpn_divexact_1 +#define HAVE_NATIVE_mpn_divexact_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_divrem_1 +#define HAVE_NATIVE_mpn_divrem_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_divrem_2 +#define HAVE_NATIVE_mpn_divrem_2 0 +#endif +#ifndef HAVE_NATIVE_mpn_mod_1 +#define HAVE_NATIVE_mpn_mod_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_modexact_1_odd +#define HAVE_NATIVE_mpn_modexact_1_odd 0 +#endif +#ifndef HAVE_NATIVE_mpn_preinv_divrem_1 +#define HAVE_NATIVE_mpn_preinv_divrem_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_preinv_mod_1 +#define HAVE_NATIVE_mpn_preinv_mod_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_sqr_basecase +#define HAVE_NATIVE_mpn_sqr_basecase 0 +#endif + + +#define MAX3(a,b,c) MAX (MAX (a, b), c) + +mp_limb_t +randlimb_norm (void) +{ + mp_limb_t n; + mpn_random (&n, 1); + n |= GMP_NUMB_HIGHBIT; + return n; +} + +#define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1) + +mp_limb_t +randlimb_half (void) +{ + mp_limb_t n; + mpn_random (&n, 1); + n &= GMP_NUMB_HALFMASK; + n += (n==0); + return n; +} + + +/* Add an entry to the end of the dat[] array, reallocing to make it bigger + if necessary. */ +void +add_dat (mp_size_t size, double d) +{ +#define ALLOCDAT_STEP 500 + + ASSERT_ALWAYS (ndat <= allocdat); + + if (ndat == allocdat) + { + dat = (struct dat_t *) __gmp_allocate_or_reallocate + (dat, allocdat * sizeof(dat[0]), + (allocdat+ALLOCDAT_STEP) * sizeof(dat[0])); + allocdat += ALLOCDAT_STEP; + } + + dat[ndat].size = size; + dat[ndat].d = d; + ndat++; +} + + +/* Return the threshold size based on the data accumulated. */ +mp_size_t +analyze_dat (int final) +{ + double x, min_x; + int j, min_j; + + /* If the threshold is set at dat[0].size, any positive values are bad. */ + x = 0.0; + for (j = 0; j < ndat; j++) + if (dat[j].d > 0.0) + x += dat[j].d; + + if (option_trace >= 2 && final) + { + printf ("\n"); + printf ("x is the sum of the badness from setting thresh at given size\n"); + printf (" (minimum x is sought)\n"); + printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x); + } + + min_x = x; + min_j = 0; + + + /* When stepping to the next dat[j].size, positive values are no longer + bad (so subtracted), negative values become bad (so add the absolute + value, meaning subtract). */ + for (j = 0; j < ndat; x -= dat[j].d, j++) + { + if (option_trace >= 2 && final) + printf ("size=%ld x=%.4f\n", (long) dat[j].size, x); + + if (x < min_x) + { + min_x = x; + min_j = j; + } + } + + return min_j; +} + + +/* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */ + +mp_limb_t mpn_divrem_1_tune + __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t)); +mp_limb_t mpn_mod_1_tune + __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t)); + +double +speed_mpn_mod_1_tune (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune); +} +double +speed_mpn_divrem_1_tune (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune); +} + + +double +tuneup_measure (speed_function_t fun, + const struct param_t *param, + struct speed_params *s) +{ + static struct param_t dummy; + double t; + TMP_DECL; + + if (! param) + param = &dummy; + + s->size += param->size_extra; + + TMP_MARK; + SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0); + SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0); + + mpn_random (s->xp, s->size); + mpn_random (s->yp, s->size); + + switch (param->data_high) { + case DATA_HIGH_LT_R: + s->xp[s->size-1] %= s->r; + s->yp[s->size-1] %= s->r; + break; + case DATA_HIGH_GE_R: + s->xp[s->size-1] |= s->r; + s->yp[s->size-1] |= s->r; + break; + } + + t = speed_measure (fun, s); + + s->size -= param->size_extra; + + TMP_FREE; + return t; +} + + +#define PRINT_WIDTH 28 + +void +print_define_start (const char *name) +{ + printf ("#define %-*s ", PRINT_WIDTH, name); + if (option_trace) + printf ("...\n"); +} + +void +print_define_end_remark (const char *name, mp_size_t value, const char *remark) +{ + if (option_trace) + printf ("#define %-*s ", PRINT_WIDTH, name); + + if (value == MP_SIZE_T_MAX) + printf ("MP_SIZE_T_MAX"); + else + printf ("%5ld", (long) value); + + if (remark != NULL) + printf (" /* %s */", remark); + printf ("\n"); +} + +void +print_define_end (const char *name, mp_size_t value) +{ + const char *remark; + if (value == MP_SIZE_T_MAX) + remark = "never"; + else if (value == 0) + remark = "always"; + else + remark = NULL; + print_define_end_remark (name, value, remark); +} + +void +print_define (const char *name, mp_size_t value) +{ + print_define_start (name); + print_define_end (name, value); +} + +void +print_define_remark (const char *name, mp_size_t value, const char *remark) +{ + print_define_start (name); + print_define_end_remark (name, value, remark); +} + + +void +one (mp_size_t *threshold, struct param_t *param) +{ + int since_positive, since_thresh_change; + int thresh_idx, new_thresh_idx; + +#define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0) + + DEFAULT (param->function_fudge, 1.0); + DEFAULT (param->function2, param->function); + DEFAULT (param->step_factor, 0.01); /* small steps by default */ + DEFAULT (param->stop_since_change, 80); + DEFAULT (param->stop_factor, 1.2); + DEFAULT (param->min_size, 10); + DEFAULT (param->max_size, DEFAULT_MAX_SIZE); + + if (param->check_size != 0) + { + double t1, t2; + s.size = param->check_size; + + *threshold = s.size+1; + t1 = tuneup_measure (param->function, param, &s); + + *threshold = s.size; + t2 = tuneup_measure (param->function2, param, &s); + if (t1 == -1.0 || t2 == -1.0) + { + printf ("Oops, can't run both functions at size %ld\n", + (long) s.size); + abort (); + } + t1 *= param->function_fudge; + + /* ask that t2 is at least 4% below t1 */ + if (t1 < t2*1.04) + { + if (option_trace) + printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2); + *threshold = MP_SIZE_T_MAX; + if (! param->noprint) + print_define (param->name, *threshold); + return; + } + + if (option_trace >= 2) + printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n", + (long) s.size, t1, t2); + } + + if (! param->noprint || option_trace) + print_define_start (param->name); + + ndat = 0; + since_positive = 0; + since_thresh_change = 0; + thresh_idx = 0; + + if (option_trace >= 2) + { + printf (" algorithm-A algorithm-B ratio possible\n"); + printf (" (seconds) (seconds) diff thresh\n"); + } + + for (s.size = param->min_size; + s.size < param->max_size; + s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1)) + { + double ti, tiplus1, d; + + /* If there's a size limit and it's reached then it should still + be sensible to analyze the data since we want the threshold put + either at or near the limit. */ + if (s.size >= param->max_size) + { + if (option_trace) + printf ("Reached maximum size (%ld) without otherwise stopping\n", + (long) param->max_size); + break; + } + + /* + FIXME: check minimum size requirements are met, possibly by just + checking for the -1 returns from the speed functions. + */ + + /* using method A at this size */ + *threshold = s.size+1; + ti = tuneup_measure (param->function, param, &s); + if (ti == -1.0) + abort (); + ti *= param->function_fudge; + + /* using method B at this size */ + *threshold = s.size; + tiplus1 = tuneup_measure (param->function2, param, &s); + if (tiplus1 == -1.0) + abort (); + + /* Calculate the fraction by which the one or the other routine is + slower. */ + if (tiplus1 >= ti) + d = (tiplus1 - ti) / tiplus1; /* negative */ + else + d = (tiplus1 - ti) / ti; /* positive */ + + add_dat (s.size, d); + + new_thresh_idx = analyze_dat (0); + + if (option_trace >= 2) + printf ("size=%ld %.9f %.9f % .4f %c %ld\n", + (long) s.size, ti, tiplus1, d, + ti > tiplus1 ? '#' : ' ', + (long) dat[new_thresh_idx].size); + + /* Stop if the last time method i was faster was more than a + certain number of measurements ago. */ +#define STOP_SINCE_POSITIVE 200 + if (d >= 0) + since_positive = 0; + else + if (++since_positive > STOP_SINCE_POSITIVE) + { + if (option_trace >= 1) + printf ("stopped due to since_positive (%d)\n", + STOP_SINCE_POSITIVE); + break; + } + + /* Stop if method A has become slower by a certain factor. */ + if (ti >= tiplus1 * param->stop_factor) + { + if (option_trace >= 1) + printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n", + param->stop_factor); + break; + } + + /* Stop if the threshold implied hasn't changed in a certain + number of measurements. (It's this condition that ususally + stops the loop.) */ + if (thresh_idx != new_thresh_idx) + since_thresh_change = 0, thresh_idx = new_thresh_idx; + else + if (++since_thresh_change > param->stop_since_change) + { + if (option_trace >= 1) + printf ("stopped due to since_thresh_change (%d)\n", + param->stop_since_change); + break; + } + + /* Stop if the threshold implied is more than a certain number of + measurements ago. */ +#define STOP_SINCE_AFTER 500 + if (ndat - thresh_idx > STOP_SINCE_AFTER) + { + if (option_trace >= 1) + printf ("stopped due to ndat - thresh_idx > amount (%d)\n", + STOP_SINCE_AFTER); + break; + } + + /* Stop when the size limit is reached before the end of the + crossover, but only show this as an error for >= the default max + size. FIXME: Maybe should make it a param choice whether this is + an error. */ + if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE) + { + fprintf (stderr, "%s\n", param->name); + fprintf (stderr, "sizes %ld to %ld total %d measurements\n", + (long) dat[0].size, (long) dat[ndat-1].size, ndat); + fprintf (stderr, " max size reached before end of crossover\n"); + break; + } + } + + if (option_trace >= 1) + printf ("sizes %ld to %ld total %d measurements\n", + (long) dat[0].size, (long) dat[ndat-1].size, ndat); + + *threshold = dat[analyze_dat (1)].size; + + if (param->min_is_always) + { + if (*threshold == param->min_size) + *threshold = 0; + } + + if (! param->noprint || option_trace) + print_define_end (param->name, *threshold); +} + + +/* Special probing for the fft thresholds. The size restrictions on the + FFTs mean the graph of time vs size has a step effect. See this for + example using + + ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9 + gnuplot foo.gnuplot + + The current approach is to compare routines at the midpoint of relevant + steps. Arguably a more sophisticated system of threshold data is wanted + if this step effect remains. */ + +struct fft_param_t { + const char *table_name; + const char *threshold_name; + const char *modf_threshold_name; + mp_size_t *p_threshold; + mp_size_t *p_modf_threshold; + mp_size_t first_size; + mp_size_t max_size; + speed_function_t function; + speed_function_t mul_function; + mp_size_t sqr; +}; + + +/* mpn_mul_fft requires pl a multiple of 2^k limbs, but with + N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple + of 2^(k-1) bits. */ + +mp_size_t +fft_step_size (int k) +{ + mp_size_t step; + + step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB; + step *= (mp_size_t) 1 << k; + + if (step <= 0) + { + printf ("Can't handle k=%d\n", k); + abort (); + } + + return step; +} + +mp_size_t +fft_next_size (mp_size_t pl, int k) +{ + mp_size_t m = fft_step_size (k); + +/* printf ("[k=%d %ld] %ld ->", k, m, pl); */ + + if (pl == 0 || (pl & (m-1)) != 0) + pl = (pl | (m-1)) + 1; + +/* printf (" %ld\n", pl); */ + return pl; +} + +void +fft (struct fft_param_t *p) +{ + mp_size_t size; + int i, k; + + for (i = 0; i < numberof (mpn_fft_table[p->sqr]); i++) + mpn_fft_table[p->sqr][i] = MP_SIZE_T_MAX; + + *p->p_threshold = MP_SIZE_T_MAX; + *p->p_modf_threshold = MP_SIZE_T_MAX; + + option_trace = MAX (option_trace, option_fft_trace); + + printf ("#define %s {", p->table_name); + if (option_trace >= 2) + printf ("\n"); + + k = FFT_FIRST_K; + size = p->first_size; + for (;;) + { + double tk, tk1; + + size = fft_next_size (size+1, k+1); + + if (size >= p->max_size) + break; + if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr])) + break; + + /* compare k to k+1 in the middle of the current k+1 step */ + s.size = size + fft_step_size (k+1) / 2; + s.r = k; + tk = tuneup_measure (p->function, NULL, &s); + if (tk == -1.0) + abort (); + + s.r = k+1; + tk1 = tuneup_measure (p->function, NULL, &s); + if (tk1 == -1.0) + abort (); + + if (option_trace >= 2) + printf ("at %ld size=%ld k=%d %.9f k=%d %.9f\n", + (long) size, (long) s.size, k, tk, k+1, tk1); + + /* declare the k+1 threshold as soon as it's faster at its midpoint */ + if (tk1 < tk) + { + mpn_fft_table[p->sqr][k-FFT_FIRST_K] = s.size; + printf (" %ld,", (long) s.size); + if (option_trace >= 2) printf ("\n"); + k++; + } + } + + mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0; + printf (" 0 }\n"); + + + size = p->first_size; + + /* Declare an FFT faster than a plain toom4 etc multiplication found as + soon as one faster measurement obtained. A multiplication in the + middle of the FFT step is tested. */ + for (;;) + { + int modf = (*p->p_modf_threshold == MP_SIZE_T_MAX); + double tk, tm; + + /* k=7 should be the first FFT which can beat toom4 on a full + multiply, so jump to that threshold and save some probing after the + modf threshold is found. */ + if (!modf && size < mpn_fft_table[p->sqr][2]) + { + size = mpn_fft_table[p->sqr][2]; + if (option_trace >= 2) + printf ("jump to size=%ld\n", (long) size); + } + + size = fft_next_size (size+1, mpn_fft_best_k (size, p->sqr)); + k = mpn_fft_best_k (size, p->sqr); + + if (size >= p->max_size) + break; + + s.size = size + fft_step_size (k) / 2; + s.r = k; + tk = tuneup_measure (p->function, NULL, &s); + if (tk == -1.0) + abort (); + + if (!modf) s.size /= 2; + tm = tuneup_measure (p->mul_function, NULL, &s); + if (tm == -1.0) + abort (); + + if (option_trace >= 2) + printf ("at %ld size=%ld k=%d %.9f size=%ld %s mul %.9f\n", + (long) size, + (long) size + fft_step_size (k) / 2, k, tk, + (long) s.size, modf ? "modf" : "full", tm); + + if (tk < tm) + { + if (modf) + { + *p->p_modf_threshold = s.size; + print_define (p->modf_threshold_name, *p->p_modf_threshold); + } + else + { + *p->p_threshold = s.size; + print_define (p->threshold_name, *p->p_threshold); + break; + } + } + } + +} + + + +/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, + giving wrong results. */ +void +tune_mul (void) +{ + static struct param_t param; + + param.function = speed_mpn_mul_n; + + param.name = "MUL_KARATSUBA_THRESHOLD"; + param.min_size = MAX (4, MPN_KARA_MUL_N_MINSIZE); + param.max_size = MUL_KARATSUBA_THRESHOLD_LIMIT-1; + one (&mul_karatsuba_threshold, ¶m); + + param.name = "MUL_TOOM3_THRESHOLD"; + param.min_size = MAX (mul_karatsuba_threshold, MPN_TOOM3_MUL_N_MINSIZE); + param.max_size = MUL_TOOM3_THRESHOLD_LIMIT-1; + one (&mul_toom3_threshold, ¶m); + + param.name = "MUL_TOOM44_THRESHOLD"; + param.min_size = MAX (mul_toom3_threshold, MPN_TOOM44_MUL_N_MINSIZE); + param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1; + one (&mul_toom44_threshold, ¶m); + + /* disabled until tuned */ + MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; +} + + +/* This was written by the tuneup challenged tege. Kevin, please delete + this comment when you've reviewed/rewritten this. :-) */ +void +tune_mullow (void) +{ + static struct param_t param; + + param.function = speed_mpn_mullow_n; + + param.name = "MULLOW_BASECASE_THRESHOLD"; + param.min_size = 3; + param.min_is_always = 1; + param.max_size = MULLOW_BASECASE_THRESHOLD_LIMIT-1; + one (&mullow_basecase_threshold, ¶m); + + param.min_is_always = 0; /* ??? */ + + param.name = "MULLOW_DC_THRESHOLD"; + param.min_size = mul_karatsuba_threshold; + param.max_size = 1000; + one (&mullow_dc_threshold, ¶m); + + param.name = "MULLOW_MUL_N_THRESHOLD"; + param.min_size = mullow_dc_threshold; + param.max_size = 2000; + one (&mullow_mul_n_threshold, ¶m); +} + + +/* Start the basecase from 3, since 1 is a special case, and if mul_basecase + is faster only at size==2 then we don't want to bother with extra code + just for that. Start karatsuba from 4 same as MUL above. */ + +void +tune_sqr (void) +{ + /* disabled until tuned */ + SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; + + if (HAVE_NATIVE_mpn_sqr_basecase) + { + print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)"); + sqr_basecase_threshold = 0; + } + else + { + static struct param_t param; + param.name = "SQR_BASECASE_THRESHOLD"; + param.function = speed_mpn_sqr_n; + param.min_size = 3; + param.min_is_always = 1; + param.max_size = TUNE_SQR_KARATSUBA_MAX; + param.noprint = 1; + one (&sqr_basecase_threshold, ¶m); + } + + { + static struct param_t param; + param.name = "SQR_KARATSUBA_THRESHOLD"; + param.function = speed_mpn_sqr_n; + param.min_size = MAX (4, MPN_KARA_SQR_N_MINSIZE); + param.max_size = TUNE_SQR_KARATSUBA_MAX; + param.noprint = 1; + one (&sqr_karatsuba_threshold, ¶m); + + if (! HAVE_NATIVE_mpn_sqr_basecase + && sqr_karatsuba_threshold < sqr_basecase_threshold) + { + /* Karatsuba becomes faster than mul_basecase before + sqr_basecase does. Arrange for the expression + "BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD))" which + selects mpn_sqr_basecase in mpn_sqr_n to be false, by setting + SQR_KARATSUBA_THRESHOLD to zero, making + SQR_BASECASE_THRESHOLD the karatsuba threshold. */ + + sqr_basecase_threshold = SQR_KARATSUBA_THRESHOLD; + SQR_KARATSUBA_THRESHOLD = 0; + + print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold, + "karatsuba"); + print_define_remark ("SQR_KARATSUBA_THRESHOLD",SQR_KARATSUBA_THRESHOLD, + "never sqr_basecase"); + } + else + { + if (! HAVE_NATIVE_mpn_sqr_basecase) + print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold); + print_define ("SQR_KARATSUBA_THRESHOLD", SQR_KARATSUBA_THRESHOLD); + } + } + + { + static struct param_t param; + mp_size_t toom3_start = MAX (sqr_karatsuba_threshold, sqr_basecase_threshold); + + param.function = speed_mpn_sqr_n; + + param.name = "SQR_TOOM3_THRESHOLD"; + param.min_size = MAX (toom3_start, MPN_TOOM3_SQR_N_MINSIZE); + param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1; + one (&sqr_toom3_threshold, ¶m); + + param.name = "SQR_TOOM4_THRESHOLD"; + param.min_size = MAX (sqr_toom3_threshold, MPN_TOOM4_SQR_N_MINSIZE); + param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1; + one (&sqr_toom4_threshold, ¶m); + } +} + + +void +tune_sb_preinv (void) +{ + static struct param_t param; + + if (GMP_NAIL_BITS != 0) + { + DIV_SB_PREINV_THRESHOLD = MP_SIZE_T_MAX; + print_define_remark ("DIV_SB_PREINV_THRESHOLD", MP_SIZE_T_MAX, + "no preinv with nails"); + return; + } + + if (UDIV_PREINV_ALWAYS) + { + print_define_remark ("DIV_SB_PREINV_THRESHOLD", 0L, "preinv always"); + return; + } + + param.check_size = 256; + param.min_size = 3; + param.min_is_always = 1; + param.size_extra = 3; + param.stop_factor = 2.0; + param.name = "DIV_SB_PREINV_THRESHOLD"; + param.function = speed_mpn_sb_divrem_m3; + one (&div_sb_preinv_threshold, ¶m); +} + + +void +tune_dc (void) +{ + static struct param_t param; + param.name = "DIV_DC_THRESHOLD"; + param.function = speed_mpn_dc_tdiv_qr; + param.step_factor = 0.02; + one (&div_dc_threshold, ¶m); +} + + +/* This is an indirect determination, based on a comparison between redc and + mpz_mod. A fudge factor of 1.04 is applied to redc, to represent + additional overheads it gets in mpz_powm. + + stop_factor is 1.1 to hopefully help cray vector systems, where otherwise + currently it hits the 1000 limb limit with only a factor of about 1.18 + (threshold should be around 650). */ + +void +tune_powm (void) +{ + static struct param_t param; + param.name = "POWM_THRESHOLD"; + param.function = speed_mpn_redc_1; + param.function2 = speed_mpz_mod; + param.step_factor = 0.03; + param.stop_factor = 1.1; + param.function_fudge = 1.04; + one (&powm_threshold, ¶m); +} + + +void +tune_matrix22_mul (void) +{ + static struct param_t param; + param.name = "MATRIX22_STRASSEN_THRESHOLD"; + param.function = speed_mpn_matrix22_mul; + param.min_size = 2; + one (&matrix22_strassen_threshold, ¶m); +} + +void +tune_hgcd (void) +{ + static struct param_t param; + param.name = "HGCD_THRESHOLD"; + param.function = speed_mpn_hgcd; + /* We seem to get strange results for small sizes */ + param.min_size = 30; + one (&hgcd_threshold, ¶m); +} + +#if 0 +void +tune_gcd_accel (void) +{ + static struct param_t param; + param.name = "GCD_ACCEL_THRESHOLD"; + param.function = speed_mpn_gcd; + param.min_size = 1; + one (&gcd_accel_threshold, ¶m); +} +#endif +void +tune_gcd_dc (void) +{ + static struct param_t param; + param.name = "GCD_DC_THRESHOLD"; + param.function = speed_mpn_gcd; + param.min_size = hgcd_threshold; + param.max_size = 3000; + param.step_factor = 0.02; + one (&gcd_dc_threshold, ¶m); +} + +void +tune_gcdext_dc (void) +{ + static struct param_t param; + param.name = "GCDEXT_DC_THRESHOLD"; + param.function = speed_mpn_gcdext; + param.min_size = hgcd_threshold; + param.max_size = 3000; + param.step_factor = 0.02; + one (&gcdext_dc_threshold, ¶m); +} + + +/* size_extra==1 reflects the fact that with high> 4); + if (s.r == 0) s.r = 123; + + t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s); + t2 = tuneup_measure (divrem_1, ¶m, &s); + if (t1 == -1.0 || t2 == -1.0) + { + printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n", + divrem_1_name, (long) s.size); + abort (); + } + if (option_trace >= 1) + printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n", + (long) s.size, t1, divrem_1_name, t2); + + print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL); +} + + +/* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply + that udiv_qrnnd_preinv is worth using, but it seems most straightforward + to compare mpn_preinv_mod_1 and mpn_mod_1_div directly. */ + +void +tune_preinv_mod_1 (void) +{ + static struct param_t param; + speed_function_t mod_1; + const char *mod_1_name; + double t1, t2; + + /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's + faster than mpn_mod_1. */ + if (HAVE_NATIVE_mpn_preinv_mod_1) + { + print_define_remark ("USE_PREINV_MOD_1", 1, "native"); + return; + } + + if (GMP_NAIL_BITS != 0) + { + print_define_remark ("USE_PREINV_MOD_1", 0, "no preinv with nails"); + return; + } + + /* If udiv_qrnnd_preinv is the only division method then of course + mpn_preinv_mod_1 should be used. */ + if (UDIV_PREINV_ALWAYS) + { + print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always"); + return; + } + + /* If we've got an assembler version of mpn_mod_1, then compare against + that, not the mpn_mod_1_div generic C. */ + if (HAVE_NATIVE_mpn_mod_1) + { + mod_1 = speed_mpn_mod_1; + mod_1_name = "mpn_mod_1"; + } + else + { + mod_1 = speed_mpn_mod_1_div; + mod_1_name = "mpn_mod_1_div"; + } + + param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */ + s.size = 200; /* generous but not too big */ + s.r = randlimb_norm(); /* divisor */ + + t1 = tuneup_measure (speed_mpn_preinv_mod_1, ¶m, &s); + t2 = tuneup_measure (mod_1, ¶m, &s); + if (t1 == -1.0 || t2 == -1.0) + { + printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n", + mod_1_name, (long) s.size); + abort (); + } + if (option_trace >= 1) + printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n", + (long) s.size, t1, mod_1_name, t2); + + print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL); +} + + +void +tune_divrem_2 (void) +{ + static struct param_t param; + + /* No support for tuning native assembler code, do that by hand and put + the results in the .asm file, and there's no need for such thresholds + to appear in gmp-mparam.h. */ + if (HAVE_NATIVE_mpn_divrem_2) + return; + + if (GMP_NAIL_BITS != 0) + { + print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX, + "no preinv with nails"); + return; + } + + if (UDIV_PREINV_ALWAYS) + { + print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always"); + return; + } + + /* Tune for the integer part of mpn_divrem_2. This will very possibly be + a bit out for the fractional part, but that's too bad, the integer part + is more important. + + min_size must be >=2 since nsize>=2 is required, but is set to 4 to save + code space if plain division is better only at size==2 or size==3. */ + param.name = "DIVREM_2_THRESHOLD"; + param.check_size = 256; + param.min_size = 4; + param.min_is_always = 1; + param.size_extra = 2; /* does qsize==nsize-2 divisions */ + param.stop_factor = 2.0; + + s.r = randlimb_norm (); + param.function = speed_mpn_divrem_2; + one (&divrem_2_threshold, ¶m); +} + + +/* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so + tune for that. Its speed can differ on odd or even divisor, so take an + average threshold for the two. + + mpn_divrem_1 can vary with high= 1) + printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1); + + t2 = tuneup_measure (speed_mpn_jacobi_base_2, ¶m, &s); + if (option_trace >= 1) + printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2); + + t3 = tuneup_measure (speed_mpn_jacobi_base_3, ¶m, &s); + if (option_trace >= 1) + printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3); + + if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0) + { + printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n", + (long) s.size); + abort (); + } + + if (t1 < t2 && t1 < t3) + method = 1; + else if (t2 < t3) + method = 2; + else + method = 3; + + print_define ("JACOBI_BASE_METHOD", method); +} + + +void +tune_get_str (void) +{ + /* Tune for decimal, it being most common. Some rough testing suggests + other bases are different, but not by very much. */ + s.r = 10; + { + static struct param_t param; + GET_STR_PRECOMPUTE_THRESHOLD = 0; + param.name = "GET_STR_DC_THRESHOLD"; + param.function = speed_mpn_get_str; + param.min_size = 4; + param.max_size = GET_STR_THRESHOLD_LIMIT; + one (&get_str_dc_threshold, ¶m); + } + { + static struct param_t param; + param.name = "GET_STR_PRECOMPUTE_THRESHOLD"; + param.function = speed_mpn_get_str; + param.min_size = GET_STR_DC_THRESHOLD; + param.max_size = GET_STR_THRESHOLD_LIMIT; + one (&get_str_precompute_threshold, ¶m); + } +} + + +double +speed_mpn_pre_set_str (struct speed_params *s) +{ + unsigned char *str; + mp_ptr wp; + mp_size_t wn; + unsigned i; + int base; + double t; + mp_ptr powtab_mem, tp; + powers_t powtab[GMP_LIMB_BITS]; + mp_size_t un; + int chars_per_limb; + TMP_DECL; + + SPEED_RESTRICT_COND (s->size >= 1); + + base = s->r == 0 ? 10 : s->r; + SPEED_RESTRICT_COND (base >= 2 && base <= 256); + + TMP_MARK; + + str = TMP_ALLOC (s->size); + for (i = 0; i < s->size; i++) + str[i] = s->xp[i] % base; + + wn = ((mp_size_t) (s->size / __mp_bases[base].chars_per_bit_exactly)) + / BITS_PER_MP_LIMB + 2; + SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); + + /* use this during development to check wn is big enough */ + /* + ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn); + */ + + speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB); + speed_operand_dst (s, wp, wn); + speed_cache_fill (s); + + chars_per_limb = __mp_bases[base].chars_per_limb; + un = s->size / chars_per_limb + 1; + powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un)); + mpn_set_str_compute_powtab (powtab, powtab_mem, un, base); + tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); + + speed_starttime (); + i = s->reps; + do + { + mpn_pre_set_str (wp, str, s->size, powtab, tp); + } + while (--i != 0); + t = speed_endtime (); + + TMP_FREE; + return t; +} + +void +tune_set_str (void) +{ + static struct param_t param; + + s.r = 10; /* decimal */ + { + static struct param_t param; + SET_STR_PRECOMPUTE_THRESHOLD = 0; + param.step_factor = 0.01; + param.name = "SET_STR_DC_THRESHOLD"; + param.function = speed_mpn_pre_set_str; + param.min_size = 100; + param.max_size = 50000; + one (&set_str_dc_threshold, ¶m); + } + { + static struct param_t param; + param.step_factor = 0.02; + param.name = "SET_STR_PRECOMPUTE_THRESHOLD"; + param.function = speed_mpn_set_str; + param.min_size = SET_STR_DC_THRESHOLD; + param.max_size = 100000; + one (&set_str_precompute_threshold, ¶m); + } +} + + +void +tune_fft_mul (void) +{ + static struct fft_param_t param; + + if (option_fft_max_size == 0) + return; + + param.table_name = "MUL_FFT_TABLE"; + param.threshold_name = "MUL_FFT_THRESHOLD"; + param.p_threshold = &mul_fft_threshold; + param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; + param.p_modf_threshold = &mul_fft_modf_threshold; + param.first_size = MUL_TOOM3_THRESHOLD / 2; + param.max_size = option_fft_max_size; + param.function = speed_mpn_mul_fft; + param.mul_function = speed_mpn_mul_n; + param.sqr = 0; + fft (¶m); +} + + +void +tune_fft_sqr (void) +{ + static struct fft_param_t param; + + if (option_fft_max_size == 0) + return; + + param.table_name = "SQR_FFT_TABLE"; + param.threshold_name = "SQR_FFT_THRESHOLD"; + param.p_threshold = &sqr_fft_threshold; + param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; + param.p_modf_threshold = &sqr_fft_modf_threshold; + param.first_size = SQR_TOOM3_THRESHOLD / 2; + param.max_size = option_fft_max_size; + param.function = speed_mpn_mul_fft_sqr; + param.mul_function = speed_mpn_sqr_n; + param.sqr = 0; + fft (¶m); +} + + +void +all (void) +{ + time_t start_time, end_time; + TMP_DECL; + + TMP_MARK; + SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0); + SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0); + + mpn_random (s.xp_block, SPEED_BLOCK_SIZE); + mpn_random (s.yp_block, SPEED_BLOCK_SIZE); + + fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST); + + speed_time_init (); + fprintf (stderr, "Using: %s\n", speed_time_string); + + fprintf (stderr, "speed_precision %d", speed_precision); + if (speed_unittime == 1.0) + fprintf (stderr, ", speed_unittime 1 cycle"); + else + fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime); + if (speed_cycletime == 1.0 || speed_cycletime == 0.0) + fprintf (stderr, ", CPU freq unknown\n"); + else + fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); + + fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n", + DEFAULT_MAX_SIZE, (long) option_fft_max_size); + fprintf (stderr, "\n"); + + time (&start_time); + { + struct tm *tp; + tp = localtime (&start_time); + printf ("/* Generated by tuneup.c, %d-%02d-%02d, ", + tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); + +#ifdef __GNUC__ + /* gcc sub-minor version doesn't seem to come through as a define */ + printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); +#define PRINTED_COMPILER +#endif +#if defined (__SUNPRO_C) + printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); +#define PRINTED_COMPILER +#endif +#if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION) + /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */ + printf ("MIPSpro C %d.%d.%d */\n", + _COMPILER_VERSION / 100, + _COMPILER_VERSION / 10 % 10, + _COMPILER_VERSION % 10); +#define PRINTED_COMPILER +#endif +#if defined (__DECC) && defined (__DECC_VER) + printf ("DEC C %d */\n", __DECC_VER); +#define PRINTED_COMPILER +#endif +#if ! defined (PRINTED_COMPILER) + printf ("system compiler */\n"); +#endif + } + printf ("\n"); + + tune_mul (); + printf("\n"); + + tune_sqr (); + printf("\n"); + + tune_mullow (); + printf("\n"); + + tune_sb_preinv (); + tune_dc (); + tune_powm (); + printf("\n"); + + tune_matrix22_mul (); + tune_hgcd (); + tune_gcd_dc (); + tune_gcdext_dc (); +#if 0 + tune_gcd_accel (); +#endif + tune_jacobi_base (); + printf("\n"); + + tune_divrem_1 (); + tune_mod_1 (); + tune_preinv_divrem_1 (); + tune_preinv_mod_1 (); + tune_divrem_2 (); + tune_divexact_1 (); + tune_modexact_1_odd (); + printf("\n"); + + tune_get_str (); + tune_set_str (); + printf("\n"); + + tune_fft_mul (); + printf("\n"); + + tune_fft_sqr (); + printf ("\n"); + + time (&end_time); + printf ("/* Tuneup completed successfully, took %ld seconds */\n", + end_time - start_time); + + TMP_FREE; +} + + +int +main (int argc, char *argv[]) +{ + int opt; + + /* Unbuffered so if output is redirected to a file it isn't lost if the + program is killed part way through. */ + setbuf (stdout, NULL); + setbuf (stderr, NULL); + + while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF) + { + switch (opt) { + case 'f': + if (optarg[0] == 't') + option_fft_trace = 2; + else + option_fft_max_size = atol (optarg); + break; + case 'o': + speed_option_set (optarg); + break; + case 'p': + speed_precision = atoi (optarg); + break; + case 't': + option_trace++; + break; + case '?': + exit(1); + } + } + + all (); + exit (0); +}