Index: modules/math/numtheory.dtx ================================================================== --- modules/math/numtheory.dtx +++ modules/math/numtheory.dtx @@ -1,16 +1,16 @@ % % \iffalse % %<*pkg> -%% Copyright (c) 2010 by Lars Hellstrom. All rights reserved. +%% Copyright (c) 2010, 2015 by Lars Hellstrom. All rights reserved. %% See the file "license.terms" for information on usage and redistribution %% of this file, and for a DISCLAIMER OF ALL WARRANTIES. % %<*driver> \documentclass{tclldoc} -\usepackage{amsmath,amsfonts} +\usepackage{amsmath,amsfonts,amssymb} \usepackage{url} \newcommand{\Tcl}{\Tcllogo} \begin{document} \DocInput{numtheory.dtx} \end{document} @@ -21,13 +21,13 @@ % \author{Lars Hellstr\"om} % \date{30 May 2010} % \maketitle % % \begin{abstract} -% This package provides a command to test whether an integer is a -% prime, but may in time come to house also other number-theoretic -% operations. +% This package provides operations related to the number theory +% branch of mathematics, notably a command to test whether an integer +% is a prime. % \end{abstract} % % \tableofcontents % % \section*{Preliminaries} @@ -35,14 +35,13 @@ % \begin{tcl} %<*pkg> package require Tcl 8.5 % \end{tcl} % \Tcl~8.4 is seriously broken with respect to arithmetic overflow, -% so we require 8.5. There are (as yet) no explicit 8.5-isms in the -% code, however. +% so we require 8.5. % \begin{tcl} -package provide math::numtheory 1.0 +package provide math::numtheory 1.1 namespace eval ::math::numtheory { namespace export isprime } % % \end{tcl} @@ -61,18 +60,21 @@ % \end{tcl} % % And the same is true for the manpage. % \begin{tcl} %<*man> -[manpage_begin math::numtheory n 1.0] +[manpage_begin math::numtheory n 1.1] +[keywords {modular exponentiation}] +[keywords {number theory}] +[keywords prime] [copyright "2010 Lars Hellstr\u00F6m\ "] [moddesc {Tcl Math Library}] [titledesc {Number Theory}] [category Mathematics] [require Tcl [opt 8.5]] -[require math::numtheory [opt 1.0]] +[require math::numtheory [opt 1.1]] [description] [para] This package is for collecting various number-theoretic operations, though at the moment it only provides that of testing whether an @@ -79,10 +81,1380 @@ integer is a prime. [list_begin definitions] % % \end{tcl} +% +% +% \section{Modular arithmetic} +% +% Many number-theoretic algorithms make use of arithmetic modulo some +% (possibly quite large) integer $N$, so there is a point in having +% efficient implementations of this available. \Tcl\ actually comes +% with C implementations of much of the following, as part of the +% tommath library it uses for integer arithmetic, but unfortunately +% those are not (currently) exposed on the \Tcl\ level. +% +% +% \subsection{Barrett reduction} +% +% The basic operation for modular arithmetic is computing the +% remainder, i.e., |expr|'s |%| operation. Unfortunately, that is +% quite often not as fast as one would like it to be. +% +% The main catch is that while libtommath implements Karabatsura +% multiplication of integers to achieve an asymptotic complexity of +% $O(n^{1.585})$ (where $n$ is the bit-size of the inputs to the +% multiplication operation), it (currently) only implements a +% traditional $O(n^2)$ algorithm for integer division, so |/| and |%| +% (at the C level, there is a single function which calculates both) +% can be orders of magnitude slower than |*|. There is however also +% more generally the catch that even though division asymptotically +% has the same theoretical complexity as multiplication, the constant +% factor associated with division is several times larger than that +% of multiplication. +% +% Barrett reduction is an algorithm for computing the remainder which +% avoids the division (instead doing just two multiplications), but +% which requires certain auxiliary constants $M$ and $s$ in addition +% to the actual denominator $N$. Computing these is feasible if the +% same denominator $N$ is going to be used in several remainder +% computations, and as it turns out, that is frequently the case. +% +% \begin{proc}{Barrett_mod} +% The basic idea for Barrett reduction is one that may be familiar +% from non-integer arithmetic, namely to replace a division $a/N$ +% by a multiplication $a \cdot \frac{1}{N}$; by precomputing the +% inverse $\frac{1}{N}$, the division can be avoided (or at least +% only carried out once). There is of course the slight complication +% that \Tcl\ cannot naively represent non-integers such as $1/N$ +% with the necessary precision, so we'll express that as a +% fixed-point number instead. This leads to the formula +% \begin{equation} +% a - \left\lfloor a \cdot \frac{M}{2^s} \right\rfloor \cdot N +% = +% \texttt{\$a - ((\$a * \$M) >> \$s) * \$N} +% \quad +% \text{where \(M 2^{-s} \approx 1/N\)} +% \end{equation} +% for computing the remainder $a \bmod N$. That $M 2^{-s}$ is not +% exactly $N^{-1}$ does however leave some room for rounding errors +% that would cause the computed quotient $a/N$ to be slightly off. +% This is countered in a correction step after the main calculation, +% and by having $N^{-1}$ rounded upwards it can be ensured that +% incorrect remainders always end up negative, which is easy to +% test for. Hence we arrive at the following procedure. +% \begin{tcl} +%<*pkg> +proc ::math::numtheory::Barrett_mod {N invN shift a} { + set r [expr {$a - (($a * $invN) >> $shift) * $N}] + if {$r<0} then {incr r $N} + return $r +} +% \end{tcl} +% The reason for picking this sequence of arguments is that it is +% natural to package Barrett reduction modulo $N$ as a command +% prefix that takes the $a$ operand as its extra argument, so that +% it could be called like +% \begin{quote} +% |{*}$modcmd $a| +% \end{quote} +% \end{proc} +% +% \begin{proc}{Barrett_parameters} +% For computing the extra parameters, there is a procedure with +% call syntax +% \begin{quote} +% |Barrett_parameters| \word{$N$} +% \end{quote} +% where $N$ is a positive integer. The return value is a list +% \begin{quote} +% \word{$N$} \word{$M$} \word{$s$} +% \end{quote} +% where the elements satisfy \(s \geqslant 2 \log_2 N\) and +% \(MN \geqslant 2^s > (M -\nobreak 1) N\), i.e., $M$ is +% $\lceil 2^s/N \rceil$ as needed for |Barrett_mod|. +% +% In \Tcl, the most convenient way of computing an approximate +% $2$-logarithm (i.e., bitsize) of a large integer is to |format| +% said integer in hexadecimal: take the string length of the number +% times the number of bits per digit. This won't be a tight bound, +% but the overhead is not in the leading term of the complexity. +% To do an integer division rounded upwards, one can swap the sign +% of the numerator, divide, and then swap the sign again. +% \begin{tcl} +proc ::math::numtheory::Barrett_parameters {N} { + set s [expr {8*[string length [format %llx $N]]}] + set M [expr {-( (-1 << $s) / $N)}] + return [list $N $M $s] +} +% +% \end{tcl} +% An alternative approach here, if one \emph{really} wants to avoid +% big integer division, would be to use a Newton--Raphson iteration +% for computing $M$, but that should (if implemented) really be +% exposed publicly as a utility operation in its own right. +% \end{proc} +% +% The main limitation of Barrett reduction is that the finite +% precision of $M 2^{-s}$ as an approximation of $N^{-1}$ means $a M +% 2^{-s}$ slowly but surely deviates from $a/N$ as \(a \to \infty\). +% The correction step of adding $N$ to the remainder $r$ is +% equivalent to adjusting the quotient to counter this, but it cannot +% adjust the quotient by more than $1$. However, Barrett reduction is +% provably correct for all $a$ up to \(2^s \geqslant N^2\), which is the +% range of arguments that one encounters in implementing multiplication +% modulo $N$. To see this, let \(0 \leqslant a < 2^s\) be arbitrary. Let +% \(\delta = M2^{-s} - 1/N\) and \(\varepsilon = aM/2^s - \bigl\lfloor +% aM/2^s \bigr\rfloor\). Then the pre-adjustment computed remainder is +% \begin{multline*} +% r +% = +% a - \left\lfloor \frac{a M}{2^s} \right\rfloor N +% = +% a - \left( \frac{a M}{2^s} - \varepsilon \right) N +% = \\ = +% a - a \frac{M}{2^s} N + \varepsilon N +% = +% a - a \left( \frac{1}{N} + \delta \right) N + \varepsilon N +% = +% (-a \delta + \varepsilon) N +% \text{.} +% \end{multline*} +% By definition \(0 \leqslant \varepsilon < 1\). For $\delta$, one +% finds +% \begin{equation*} +% \delta +% = +% \frac{M}{2^s} - \frac{1}{N} +% = +% \frac{MN - 2^s}{2^s N} +% < +% \frac{MN - (M-1)N}{2^s N} +% = +% \frac{1}{2^s} +% \end{equation*} +% and hence \(1 > a\delta \geqslant 0\). This implies \(-1 < -a\delta +% + \varepsilon < 1\) and thus \(-N < r < N\), ensuring that the +% result of |Barrett_mod| is nonnegative and less than $N$ for \(a < +% N^2\). +% +% \begin{proc}{make_modulo} +% Packaging it all up, the primary user interface is an ensemble +% |make_modulo| that can be asked to do the preparation work for +% us. +% \begin{tcl} +%<*man> +[call [cmd math::numtheory::make_modulo] [arg subcmd] [arg N]] + The [cmd make_modulo] command precomputes some data that are useful + for quickly computing remainders modulo [arg N], which must be a + positive integer. The format of the return value depends on the + [arg subcmd]. This kind of preparation is primarily of interest if + one needs to compute many remainders modulo the same [arg N], but + that turns out to be quite common. + + The [arg subcmd]s are: + [list_begin commands] + [cmd_def prefix] + Returns a [emph {command prefix}] that expects one extra argument + [arg x]. + [example_begin]{*}[arg modprefix] [arg x][example_end] + This argument [arg x] is an integer which should be nonnegative + and less than [arg N]**2. The result of that call will be the + remainder of [arg x] divided by [arg N]. + [cmd_def mulprefix] + Returns a command prefix that expects two extra arguments + [arg x] and [arg y]. + [example_begin]{*}[arg mulmodprefix] [arg x] [arg y][example_end] + These arguments are integers whose product should be nonnegative + and less than [arg N]**2 (typically, [arg x] and [arg y] are both + less than [arg N]), and the result of that call will be the + remainder of [arg x]*[arg y] divided by [arg N]. + [cmd_def Barrett-parameters] + Returns a list of three integers [arg N], [arg M], and [arg s] + that are what one needs in order to perform Barrett reduction + ([uri {http://en.wikipedia.org/wiki/Barrett_reduction}]). [arg N] + is the argument. The shift amount [arg s] satisfies 2**([arg s]/2) + > [arg N]. The scaled reciprocal [arg M] satisfies + [arg M]*[arg N] >= 2**[arg s] > ([arg M]-1)*[arg N]. + [list_end] + +% +% \end{tcl} +% Creating the ensemble is pretty straightforward. An explicit +% |-map| is used to adjust the command names. +% \begin{tcl} +%<*pkg> +namespace eval ::math::numtheory { + namespace ensemble create -command make_modulo -map { + Barrett-parameters Barrett_parameters + prefix {Make_modulo_prefix 0} + mulprefix {Make_modulo_prefix 1} + } +} +% \end{tcl} +% \end{proc} +% +% \begin{proc}{Barrett_mulmod} +% Obviously a command prefix that multiplies two arguments ends up +% being implemented using a different command prefix than that +% which does not, so |Barrett_mod| gets an obvious sibling. +% \begin{tcl} +proc ::math::numtheory::Barrett_mulmod {N invN shift x y} { + set a [expr {$x*$y}] + set r [expr {$a - (($a * $invN) >> $shift) * $N}] + if {$r<0} then {incr r $N} + return $r +} +% \end{tcl} +% The reason for not using one command and having it check its +% number of arguments is purely for speed: this is something that +% is done in various inner loops, so we really don't want it to +% deal with variable syntax. +% \end{proc} +% +% \begin{proc}{Plain_mod} +% \begin{proc}{Plain_mulmod} +% The reason that there is an option to get multiplication integrated +% with the modulo operation is also speed, but under a slightly +% different line of reasoning. Barrett reduction is useful for large +% $N$ where |expr|'s built-in |%| grows comparatively slow, but for +% smaller $N$ (which would still seem very large to a human having to +% calculate with it) the overhead of evaluating more \Tcl\ commands +% is the dominant factor. Hence a quite competitive alternative would +% be to just do the obvious |expr| call, and in that case it is nice +% to do two operations in one |expr|. +% \begin{tcl} +proc ::math::numtheory::Plain_mod {N a} {expr {$a % $N}} +proc ::math::numtheory::Plain_mulmod {N x y} {expr {$x*$y % $N}} +% +% \end{tcl} +% \end{proc}\end{proc} +% +% So which implementation should one choose for which $N$? Only +% |time| can tell, so we'll have to do a bit of experimentation. +% \begin{tcl} +%<*experiment> +% \end{tcl} +% First, we need a selection of integers of varying sizes. Easy +% enough to build as strings, but let's do some bitlogic afterwards +% to reduce the risk of easy proportions (and incidentally ensure +% that they have a numeric internal representation). Adding the +% |llength| is to ensure that the numbers do not all have the same +% parity. +% \begin{tcl} +set numL {} +set digits 1 +set last 12345 +while {[llength $numL] < 1000} { + set new 1$digits + lappend numL [expr {$new ^ $last + [llength $numL]}] + set last $new + set d [expr {[llength $numL] % 10}] + set digits $d$digits$d +} +% \end{tcl} +% +% Then some kind of test procedure. This takes $a$ and $N$ as +% arguments, computes $a \bmod N$ using both approaches, and returns +% the list of the two |time| messages. +% \begin{tcl} +proc time_both_modulo {a N} { + set P [math::numtheory::Barrett_parameters $N] + list [ + time {math::numtheory::Plain_mod $N $a} 50 + ] [ + time {math::numtheory::Barrett_mod {*}$P $a} 20 + ] +} +% \end{tcl} +% Finally run that on a suitable selection of the |numL| list. +% \begin{tcl} +set resL {} +while {[llength $resL] < 500} { + lappend resL [ + time_both_modulo\ + [lindex $numL [expr {round(1.93*[llength $resL]+1)}]]\ + [lindex $numL [llength $resL]] + ] +} +% \end{tcl} +% The results are \dots\ a bit weird. But to make it easier to see +% the big picture, here's a routine that reduces the comparison +% between two timing values to a single digit: |Plain_mod| is faster +% for 0--4, |Barrett_mod| is faster for 5--9. The $1.2$ means each +% increment on this scale corresponds to another $20$\% difference in +% speed. +% \begin{tcl} +set over "" +foreach pair $resL { + scan [lindex $pair 0] %f plain + scan [lindex $pair 1] %f b + set l [expr {round(log($plain/$b)/log(1.2)+4.5)}] + set m [expr {max(min($l,9),0)}] + append over $m +} +% +% \end{tcl} +% The good news is that from position $90$ and up, |Barrett_mod| is +% consistently faster. |[lindex $numL 90]| has $602$ bits, but let's +% pick $625$ as a nice round switchover point. +% +% The somewhat surprising result, from looking at |[join $resL \n]|, +% is that whereas the timing results for |Barrett_mod| are well +% concentrated along a nice curve, the timing results for |Plain_mod| +% are not: they have seemingly random jumps up and down, which do not +% seem to be due to noise in the |time|ing as such. Presumably they +% are rather due to the actual numbers used---dependent upon the +% extent to which the code follows one branch or another in the main +% loop of \textbf{mp\_div}---but this would require a more extensive +% experiment (testing several numerators and denominators of each +% size) to examine. +% +% The bad news can be had by changing the constant |1.93| above to +% for example |1.5| or |1.1|: in those cases |Barrett_mod| would not +% emerge as superior. The reason for this can be discovered through a +% more thorough complexity analysis of libtommath's division operation. +% While it is true that said implementation's time complexity is +% asymptotically quadratic, it would be more accurate to describe it +% as being $O(DQ)$, where \(D = \lceil \log_2 N \rceil$ is the size of +% the denominator and $Q$ is the size of the quotient. For numerator +% and denominator of roughly equal size, the size $Q$ of the quotient +% becomes effectively bounded (i.e., $O(1)$), and the overall |%| +% operation effectively linear(!) in the input size. As long as \(Q = +% o(D^{-1+\log_2 3}) = o(D^{0.585})\), the multiplications in +% |Barrett_mod| are asymptotically \emph{slower} than the naive +% remainder operation in |Plain_mod|! Asymptotically that would only +% allow the numerator's size to grow as $D + O(D^{0.585})$, which is +% a lot less than the $2D$ required for arbitrary \(a < N^2\), and +% it only differs in lower order terms from the size $D$ of $N$ +% itself, but $O(D^{0.585})$ doesn't grow \emph{that} much slower +% than $D$, so for practical input sizes a numerator $a$ of the same +% order as, say, $N^{1.5}$ might be just as efficiently reduced by +% |Plain_mod| as it might by |Barrett_mod|. +% +% It's probably most fair to include a remark hinting at this in the +% manpage, where we're still in the scope of the +% |math::numtheory::make_modulo| item. +% +% \begin{tcl} +%<*man> + Note that the [arg modprefix] computed is tuned for applications + where the number to be reduced modulo [arg N] is uniformly + distributed in the interval from 0 (inclusive) to [arg N]**2. In + applications where the number comes from a much smaller interval, + the choice of algorithm for the [arg modprefix] may be suboptimal, + but many number-theoretical and cryptographic algorithms end up + exercising the full range. + +% +% \end{tcl} +% +% \begin{proc}{Make_modulo_prefix} +% To finish the implementation, we need only the command that +% actually constructs the prefixes. This has the call syntax +% \begin{quote} +% |Make_modulo_prefix| \word{mul-prefix?} \word{$N$} +% \end{quote} +% where \word{mul-prefix} is |1| if a mulmod prefix is sought, and +% |0| if a mod prefix is sought. +% +% The implementation is to first call |Barrett_parameters|, since +% these includes the bitsize of $N$ (or rather: twice that bitsize) +% we need to choose between plain and Barrett reduction. That means +% we'll sometimes perform an unnecessary division, but only in a +% context where we expect to perform several more anyway. +% \begin{tcl} +%<*pkg> +proc ::math::numtheory::Make_modulo_prefix {mulQ N} { + set P [Barrett_parameters $N] +% \end{tcl} +% The switchover at $625$ bits becomes a bound of $1250$ for the +% $s$ parameter. +% \begin{tcl} + if {[lindex $P 2] >= 1250} then { + return [linsert $P 0 [ + namespace which [lindex {Barrett_mod Barrett_mulmod} $mulQ] + ]] + } else { + return [list [ + namespace which [lindex {Plain_mod Plain_mulmod} $mulQ] + ] $N] + } +} +% +% \end{tcl} +% \end{proc} +% +% In the following test, the fun thing about $10^{643}$ is that it is +% only slightly smaller than \(2^{2136} \approx 1.00016 \cdot 10^{643}\). +% Picking a power of $10$ as the modulus makes it trivial to predict +% what the remainder should be for a number constructed as a sequence of +% digits. In the first case, the quotient estimate is $1$ (correct) +% and the remainder is correct on the first try, but in the second +% case the quotient estimate is $11$ (correct integer quotient is +% $10$), so the first try remainder is $-1$. +% \begin{tcl} +%<*test> +test Barrett-1.1 "1e643" -body { + set mod [math::numtheory::make_modulo prefix 1[string repeat 0 643]] + list [ + {*}$mod 1[string repeat 0 642]9 + ] [ + {*}$mod 10[string repeat 9 643] + ] +} -result [list 9 [string repeat 9 643]] +% +% \end{tcl} +% +% +% +% \subsection{Power modulo} +% +% The power-modulo, or \texttt{powm}, operation is that defined by +% \begin{equation*} +% \mathrm{powm}(a,b,N) = a^b \bmod N +% \text{.} +% \end{equation*} +% The RSA public key cryptosystem established it as a workhorse in +% modern cryptography, but it is also a useful tool in several +% number-theoretical algorithms. Common to both domains is that all +% three arguments may be several thousand bits large, so reducing the +% work required to compute it is well worth the effort. +% +% First, it should perhaps be remarked that computing +% $\mathrm{powm}(a,b,N)$ does not require computing the integer $a^b$ +% (which is a good thing, since it for quite moderate $a$ and $b$ could +% easily overflow available memory). By doing all multiplications +% modulo $N$, the space complexity of the calculation can remain +% linear. +% +% Second, it would be unfeasible to compute power modulo using the +% simple recursion +% \begin{equation*} +% \mathrm{powm}(a,b,N) = \begin{cases} +% a \bmod N & \text{if \(b = 1\),}\\ +% \bigl( \mathrm{powm}(a,b-1,N) \cdot a \bigr) \bmod N +% & \text{if \(b > 1\)} +% \end{cases} +% \end{equation*} +% since we usually couldn't afford to have a loop iterated $b$ times +% (it makes the time complexity exponential) no matter how small the +% loop body was. Instead one preferably uses some variation on the +% \emph{exponentiation by squaring} algorithm, a recursion +% corresponding to which is +% \begin{equation*} +% \mathrm{powm}(a,b,N) = \begin{cases} +% a \bmod N & \text{if \(b = 1\),}\\ +% \bigl( \mathrm{powm}(a,b-1,N) \cdot a \bigr) \bmod N +% & \text{if \(b > 1\) is odd,} \\ +% \mathrm{powm}(a,b/2,N)^2 \bmod N +% & \text{if \(b > 1\) is even.} +% \end{cases} +% \end{equation*} +% For $b$ having bit-size $n$, in the sense that \(2^n \leqslant b < +% 2^{n+1}\), that recursion will take the third branch $n$ times and +% the second branch at most $n$ times, so at most $2n$ mulmod +% operations suffice for computing $\mathrm{powm}(a,b,N)$. This +% places $\mathrm{powm}(a,b,N)$ solidly in the realm of things that +% can be computed in polynomial time. Conversely, it is not too hard +% to see that if $\mathrm{powm}$ is implemented using mulmod as +% underlying operation then it will require at least $n$ +% multiplications to get up to the $b$th power of $a$, so +% exponentiation by squaring is within a factor $2$ of the optimal +% algorithm. But with some planning it is in fact possible to get +% said factor quite close to $1$, so that extra preparation is well +% worth it. +% +% An improvement upon the basic exponentiation by squaring algorithm +% is what might be called the \emph{fixed window} algorithm; in the +% particular case that the window width is $3$~bits, this algorithm +% can be straightforwardly understood as a trick based on writing the +% exponent in octal (base~$8$). The idea is that instead of for each +% bit in the exponent doing one modular squaring possibly followed +% by one modular multiplication, one does +% \emph{three} modular squarings in sequence (equivalent to raising +% to the eight power) possibly followed by one modular +% multiplication; instead of possibly doing the general modular +% multiplication for every bit of the exponent, one only does it for +% every third bit, thereby reducing the expected number of times one +% needs to do it. For example, if \(b = 125 = 1 \cdot 8^2 + 7 \cdot 8^1 +% + 5 \cdot 8^0 = 2^6 + 2^5 + 2^4 + 2^3 + 2^2 + 2^0\), then instead of +% computing +% \[ +% a^b \equiv +% (((((a^2 \cdot a)^2 \cdot a)^2 \cdot a)^2 \cdot a)^2)^2 \cdot a +% \pmod{N} +% \] +% one computes +% \[ +% a^b \equiv (a^8 \cdot a^7)^8 \cdot a^5 \pmod{N} +% \] +% with only $2$ general multiplications rather than $5$ general +% multiplications. The catch is of course that whereas $a$ as factor +% was simply given, the factors $a^7$ and $a^5$ above are not, so +% they would need to be computed as well, at a cost of some +% additional multiplications. But in an exponent large enough that +% each octal digit tends to occur several times, that additional cost +% is lower than what is gained by doing fewer multiplications in the +% main calculation of $a^b \bmod N$. Since determining the point of +% break even for the $3$ bit fixed window algorithm is instructive, +% we might as well take the time to do so. +% +% It is reasonable to assume that the binary digits of $b$, after the +% initial $1$ of place value $2^n$, are random, independent, and +% uniformly distributed. This means the probability is $\frac{1}{2}$ +% that basic exponentiation by squaring will need to do a modular +% multiplication at any given bit, so the expected number of modular +% multiplications for an $n$-bit exponent is $\frac{1}{2}n$. With +% fixed windows $3$ bits wide, the probability is $\frac{1}{8}$ that +% all bits are $0$ so that we can skip the multiplication, therefore +% the probability is $\frac{7}{8}$ that it is needed, and we expect +% $\frac{7}{8}$ multiplications per $3$ bits, or $\frac{7}{24}$ +% multiplications per bit. The expected number of modular +% multiplications for an $n$-bit exponent is thus $\frac{7}{24}n$ +% (close to half of \(\frac{1}{2}n = \frac{12}{24}n\)!), but it +% requires an extra $6$ multiplications to +% precompute the powers $a^2$ through $a^7$. Thus the point of break +% even is where \(\frac{1}{2}n = \frac{7}{24}n + 6\), i.e., \(n = +% \frac{24}{5} \cdot 6 = 28.8\); in average, a mere ten octal digits +% suffice. But it is possible to do even better. For a general +% $k$-bit window, one expects to need $(1 -\nobreak 2^{-k}) \big/ k$ +% multiplications per bit and needs to precompute $2^k - 2$ powers of +% $a$, so the break even for the $k+1$ bit window over the $k$ bit +% window happens where +% \begin{multline*} +% (1 - 2^{-k}) \frac{n}{k} + 2^k - 2 = +% (1 - 2^{-k-1}) \frac{n}{k+1} + 2^{k+1} - 2 +% \quad\Longleftrightarrow \\ +% \left( \frac{1 - 2^{-k}}{k} - \frac{1 - 2^{-k-1}}{k+1} \right) n +% = 2^k +% \quad\Longleftrightarrow\\ +% \frac{ k+1 - (k+1)2^{-k} - k + k2^{-k-1} }{ k(k+1) } n = 2^k +% \quad\Longleftrightarrow\\ +% n = +% \frac{ k(k+1) 2^k}{ 1 - (k+2)2^{-k-1} } +% \approx k(k+1) 2^k +% \text{,} +% \end{multline*} +% whence $4$-bit windows start to dominate over $3$-bit when \(n \approx +% 140\), and $5$-bit windows start to dominate over $4$-bit when +% \(n \approx 394\). None the less it is possible to do even better by +% using \emph{sliding} windows. +% +% Sliding windows, like fixed windows, use a single modular +% multiplication to deal with some group of $k$ bits in the exponent, +% but it is not determined beforehand which bit positions will form a +% group. Instead, a group always begins with a bit that is $1$, +% because if the next bit to process is $0$ then you can always +% square the partial result and move on to the next bit. Knowing that +% you're at a $1$ bit gives you a better chance of covering +% additional $1$ bits in the same modular multiplication than you +% would have if you started at an arbitrary bit. This also means that +% you asymptotically only need to precompute half as many powers of +% $a$ as you would with fixed windows, since for sliding windows you +% only multiply by $a^i$ for \(2^{k-1} \leqslant i < 2^k\) because it +% is given that the leading bit in any group is a~$1$. If done +% without preprocessing, sliding window exponentiation by squaring +% proceeds as follows: +% \begin{itemize} +% \item +% If the current bit in the exponent is a~$0$, then square the +% partial result and move one bit to the right. +% \item +% If the current bit in the exponent is a~$1$, then square the +% partial result $k$ times. Move $k$ bits to the right, and +% collect the bits you pass to form the number $i$ (which +% satisfies \(2^{k-1} \leqslant i < 2^k\)). Multiply the partial +% result by $a^i$, which you get from a precomputed table. +% \end{itemize} +% In the end, it may happen that a $k$-bit window would extend past +% the end of the exponent, so at that point it may become necessary +% to fall back to basic exponentiation by squaring, but only for a +% sequence of less than $k$ bits. +% +% \begin{proc}{Sliding_window_powm} +% The actual bit operations needed to observe the exponent $b$ +% through a $k$-bit sliding window are kind of awkward in \Tcl, but +% on the other hand many applications will use the same exponent +% $b$ for several different bases $a$. Hence it makes sense to have +% one procedure that performs a sliding window decomposition of an +% exponent $b$, and another procedure that applies said +% decomposition to compute $\mathrm{powm}(a,b,N)$ for some +% arbitrary~$a$. This procedure is the latter; it is not expected +% that users will manually construct calls to it. +% +% The call syntax is +% \begin{quote} +% |Sliding_window_powm| \word{$k$} \word{$i_0$} \word{$i_m$-list} +% \word{tail bit list} \word{mulmod-prefix} \word{$a$} +% \end{quote} +% where $k$ is the window size (a positive integer), the three +% arguments \word{$i_0$}, \word{$i_m$-list}, and \word{tail bit +% list} encode the exponent $b$, \word{mulmod-prefix} is a command +% prefix that performs multiplication modulo $N$, and finally $a$ +% is the base of the exponentiation. It is presumed that $a$ is +% appropriate for the \word{mulmod-prefix} (e.g.~that \(0 \leqslant +% a < N\), if the \word{mulmod-prefix} is |Barrett_mulmod| and you +% absolutely require the result to be reduced modulo $N$). +% +% The various $i_m$ numbers are integers in the range from $-1$ to +% $2^{k-1}-1$, inclusive. The value $-1$ signals a step where the +% current bit in the exponent was $0$. A nonnegative value signals +% a step where it is $1$, and $i_m$ is then the numerical value of +% the $k-1$ bits following that $1$ bit. $i_0$ is the $i_m$ number +% for the window beginning at the most significant bit of the +% exponent; unlike the others, it cannot be $-1$. The \word{tail +% bit list} is the list of the final up to $k-1$ bits of the +% exponent, that would not fit in an additional $k$-bit window; its +% elements are either $0$ or $1$. Both the \word{$i_m$-list} and +% the \word{tail bit list} are in order of falling bit significance. +% +% \begin{tcl} +%<*pkg> +proc ::math::numtheory::Sliding_window_powm {k i0 iL tL mulmod a} { +% \end{tcl} +% The first order of business is to precompute a table of the +% numbers \(a^{i+2^{k-1}} \bmod N\) for \(0 \leqslant i < +% 2^{k-1}\). This first involves computing $a^{2^r}$ for \(1 +% \leqslant r \leqslant k-1\). +% \begin{tcl} + set apow $a + for {set r 1} {$r < $k} {incr r} { + set apow [{*}$mulmod $apow $apow] + } + set aL [list $apow] + set r [expr {1 << ($k-1)}] + while {[llength $aL] < $r} { + set apow [{*}$mulmod $apow $a] + lappend aL $apow + } +% \end{tcl} +% That taken care of, we can move on to the main sliding window +% calculations. +% \begin{tcl} + set res [lindex $aL $i0] + foreach i $iL { + if {$i < 0} then { + set res [{*}$mulmod $res $res] + } else { + for {set r 0} {$r < $k} {incr r} { + set res [{*}$mulmod $res $res] + } + set res [{*}$mulmod $res [lindex $aL $i]] + } + } +% \end{tcl} +% And finally finish off with the tail bits. +% \begin{tcl} + foreach i $tL { + set res [{*}$mulmod $res $res] + if {$i} then { + set res [{*}$mulmod $res $a] + } + } + return $res +} +% \end{tcl} +% +% This procedure has certain restrictions, insomuch as that the +% smallest exponent $b$ that can used with window size $k$ is +% $2^{k-1}$; it corresponds to having \(i_0 = 0\), the $i_m$-list +% empty, and the tail bit list also empty. However, the bitsizes at +% which a window width $k$ becomes optimal grows much faster +% than~$k$, so that is a minor concern. Also, by taking \(k=1\), +% \(i_0 = 0\), and the $i_m$-list to be empty, it is possible to +% emulate any basic exponentiation by squaring without wasting +% modular multiplications. +% \end{proc} +% +% \begin{proc}{Sliding_window_cover} +% The companion operation of computing an \word{$i_m$-list} and +% \word{tail bit list} for a particular exponent is provided by the +% call +% \begin{quote} +% |Sliding_window_cover| \word{$k$} \word{bitlist} +% \end{quote} +% where $k$ is the window size and \word{bitlist} is the list of +% bits in the exponent, ordered from most to least significant; a +% command for constructing that list for an exponent $b$ would be +%\begin{verbatim} +% split [string trimleft [ +% string map {0 000 1 001 2 010 3 011 4 100 5 101 6 110 7 111}\ +% [format %llo $b] +% ] 0] "" +%\end{verbatim} +% The return value from |Sliding_window_cover| is a list with the +% structure +% \begin{quote} +% \word{cost} |{| \word{$i_0$} \meta{$i_m$-list} |}| +% \word{tail bit list} +% \end{quote} +% where the \word{cost} is the number of modular multiplications +% (not including the $n$ squarings, but including the calculations +% for precomputing low powers of the base) that these data would +% have |Sliding_window_powm| do. Note that $i_0$ is in this result +% not separated from the list of $i_m$. +% +% The main implementation is like a little automaton reading the +% \word{bitlist} element by element, and intermittently generating +% output as it goes. The main internal control state is the +% |collect| variable, which says how many bits of the current +% window that still remain to be processed; when this is $0$, the +% next bit may start a new window. After seeing the last bit of a +% window, the corresponding $i_m$ value gets appended to |imL|, and +% the |i| variable is where that $i_m$ value gets constructed (as a +% number). The bits of the current window are also being collected +% in the |tail| variable; if the \word{bitlist} ends before the +% window does then the |tail| will be exactly the needed \word{tail +% bit list}. (It should be observed that |tail| includes the first +% bit of a window, whereas |i| does not.) +% \begin{tcl} +proc ::math::numtheory::Sliding_window_cover {k bitlist} { + if {$k>=2} then { + set km1 [expr {$k-1}] + set cost [expr {(1 << $km1) - 2}] +% \end{tcl} +% That initial value for |cost| is essentially the cost for the +% part of |Sliding_window_powm| that comes before the |foreach| +% over the \word{$i_m$-list}, but the fine details need explaining. +% First, the squarings in the initial |for| loop are not included, +% because each of these replace one of the $n$ basic squarings of +% the result |res|. Second, the |aL| list has $2^{k-1}$ elements, +% but the first of these is not computed in the |while| loop, which +% accounts for $-1$ above. The other $-1$ is because the cost +% estimates below treat $i_0$ the same as the other $i_m$ numbers, +% whereas in |Sliding_window_powm| there is no multiplication for +% $i_0$, so a total adjustment of $-2$ is called for. +% \begin{tcl} + set imL {} + set collect 0 + foreach bit $bitlist { + if {$collect>0} then { + set i [expr {($i<<1) | $bit}] + if {[incr collect -1]} then { + lappend tail $bit + } else { + lappend imL $i + incr cost + } + } elseif {$bit} then { + set tail [list $bit] + set i 0 + set collect $km1 + } else { + lappend imL -1 + } + } + } else { +% \end{tcl} +% It turns out the above will not work right for \(k=1\), since +% in that case |collect| is always zero. So to be on the safe side, +% just throw everything in the |tail| in that case. +% \begin{tcl} + set imL [list 0] + set tail [lrange $bitlist 1 end] + set cost 0 + set collect 1 + } + if {$collect} then { + foreach bit $tail {incr cost $bit} + } else { + set tail {} + } + return [list $cost $imL $tail] +} +% \end{tcl} +% Deliberately left out from this procedure is the matter of how to +% choose the window width $k$. +% \end{proc} +% +% Deriving the optimal window width for a given bitsize $n$ is not as +% straightforward for sliding windows as it is for fixed windows, +% because in this case the sequence of steps taken through the +% bitstream of the exponent is itself the result of a stochastic +% process. In particular, it is by no means given that the $j$th bit +% (counting the most significant as the $0$th) will be the first bit +% in any sliding window; chances are rather that it will fall +% somewhere in the interior of a window, and thus have no effect on +% the sequence of steps that are taken. This complicates the +% definition (and consequently the derivation) of expected costs +% quite a bit. +% +% One way to quantify things is to let $p_n$ be the probability that +% bit $n$ is the stream will be inspected as possibly the first one +% in a window, and let $C_n$ (the cost) be the expected number of +% general modular multiplications performed up to that point, i.e., +% the expected number of length $k$ steps taken to get there. Since +% the most significant bit by definition is a~$1$, the process always +% starts with a step of length $k$, and thus \(p_1 = p_2 = \dotsb = +% p_{k-1} = 0\) but \(p_k = 1\). After that \(p_{k+1} = \frac{1}{2}\), +% \(p_{k+2} = \frac{1}{4}\), and so on to \(p_{2k-1} = 2^{-(k-1)}\), +% but \(p_{2k} = \frac{1}{2} + 2^{-k}\) since one gets there either +% by seeing bit $k$ be $1$ or bits $k$ through $2k-1$ all be $0$. The +% general recursion is that +% \begin{equation} \label{Eq:RecursionProbability} +% p_n = \tfrac{1}{2} p_{n-1} + \tfrac{1}{2} p_{n-k} +% \qquad\text{for \(n > k\)} +% \end{equation} +% because one can arrive at bit $n$ either by having seen $0$ at bit +% $n-1$ or by having seen $1$ at bit $n-k$, and in either of those +% positions the probability is $\tfrac{1}{2}$ of seeing the bit value +% that leads directly to bit $n$. The two events are furthermore +% disjoint (while it is possible to first visit bit $n-k$ and later +% bit $n-1$, this can only happen if one saw $0$ at bit $n-k$), so it +% is correct to simply add the probabilities. The corresponding +% recursion for the cost is +% \begin{equation} \label{Eq:RecursionCost} +% C_n = \frac{ +% \tfrac{1}{2}p_{n-1}C_{n-1} + \tfrac{1}{2}p_{n-k}(C_{n-k}+1) +% }{p_n} +% \end{equation} +% since $C_n$ is a conditioned expectation value; this recursion is +% effectively a weighted average of the expected costs at the +% previous step, with a $+1$ in one branch for the extra cost +% incurred there. +% +% Beginning with the probabilities, it may be observed that +% \eqref{Eq:RecursionProbability} is a linear recursion with constant +% coefficients, so finding a closed form for it is actually feasible. +% Putting the recursion on vector form, one gets +% \begin{equation} +% \begin{pmatrix} +% p_n \\ p_{n-1} \\ p_{n-2} \\ \vdots \\ p_{n-k+1} +% \end{pmatrix} = +% \begin{pmatrix} +% \tfrac{1}{2} & 0 & \dots & 0 & \tfrac{1}{2} \\ +% 1 & 0 & \dots & 0 & 0 \\ +% 0 & 1 & \dots & 0 & 0 \\ +% \vdots & \vdots & \ddots & \vdots & \vdots \\ +% 0 & 0 & \dots & 1 & 0 +% \end{pmatrix} +% \begin{pmatrix} +% p_{n-1} \\ p_{n-2} \\ \vdots \\ p_{n-k+1} \\ p_{n-k} +% \end{pmatrix} +% \text{.} +% \end{equation} +% Letting $T$ be that $k \times k$ matrix, one would thus have \(p_n +% = \mathbf{e}_1^\top T^{n-k} \mathbf{e}_1\), where $\mathbf{e}_1$ is +% the first vector in the standard basis of $\mathbb{R}^k$. While +% this technically is a closed form formula for $p_n$, its usefulness +% lies more in being convenient to analyse than in being a formula +% used to directly compute $p_n$ (even though exponentiation by +% squaring is a valid algorithm also for matrix multiplication, and +% thus could be utilised to compute any particular $p_n$ in polynomial +% time). In particular, high powers of a matrix are dominated by the +% largest eigenvalue of that matrix. For $T$, the characteristic +% polynomial \(\det (\lambda I -\nobreak T) = \lambda^k - +% \tfrac{1}{2} \lambda^{k-1} - \tfrac{1}{2}\), which has the obvious +% zero \(\lambda = 1\); indeed, it is immediate that \(T \mathbf{1} = +% \mathbf{1}\) where \(\mathbf{1} = \sum_{j=1}^k \mathbf{e}_j\) is +% the all-ones vector, and thus \(\lambda = 1\) is an obvious +% eigenvalue (of multiplicity $1$ with eigenvector $\mathbf{1}$) of $T$. +% There are furthermore no eigenvalues (solutions $\lambda$ to +% \(2\lambda^k = \lambda^{k-1} + 1\)) with \(\lvert\lambda\rvert > 1\) +% since that would lead to the contradiction \(2\lvert\lambda\rvert^k = +% \lvert 2\lambda^k\rvert = \lvert \lambda^{k-1} +\nobreak 1 \rvert +% \leqslant \lvert\lambda^{k-1}\rvert + \lvert 1\rvert = +% \lvert\lambda\rvert^{k-1} + 1 < \lvert\lambda\rvert^k + 1^k < +% 2 \lvert\lambda\rvert^k\), and the only solution if +% \(\lvert\lambda\rvert = 1\) is to make \(\lambda^k = \lambda^{k-1} +% = 1\), i.e., \(\lambda = 1\). This means \(\lambda = 1\) is the +% unique largest eigenvalue of $T$, having eigenvector $\mathbf{1}$, +% and thus \(T^n \mathbf{e}_1 \rightarrow p \mathbf{1}\) as +% \(n \rightarrow \infty\), for some constant $p$; asymptotically, +% \(p_n \rightarrow p\), meaning all positions are essentially of +% equal probability for sufficiently large $n$. +% +% Knowing that, it is actually not necessary to determine the +% limiting probability $p$, since constant probabilities cancel in +% \eqref{Eq:RecursionCost}: +% \begin{equation} \label{Eq2:RecursionCost} +% C_n \approx \frac{ +% \tfrac{1}{2}pC_{n-1} + \tfrac{1}{2}p(C_{n-k}+1) +% }{p} = +% \tfrac{1}{2}C_{n-1} + \tfrac{1}{2}C_{n-k} + \tfrac{1}{2} +% \text{.} +% \end{equation} +% On the other hand, determining $p$ is rather easy. The transpose +% $T^\top$ has the same eigenvalues as $T$, but in general different +% eigenvectors, and in particular \(\mathbf{u} = \mathbf{1} + +% \mathbf{e}_1\) is an eigenvector since \(T^\top \mathbf{u} = +% \mathbf{u}\), or putting it differently \(\mathbf{u}^\top T = +% \mathbf{u}^\top\). Hence +% \begin{multline*} +% 2 = +% \mathbf{u}^\top \mathbf{e}_1 = +% \mathbf{u}^\top T \mathbf{e}_1 = +% \mathbf{u}^\top T^{n-k} \mathbf{e}_1 = +% \mathbf{u}^\top \sum_{r=1}^k p_{n-(r-1)} \mathbf{e}_r \rightarrow +% \mathbf{u}^\top p \mathbf{1} = +% (k+1) p +% \end{multline*} +% and thus \(p = 2\big/ (k +\nobreak 1)\). This should really not +% come as a surprise, as it is easily calculated that the expected +% length of a step is \(\frac{1}{2} + \frac{1}{2}k = (k +\nobreak 1) +% / 2\) bits. +% +% The analysis would however not be complete without also considering +% how long it takes for the probability $p_n$ to exhibit this +% limiting behaviour, since $k$ is in practice going to grow with +% $n$, thus lengthening the natural scale of the probability +% variations; without knowing the speeds at which things grow, it is +% impossible to tell whether the limiting behaviour is at all +% relevant! The bad news are that the bitsize required for variation +% decay grows considerably faster than the basic window size $k$, but +% the good news is that the bitsize for break even grows much faster +% still, so the limiting behaviour should dominate what happens at +% break even. +% +% In more detail, the characteristic polynomial \(\lambda^k - +% \tfrac{1}{2}\lambda^{k-1} - \tfrac{1}{2}\) has no factor common +% with its derivative \(k\lambda^{k-1} - \tfrac{k-1}{2}\lambda^{k-2} +% = (k\lambda -\nobreak \tfrac{k-1}{2}) \lambda^{k-2}\), hence all +% its zeroes have multiplicity $1$, and thus the matrix $T$ is +% diagonalisable. It follows that \(p_n = \sum_{j=1}^k \alpha_j +% \lambda_j^n\) where \(\lambda_1,\dotsc,\lambda_k \in \mathbb{C}\) +% are the eigenvalues of $T$ and \(\alpha_1,\dotsc,\alpha_k \in +% \mathbb{C}\) are some constants. Ordering the eigenvalues so that +% \(\lvert\lambda_1\rvert \geqslant \lvert\lambda_2\rvert \geqslant +% \dotsb \geqslant \lvert\lambda_k\rvert\), we know from the above +% that \(\lambda_1=1\) and \(\alpha_1 = p = 2/(k +\nobreak 1)\), so +% the rate of convergence \(p_n \to p\) depends essentially on the +% second largest eigenvalue $\lambda_2$. (Since the complex +% eigenvalues all occur in conjugate pairs, it is actually the case +% that \(\lvert\lambda_2\rvert = \lvert\lambda_3\rvert\), but we may +% without loss of generality let $\lambda_2$ be that eigenvalue with +% positive imaginary part. This turns out to uniquely identify +% $\lambda_2$.) It is immediate from the triangle inequality that +% \(\lvert p_n -\nobreak p \rvert = \mathrm{O}\bigl( +% \lvert\lambda_2\rvert^n \bigr)\) as \(n \to \infty\), and +% straightforward to show that it is also $\Omega\bigl( +% \lvert\lambda_2\rvert^n \bigr)$, so essentially the lower end of +% the range of $n$ values for which \(\lvert p_n -\nobreak p \rvert < +% \varepsilon\) grows as $\ln\varepsilon \big/ +% \ln \lvert\lambda_2\rvert$; in particular the exact choice of +% tolerance \(\varepsilon > 0\) is of minor importance, since it only +% contributes a constant factor to the asymptotics of the +% convergence. What matters most is how close $\lvert\lambda_2\rvert$ +% gets to $1$. +% +% Through the ansatz \(\lambda = r (\cos \theta +\nobreak +% i\sin\theta)\) in the characteristic equation \((2\lambda -\nobreak +% 1) \lambda^{k-1} = 1\) one may derive \(\lvert 2r\cos\theta +% -\nobreak 1 +\nobreak 2ir\sin\theta \rvert r^{k-1} = 1\), or +% equivalently +% \begin{equation} \label{Eq:EigenvalueCurve} +% 1 = +% r^{k-1} \sqrt{ (2r\cos\theta - 1)^2 + 4r^2 \sin^2 \theta } = +% r^{k-1} \sqrt{ 4r^2 + 1 - 4 r \cos\theta } +% \text{.} +% \end{equation} +% The monotonicity of both $r^{k-1}$ and $4r^2 + 1 - 4r\cos\theta$ +% for \(r \geqslant \max\{\tfrac{1}{2}\cos\theta,0\}\) show that this +% equation has a unique solution $r$ for every $\theta$, meaning $r$ +% can be viewed as a function of $\theta$, and thus all eigenvalues +% lie on a simple polar curve in the complex plane. As +% \(k\to\infty\) that curve approaches the unit circle, with the +% greatest distance (at \(\theta=\pi\)) shrinking roughly as +% $\mathrm{O}(k^{-1})$ due to the factor $r^{k-1}$, but of greater +% importance for the asymptotic behaviour of $\lvert\lambda_2\rvert$ +% is actually the change in +% argument $\theta$, as the curve for every $k$ touches the unit +% circle at \(\theta = 0\). Considering instead the argument side of +% the characteristic equation, one arrives at +% \[ +% \arg (2\lambda - 1) + (k-1)\arg\lambda \equiv 0 \pmod{2\pi} +% \text{,} +% \] +% and since $\lambda_2$ has the smallest positive argument, it +% follows that \(\arg (2\lambda_2 -\nobreak 1) + (k -\nobreak 1) +% \arg\lambda_2 = 2\pi\). The first term of that equation is awkward +% to express exactly, but easy enough to put bounds on: there is a +% triangle in the complex plane with vertices at $0$, $2\lambda_2$, +% and $2\lambda_2-1$. Since \(\lvert 2\lambda_2 -\nobreak 1 \rvert = +% \lvert\lambda_2\rvert^{-(k-1)} > 1\), it follows that the side from +% $0$ to $2\lambda_2-1$ is longer than the side from $2\lambda_2-1$ +% to $2\lambda_2$, and consequently the angle at $2\lambda_2$ (being +% $\arg\lambda_2$ by parallel lines) is larger than the angle at $0$ +% (which is $\arg(2\lambda_2 -\nobreak 1) - \arg\lambda_2$). Hence +% \(2\arg\lambda_2 > \arg(2\lambda_2 -\nobreak 1) > \arg\lambda_2\) +% and \((k +\nobreak 1)\arg\lambda_2 > 2\pi > k \arg\lambda_2\). It +% follows that \(\arg\lambda_2 = \Theta(k^{-1})\) as \(k \to +% \infty\). Hence \(1 - \cos\arg\lambda_2 = \Theta(k^{-2})\), and +% writing \(1-x = \lvert\lambda_2\rvert\), \eqref{Eq:EigenvalueCurve} +% becomes +% \begin{align*} +% 1 ={}& +% (1 - x)^{k-1} +% \sqrt{ 4(1-x)^2 + 1 - 4(1-x)\bigl(1 - \Theta(k^{-2}) \bigr) } +% = \\ ={}& +% (1 - x)^{k-1} +% \sqrt{ 1 - 4x + 4x^2 + \Theta(k^{-2}) } +% = \\ ={}& +% (1 - x)^{k-1} +% (1-2x)\Bigl( 1 + \tfrac{1}{(1-2x)^2}\Theta(k^{-2}) \Bigr)^{1/2} +% = \\ ={}& +% \bigl( 1 - (k-1)x + o(x^2) \bigr) (1-2x) +% \bigl( 1 + \Theta(k^{-2}) \bigr) +% = +% 1 - (k+1)x + \Theta(k^{-2}) +% \text{,} +% \end{align*} +% whence \(x = \Theta(k^{-3})\), implying \(\ln \lvert\lambda_2\rvert +% = \Theta(k^{-3})\), and thus the $n$ required to +% make \(\lvert p_n -\nobreak p \rvert < \varepsilon\) grows as +% $\Theta(k^3)$. This is however much smaller than the point of break +% even, which rather grows like $\Theta(k^2 2^k)$, so equal bit +% probabilities is a valid approximation for estimating that. +% +% The cost recursion~\eqref{Eq2:RecursionCost} is not homogeneous, +% but can be made such through the ansatz \(C_n = \beta n + D_n\), +% where \(D_n = \tfrac{1}{2} D_{n-1} + \tfrac{1}{2} D_{n-k}\); it +% then follows that \(\beta n = \tfrac{1}{2}\beta (n -\nobreak 1) + +% \tfrac{1}{2}\beta (n -\nobreak k) + \tfrac{1}{2}\) for all $n$, or +% from the constant terms alone that \(0 = -\tfrac{1}{2}\beta +% -\tfrac{1}{2}k\beta + \tfrac{1}{2}\), making \(\beta = 1 \big/ (k +% +\nobreak 1)\). Moreover, since the recursion for $D_n$ is the same +% as that for $p_n$, it follows from the above analysis of the +% eigenstructure of the transfer matrix $T$ that the sequence +% $\{D_n\}_{n=0}^\infty$ is bounded, meaning that asymptotically the +% marginal cost for a width $k$ sliding window is $\beta$ modular +% multiplications per bit. The cost estimate (including startup +% precomputations) for bitsize $n$ with sliding window size $k$ is +% thus +% \begin{equation} +% \frac{n}{k+1} + 2^{k-1} - 1 +% \text{,} +% \end{equation} +% which compared to the fixed window algorithm has the marginal cost +% of a window one size larger, and a startup cost almost that +% for a window one size smaller! Window size $k$ gains an advantage +% to window size $k-1$ where +% \[ +% \frac{n}{k+1} + 2^{k-1} - 1 = +% \frac{n}{k} + 2^{k-2} - 1 +% \quad\Longleftrightarrow\quad +% 2^{k-2} = \frac{n}{k(k+1)} +% \text{,} +% \] +% making \(n = \Theta(k^2 2^k) > \Theta(k^3)\), as claimed. +% +% \begin{proc}{make_powm} +% This command constructs a command prefix with the syntax +% \begin{quote} +% \meta{prefix} \word{a} +% \end{quote} +% which computes $\mathrm{powm}(a,b,N)$. The call syntax for +% |make_powm| itself is +% \begin{quote} +% |make_powm| \word{exponent} \word{mod-style} \word{mod-data} +% \end{quote} +% where \word{exponent} is the exponent $b$. The \word{mod-style} +% is either |mulprefix| or |modulo|. In the |modulo| case, the +% \word{mod-data} is the modulo number $N$. In the |mulprefix| +% case, it is a command prefix computing multiplication of two +% factors (as one might construct using |make_modulo mulprefix|). +% In the |mulprefix| case, it is technically not required that the +% multiplication is modulo $N$, or even that the base $a$ is a +% number; as long as the given prefix performs some kind of +% associative multiplication, the returned prefix raises to the +% power $b$ in the corresponding sense. +% +% \begin{tcl} +proc ::math::numtheory::make_powm {b style data} { + if {$b < 1} then { + return -code error "Exponent must be positive" + } + switch $style { + modulo { + set mulmod [make_modulo mulprefix $data] + } + mulprefix { + set mulmod $data + } + default { + return -code error\ + {Modulo style must be "modulo" or "mulprefix"} + } + } +% \end{tcl} +% The first order of business is to compute the bitlist of the +% exponent. This also gives the bitsize $n$, as the list length +% minus one. +% \begin{tcl} + set bitlist [split [string trimleft [ + string map {0 000 1 001 2 010 3 011 4 100 5 101 6 110 7 111}\ + [format %llo $b] + ] 0] ""] +% \end{tcl} +% The next step is to compute the appropriate window width $k$. By +% the above that should be the largest integer solution to +% \(k (k +\nobreak 1) 2^{k-2} \leqslant n\), and that happens to +% be \(k=1\) for \(n < 6\). Hence special-casing that might be +% appropriate for such ``small'' exponents. +% \begin{tcl} + if {[llength $bitlist] < 7} then { + return [list [namespace which Sliding_window_powm]\ + 1 0 {} [lrange $bitlist 1 end] $mulmod] + } +% \end{tcl} +% Above that, one may observe that the left hand side of the +% inequality is increasing in $k$, so one might alternatively +% choose to solve \(f(x) = \ln n\) for +% \[ +% f(x) = +% \ln x + \ln (x +\nobreak 1) + \ln 2^{x-2} = +% \ln x + \ln (x+1) + (x-2)\ln 2 +% \text{.} +% \] +% An exact solution here is hardly called for, since only the +% integer part of $x$ is relevant. Since in addition $f$ is nearly +% linear, a solution that seems good enough is to start with \(x_0 = +% 2 + \ln n / \ln 2\) as initial guess and then perform one +% Newton--Raphson step to adjust it. +% \begin{tcl} + set x [expr {log([llength $bitlist]-1)/log(2) + 2}] + set x [expr {$x - + (log($x) + log($x+1)) / (1/$x + 1/($x+1) + log(2)) + }] +% \end{tcl} +% where the numerator \(f(x) - \ln n\) could be simplified since +% the remaining terms are equal for the chosen initial guess $x_0$. +% This results in the following guesses: +% \begin{center} +% \begin{tabular}{ r@{--}r c} +% \multicolumn{2}{c}{$n$ range} & $\lfloor x_1 \rfloor$ \\ +% 11&40 & 2 \\ +% 41&124 & 3 \\ +% 125&353 & 4 \\ +% 354&945 & 5 +% \end{tabular} +% \end{center} +% +% In practice, what determines the optimal window width for a +% particular exponent may however be how well the windows manage to +% cover the bitlist rather than the expected cost per bit. +% Therefore this procedure computes coverings for two adjacent +% window sizes and chooses that which has the lowest cost. This is +% not guaranteed to be optimal---\(k=1\) is always optimal for an +% exponent that is a power of $2$---but it should at least reduce +% unfortunate resonances between bitpattern and window width. +% \begin{tcl} + set k [expr {int($x)}] + set down [Sliding_window_cover $k $bitlist] + incr k + set up [Sliding_window_cover $k $bitlist] + if {[lindex $down 0] < [lindex $up 0]} then { + incr k -1 + } else { + set down $up + } + return [list [namespace which Sliding_window_powm]\ + $k [lindex $down 1 0] [lrange [lindex $down 1] 1 end]\ + [lindex $down 2] $mulmod] +} +% +% \end{tcl} +% \end{proc} +% +% \begin{proc}{powm} +% In practice, many users may prefer to use a command that ``just +% computes the \texttt{powm},'' without caring about whether some +% arguments are mostly constant. +% \begin{tcl} +%<*man> +[call [cmd math::numtheory::powm] [arg a] [arg b] [arg N]] + The [cmd powm] command performs a modular exponentiation, i.e., it + modulo [arg N] raises [arg a] to the power [arg b]. The arguments + must all be integers and satisfy [arg N],[arg b]>=1. It is + furthermore preferred that 0<=[arg a]<[arg N]; if it is not, then + the result may sometimes fail to be so bounded, even though it will + still be congruent to [arg a]**[arg b] modulo [arg N]. + [para] + + This command is optimised for the case of large [arg b] and + [arg N]; appropriate algorithms will be chosen also for small + argument values, but the overhead for [emph {making the choice}] + could then be nonnegligible. If the same values for [arg b] and + [arg N] are to be used several times, then the overhead can be + reduced considerably by instead using the [cmd make_powm] command + to construct a command prefix which has those values hardwired. + ([cmd powm] is in fact a convenience wrapper around + [cmd make_powm].) + +% +% \end{tcl} +% `Convenience wrapper' it is: first compute the command prefix, +% then apply it to the base $a$. +% \begin{tcl} +%<*pkg> +proc ::math::numtheory::powm {a b N} {{*}[make_powm $b modulo $N] $a} +% +% \end{tcl} +% \end{proc} +% +% The description of |make_powm| should come next. It is probably +% best to make the |modulo| and |mulprefix| cases two separate +% prototype |call|s. +% \begin{tcl} +%<*man> +[call [cmd math::numtheory::make_powm] [arg exponent] [method modulo] [arg modulus]] + This command returns a command prefix, which expects an integer + base [arg a] as its only additional argument. That command prefix + computes [arg a] raised to the power [arg exponent] modulo the + [arg modulus], using a "sliding window" variant of the + exponentiation by squaring algorithm. The [arg exponent] and + [arg modulus] must both be positive, and the implementation is + designed to be performant when these numbers are very large. The + returned command prefix has several large pieces of data embedded + into it, that [cmd make_powm] derives from the [arg exponent] and + [arg modulus], but no related data are cached outside of the command + prefix itself. + [para] + + For example, the definition of [cmd powm] as a convenience call to + [cmd make_powm] is +[example { + proc ::math::numtheory::powm {a b N} { + {*}[make_powm $b modulo $N] $a + } +}] +[call [cmd math::numtheory::make_powm] [arg exponent] [method mulprefix] [arg prefix]] + This command returns a command prefix, which implements raising its + argument to the power [arg exponent], through multiplying various + smaller powers of that argument with each other. The [arg prefix] + argument is the command prefix called upon to actually perform + those multiplications; it should have the call syntax +[example_begin] + {*}[arg prefix] [arg x] [arg y] +[example_end] + and return the product of [arg x] and [arg y]. If [arg powprefix] + similarly is the return value of [cmd make_powm], then it has the + call syntax +[example_begin] + {*}[arg powprefix] [arg a] +[example_end] + and returns [arg a] raised to the power [arg exponent]. + [para] + + The [method modulo] form of [cmd make_powm] is a shorthand that + calls [cmd make_modulo] to compute the multiplication prefix. The + longer [method mulprefix] form is useful in two cases. First, if + the caller anyway has that multiplication prefix at hand, it would + be unnecessary for [cmd make_powm] to compute it again. Second, the + reduction of exponentiation to a select sequence of multiplications + is not something that only makes sense in modular arithmetic; as + long as the [arg prefix] implements some associative binary + operation (e.g. matrix multiplication), [cmd make_powm] will + construct the corresponding repetition of that operation, implemented + in a fairly efficient manner. + +% +% \end{tcl} +% +% Testing someting like |make_powm|, designed to be used for very +% large numbers as it is, can be something of a challenge. One +% approach can be to pick two distinct primes $p$ and $q$, and then +% compute $a^{\mathrm{lcm}(p-1,q-1)+1} \bmod pq$ for a number of +% distinct bases $a$, since the result should again be $a$ by (Euler's +% generalisation of) Fermat's little theorem. +% +% Two nice primes are \(p = 10^9+7\) and \(q = 10^9+9\), for which +% the product \(pq = 10^{18} + 16 \cdot 10^9 + 63 = +% 1 \mkern1mu 000 \mkern1mu 000 \mkern1mu 016 \mkern1mu 000 \mkern1mu +% 000 \mkern1mu 063\); this is of the same order of magnitude as $2^{60}$. +% It is immediate that \(\gcd( p -\nobreak 1, q -\nobreak 1) = 2\) and +% hence \(\mathrm{lcm}( p -\nobreak 1, q -\nobreak 1) = +% (p -\nobreak 1)(q -\nobreak 1) \big/ 2 = +% 5 \cdot 10^{17} + 7 \cdot 10^9 + 24\). +% \begin{tcl} +%<*test> +test powm-1.1 "powm(a,lcm(p-1,q-1)+1,pq)=a" -body { + set powm [ + math::numtheory::make_powm 500000007000000025 modulo 1000000016000000063 + ] + list [{*}$powm 2] [{*}$powm 3] [{*}$powm 5] [{*}$powm 7]\ + [{*}$powm 11] [{*}$powm 13] [{*}$powm 100] [{*}$powm 98754] +} -result {2 3 5 7 11 13 100 98754} +% \end{tcl} +% Of course, it would not be entirely impossible for a completely +% messed up implementation to accidentally give back exactly what +% is fed in, so we might as well test that it also gives back $1$ for +% an exponent one less. It is somewhat harder for a complete +% garbage implementation to get both of these right. +% \begin{tcl} +test powm-1.2 "powm(a,lcm(p-1,q-1),pq)=1" -body { + set powm [ + math::numtheory::make_powm 500000007000000024 modulo 1000000016000000063 + ] + list [{*}$powm 2] [{*}$powm 3] [{*}$powm 5] [{*}$powm 7]\ + [{*}$powm 11] [{*}$powm 13] [{*}$powm 100] [{*}$powm 98754] +} -result {1 1 1 1 1 1 1 1} +% \end{tcl} +% And way harder still for it to also get the +% $(p -\nobreak 1)(q -\nobreak 1) + 2$ case right. +% \begin{tcl} +test powm-1.3 "powm(a,lcm(p-1,q-1)+2,pq)=a**2" -body { + set powm [ + math::numtheory::make_powm 500000007000000026 modulo 1000000016000000063 + ] + list [{*}$powm 2] [{*}$powm 3] [{*}$powm 5] [{*}$powm 7]\ + [{*}$powm 11] [{*}$powm 13] [{*}$powm 100] [{*}$powm 98754] +} -result {4 9 25 49 121 169 10000 9752352516} +% \end{tcl} +% +% A different test direction would be to check that all code paths in +% |make_powm| and subsidiaries do things the right number of times. +% Here, it can be convenient to instead of modular multiplication use +% \((x,y) \mapsto x + 3 + y\) as a binary operation. When that is +% applied to $7$, the result will be \(7+3 = 10\) times the +% ``exponent'', minus $3$. +% \begin{tcl} +test powm-2.1 "Code paths" -body { + set nearadd {::apply {{x y} {expr {$x+3+$y}}}} + list [ + {*}[math::numtheory::make_powm 31 mulprefix $nearadd] 7 +% \end{tcl} +% \(31 = 2^5-1\) has a bitsize \(n=4\), and will thus take the +% \(k=1\) shortcut in |make_powm|. +% \begin{tcl} + ] [ + {*}[math::numtheory::make_powm 512 mulprefix $nearadd] 7 +% \end{tcl} +% \(512 = 2^9\) has a bitsize \(n=8\), so |make_powm| computes \(x_1 +% \approx 1.79\) and computes covers for the window sizes \(k=1\) and +% \(k=2\). It picks the former. +% \begin{tcl} + ] [ + {*}[math::numtheory::make_powm 1023 mulprefix $nearadd] 7 +% \end{tcl} +% \(1023 = 2^{10}-1\) also has bitsize \(n=8\), so |make_powm| again +% finds \(x_1 \approx 1.79\) and computes covers for the window sizes +% \(k=1\) and \(k=2\), but here it picks the latter. +% +% The final group of exponents are in octal |470701|, |470703|, and +% |470707|, respectively. The initial |47070| pattern produces equal +% costs for window widths $2$ and $3$, so the final digit decides +% which is used: with |1| there is a tie (and one tail bit), with |3| +% window width $2$ wins (it gets no tail, whereas $3$ would), and +% with |7| window width $3$ wins (it gets no tail, whereas $2$ +% would). +% \begin{tcl} + ] [ + {*}[math::numtheory::make_powm 160193 mulprefix $nearadd] 7 + ] [ + {*}[math::numtheory::make_powm 160195 mulprefix $nearadd] 7 + ] [ + {*}[math::numtheory::make_powm 160199 mulprefix $nearadd] 7 + ] +} -result {307 5117 10227 1601927 1601947 1601987} +% +% \end{tcl} +% +% +% % % % \section{Primes} % % The first (and so far only) operation provided is |isprime|, which @@ -114,13 +1486,14 @@ The only option currently defined is: [list_begin options] [opt_def -randommr [arg repetitions]] which controls how many times the Miller-Rabin test should be - repeated with randomly chosen bases. Each repetition reduces the - probability of a false positive by a factor at least 4. The - default for [arg repetitions] is 4. + repeated with randomly chosen bases. The risk that one iteration + of the test (with a random basis) fails to detect compositeness + is at most 25%, with two iterations it is at most 6.25%, and so + on. The default for [arg repetitions] is 4. [list_end] Unknown options are silently ignored. % % \end{tcl} @@ -831,11 +2204,17 @@ [::math::numtheory::isprime $n -randommr 2] } -result {on on on 0} % \end{tcl} % RFC~2409~\cite{RFC2409} lists a number of primes (and primitive % generators of their respective multiplicative groups). The -% smallest of these is defined as \(p = 2^{768} - 2^{704} - 1 + +% smallest of these\footnote{ +% The authors of the Logjam attack calculate that it would be +% feasible for an academic team to precompute tables that would +% allow computing the discrete logarithm modulo this prime (or +% any other given prime of similar size), so it shouldn't be used +% in cryptographic applications anymore. +% } is defined as \(p = 2^{768} - 2^{704} - 1 + % 2^{64} \cdot \left( [2^{638} \pi] + 149686 \right)\) (where the % brackets probably denote rounding to the nearest integer), but % since high precision (roughly $200$ decimal digits would be % required) values of \(\pi = 3.14159\dots\) are a bit awkward to % get hold of, we might as well use the stated hexadecimal digit @@ -853,10 +2232,12 @@ list\ [::math::numtheory::isprime 0x$digits]\ [::math::numtheory::isprime 0x[string reverse $digits]] } -result {on 0} % \end{tcl} +% I.e., the second (least significant digit first) interpretation +% is clearly not a prime. % % A quite different thing to test is that the tweaked PRNG really % produces only \(a \equiv 1,5 \pmod{6}\). % \begin{tcl} test isprime-2.0 "PRNG tweak" -setup { @@ -883,11 +2264,12 @@ % % \begin{tcl} %<*man> [list_end] -[keywords {number theory} prime] +[vset CATEGORY {math :: numtheory}] +[include ../doctools2base/include/feedback.inc] [manpage_end] % % \end{tcl} % % \begin{tcl} Index: modules/math/numtheory.man ================================================================== --- modules/math/numtheory.man +++ modules/math/numtheory.man @@ -1,23 +1,136 @@ -[manpage_begin math::numtheory n 1.0] +[manpage_begin math::numtheory n 1.1] +[keywords {modular exponentiation}] [keywords {number theory}] [keywords prime] [copyright "2010 Lars Hellstr\u00F6m\ "] [moddesc {Tcl Math Library}] [titledesc {Number Theory}] [category Mathematics] [require Tcl [opt 8.5]] -[require math::numtheory [opt 1.0]] +[require math::numtheory [opt 1.1]] [description] [para] This package is for collecting various number-theoretic operations, though at the moment it only provides that of testing whether an integer is a prime. [list_begin definitions] +[call [cmd math::numtheory::make_modulo] [arg subcmd] [arg N]] + The [cmd make_modulo] command precomputes some data that are useful + for quickly computing remainders modulo [arg N], which must be a + positive integer. The format of the return value depends on the + [arg subcmd]. This kind of preparation is primarily of interest if + one needs to compute many remainders modulo the same [arg N], but + that turns out to be quite common. + + The [arg subcmd]s are: + [list_begin commands] + [cmd_def prefix] + Returns a [emph {command prefix}] that expects one extra argument + [arg x]. + [example_begin]{*}[arg modprefix] [arg x][example_end] + This argument [arg x] is an integer which should be nonnegative + and less than [arg N]**2. The result of that call will be the + remainder of [arg x] divided by [arg N]. + [cmd_def mulprefix] + Returns a command prefix that expects two extra arguments + [arg x] and [arg y]. + [example_begin]{*}[arg mulmodprefix] [arg x] [arg y][example_end] + These arguments are integers whose product should be nonnegative + and less than [arg N]**2 (typically, [arg x] and [arg y] are both + less than [arg N]), and the result of that call will be the + remainder of [arg x]*[arg y] divided by [arg N]. + [cmd_def Barrett-parameters] + Returns a list of three integers [arg N], [arg M], and [arg s] + that are what one needs in order to perform Barrett reduction + ([uri {http://en.wikipedia.org/wiki/Barrett_reduction}]). [arg N] + is the argument. The shift amount [arg s] satisfies 2**([arg s]/2) + > [arg N]. The scaled reciprocal [arg M] satisfies + [arg M]*[arg N] >= 2**[arg s] > ([arg M]-1)*[arg N]. + [list_end] + + Note that the [arg modprefix] computed is tuned for applications + where the number to be reduced modulo [arg N] is uniformly + distributed in the interval from 0 (inclusive) to [arg N]**2. In + applications where the number comes from a much smaller interval, + the choice of algorithm for the [arg modprefix] may be suboptimal, + but many number-theoretical and cryptographic algorithms end up + exercising the full range. + +[call [cmd math::numtheory::powm] [arg a] [arg b] [arg N]] + The [cmd powm] command performs a modular exponentiation, i.e., it + modulo [arg N] raises [arg a] to the power [arg b]. The arguments + must all be integers and satisfy [arg N],[arg b]>=1. It is + furthermore preferred that 0<=[arg a]<[arg N]; if it is not, then + the result may sometimes fail to be so bounded, even though it will + still be congruent to [arg a]**[arg b] modulo [arg N]. + [para] + + This command is optimised for the case of large [arg b] and + [arg N]; appropriate algorithms will be chosen also for small + argument values, but the overhead for [emph {making the choice}] + could then be nonnegligible. If the same values for [arg b] and + [arg N] are to be used several times, then the overhead can be + reduced considerably by instead using the [cmd make_powm] command + to construct a command prefix which has those values hardwired. + ([cmd powm] is in fact a convenience wrapper around + [cmd make_powm].) + +[call [cmd math::numtheory::make_powm] [arg exponent] [method modulo] [arg modulus]] + This command returns a command prefix, which expects an integer + base [arg a] as its only additional argument. That command prefix + computes [arg a] raised to the power [arg exponent] modulo the + [arg modulus], using a "sliding window" variant of the + exponentiation by squaring algorithm. The [arg exponent] and + [arg modulus] must both be positive, and the implementation is + designed to be performant when these numbers are very large. The + returned command prefix has several large pieces of data embedded + into it, that [cmd make_powm] derives from the [arg exponent] and + [arg modulus], but no related data are cached outside of the command + prefix itself. + [para] + + For example, the definition of [cmd powm] as a convenience call to + [cmd make_powm] is +[example { + proc ::math::numtheory::powm {a b N} { + {*}[make_powm $b modulo $N] $a + } +}] +[call [cmd math::numtheory::make_powm] [arg exponent] [method mulprefix] [arg prefix]] + This command returns a command prefix, which implements raising its + argument to the power [arg exponent], through multiplying various + smaller powers of that argument with each other. The [arg prefix] + argument is the command prefix called upon to actually perform + those multiplications; it should have the call syntax +[example_begin] + {*}[arg prefix] [arg x] [arg y] +[example_end] + and return the product of [arg x] and [arg y]. If [arg powprefix] + similarly is the return value of [cmd make_powm], then it has the + call syntax +[example_begin] + {*}[arg powprefix] [arg a] +[example_end] + and returns [arg a] raised to the power [arg exponent]. + [para] + + The [method modulo] form of [cmd make_powm] is a shorthand that + calls [cmd make_modulo] to compute the multiplication prefix. The + longer [method mulprefix] form is useful in two cases. First, if + the caller anyway has that multiplication prefix at hand, it would + be unnecessary for [cmd make_powm] to compute it again. Second, the + reduction of exponentiation to a select sequence of multiplications + is not something that only makes sense in modular arithmetic; as + long as the [arg prefix] implements some associative binary + operation (e.g. matrix multiplication), [cmd make_powm] will + construct the corresponding repetition of that operation, implemented + in a fairly efficient manner. + [call [cmd math::numtheory::isprime] [arg N] [ opt "[arg option] [arg value] ..." ]] The [cmd isprime] command tests whether the integer [arg N] is a prime, returning a boolean true value for prime [arg N] and a @@ -41,16 +154,17 @@ The only option currently defined is: [list_begin options] [opt_def -randommr [arg repetitions]] which controls how many times the Miller-Rabin test should be - repeated with randomly chosen bases. Each repetition reduces the - probability of a false positive by a factor at least 4. The - default for [arg repetitions] is 4. + repeated with randomly chosen bases. The risk that one iteration + of the test (with a random basis) fails to detect compositeness + is at most 25%, with two iterations it is at most 6.25%, and so + on. The default for [arg repetitions] is 4. [list_end] Unknown options are silently ignored. [list_end] [vset CATEGORY {math :: numtheory}] [include ../doctools2base/include/feedback.inc] [manpage_end] Index: modules/math/numtheory.tcl ================================================================== --- modules/math/numtheory.tcl +++ modules/math/numtheory.tcl @@ -11,18 +11,163 @@ ## ************************************** ## * This Source is not the True Source * ## ************************************** ## the true source is the file from which this one was generated. ## -# Copyright (c) 2010 by Lars Hellstrom. All rights reserved. +# Copyright (c) 2010, 2015 by Lars Hellstrom. All rights reserved. # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. package require Tcl 8.5 -package provide math::numtheory 1.0 +package provide math::numtheory 1.1 namespace eval ::math::numtheory { namespace export isprime } +proc ::math::numtheory::Barrett_mod {N invN shift a} { + set r [expr {$a - (($a * $invN) >> $shift) * $N}] + if {$r<0} then {incr r $N} + return $r +} +proc ::math::numtheory::Barrett_parameters {N} { + set s [expr {8*[string length [format %llx $N]]}] + set M [expr {-( (-1 << $s) / $N)}] + return [list $N $M $s] +} +namespace eval ::math::numtheory { + namespace ensemble create -command make_modulo -map { + Barrett-parameters Barrett_parameters + prefix {Make_modulo_prefix 0} + mulprefix {Make_modulo_prefix 1} + } +} +proc ::math::numtheory::Barrett_mulmod {N invN shift x y} { + set a [expr {$x*$y}] + set r [expr {$a - (($a * $invN) >> $shift) * $N}] + if {$r<0} then {incr r $N} + return $r +} +proc ::math::numtheory::Plain_mod {N a} {expr {$a % $N}} +proc ::math::numtheory::Plain_mulmod {N x y} {expr {$x*$y % $N}} +proc ::math::numtheory::Make_modulo_prefix {mulQ N} { + set P [Barrett_parameters $N] + if {[lindex $P 2] >= 1250} then { + return [linsert $P 0 [ + namespace which [lindex {Barrett_mod Barrett_mulmod} $mulQ] + ]] + } else { + return [list [ + namespace which [lindex {Plain_mod Plain_mulmod} $mulQ] + ] $N] + } +} +proc ::math::numtheory::Sliding_window_powm {k i0 iL tL mulmod a} { + set apow $a + for {set r 1} {$r < $k} {incr r} { + set apow [{*}$mulmod $apow $apow] + } + set aL [list $apow] + set r [expr {1 << ($k-1)}] + while {[llength $aL] < $r} { + set apow [{*}$mulmod $apow $a] + lappend aL $apow + } + set res [lindex $aL $i0] + foreach i $iL { + if {$i < 0} then { + set res [{*}$mulmod $res $res] + } else { + for {set r 0} {$r < $k} {incr r} { + set res [{*}$mulmod $res $res] + } + set res [{*}$mulmod $res [lindex $aL $i]] + } + } + foreach i $tL { + set res [{*}$mulmod $res $res] + if {$i} then { + set res [{*}$mulmod $res $a] + } + } + return $res +} +proc ::math::numtheory::Sliding_window_cover {k bitlist} { + if {$k>=2} then { + set km1 [expr {$k-1}] + set cost [expr {(1 << $km1) - 2}] + set imL {} + set collect 0 + foreach bit $bitlist { + if {$collect>0} then { + set i [expr {($i<<1) | $bit}] + if {[incr collect -1]} then { + lappend tail $bit + } else { + lappend imL $i + incr cost + } + } elseif {$bit} then { + set tail [list $bit] + set i 0 + set collect $km1 + } else { + lappend imL -1 + } + } + } else { + set imL [list 0] + set tail [lrange $bitlist 1 end] + set cost 0 + set collect 1 + } + if {$collect} then { + foreach bit $tail {incr cost $bit} + } else { + set tail {} + } + return [list $cost $imL $tail] +} +proc ::math::numtheory::make_powm {b style data} { + if {$b < 1} then { + return -code error "Exponent must be positive" + } + switch $style { + modulo { + set mulmod [make_modulo mulprefix $data] + } + mulprefix { + set mulmod $data + } + default { + return -code error\ + {Modulo style must be "modulo" or "mulprefix"} + } + } + set bitlist [split [string trimleft [ + string map {0 000 1 001 2 010 3 011 4 100 5 101 6 110 7 111}\ + [format %llo $b] + ] 0] ""] + if {[llength $bitlist] < 7} then { + return [list [namespace which Sliding_window_powm]\ + 1 0 {} [lrange $bitlist 1 end] $mulmod] + } + set x [expr {log([llength $bitlist]-1)/log(2) + 2}] + set x [expr {$x - + (log($x) + log($x+1)) / (1/$x + 1/($x+1) + log(2)) + }] + set k [expr {int($x)}] + set down [Sliding_window_cover $k $bitlist] + incr k + set up [Sliding_window_cover $k $bitlist] + if {[lindex $down 0] < [lindex $up 0]} then { + incr k -1 + } else { + set down $up + } + return [list [namespace which Sliding_window_powm]\ + $k [lindex $down 1 0] [lrange [lindex $down 1] 1 end]\ + [lindex $down 2] $mulmod] +} +proc ::math::numtheory::powm {a b N} {{*}[make_powm $b modulo $N] $a} proc ::math::numtheory::prime_trialdivision {n} { if {$n<2} then {return -code return 0} if {$n<4} then {return -code return 1} if {$n%2 == 0} then {return -code return 0} if {$n<9} then {return -code return 1} Index: modules/math/numtheory.test ================================================================== --- modules/math/numtheory.test +++ modules/math/numtheory.test @@ -17,10 +17,55 @@ [file dirname [file dirname [file join [pwd] [info script]]]]\ devtools testutilities.tcl] testsNeedTcl 8.5 testsNeedTcltest 2 testing {useLocal numtheory.tcl math::numtheory} +test Barrett-1.1 "1e643" -body { + set mod [math::numtheory::make_modulo prefix 1[string repeat 0 643]] + list [ + {*}$mod 1[string repeat 0 642]9 + ] [ + {*}$mod 10[string repeat 9 643] + ] +} -result [list 9 [string repeat 9 643]] +test powm-1.1 "powm(a,lcm(p-1,q-1)+1,pq)=a" -body { + set powm [ + math::numtheory::make_powm 500000007000000025 modulo 1000000016000000063 + ] + list [{*}$powm 2] [{*}$powm 3] [{*}$powm 5] [{*}$powm 7]\ + [{*}$powm 11] [{*}$powm 13] [{*}$powm 100] [{*}$powm 98754] +} -result {2 3 5 7 11 13 100 98754} +test powm-1.2 "powm(a,lcm(p-1,q-1),pq)=1" -body { + set powm [ + math::numtheory::make_powm 500000007000000024 modulo 1000000016000000063 + ] + list [{*}$powm 2] [{*}$powm 3] [{*}$powm 5] [{*}$powm 7]\ + [{*}$powm 11] [{*}$powm 13] [{*}$powm 100] [{*}$powm 98754] +} -result {1 1 1 1 1 1 1 1} +test powm-1.3 "powm(a,lcm(p-1,q-1)+2,pq)=a**2" -body { + set powm [ + math::numtheory::make_powm 500000007000000026 modulo 1000000016000000063 + ] + list [{*}$powm 2] [{*}$powm 3] [{*}$powm 5] [{*}$powm 7]\ + [{*}$powm 11] [{*}$powm 13] [{*}$powm 100] [{*}$powm 98754] +} -result {4 9 25 49 121 169 10000 9752352516} +test powm-2.1 "Code paths" -body { + set nearadd {::apply {{x y} {expr {$x+3+$y}}}} + list [ + {*}[math::numtheory::make_powm 31 mulprefix $nearadd] 7 + ] [ + {*}[math::numtheory::make_powm 512 mulprefix $nearadd] 7 + ] [ + {*}[math::numtheory::make_powm 1023 mulprefix $nearadd] 7 + ] [ + {*}[math::numtheory::make_powm 160193 mulprefix $nearadd] 7 + ] [ + {*}[math::numtheory::make_powm 160195 mulprefix $nearadd] 7 + ] [ + {*}[math::numtheory::make_powm 160199 mulprefix $nearadd] 7 + ] +} -result {307 5117 10227 1601927 1601947 1601987} test prime_trialdivision-1 "Trial division of 1" -body { ::math::numtheory::prime_trialdivision 1 } -returnCodes 2 -result 0 test prime_trialdivision-2 "Trial division of 2" -body { ::math::numtheory::prime_trialdivision 2 Index: modules/math/pkgIndex.tcl ================================================================== --- modules/math/pkgIndex.tcl +++ modules/math/pkgIndex.tcl @@ -24,10 +24,10 @@ package ifneeded math::machineparameters 0.1 [list source [file join $dir machineparameters.tcl]] if {![package vsatisfies [package provide Tcl] 8.5]} {return} package ifneeded math::calculus::symdiff 1.0 [list source [file join $dir symdiff.tcl]] package ifneeded math::bigfloat 2.0.2 [list source [file join $dir bigfloat2.tcl]] -package ifneeded math::numtheory 1.0 [list source [file join $dir numtheory.tcl]] +package ifneeded math::numtheory 1.1 [list source [file join $dir numtheory.tcl]] package ifneeded math::decimal 1.0.3 [list source [file join $dir decimal.tcl]] if {![package vsatisfies [package require Tcl] 8.6]} {return} package ifneeded math::exact 1.0 [list source [file join $dir exact.tcl]]