Merge branch 'master' into optional-semicolons

author: gingerBill <bill@gingerbill.org> 2021-09-06 16:46:57 +0100
committer: gingerBill <bill@gingerbill.org> 2021-09-06 16:46:57 +0100
commit: 2800d4b8d0698d813584989b74cefed8c7e37a40 (patch)
tree: 23b554635130305a05aa5d2a787a1964fdf66709 /core/math/big/private.odin
parent: 720884e0f1d6f15c248f8fbe7b86aa146cedac72 (diff)
parent: bc15ce302c0e095fe8db245194e59adc0533eebe (diff)
1 files changed, 1032 insertions, 30 deletions
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 241cd7441..b3d4d80e5 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -17,6 +15,7 @@ package math_big
 
 	These aren't exported for the same reasons.
 */
+package math_big
 
 import "core:intrinsics"
 import "core:mem"
@@ -1072,11 +1071,11 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 
 	internal_shl_digit(y, n - t) or_return
 
-	c := internal_cmp(x, y)
-	for c != -1 {
+	gte := internal_gte(x, y)
+	for gte {
 		q.digit[n - t] += 1
 		internal_sub(x, x, y) or_return
-		c = internal_cmp(x, y)
+		gte = internal_gte(x, y)
 	}
 
 	/*
@@ -1135,7 +1134,7 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 			t2.digit[2] = x.digit[i]
 			t2.used = 3
 
-			if t1_t2 := internal_cmp_mag(t1, t2); t1_t2 != 1 {
+			if internal_lte(t1, t2) {
 				break
 			}
 			iter += 1; if iter > 100 {
@@ -1228,15 +1227,13 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
 	*/
-	if internal_cmp(A1, 0) == -1 {
+	if internal_lt(A1, 0) {
 		internal_shl(t, b, k * _DIGIT_BITS) or_return
 
 		for {
 			internal_decr(Q1) or_return
 			internal_add(A1, A1, t) or_return
-			if internal_cmp(A1, 0) != -1 {
-				break
-			}
+			if internal_gte(A1, 0) { break }
 		}
 	}
 
@@ -1257,7 +1254,7 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
 	*/
-	for internal_cmp(A2, 0) == -1 {
+	for internal_is_negative(A2) { // internal_lt(A2, 0) {
 		internal_decr(Q0) or_return
 		internal_add(A2, A2, b) or_return
 	}
@@ -1376,7 +1373,7 @@ _private_int_div_recursive :: proc(quotient, remainder, a, b: ^Int, allocator :=
 _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) {
 
 	ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}
-	c: int
+
 	defer internal_destroy(ta, tb, tq, q)
 
 	for {
@@ -1392,7 +1389,7 @@ _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int
 		shl(tq, tq, n)       or_return
 
 		for n >= 0 {
-			if c, _ = cmp_mag(ta, tb); c == 0 || c == 1 {
+			if internal_gte(ta, tb) {
 				// ta -= tb
 				sub(ta, ta, tb) or_return
 				//  q += tq
@@ -1438,7 +1435,7 @@ _private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := cont
 	internal_one(inner, false)                                       or_return
 	internal_one(outer, false)                                       or_return
 
-	bits_used := int(_DIGIT_TYPE_BITS - intrinsics.count_leading_zeros(n))
+	bits_used := ilog2(n)
 
 	for i := bits_used; i >= 0; i -= 1 {
 		start := (n >> (uint(i) + 1)) + 1 | 1
@@ -1597,7 +1594,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		/*
 			Make sure `v` is the largest.
 		*/
-		if internal_cmp_mag(u, v) == 1 {
+		if internal_gt(u, v) {
 			/*
 				Swap `u` and `v` to make sure `v` is >= `u`.
 			*/
@@ -1635,7 +1632,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		Computes least common multiple as `|a*b|/gcd(a,b)`
 		Divide the smallest by the GCD.
 	*/
-	if internal_cmp_mag(a, b) == -1 {
+	if internal_lt_abs(a, b) {
 		/*
 			Store quotient in `t2` such that `t2 * b` is the LCM.
 		*/
@@ -1695,9 +1692,7 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 		/*
 			Iterate until `a` is bracketed between low + high.
 		*/
-		if #force_inline internal_cmp(bracket_high, a) != -1 {
-			break
-		}
+		if #force_inline internal_gte(bracket_high, a) { break }
 
 		low = high
 		#force_inline internal_copy(bracket_low, bracket_high) or_return
@@ -1731,9 +1726,6 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 	return
 }
 
-
-
-
 /*
 	Computes xR**-1 == x (mod N) via Montgomery Reduction.
 	This is an optimized implementation of `internal_montgomery_reduce`
@@ -1754,7 +1746,7 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		Grow `x` as required.
 	*/
-	internal_grow(x, n.used + 1) or_return
+	internal_grow(x, n.used + 1)                                     or_return
 
 	/*
 		First we have to get the digits of the input into an array of double precision words W[...]
@@ -1848,9 +1840,1018 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		if A >= m then A = A - m
 	*/
-	if internal_cmp_mag(x, n) != -1 {
+	if internal_gte_abs(x, n) {
+		return internal_sub(x, x, n)
+	}
+	return nil
+}
+
+/*
+	Computes xR**-1 == x (mod N) via Montgomery Reduction.
+	Assumes `x` and `n` not to be nil.
+*/
+_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	/*
+		Can the fast reduction [comba] method be used?
+		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
+		since carries are fixed up in the inner loop.
+	*/
+	internal_clear_if_uninitialized(x, n) or_return
+
+	digs := (n.used * 2) + 1
+	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
+		return _private_montgomery_reduce_comba(x, n, rho)
+	}
+
+	/*
+		Grow the input as required
+	*/
+	internal_grow(x, digs)                                           or_return
+	x.used = digs
+
+	for ix := 0; ix < n.used; ix += 1 {
+		/*
+			`mu = ai * rho mod b`
+			The value of rho must be precalculated via `int_montgomery_setup()`,
+			such that it equals -1/n0 mod b this allows the following inner loop
+			to reduce the input one digit at a time.
+		*/
+
+		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK))
+
+		/*
+			a = a + mu * m * b**i
+			Multiply and add in place.
+		*/
+		u  := DIGIT(0)
+		iy := int(0)
+		for ; iy < n.used; iy += 1 {
+			/*
+				Compute product and sum.
+			*/
+			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]))
+
+			/*
+				Get carry.
+			*/
+			u = DIGIT(r >> _DIGIT_BITS)
+
+			/*
+				Fix digit.
+			*/
+			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
+		}
+
+		/*
+			At this point the ix'th digit of x should be zero.
+			Propagate carries upwards as required.
+		*/
+		for u != 0 {
+			x.digit[ix + iy] += u
+			u = x.digit[ix + iy] >> _DIGIT_BITS
+			x.digit[ix + iy] &= _MASK
+			iy += 1
+		}
+	}
+
+	/*
+		At this point the n.used'th least significant digits of x are all zero,
+		which means we can shift x to the right by n.used digits and the
+		residue is unchanged.
+
+		x = x/b**n.used.
+	*/
+	internal_clamp(x)
+	internal_shr_digit(x, n.used)
+
+	/*
+		if x >= n then x = x - n
+	*/
+	if internal_gte_abs(x, n) {
 		return internal_sub(x, x, n)
 	}
+
+	return nil
+}
+
+/*
+	Shifts with subtractions when the result is greater than b.
+
+	The method is slightly modified to shift B unconditionally upto just under
+	the leading bit of b.  This saves alot of multiple precision shifting.
+
+	Assumes `a` and `b` not to be `nil`.
+*/
+_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	/*
+		How many bits of last digit does b use.
+	*/
+	internal_clear_if_uninitialized(a, b) or_return
+
+	bits := internal_count_bits(b) % _DIGIT_BITS
+
+	if b.used > 1 {
+		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1
+		internal_int_power_of_two(a, power)                          or_return
+	} else {
+		internal_one(a)                                              or_return
+		bits = 1
+	}
+
+	/*
+		Now compute C = A * B mod b.
+	*/
+	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
+		internal_int_shl1(a, a)                                      or_return
+		if internal_gte_abs(a, b) {
+			internal_sub(a, a, b)                                    or_return
+		}
+	}
+	return nil
+}
+
+/*
+	Sets up the Montgomery reduction stuff.
+*/
+_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
+	/*
+		Fast inversion mod 2**k
+		Based on the fact that:
+
+		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
+		                  =>  2*X*A - X*X*A*A = 1
+		                  =>  2*(1) - (1)     = 1
+	*/
+	internal_clear_if_uninitialized(n, allocator) or_return
+
+	b := n.digit[0]
+	if b & 1 == 0 { return 0, .Invalid_Argument }
+
+	x := (((b + 2) & 4) << 1) + b /* here x*a==1 mod 2**4 */
+	x *= 2 - (b * x)              /* here x*a==1 mod 2**8 */
+	x *= 2 - (b * x)              /* here x*a==1 mod 2**16 */
+
+	when _DIGIT_TYPE_BITS == 64 {
+		x *= 2 - (b * x)              /* here x*a==1 mod 2**32 */
+		x *= 2 - (b * x)              /* here x*a==1 mod 2**64 */
+	}
+
+	/*
+		rho = -1/m mod b
+	*/
+	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK))
+	return rho, nil
+}
+
+/*
+	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
+	From HAC pp.604 Algorithm 14.42
+
+	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
+*/
+_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	q := &Int{}
+	defer internal_destroy(q)
+	um := m.used
+
+	/*
+		q = x
+	*/
+	internal_copy(q, x)                                              or_return
+
+	/*
+		q1 = x / b**(k-1)
+	*/
+	internal_shr_digit(q, um - 1)
+
+	/*
+		According to HAC this optimization is ok.
+	*/
+	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
+		internal_mul(q, q, mu)                                       or_return
+	} else {
+		_private_int_mul_high(q, q, mu, um)                          or_return
+	}
+
+	/*
+		q3 = q2 / b**(k+1)
+	*/
+	internal_shr_digit(q, um + 1)
+
+	/*
+		x = x mod b**(k+1), quick (no division)
+	*/
+	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return
+
+	/*
+		q = q * m mod b**(k+1), quick (no division)
+	*/
+	_private_int_mul(q, q, m, um + 1)                                or_return
+
+	/*
+		x = x - q
+	*/
+	internal_sub(x, x, q)                                            or_return
+
+	/*
+		If x < 0, add b**(k+1) to it.
+	*/
+	if internal_is_negative(x) {
+		internal_set(q, 1)                                           or_return
+		internal_shl_digit(q, um + 1)                                or_return
+		internal_add(x, x, q)                                        or_return
+	}
+
+	/*
+		Back off if it's too big.
+	*/
+	for internal_gte(x, m) {
+		internal_sub(x, x, m)                                        or_return
+	}
+
+	return nil
+}
+
+/*
+	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
+*/
+_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	q := &Int{}
+	defer internal_destroy(q)
+
+	internal_zero(q)                                                 or_return
+
+	p := internal_count_bits(n)
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return
+
+		if d != 1 {
+			/*
+				q = q * d
+			*/
+			internal_mul(q, q, d)                                    or_return
+		}
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return
+		if internal_lt_abs(a, n)                                     { break }
+		internal_sub(a, a, n)                                        or_return
+	}
+
+	return nil
+}
+
+/*
+	Reduces `a` modulo `n` where `n` is of the form 2**p - d
+	This differs from reduce_2k since "d" can be larger than a single digit.
+*/
+_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	q := &Int{}
+	defer internal_destroy(q)
+
+	internal_zero(q)                                                 or_return
+
+	p := internal_count_bits(n)
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return
+
+		/*
+			q = q * d
+		*/
+		internal_mul(q, q, d)                                        or_return
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return
+		if internal_lt_abs(a, n)                                     { break }
+		internal_sub(a, a, n)                                        or_return
+	}
+
+	return nil
+}
+
+/*
+	Determines if `internal_int_reduce_2k` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a)
+
+	if internal_is_zero(a) {
+		return false, nil
+	} else if a.used == 1 {
+		return true, nil
+	} else if a.used  > 1 {
+		iy := internal_count_bits(a)
+		iw := 1
+		iz := DIGIT(1)
+
+		/*
+			Test every bit from the second digit up, must be 1.
+		*/
+		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
+			if a.digit[iw] & iz == 0 {
+				return false, nil
+			}
+
+			iz <<= 1
+			if iz > _DIGIT_MAX {
+				iw += 1
+				iz  = 1
+			}
+		}
+		return true, nil
+	} else {
+		return true, nil
+	}
+}
+
+/*
+	Determines if `internal_int_reduce_2k_l` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a)
+
+	if internal_int_is_zero(a) {
+		return false, nil
+	} else if a.used == 1 {
+		return true, nil
+	} else if a.used  > 1 {
+		/*
+			If more than half of the digits are -1 we're sold.
+		*/
+		ix := 0
+		iy := 0
+
+		for ; ix < a.used; ix += 1 {
+			if a.digit[ix] == _DIGIT_MAX {
+				iy += 1
+			}
+		}
+		return iy >= (a.used / 2), nil
+	} else {
+		return false, nil
+	}
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
+	context.allocator = allocator
+
+	tmp := &Int{}
+	defer internal_destroy(tmp)
+	internal_zero(tmp)                                               or_return
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return
+	internal_sub(tmp, tmp, a)                                        or_return
+
+	return tmp.digit[0], nil
+}
+
+/*
+	Determines the setup value.
+	Assumes `mu` and `P` are not `nil`.
+
+	d := (1 << a.bits) - a;
+*/
+_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	tmp := &Int{}
+	defer internal_destroy(tmp)
+	internal_zero(tmp)                                               or_return
+
+	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return
+	internal_sub(mu, tmp, P)                                         or_return
+
+	return nil
+}
+
+/*
+	Pre-calculate the value required for Barrett reduction.
+	For a given modulus "P" it calulates the value required in "mu"
+	Assumes `mu` and `P` are not `nil`.
+*/
+_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return
+	return internal_int_div(mu, mu, P)
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
+	/*
+		The casts are required if _DIGIT_BITS is one less than
+		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
+	*/
+	return DIGIT((1 << _DIGIT_BITS) - a.digit[0])
+}
+
+/*
+	Determines if a number is a valid DR modulus.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
+	/*
+		Must be at least two digits.
+	*/
+	if a.used < 2 { return false }
+
+	/*
+		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
+	*/
+	for ix := 1; ix < a.used; ix += 1 {
+		if a.digit[ix] != _MASK {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+	Based on algorithm from the paper
+
+		"Generating Efficient Primes for Discrete Log Cryptosystems"
+					Chae Hoon Lim, Pil Joong Lee,
+			POSTECH Information Research Laboratories
+
+	The modulus must be of a special format [see manual].
+	Has been modified to use algorithm 7.10 from the LTM book instead
+
+	Input x must be in the range 0 <= x <= (n-1)**2
+	Assumes `x` and `n` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
+	/*
+		m = digits in modulus.
+	*/
+	m := n.used
+
+	/*
+		Ensure that "x" has at least 2m digits.
+	*/
+	internal_grow(x, m + m)                                          or_return
+
+	/*
+		Top of loop, this is where the code resumes if another reduction pass is required.
+	*/
+	for {
+		i: int
+		mu := DIGIT(0)
+
+		/*
+			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
+		*/
+		for i = 0; i < m; i += 1 {
+			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu)
+			x.digit[i] = DIGIT(r & _WORD(_MASK))
+			mu         = DIGIT(r >> _WORD(_DIGIT_BITS))
+		}
+
+		/*
+			Set final carry.
+		*/
+		x.digit[i] = mu
+
+		/*
+			Zero words above m.
+		*/
+		mem.zero_slice(x.digit[m + 1:][:x.used - m])
+
+		/*
+			Clamp, sub and return.
+		*/
+		internal_clamp(x)                                            or_return
+
+		/*
+			If x >= n then subtract and reduce again.
+			Each successive "recursion" makes the input smaller and smaller.
+		*/
+		if internal_lt_abs(x, n) { break }
+
+		internal_sub(x, x, n)                                        or_return
+	}
+	return nil
+}
+
+/*
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	M := [_TAB_SIZE]Int{}
+	winsize: uint
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error)
+
+	defer {
+		internal_destroy(&M[1])
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x])
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X)
+	switch {
+	case x <= 7:
+		winsize = 2
+	case x <= 36:
+		winsize = 3
+	case x <= 140:
+		winsize = 4
+	case x <= 450:
+		winsize = 5
+	case x <= 1303:
+		winsize = 6
+	case x <= 3529:
+		winsize = 7
+	case:
+		winsize = 8
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
+
+	/*
+		Init M array.
+		Init first cell.
+	*/
+	internal_zero(&M[1])                                             or_return
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_zero(&M[x])                                         or_return
+	}
+
+	/*
+		Create `mu`, used for Barrett reduction.
+	*/
+	mu := &Int{}
+	defer internal_destroy(mu)
+	internal_zero(mu)                                                or_return
+
+	if redmode == 0 {
+		_private_int_reduce_setup(mu, P)                             or_return
+		redux = _private_int_reduce
+	} else {
+		_private_int_reduce_2k_setup_l(mu, P)                        or_return
+		redux = _private_int_reduce_2k_l
+	}
+
+	/*
+		Create M table.
+
+		The M table contains powers of the base, e.g. M[x] = G**x mod P.
+		The first half of the table is not computed, though, except for M[0] and M[1].
+	*/
+	internal_int_mod(&M[1], G, P)                                    or_return
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+
+		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
+		instead of repeated squaring.
+	*/
+	slot := 1 << (winsize - 1)
+	internal_copy(&M[slot], &M[1])                                   or_return
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		/*
+			Square it.
+		*/
+		internal_sqr(&M[slot], &M[slot])                             or_return
+
+		/*
+			Reduce modulo P
+		*/
+		redux(&M[slot], P, mu)                                       or_return
+	}
+
+	/*
+		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+	*/
+	for x = slot + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return
+		redux(&M[x], P, mu)                                          or_return
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_one(res)                                                or_return
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0
+	bitcnt := 1
+	buf    := DIGIT(0)
+	digidx := X.used - 1
+	bitcpy := uint(0)
+	bitbuf := DIGIT(0)
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits.
+			*/
+			if digidx == -1 { break }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx]
+			digidx -= 1
+			bitcnt = _DIGIT_BITS
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := buf >> (_DIGIT_BITS - 1) & 1
+		buf <<= 1
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit
+			in the exponent.  Technically this opt is not required but it
+			does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 {
+			continue
+		}
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, mu)                                        or_return
+			continue
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1
+		bitbuf |= (y << (winsize - bitcpy))
+		mode    = 2
+
+		if (bitcpy == winsize) {
+			/*
+				Window is filled so square as required and multiply.
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return
+				redux(res, P, mu)                                    or_return
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return
+			redux(res, P, mu)                                        or_return
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0
+			bitbuf = 0
+			mode   = 1
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < int(bitcpy); x += 1 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, mu)                                        or_return
+
+			bitbuf <<= 1
+			if ((bitbuf & (1 << winsize)) != 0) {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return
+				redux(res, P, mu)                                    or_return
+			}
+		}
+	}
+	return err
+}
+
+/*
+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+
+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
+	The value of `k` changes based on the size of the exponent.
+
+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	M := [_TAB_SIZE]Int{}
+	winsize: uint
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error)
+
+	defer {
+		internal_destroy(&M[1])
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x])
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X)
+	switch {
+	case x <= 7:
+		winsize = 2
+	case x <= 36:
+		winsize = 3
+	case x <= 140:
+		winsize = 4
+	case x <= 450:
+		winsize = 5
+	case x <= 1303:
+		winsize = 6
+	case x <= 3529:
+		winsize = 7
+	case:
+		winsize = 8
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
+
+	/*
+		Init M array
+		Init first cell.
+	*/
+	cap := internal_int_allocated_cap(P)
+	internal_grow(&M[1], cap)                                        or_return
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_grow(&M[x], cap)                                    or_return
+	}
+
+	/*
+		Determine and setup reduction code.
+	*/
+	rho: DIGIT
+
+	if redmode == 0 {
+		/*
+			Now setup Montgomery.
+		*/
+		rho = _private_int_montgomery_setup(P)                       or_return
+
+		/*
+			Automatically pick the comba one if available (saves quite a few calls/ifs).
+		*/
+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
+			redux = _private_montgomery_reduce_comba
+		} else {
+			/*
+				Use slower baseline Montgomery method.
+			*/
+			redux = _private_int_montgomery_reduce
+		}
+	} else if redmode == 1 {
+		/*
+			Setup DR reduction for moduli of the form B**k - b.
+		*/
+		rho = _private_int_dr_setup(P)
+		redux = _private_int_dr_reduce
+	} else {
+		/*
+			Setup DR reduction for moduli of the form 2**k - b.
+		*/
+		rho = _private_int_reduce_2k_setup(P)                        or_return
+		redux = _private_int_reduce_2k
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_grow(res, cap)                                          or_return
+
+	/*
+		Create M table
+		The first half of the table is not computed, though, except for M[0] and M[1]
+	*/
+
+	if redmode == 0 {
+		/*
+			Now we need R mod m.
+		*/
+		_private_int_montgomery_calc_normalization(res, P)           or_return
+
+		/*
+			Now set M[1] to G * R mod m.
+		*/
+		internal_mulmod(&M[1], G, res, P)                            or_return
+	} else {
+		internal_one(res)                                            or_return
+		internal_mod(&M[1], G, P)                                    or_return
+	}
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+	*/
+	slot := 1 << (winsize - 1)
+	internal_copy(&M[slot], &M[1])                                   or_return
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		internal_sqr(&M[slot], &M[slot])                             or_return
+		redux(&M[slot], P, rho)                                      or_return
+	}
+
+	/*
+		Create upper table.
+	*/
+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return
+		redux(&M[x], P, rho)                                         or_return
+	}
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0
+	bitcnt := 1
+	buf    := DIGIT(0)
+	digidx := X.used - 1
+	bitcpy := 0
+	bitbuf := DIGIT(0)
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits so break.
+			*/
+			if digidx == -1 { break }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx]
+			digidx -= 1
+			bitcnt = _DIGIT_BITS
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := (buf >> (_DIGIT_BITS - 1)) & 1
+		buf <<= 1
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit in the exponent.
+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 { continue }
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, rho)                                       or_return
+			continue
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1
+		bitbuf |= (y << (winsize - uint(bitcpy)))
+		mode    = 2
+
+		if bitcpy == int(winsize) {
+			/*
+				Window is filled so square as required and multiply
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return
+				redux(res, P, rho)                                   or_return
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return
+			redux(res, P, rho)                                       or_return
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0
+			bitbuf = 0
+			mode   = 1
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < bitcpy; x += 1 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, rho)                                       or_return
+
+			/*
+				Get next bit of the window.
+			*/
+			bitbuf <<= 1
+			if bitbuf & (1 << winsize) != 0 {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return
+				redux(res, P, rho)                                   or_return
+			}
+		}
+	}
+
+	if redmode == 0 {
+		/*
+			Fixup result if Montgomery reduction is used.
+			Recall that any value in a Montgomery system is actually multiplied by R mod n.
+			So we have to reduce one more time to cancel out the factor of R.
+		*/
+		redux(res, P, rho)                                           or_return
+	}
+
 	return nil
 }
 
@@ -1980,21 +2981,21 @@ _private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator
 	/*
 		If `v` != `1` then there is no inverse.
 	*/
-	if internal_cmp(v, 1) != 0 {
+	if !internal_eq(v, 1) {
 		return .Invalid_Argument
 	}
 
 	/*
 		If its too low.
 	*/
-	if internal_cmp(C, 0) == -1 {
+	if internal_is_negative(C) {
 		internal_add(C, C, b) or_return
 	}
 
 	/*
 		Too big.
 	*/
-	if internal_cmp(C, 0) != -1 {
+	if internal_gte(C, 0) {
 		internal_sub(C, C, b) or_return
 	}
 
@@ -2147,7 +3148,7 @@ _private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.alloc
 	/*
 		Too big.
 	*/
-	for internal_cmp_mag(D, b) != -1 {
+	for internal_gte_abs(D, b) {
 		internal_sub(D, D, b) or_return
 	}
 
@@ -2222,7 +3223,8 @@ _private_int_rem_105 := [?]DIGIT{
 }
 #assert(105 * size_of(DIGIT) == size_of(_private_int_rem_105))
 
-_private_prime_table := [?]DIGIT{
+_PRIME_TAB_SIZE :: 256
+_private_prime_table := [_PRIME_TAB_SIZE]DIGIT{
 	0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
 	0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
 	0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
@@ -2259,7 +3261,7 @@ _private_prime_table := [?]DIGIT{
 	0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
 	0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653,
 }
-#assert(256 * size_of(DIGIT) == size_of(_private_prime_table))
+#assert(_PRIME_TAB_SIZE * size_of(DIGIT) == size_of(_private_prime_table))
 
 when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
 	_factorial_table := [35]_WORD{
author	gingerBill <bill@gingerbill.org>	2021-09-06 16:46:57 +0100
committer	gingerBill <bill@gingerbill.org>	2021-09-06 16:46:57 +0100
commit	2800d4b8d0698d813584989b74cefed8c7e37a40 (patch)
tree	23b554635130305a05aa5d2a787a1964fdf66709 /core/math/big/private.odin
parent	720884e0f1d6f15c248f8fbe7b86aa146cedac72 (diff)
parent	bc15ce302c0e095fe8db245194e59adc0533eebe (diff)