Merge pull request #1113 from Kelimion/bigint

big: Add `expt_mod`, new comparison helpers, etc.
author: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2021-09-01 19:25:52 +0200
committer: GitHub <noreply@github.com> 2021-09-01 19:25:52 +0200
commit: e2f035d6eea1a6b6624213cf653fc31bd64f0b9e (patch)
tree: 9c077cfbf196d7e4d58c25d2bb9724c5d81e3707 /core/math
parent: b2cf0755f2aa899a5a87bb92e87d0c46d01f593b (diff)
parent: ae354731edd1d408f1852c4fb19af2c840cbed46 (diff)
14 files changed, 1701 insertions, 337 deletions
diff --git a/core/math/big/api.odin b/core/math/big/api.odin
index 1f2eab8d7..e2761b425 100644
--- a/core/math/big/api.odin
+++ b/core/math/big/api.odin
@@ -1,14 +1,15 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-2 license.
+	Made available under Odin's BSD-3 license.
 
 	An arbitrary precision mathematics implementation in Odin.
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 
 	This file collects public proc maps and their aliases.
+*/
+package math_big
+/*
 
 	=== === === === === === === === === === === === === === === === === === === === === === === ===
 	                                    Basic arithmetic.
diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 31480bcc8..4a6aeeb3e 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -2,6 +2,7 @@
 :odin run . -vet
 
 set TEST_ARGS=-fast-tests
+:set TEST_ARGS=
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
diff --git a/core/math/big/common.odin b/core/math/big/common.odin
index ce1f7d77f..4171d25f3 100644
--- a/core/math/big/common.odin
+++ b/core/math/big/common.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -8,6 +6,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:intrinsics"
 
@@ -57,10 +56,10 @@ when #config(MATH_BIG_EXE, true) {
 	debugged where necessary.
 */
 
-_DEFAULT_MUL_KARATSUBA_CUTOFF :: #config(MUL_KARATSUBA_CUTOFF,  80);
-_DEFAULT_SQR_KARATSUBA_CUTOFF :: #config(SQR_KARATSUBA_CUTOFF, 120);
-_DEFAULT_MUL_TOOM_CUTOFF      :: #config(MUL_TOOM_CUTOFF,      350);
-_DEFAULT_SQR_TOOM_CUTOFF      :: #config(SQR_TOOM_CUTOFF,      400);
+_DEFAULT_MUL_KARATSUBA_CUTOFF :: #config(MATH_BIG_MUL_KARATSUBA_CUTOFF,  80);
+_DEFAULT_SQR_KARATSUBA_CUTOFF :: #config(MATH_BIG_SQR_KARATSUBA_CUTOFF, 120);
+_DEFAULT_MUL_TOOM_CUTOFF      :: #config(MATH_BIG_MUL_TOOM_CUTOFF,      350);
+_DEFAULT_SQR_TOOM_CUTOFF      :: #config(MATH_BIG_SQR_TOOM_CUTOFF,      400);
 
 
 MAX_ITERATIONS_ROOT_N := 500;
@@ -85,15 +84,22 @@ FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS := 100;
 
 	2) Optimizations thanks to precomputed masks wouldn't work.
 */
-MATH_BIG_FORCE_64_BIT :: #config(MATH_BIG_FORCE_64_BIT, false);
-MATH_BIG_FORCE_32_BIT :: #config(MATH_BIG_FORCE_32_BIT, false);
+MATH_BIG_FORCE_64_BIT   :: #config(MATH_BIG_FORCE_64_BIT, false);
+MATH_BIG_FORCE_32_BIT   :: #config(MATH_BIG_FORCE_32_BIT, false);
 when (MATH_BIG_FORCE_32_BIT && MATH_BIG_FORCE_64_BIT) { #panic("Cannot force 32-bit and 64-bit big backend simultaneously."); };
 
-_LOW_MEMORY           :: #config(BIGINT_SMALL_MEMORY, false);
+/*
+	Trade a smaller memory footprint for more processing overhead?
+*/
+_LOW_MEMORY             :: #config(MATH_BIG_SMALL_MEMORY, false);
 when _LOW_MEMORY {
-	_DEFAULT_DIGIT_COUNT :: 8;
+	_DEFAULT_DIGIT_COUNT ::   8;
+	_TAB_SIZE            ::  32;
+	_MAX_WIN_SIZE        ::   5;
 } else {
-	_DEFAULT_DIGIT_COUNT :: 32;
+	_DEFAULT_DIGIT_COUNT ::  32;
+	_TAB_SIZE            :: 256;
+	_MAX_WIN_SIZE        ::   0;
 }
 
 /*
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 4542e9e15..4da2ebbe9 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -9,6 +7,8 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
+
 
 import "core:fmt"
 import "core:mem"
@@ -18,11 +18,14 @@ print_configation :: proc() {
 `
 Configuration:
 	_DIGIT_BITS                           %v
+	_SMALL_MEMORY                         %v
 	_MIN_DIGIT_COUNT                      %v
 	_MAX_DIGIT_COUNT                      %v
 	_DEFAULT_DIGIT_COUNT                  %v
 	_MAX_COMBA                            %v
 	_WARRAY                               %v
+	_TAB_SIZE                             %v
+	_MAX_WIN_SIZE                         %v
 Runtime tunable:
 	MUL_KARATSUBA_CUTOFF                  %v
 	SQR_KARATSUBA_CUTOFF                  %v
@@ -34,11 +37,14 @@ Runtime tunable:
 	FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS %v
 
 `, _DIGIT_BITS,
+_LOW_MEMORY,
 _MIN_DIGIT_COUNT,
 _MAX_DIGIT_COUNT,
 _DEFAULT_DIGIT_COUNT,
 _MAX_COMBA,
 _WARRAY,
+_TAB_SIZE,
+_MAX_WIN_SIZE,
 MUL_KARATSUBA_CUTOFF,
 SQR_KARATSUBA_CUTOFF,
 MUL_TOOM_CUTOFF,
@@ -73,138 +79,11 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-int_to_byte :: proc(v: ^Int) {
-	err: Error;
-	size: int;
-	print("v: ", v);
-	fmt.println();
-
-	t := &Int{};
-	defer destroy(t);
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b1 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big(v, b1);
-	int_from_bytes_big(t, b1);
-	fmt.printf("big: %v | err: %v\n", b1, err);
-
-	int_from_bytes_big(t, b1);
-	if internal_cmp_mag(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b2 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big_python(v, b2);
-	fmt.printf("big python: %v | err: %v\n", b2, err);
-
-	if err == nil {
-		int_from_bytes_big_python(t, b2);
-		if internal_cmp_mag(t, v) != 0 {
-			print("\tError parsing t: ", t);
-		}
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b3 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big(v, b3, true);
-	fmt.printf("big signed: %v | err: %v\n", b3, err);
-
-	int_from_bytes_big(t, b3, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b4 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big_python(v, b4, true);
-	fmt.printf("big signed python: %v | err: %v\n", b4, err);
-
-	int_from_bytes_big_python(t, b4, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-}
-
-int_to_byte_little :: proc(v: ^Int) {
-	err: Error;
-	size: int;
-	print("v: ", v);
-	fmt.println();
-
-	t := &Int{};
-	defer destroy(t);
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b1 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little(v, b1);
-	fmt.printf("little: %v | err: %v\n", b1, err);
-
-	int_from_bytes_little(t, b1);
-	if internal_cmp_mag(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b2 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little_python(v, b2);
-	fmt.printf("little python: %v | err: %v\n", b2, err);
-
-	if err == nil {
-		int_from_bytes_little_python(t, b2);
-		if internal_cmp_mag(t, v) != 0 {
-			print("\tError parsing t: ", t);
-		}
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b3 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little(v, b3, true);
-	fmt.printf("little signed: %v | err: %v\n", b3, err);
-
-	int_from_bytes_little(t, b3, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b4 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little_python(v, b4, true);
-	fmt.printf("little signed python: %v | err: %v\n", b4, err);
-
-	int_from_bytes_little_python(t, b4, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-}
+// printf :: fmt.printf;
 
 demo :: proc() {
-	a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
-	defer destroy(a, b, c, d, e, f);
+	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
+	defer destroy(a, b, c, d, e, f, res);
 }
 
 main :: proc() {
diff --git a/core/math/big/helpers.odin b/core/math/big/helpers.odin
index 8ce1b2811..ff654172c 100644
--- a/core/math/big/helpers.odin
+++ b/core/math/big/helpers.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -8,6 +6,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:intrinsics"
 import rnd "core:math/rand"
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 9422067ae..72ff1fe76 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -31,6 +29,7 @@ package math_big
 
 	TODO: Handle +/- Infinity and NaN.
 */
+package math_big
 
 import "core:mem"
 import "core:intrinsics"
@@ -137,7 +136,7 @@ internal_int_add_signed :: proc(dest, a, b: ^Int, allocator := context.allocator
 		Subtract the one with the greater magnitude from the other.
 		The result gets the sign of the one with the greater magnitude.
 	*/
-	if #force_inline internal_cmp_mag(a, b) == -1 {
+	if #force_inline internal_lt_abs(a, b) {
 		x, y = y, x;
 	}
 
@@ -359,7 +358,7 @@ internal_int_sub_signed :: proc(dest, number, decrease: ^Int, allocator := conte
 		Subtract a positive from a positive, OR negative from a negative.
 		First, take the difference between their magnitudes, then...
 	*/
-	if #force_inline internal_cmp_mag(number, decrease) == -1 {
+	if #force_inline internal_lt_abs(number, decrease) {
 		/*
 			The second has a larger magnitude.
 			The result has the *opposite* sign from the first number.
@@ -719,7 +718,7 @@ internal_int_divmod :: proc(quotient, remainder, numerator, denominator: ^Int, a
 	/*
 		If numerator < denominator then quotient = 0, remainder = numerator.
 	*/
-	if #force_inline internal_cmp_mag(numerator, denominator) == -1 {
+	if #force_inline internal_lt_abs(numerator, denominator) {
 		if remainder != nil {
 			internal_copy(remainder, numerator) or_return;
 		}
@@ -732,7 +731,6 @@ internal_int_divmod :: proc(quotient, remainder, numerator, denominator: ^Int, a
 	if (denominator.used > 2 * MUL_KARATSUBA_CUTOFF) && (denominator.used <= (numerator.used / 3) * 2) {
 		assert(denominator.used >= 160 && numerator.used >= 240, "MUL_KARATSUBA_CUTOFF global not properly set.");
 		err = _private_int_div_recursive(quotient, remainder, numerator, denominator);
-		// err = #force_inline _private_int_div_school(quotient, remainder, numerator, denominator);
 	} else {
 		when true {
 			err = #force_inline _private_int_div_school(quotient, remainder, numerator, denominator);
@@ -993,12 +991,20 @@ internal_int_mod_bits :: proc(remainder, numerator: ^Int, bits: int, allocator :
 */
 
 /*
+	This procedure returns the allocated capacity of an Int.
+	Assumes `a` not to be `nil`.
+*/
+internal_int_allocated_cap :: #force_inline proc(a: ^Int) -> (cap: int) {
+	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
+	return raw.cap;
+}
+
+/*
 	This procedure will return `true` if the `Int` is initialized, `false` if not.
 	Assumes `a` not to be `nil`.
 */
 internal_int_is_initialized :: #force_inline proc(a: ^Int) -> (initialized: bool) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-	return raw.cap >= _MIN_DIGIT_COUNT;
+	return internal_int_allocated_cap(a) >= _MIN_DIGIT_COUNT;
 }
 internal_is_initialized :: proc { internal_int_is_initialized, };
 
@@ -1091,6 +1097,7 @@ internal_is_power_of_two :: proc { internal_int_is_power_of_two, };
 	Expects `a` and `b` both to be valid `Int`s, i.e. initialized and not `nil`.
 */
 internal_int_compare :: #force_inline proc(a, b: ^Int) -> (comparison: int) {
+	assert_if_nil(a, b);
 	a_is_negative := #force_inline internal_is_negative(a);
 
 	/*
@@ -1114,6 +1121,7 @@ internal_cmp :: internal_compare;
 	Expects: `a` and `b` both to be valid `Int`s, i.e. initialized and not `nil`.
 */
 internal_int_compare_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (comparison: int) {
+	assert_if_nil(a);
 	a_is_negative := #force_inline internal_is_negative(a);
 
 	switch {
@@ -1145,6 +1153,7 @@ internal_cmp_digit :: internal_compare_digit;
 	Compare the magnitude of two `Int`s, unsigned.
 */
 internal_int_compare_magnitude :: #force_inline proc(a, b: ^Int) -> (comparison: int) {
+	assert_if_nil(a, b);
 	/*
 		Compare based on used digits.
 	*/
@@ -1172,6 +1181,177 @@ internal_int_compare_magnitude :: #force_inline proc(a, b: ^Int) -> (comparison:
 internal_compare_magnitude :: proc { internal_int_compare_magnitude, };
 internal_cmp_mag :: internal_compare_magnitude;
 
+
+/*
+	bool := a < b
+*/
+internal_int_less_than :: #force_inline proc(a, b: ^Int) -> (less_than: bool) {
+	return internal_cmp(a, b) == -1;
+}
+
+/*
+	bool := a < b
+*/
+internal_int_less_than_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (less_than: bool) {
+	return internal_cmp_digit(a, b) == -1;
+}
+
+/*
+	bool := |a| < |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_less_than_abs :: #force_inline proc(a, b: ^Int) -> (less_than: bool) {
+	return internal_cmp_mag(a, b) == -1;
+}
+
+internal_less_than :: proc {
+	internal_int_less_than,
+	internal_int_less_than_digit,
+};
+internal_lt :: internal_less_than;
+
+internal_less_than_abs :: proc {
+	internal_int_less_than_abs,
+};
+internal_lt_abs :: internal_less_than_abs;
+
+
+/*
+	bool := a <= b
+*/
+internal_int_less_than_or_equal :: #force_inline proc(a, b: ^Int) -> (less_than_or_equal: bool) {
+	return internal_cmp(a, b) <= 0;
+}
+
+/*
+	bool := a <= b
+*/
+internal_int_less_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (less_than_or_equal: bool) {
+	return internal_cmp_digit(a, b) <= 0;
+}
+
+/*
+	bool := |a| <= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (less_than_or_equal: bool) {
+	return internal_cmp_mag(a, b) <= 0;
+}
+
+internal_less_than_or_equal :: proc {
+	internal_int_less_than_or_equal,
+	internal_int_less_than_or_equal_digit,
+};
+internal_lte :: internal_less_than_or_equal;
+
+internal_less_than_or_equal_abs :: proc {
+	internal_int_less_than_or_equal_abs,
+};
+internal_lte_abs :: internal_less_than_or_equal_abs;
+
+
+/*
+	bool := a == b
+*/
+internal_int_equals :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
+	return internal_cmp(a, b) == 0;
+}
+
+/*
+	bool := a == b
+*/
+internal_int_equals_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (equals: bool) {
+	return internal_cmp_digit(a, b) == 0;
+}
+
+/*
+	bool := |a| == |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_equals_abs :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
+	return internal_cmp_mag(a, b) == 0;
+}
+
+internal_equals :: proc {
+	internal_int_equals,
+	internal_int_equals_digit,
+};
+internal_eq :: internal_equals;
+
+internal_equals_abs :: proc {
+	internal_int_equals_abs,
+};
+internal_eq_abs :: internal_equals_abs;
+
+
+/*
+	bool := a >= b
+*/
+internal_int_greater_than_or_equal :: #force_inline proc(a, b: ^Int) -> (greater_than_or_equal: bool) {
+	return internal_cmp(a, b) >= 0;
+}
+
+/*
+	bool := a >= b
+*/
+internal_int_greater_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (greater_than_or_equal: bool) {
+	return internal_cmp_digit(a, b) >= 0;
+}
+
+/*
+	bool := |a| >= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (greater_than_or_equal: bool) {
+	return internal_cmp_mag(a, b) >= 0;
+}
+
+internal_greater_than_or_equal :: proc {
+	internal_int_greater_than_or_equal,
+	internal_int_greater_than_or_equal_digit,
+};
+internal_gte :: internal_greater_than_or_equal;
+
+internal_greater_than_or_equal_abs :: proc {
+	internal_int_greater_than_or_equal_abs,
+};
+internal_gte_abs :: internal_greater_than_or_equal_abs;
+
+
+/*
+	bool := a > b
+*/
+internal_int_greater_than :: #force_inline proc(a, b: ^Int) -> (greater_than: bool) {
+	return internal_cmp(a, b) == 1;
+}
+
+/*
+	bool := a > b
+*/
+internal_int_greater_than_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (greater_than: bool) {
+	return internal_cmp_digit(a, b) == 1;
+}
+
+/*
+	bool := |a| > |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_greater_than_abs :: #force_inline proc(a, b: ^Int) -> (greater_than: bool) {
+	return internal_cmp_mag(a, b) == 1;
+}
+
+internal_greater_than :: proc {
+	internal_int_greater_than,
+	internal_int_greater_than_digit,
+};
+internal_gt :: internal_greater_than;
+
+internal_greater_than_abs :: proc {
+	internal_int_greater_than_abs,
+};
+internal_gt_abs :: internal_greater_than_abs;
+
+
 /*
 	Check if remainders are possible squares - fast exclude non-squares.
 
@@ -1229,7 +1409,7 @@ internal_int_is_square :: proc(a: ^Int, allocator := context.allocator) -> (squa
 	sqrt(t, a) or_return;
 	sqr(t, t)  or_return;
 
-	square = internal_cmp_mag(t, a) == 0;
+	square = internal_eq_abs(t, a);
 
 	return;
 }
@@ -1461,7 +1641,7 @@ internal_int_sqrt :: proc(dest, src: ^Int, allocator := context.allocator) -> (e
 		internal_add(t2, t1, x)  or_return;
 		internal_shr(y, t2, 1)   or_return;
 
-		if c := internal_cmp(y, x); c == 0 || c == 1 {
+		if internal_gte(y, x) {
 			internal_swap(dest, x);
 			return nil;
 		}
@@ -1576,8 +1756,8 @@ internal_int_root_n :: proc(dest, src: ^Int, n: int, allocator := context.alloca
 			 Number of rounds is at most log_2(root). If it is more it
 			 got stuck, so break out of the loop and do the rest manually.
 		*/
-		if ilog2 -= 1;    ilog2 == 0 { break; }
-		if internal_cmp(t1, t2) == 0 { break; }
+		if ilog2 -= 1; ilog2 == 0 { break; }
+		if internal_eq(t1, t2)    { break; }
 
 		iterations += 1;
 		if iterations == MAX_ITERATIONS_ROOT_N {
@@ -1615,7 +1795,7 @@ internal_int_root_n :: proc(dest, src: ^Int, n: int, allocator := context.alloca
 	for {
 		internal_pow(t2, t1, n) or_return;
 	
-		if internal_cmp(t2, a) != 1 { break; }
+		if internal_lt(t2, a) { break; }
 		
 		internal_sub(t1, t1, DIGIT(1)) or_return;
 
@@ -1651,8 +1831,7 @@ internal_int_destroy :: proc(integers: ..^Int) {
 	integers := integers;
 
 	for a in &integers {
-		raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-		if raw.cap > 0 {
+		if internal_int_allocated_cap(a) > 0 {
 			mem.zero_slice(a.digit[:]);
 			free(&a.digit[0]);
 		}
@@ -1821,12 +2000,12 @@ internal_int_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.alloc
 	/*
 		For all n in N and n > 0, n = 0 mod 1.
 	*/
-	if internal_is_positive(a) && internal_cmp(b, 1) == 0 { return internal_zero(dest);	}
+	if internal_is_positive(a) && internal_eq(b, 1) { return internal_zero(dest);	}
 
 	/*
 		`b` cannot be negative and has to be > 1
 	*/
-	if internal_is_negative(b) && internal_cmp(b, 1) != 1 { return .Invalid_Argument; }
+	if internal_is_negative(b) || internal_gt(b, 1) { return .Invalid_Argument; }
 
 	/*
 		If the modulus is odd we can use a faster routine instead.
@@ -1914,23 +2093,23 @@ internal_int_shrink :: proc(a: ^Int) -> (err: Error) {
 internal_shrink :: proc { internal_int_shrink, };
 
 internal_int_grow :: proc(a: ^Int, digits: int, allow_shrink := false, allocator := context.allocator) -> (err: Error) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-
 	/*
 		We need at least _MIN_DIGIT_COUNT or a.used digits, whichever is bigger.
 		The caller is asking for `digits`. Let's be accomodating.
 	*/
+	cap := internal_int_allocated_cap(a);
+
 	needed := max(_MIN_DIGIT_COUNT, a.used, digits);
 	if !allow_shrink {
-		needed = max(needed, raw.cap);
+		needed = max(needed, cap);
 	}
 
 	/*
 		If not yet iniialized, initialize the `digit` backing with the allocator we were passed.
 	*/
-	if raw.cap == 0 {
+	if cap == 0 {
 		a.digit = make([dynamic]DIGIT, needed, allocator);
-	} else if raw.cap != needed {
+	} else if cap != needed {
 		/*
 			`[dynamic]DIGIT` already knows what allocator was used for it, so resize will do the right thing.
 		*/
diff --git a/core/math/big/logical.odin b/core/math/big/logical.odin
index 64f3b0898..1e7f8e1b1 100644
--- a/core/math/big/logical.odin
+++ b/core/math/big/logical.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,7 @@ package math_big
 
 	This file contains logical operations like `and`, `or` and `xor`.
 */
+package math_big
 
 /*
 	The `and`, `or` and `xor` binops differ in two lines only.
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index f35e02807..d6626ffbf 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,12 +8,13 @@ package math_big
 
 	This file contains prime finding operations.
 */
+package math_big
 
 /*
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
 	Returns true if it is, false if not. 
 */
-int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res: bool, err: Error) {
+internal_int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res: bool, err: Error) {
 	assert_if_nil(a);
 	context.allocator = allocator;
 
@@ -34,177 +33,175 @@ int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res:
 }
 
 /*
-	Computes xR**-1 == x (mod N) via Montgomery Reduction.
+	This is a shell function that calls either the normal or Montgomery exptmod functions.
+	Originally the call to the Montgomery code was embedded in the normal function but that
+	wasted alot of stack space for nothing (since 99% of the time the Montgomery code would be called).
+
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
 */
-internal_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
+internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
+
+	dr: int;
+
 	/*
-		Can the fast reduction [comba] method be used?
-		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
-		since carries are fixed up in the inner loop.
+		Modulus P must be positive.
 	*/
-	digs := (n.used * 2) + 1;
-	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
-		return _private_montgomery_reduce_comba(x, n, rho);
-	}
+	if internal_is_negative(P) { return .Invalid_Argument; }
 
 	/*
-		Grow the input as required
+		If exponent X is negative we have to recurse.
 	*/
-	internal_grow(x, digs)                                           or_return;
-	x.used = digs;
+	if internal_is_negative(X) {
+		tmpG, tmpX := &Int{}, &Int{};
+		defer internal_destroy(tmpG, tmpX);
+
+		internal_init_multi(tmpG, tmpX) or_return;
 
-	for ix := 0; ix < n.used; ix += 1 {
 		/*
-			`mu = ai * rho mod b`
-			The value of rho must be precalculated via `int_montgomery_setup()`,
-			such that it equals -1/n0 mod b this allows the following inner loop
-			to reduce the input one digit at a time.
+			First compute 1/G mod P.
 		*/
-
-		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK));
+		internal_invmod(tmpG, G, P) or_return;
 
 		/*
-			a = a + mu * m * b**i
-			Multiply and add in place.
+			now get |X|.
 		*/
-		u  := DIGIT(0);
-		iy := int(0);
-		for ; iy < n.used; iy += 1 {
-			/*
-				Compute product and sum.
-			*/
-			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]));
-
-			/*
-				Get carry.
-			*/
-			u = DIGIT(r >> _DIGIT_BITS);
-
-			/*
-				Fix digit.
-			*/
-			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK));
-		}
+		internal_abs(tmpX, X) or_return;
 
 		/*
-			At this point the ix'th digit of x should be zero.
-			Propagate carries upwards as required.
+			And now compute (1/G)**|X| instead of G**X [X < 0].
 		*/
-		for u != 0 {
-			x.digit[ix + iy] += u;
-			u = x.digit[ix + iy] >> _DIGIT_BITS;
-			x.digit[ix + iy] &= _MASK;
-			iy += 1;
-		}
+		return internal_int_exponent_mod(res, tmpG, tmpX, P);
 	}
 
 	/*
-		At this point the n.used'th least significant digits of x are all zero,
-		which means we can shift x to the right by n.used digits and the
-		residue is unchanged.
+		Modified diminished radix reduction.
+	*/
+	can_reduce_2k_l := _private_int_reduce_is_2k_l(P) or_return;
+	if can_reduce_2k_l {
+		return _private_int_exponent_mod(res, G, X, P, 1);
+	}
 
-		x = x/b**n.used.
+	/*
+		Is it a DR modulus? default to no.
 	*/
-	internal_clamp(x);
-	internal_shr_digit(x, n.used);
+	dr = 1 if _private_dr_is_modulus(P) else 0;
 
 	/*
-		if x >= n then x = x - n
+		If not, is it a unrestricted DR modulus?
 	*/
-	if internal_cmp_mag(x, n) != -1 {
-		return internal_sub(x, x, n);
+	if dr == 0 {
+		reduce_is_2k := _private_int_reduce_is_2k(P) or_return;
+		dr = 2 if reduce_is_2k else 0;
 	}
 
-	return nil;
-}
-
-int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
-	assert_if_nil(x, n);
-	context.allocator = allocator;
-
-	internal_clear_if_uninitialized(x, n) or_return;
+	/*
+		If the modulus is odd or dr != 0 use the montgomery method.
+	*/
+	if internal_int_is_odd(P) || dr != 0 {
+		return _private_int_exponent_mod(res, G, X, P, dr);
+	}
 
-	return #force_inline internal_int_montgomery_reduce(x, n, rho);
+	/*
+		Otherwise use the generic Barrett reduction technique.
+	*/
+	return _private_int_exponent_mod(res, G, X, P, 0);
 }
 
 /*
-	Shifts with subtractions when the result is greater than b.
+	Kronecker symbol (a|p)
+	Straightforward implementation of algorithm 1.4.10 in
+	Henri Cohen: "A Course in Computational Algebraic Number Theory"
+
+	@book{cohen2013course,
+		title={A course in computational algebraic number theory},
+		author={Cohen, Henri},
+		volume={138},
+		year={2013},
+		publisher={Springer Science \& Business Media}
+	}
 
-	The method is slightly modified to shift B unconditionally upto just under
-	the leading bit of b.  This saves alot of multiple precision shifting.
+	Assumes `a` and `p` to not be `nil` and to have been initialized.
 */
-internal_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (kronecker: int, err: Error) {
 	context.allocator = allocator;
-	/*
-		How many bits of last digit does b use.
-	*/
-	bits := internal_count_bits(b) % _DIGIT_BITS;
-
-	if b.used > 1 {
-		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
-		internal_int_power_of_two(a, power)                          or_return;
-	} else {
-		internal_one(a);
-		bits = 1;
-	}
 
-	/*
-		Now compute C = A * B mod b.
-	*/
-	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
-		internal_int_shl1(a, a)                                      or_return;
-		if internal_cmp_mag(a, b) != -1 {
-			internal_sub(a, a, b)                                    or_return;
+	a1, p1, r := &Int{}, &Int{}, &Int{};
+	defer internal_destroy(a1, p1, r);
+
+	table := []int{0, 1, 0, -1, 0, -1, 0, 1};
+
+	if internal_int_is_zero(p) {
+		if a.used == 1 && a.digit[0] == 1 {
+			return 1, nil;
+		} else {
+			return 0, nil;
 		}
 	}
-	return nil;
-}
 
-int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	assert_if_nil(a, b);
-	context.allocator = allocator;
+	if internal_is_even(a) && internal_is_even(p) {
+		return 0, nil;
+	}
 
-	internal_clear_if_uninitialized(a, b) or_return;
+	internal_copy(a1, a) or_return;
+	internal_copy(p1, p) or_return;
 
-	return #force_inline internal_int_montgomery_calc_normalization(a, b);
-}
+	v := internal_count_lsb(p1) or_return;
+	internal_shr(p1, p1, v) or_return;
 
-/*
-	Sets up the Montgomery reduction stuff.
-*/
-internal_int_montgomery_setup :: proc(n: ^Int) -> (rho: DIGIT, err: Error) {
-	/*
-		Fast inversion mod 2**k
-		Based on the fact that:
+	k := 1 if v & 1 == 0 else table[a.digit[0] & 7];
 
-		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
-		                  =>  2*X*A - X*X*A*A = 1
-		                  =>  2*(1) - (1)     = 1
-	*/
-	b := n.digit[0];
-	if b & 1 == 0 { return 0, .Invalid_Argument; }
-
-	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
-	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
-	when _WORD_TYPE_BITS == 64 {
-		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
-		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
+	if internal_is_negative(p1) {
+		p1.sign = .Zero_or_Positive;
+		if internal_is_negative(a1) {
+			k = -k;
+		}
 	}
 
-	/*
-		rho = -1/m mod b
-	*/
-	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK));
-	return rho, nil;
-}
+	internal_zero(r) or_return;
 
-int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
-	assert_if_nil(n);
-	internal_clear_if_uninitialized(n, allocator) or_return;
+	for {
+		if internal_is_zero(a1) {
+			if internal_eq(p1, 1) {
+				return k, nil;
+			} else {
+				return 0, nil;
+			}
+		}
+
+		v = internal_count_lsb(a1) or_return;
+		internal_shr(a1, a1, v) or_return;
+
+		if v & 1 == 1 {
+			k = k * table[p1.digit[0] & 7];
+		}
 
-	return #force_inline internal_int_montgomery_setup(n);
+		if internal_is_negative(a1) {
+			/*
+				Compute k = (-1)^((a1)*(p1-1)/4) * k.
+				a1.digit[0] + 1 cannot overflow because the MSB
+				of the DIGIT type is not set by definition.
+			 */
+			if a1.digit[0] + 1 & p1.digit[0] & 2 != 0 {
+				k = -k;
+			}
+		} else {
+			/*
+				Compute k = (-1)^((a1-1)*(p1-1)/4) * k.
+			*/
+			if a1.digit[0] & p1.digit[0] & 2 != 0 {
+				k = -k;
+			}
+		}
+
+		internal_copy(r, a1) or_return;
+		r.sign = .Zero_or_Positive;
+
+		internal_mod(a1, p1, r) or_return;
+		internal_copy(p1, r)    or_return;
+	}
+	return;
 }
 
 /*
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index d71946ce2..fc2fe69e8 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -17,6 +15,7 @@ package math_big
 
 	These aren't exported for the same reasons.
 */
+package math_big
 
 import "core:intrinsics"
 import "core:mem"
@@ -1072,11 +1071,11 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 
 	internal_shl_digit(y, n - t) or_return;
 
-	c := internal_cmp(x, y);
-	for c != -1 {
+	gte := internal_gte(x, y);
+	for gte {
 		q.digit[n - t] += 1;
 		internal_sub(x, x, y) or_return;
-		c = internal_cmp(x, y);
+		gte = internal_gte(x, y);
 	}
 
 	/*
@@ -1135,7 +1134,7 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 			t2.digit[2] = x.digit[i];
 			t2.used = 3;
 
-			if t1_t2 := internal_cmp_mag(t1, t2); t1_t2 != 1 {
+			if internal_lte(t1, t2) {
 				break;
 			}
 			iter += 1; if iter > 100 {
@@ -1228,15 +1227,13 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
 	*/
-	if internal_cmp(A1, 0) == -1 {
+	if internal_lt(A1, 0) {
 		internal_shl(t, b, k * _DIGIT_BITS) or_return;
 
 		for {
 			internal_decr(Q1) or_return;
 			internal_add(A1, A1, t) or_return;
-			if internal_cmp(A1, 0) != -1 {
-				break;
-			}
+			if internal_gte(A1, 0) { break; }
 		}
 	}
 
@@ -1257,7 +1254,7 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
 	*/
-	for internal_cmp(A2, 0) == -1 {
+	for internal_is_negative(A2) { // internal_lt(A2, 0) {
 		internal_decr(Q0) or_return;
 		internal_add(A2, A2, b) or_return;
 	}
@@ -1392,7 +1389,7 @@ _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int
 		shl(tq, tq, n)       or_return;
 
 		for n >= 0 {
-			if c, _ = cmp_mag(ta, tb); c == 0 || c == 1 {
+			if internal_gte(ta, tb) {
 				// ta -= tb
 				sub(ta, ta, tb) or_return;
 				//  q += tq
@@ -1438,7 +1435,7 @@ _private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := cont
 	internal_one(inner, false)                                       or_return;
 	internal_one(outer, false)                                       or_return;
 
-	bits_used := int(_DIGIT_TYPE_BITS - intrinsics.count_leading_zeros(n));
+	bits_used := ilog2(n);
 
 	for i := bits_used; i >= 0; i -= 1 {
 		start := (n >> (uint(i) + 1)) + 1 | 1;
@@ -1597,7 +1594,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		/*
 			Make sure `v` is the largest.
 		*/
-		if internal_cmp_mag(u, v) == 1 {
+		if internal_gt(u, v) {
 			/*
 				Swap `u` and `v` to make sure `v` is >= `u`.
 			*/
@@ -1635,7 +1632,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		Computes least common multiple as `|a*b|/gcd(a,b)`
 		Divide the smallest by the GCD.
 	*/
-	if internal_cmp_mag(a, b) == -1 {
+	if internal_lt_abs(a, b) {
 		/*
 			Store quotient in `t2` such that `t2 * b` is the LCM.
 		*/
@@ -1695,9 +1692,7 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 		/*
 			Iterate until `a` is bracketed between low + high.
 		*/
-		if #force_inline internal_cmp(bracket_high, a) != -1 {
-			break;
-		}
+		if #force_inline internal_gte(bracket_high, a) { break; }
 
 		low = high;
 		#force_inline internal_copy(bracket_low, bracket_high) or_return;
@@ -1731,9 +1726,6 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 	return;
 }
 
-
-
-
 /*
 	Computes xR**-1 == x (mod N) via Montgomery Reduction.
 	This is an optimized implementation of `internal_montgomery_reduce`
@@ -1754,7 +1746,7 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		Grow `x` as required.
 	*/
-	internal_grow(x, n.used + 1) or_return;
+	internal_grow(x, n.used + 1)                                     or_return;
 
 	/*
 		First we have to get the digits of the input into an array of double precision words W[...]
@@ -1848,9 +1840,1018 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		if A >= m then A = A - m
 	*/
-	if internal_cmp_mag(x, n) != -1 {
+	if internal_gte_abs(x, n) {
+		return internal_sub(x, x, n);
+	}
+	return nil;
+}
+
+/*
+	Computes xR**-1 == x (mod N) via Montgomery Reduction.
+	Assumes `x` and `n` not to be nil.
+*/
+_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+	/*
+		Can the fast reduction [comba] method be used?
+		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
+		since carries are fixed up in the inner loop.
+	*/
+	internal_clear_if_uninitialized(x, n) or_return;
+
+	digs := (n.used * 2) + 1;
+	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
+		return _private_montgomery_reduce_comba(x, n, rho);
+	}
+
+	/*
+		Grow the input as required
+	*/
+	internal_grow(x, digs)                                           or_return;
+	x.used = digs;
+
+	for ix := 0; ix < n.used; ix += 1 {
+		/*
+			`mu = ai * rho mod b`
+			The value of rho must be precalculated via `int_montgomery_setup()`,
+			such that it equals -1/n0 mod b this allows the following inner loop
+			to reduce the input one digit at a time.
+		*/
+
+		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK));
+
+		/*
+			a = a + mu * m * b**i
+			Multiply and add in place.
+		*/
+		u  := DIGIT(0);
+		iy := int(0);
+		for ; iy < n.used; iy += 1 {
+			/*
+				Compute product and sum.
+			*/
+			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]));
+
+			/*
+				Get carry.
+			*/
+			u = DIGIT(r >> _DIGIT_BITS);
+
+			/*
+				Fix digit.
+			*/
+			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK));
+		}
+
+		/*
+			At this point the ix'th digit of x should be zero.
+			Propagate carries upwards as required.
+		*/
+		for u != 0 {
+			x.digit[ix + iy] += u;
+			u = x.digit[ix + iy] >> _DIGIT_BITS;
+			x.digit[ix + iy] &= _MASK;
+			iy += 1;
+		}
+	}
+
+	/*
+		At this point the n.used'th least significant digits of x are all zero,
+		which means we can shift x to the right by n.used digits and the
+		residue is unchanged.
+
+		x = x/b**n.used.
+	*/
+	internal_clamp(x);
+	internal_shr_digit(x, n.used);
+
+	/*
+		if x >= n then x = x - n
+	*/
+	if internal_gte_abs(x, n) {
 		return internal_sub(x, x, n);
 	}
+
+	return nil;
+}
+
+/*
+	Shifts with subtractions when the result is greater than b.
+
+	The method is slightly modified to shift B unconditionally upto just under
+	the leading bit of b.  This saves alot of multiple precision shifting.
+
+	Assumes `a` and `b` not to be `nil`.
+*/
+_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+	/*
+		How many bits of last digit does b use.
+	*/
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	bits := internal_count_bits(b) % _DIGIT_BITS;
+
+	if b.used > 1 {
+		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
+		internal_int_power_of_two(a, power)                          or_return;
+	} else {
+		internal_one(a)                                              or_return;
+		bits = 1;
+	}
+
+	/*
+		Now compute C = A * B mod b.
+	*/
+	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
+		internal_int_shl1(a, a)                                      or_return;
+		if internal_gte_abs(a, b) {
+			internal_sub(a, a, b)                                    or_return;
+		}
+	}
+	return nil;
+}
+
+/*
+	Sets up the Montgomery reduction stuff.
+*/
+_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
+	/*
+		Fast inversion mod 2**k
+		Based on the fact that:
+
+		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
+		                  =>  2*X*A - X*X*A*A = 1
+		                  =>  2*(1) - (1)     = 1
+	*/
+	internal_clear_if_uninitialized(n, allocator) or_return;
+
+	b := n.digit[0];
+	if b & 1 == 0 { return 0, .Invalid_Argument; }
+
+	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
+	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
+
+	when _DIGIT_TYPE_BITS == 64 {
+		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
+		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
+	}
+
+	/*
+		rho = -1/m mod b
+	*/
+	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK));
+	return rho, nil;
+}
+
+/*
+	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
+	From HAC pp.604 Algorithm 14.42
+
+	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
+*/
+_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+	um := m.used;
+
+	/*
+		q = x
+	*/
+	internal_copy(q, x)                                              or_return;
+
+	/*
+		q1 = x / b**(k-1)
+	*/
+	internal_shr_digit(q, um - 1);
+
+	/*
+		According to HAC this optimization is ok.
+	*/
+	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
+		internal_mul(q, q, mu)                                       or_return;
+	} else {
+		_private_int_mul_high(q, q, mu, um)                          or_return;
+	}
+
+	/*
+		q3 = q2 / b**(k+1)
+	*/
+	internal_shr_digit(q, um + 1);
+
+	/*
+		x = x mod b**(k+1), quick (no division)
+	*/
+	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return;
+
+	/*
+		q = q * m mod b**(k+1), quick (no division)
+	*/
+	_private_int_mul(q, q, m, um + 1)                                or_return;
+
+	/*
+		x = x - q
+	*/
+	internal_sub(x, x, q)                                            or_return;
+
+	/*
+		If x < 0, add b**(k+1) to it.
+	*/
+	if internal_is_negative(x) {
+		internal_set(q, 1)                                           or_return;
+		internal_shl_digit(q, um + 1)                                or_return;
+		internal_add(x, x, q)                                        or_return;
+	}
+
+	/*
+		Back off if it's too big.
+	*/
+	for internal_gte(x, m) {
+		internal_sub(x, x, m)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
+*/
+_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		if d != 1 {
+			/*
+				q = q * d
+			*/
+			internal_mul(q, q, d)                                    or_return;
+		}
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_lt_abs(a, n)                                     { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n` where `n` is of the form 2**p - d
+	This differs from reduce_2k since "d" can be larger than a single digit.
+*/
+_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		/*
+			q = q * d
+		*/
+		internal_mul(q, q, d)                                        or_return;
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_lt_abs(a, n)                                     { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Determines if `internal_int_reduce_2k` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		iy := internal_count_bits(a);
+		iw := 1;
+		iz := DIGIT(1);
+
+		/*
+			Test every bit from the second digit up, must be 1.
+		*/
+		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
+			if a.digit[iw] & iz == 0 {
+				return false, nil;
+			}
+
+			iz <<= 1;
+			if iz > _DIGIT_MAX {
+				iw += 1;
+				iz  = 1;
+			}
+		}
+		return true, nil;
+	} else {
+		return true, nil;
+	}
+}
+
+/*
+	Determines if `internal_int_reduce_2k_l` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_int_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		/*
+			If more than half of the digits are -1 we're sold.
+		*/
+		ix := 0;
+		iy := 0;
+
+		for ; ix < a.used; ix += 1 {
+			if a.digit[ix] == _DIGIT_MAX {
+				iy += 1;
+			}
+		}
+		return iy >= (a.used / 2), nil;
+	} else {
+		return false, nil;
+	}
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
+	internal_sub(tmp, tmp, a)                                        or_return;
+
+	return tmp.digit[0], nil;
+}
+
+/*
+	Determines the setup value.
+	Assumes `mu` and `P` are not `nil`.
+
+	d := (1 << a.bits) - a;
+*/
+_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return;
+	internal_sub(mu, tmp, P)                                         or_return;
+
+	return nil;
+}
+
+/*
+	Pre-calculate the value required for Barrett reduction.
+	For a given modulus "P" it calulates the value required in "mu"
+	Assumes `mu` and `P` are not `nil`.
+*/
+_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return;
+	return internal_int_div(mu, mu, P);
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
+	/*
+		The casts are required if _DIGIT_BITS is one less than
+		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
+	*/
+	return DIGIT((1 << _DIGIT_BITS) - a.digit[0]);
+}
+
+/*
+	Determines if a number is a valid DR modulus.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
+	/*
+		Must be at least two digits.
+	*/
+	if a.used < 2 { return false; }
+
+	/*
+		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
+	*/
+	for ix := 1; ix < a.used; ix += 1 {
+		if a.digit[ix] != _MASK {
+			return false;
+		}
+	}
+	return true;
+}
+
+/*
+	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+	Based on algorithm from the paper
+
+		"Generating Efficient Primes for Discrete Log Cryptosystems"
+					Chae Hoon Lim, Pil Joong Lee,
+			POSTECH Information Research Laboratories
+
+	The modulus must be of a special format [see manual].
+	Has been modified to use algorithm 7.10 from the LTM book instead
+
+	Input x must be in the range 0 <= x <= (n-1)**2
+	Assumes `x` and `n` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
+	/*
+		m = digits in modulus.
+	*/
+	m := n.used;
+
+	/*
+		Ensure that "x" has at least 2m digits.
+	*/
+	internal_grow(x, m + m)                                          or_return;
+
+	/*
+		Top of loop, this is where the code resumes if another reduction pass is required.
+	*/
+	for {
+		i: int;
+		mu := DIGIT(0);
+
+		/*
+			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
+		*/
+		for i = 0; i < m; i += 1 {
+			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu);
+			x.digit[i] = DIGIT(r & _WORD(_MASK));
+			mu         = DIGIT(r >> _WORD(_DIGIT_BITS));
+		}
+
+		/*
+			Set final carry.
+		*/
+		x.digit[i] = mu;
+
+		/*
+			Zero words above m.
+		*/
+		mem.zero_slice(x.digit[m + 1:][:x.used - m]);
+
+		/*
+			Clamp, sub and return.
+		*/
+		internal_clamp(x)                                            or_return;
+
+		/*
+			If x >= n then subtract and reduce again.
+			Each successive "recursion" makes the input smaller and smaller.
+		*/
+		if internal_lt_abs(x, n) { break; }
+
+		internal_sub(x, x, n)                                        or_return;
+	}
+	return nil;
+}
+
+/*
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array.
+		Init first cell.
+	*/
+	internal_zero(&M[1])                                             or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_zero(&M[x])                                         or_return;
+	}
+
+	/*
+		Create `mu`, used for Barrett reduction.
+	*/
+	mu := &Int{};
+	defer internal_destroy(mu);
+	internal_zero(mu)                                                or_return;
+
+	if redmode == 0 {
+		_private_int_reduce_setup(mu, P)                             or_return;
+		redux = _private_int_reduce;
+	} else {
+		_private_int_reduce_2k_setup_l(mu, P)                        or_return;
+		redux = _private_int_reduce_2k_l;
+	}
+
+	/*
+		Create M table.
+
+		The M table contains powers of the base, e.g. M[x] = G**x mod P.
+		The first half of the table is not computed, though, except for M[0] and M[1].
+	*/
+	internal_int_mod(&M[1], G, P)                                    or_return;
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+
+		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
+		instead of repeated squaring.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		/*
+			Square it.
+		*/
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+
+		/*
+			Reduce modulo P
+		*/
+		redux(&M[slot], P, mu)                                       or_return;
+	}
+
+	/*
+		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+	*/
+	for x = slot + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, mu)                                          or_return;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_one(res)                                                or_return;
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := uint(0);
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := buf >> (_DIGIT_BITS - 1) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit
+			in the exponent.  Technically this opt is not required but it
+			does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 {
+			continue;
+		}
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - bitcpy));
+		mode    = 2;
+
+		if (bitcpy == winsize) {
+			/*
+				Window is filled so square as required and multiply.
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, mu)                                        or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < int(bitcpy); x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+
+			bitbuf <<= 1;
+			if ((bitbuf & (1 << winsize)) != 0) {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+		}
+	}
+	return err;
+}
+
+/*
+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+
+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
+	The value of `k` changes based on the size of the exponent.
+
+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array
+		Init first cell.
+	*/
+	cap := internal_int_allocated_cap(P);
+	internal_grow(&M[1], cap)                                        or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_grow(&M[x], cap)                                    or_return;
+	}
+
+	/*
+		Determine and setup reduction code.
+	*/
+	rho: DIGIT;
+
+	if redmode == 0 {
+		/*
+			Now setup Montgomery.
+		*/
+		rho = _private_int_montgomery_setup(P)                       or_return;
+
+		/*
+			Automatically pick the comba one if available (saves quite a few calls/ifs).
+		*/
+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
+			redux = _private_montgomery_reduce_comba;
+		} else {
+			/*
+				Use slower baseline Montgomery method.
+			*/
+			redux = _private_int_montgomery_reduce;
+		}
+	} else if redmode == 1 {
+		/*
+			Setup DR reduction for moduli of the form B**k - b.
+		*/
+		rho = _private_int_dr_setup(P);
+		redux = _private_int_dr_reduce;
+	} else {
+		/*
+			Setup DR reduction for moduli of the form 2**k - b.
+		*/
+		rho = _private_int_reduce_2k_setup(P)                        or_return;
+		redux = _private_int_reduce_2k;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_grow(res, cap)                                          or_return;
+
+	/*
+		Create M table
+		The first half of the table is not computed, though, except for M[0] and M[1]
+	*/
+
+	if redmode == 0 {
+		/*
+			Now we need R mod m.
+		*/
+		_private_int_montgomery_calc_normalization(res, P)           or_return;
+
+		/*
+			Now set M[1] to G * R mod m.
+		*/
+		internal_mulmod(&M[1], G, res, P)                            or_return;
+	} else {
+		internal_one(res)                                            or_return;
+		internal_mod(&M[1], G, P)                                    or_return;
+	}
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+		redux(&M[slot], P, rho)                                      or_return;
+	}
+
+	/*
+		Create upper table.
+	*/
+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, rho)                                         or_return;
+	}
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := 0;
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits so break.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := (buf >> (_DIGIT_BITS - 1)) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit in the exponent.
+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 { continue; }
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - uint(bitcpy)));
+		mode    = 2;
+
+		if bitcpy == int(winsize) {
+			/*
+				Window is filled so square as required and multiply
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < bitcpy; x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Get next bit of the window.
+			*/
+			bitbuf <<= 1;
+			if bitbuf & (1 << winsize) != 0 {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+		}
+	}
+
+	if redmode == 0 {
+		/*
+			Fixup result if Montgomery reduction is used.
+			Recall that any value in a Montgomery system is actually multiplied by R mod n.
+			So we have to reduce one more time to cancel out the factor of R.
+		*/
+		redux(res, P, rho)                                           or_return;
+	}
+
 	return nil;
 }
 
@@ -1980,21 +2981,21 @@ _private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator
 	/*
 		If `v` != `1` then there is no inverse.
 	*/
-	if internal_cmp(v, 1) != 0 {
+	if !internal_eq(v, 1) {
 		return .Invalid_Argument;
 	}
 
 	/*
 		If its too low.
 	*/
-	if internal_cmp(C, 0) == -1 {
+	if internal_is_negative(C) {
 		internal_add(C, C, b) or_return;
 	}
 
 	/*
 		Too big.
 	*/
-	if internal_cmp(C, 0) != -1 {
+	if internal_gte(C, 0) {
 		internal_sub(C, C, b) or_return;
 	}
 
@@ -2147,7 +3148,7 @@ _private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.alloc
 	/*
 		Too big.
 	*/
-	for internal_cmp_mag(D, b) != -1 {
+	for internal_gte_abs(D, b) {
 		internal_sub(D, D, b) or_return;
 	}
 
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index 542725289..e8ff1e28b 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,9 @@ package math_big
 
 	This file contains basic arithmetic operations like `add`, `sub`, `mul`, `div`, ...
 */
+package math_big
+
+import "core:intrinsics"
 
 /*
 	===========================
@@ -384,6 +385,10 @@ digit_log :: proc(a: DIGIT, base: DIGIT) -> (log: int, err: Error) {
 }
 log :: proc { int_log, digit_log, };
 
+ilog2 :: proc(value: $T) -> (log2: T) {
+	return (size_of(T) * 8) - intrinsics.count_leading_zeros(value);
+}
+
 /*
 	Calculate `dest = base^power` using a square-multiply algorithm.
 */
@@ -556,6 +561,298 @@ int_compare_magnitude :: proc(a, b: ^Int, allocator := context.allocator) -> (re
 
 	return #force_inline internal_cmp_mag(a, b), nil;
 }
+int_cmp_mag :: int_compare_magnitude;
+
+
+/*
+	bool := a < b
+*/
+int_less_than :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == -1, err;
+}
+
+/*
+	bool := a < b
+*/
+int_less_than_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == -1, err;
+}
+
+/*
+	bool := |a| < |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_less_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c == -1, err;
+}
+
+less_than :: proc {
+	int_less_than,
+	int_less_than_digit,
+};
+lt :: less_than;
+
+less_than_abs :: proc {
+	int_less_than_abs,
+};
+lt_abs :: less_than_abs;
+
+
+/*
+	bool := a <= b
+*/
+int_less_than_or_equal :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c <= 0, err;
+}
+
+/*
+	bool := a <= b
+*/
+int_less_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c <= 0, err;
+}
+
+/*
+	bool := |a| <= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c <= 0, err;
+}
+
+less_than_or_equal :: proc {
+	int_less_than_or_equal,
+	int_less_than_or_equal_digit,
+};
+lteq :: less_than_or_equal;
+
+less_than_or_equal_abs :: proc {
+	int_less_than_or_equal_abs,
+};
+lteq_abs :: less_than_or_equal_abs;
+
+
+/*
+	bool := a == b
+*/
+int_equals :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == 0, err;
+}
+
+/*
+	bool := a == b
+*/
+int_equals_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == 0, err;
+}
+
+/*
+	bool := |a| == |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_equals_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c == 0, err;
+}
+
+equals :: proc {
+	int_equals,
+	int_equals_digit,
+};
+eq :: equals;
+
+equals_abs :: proc {
+	int_equals_abs,
+};
+eq_abs :: equals_abs;
+
+
+/*
+	bool := a >= b
+*/
+int_greater_than_or_equal :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c >= 0, err;
+}
+
+/*
+	bool := a >= b
+*/
+int_greater_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c >= 0, err;
+}
+
+/*
+	bool := |a| >= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c >= 0, err;
+}
+
+greater_than_or_equal :: proc {
+	int_greater_than_or_equal,
+	int_greater_than_or_equal_digit,
+};
+gteq :: greater_than_or_equal;
+
+greater_than_or_equal_abs :: proc {
+	int_greater_than_or_equal_abs,
+};
+gteq_abs :: greater_than_or_equal_abs;
+
+
+/*
+	bool := a > b
+*/
+int_greater_than :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c > 0, err;
+}
+
+/*
+	bool := a > b
+*/
+int_greater_than_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c > 0, err;
+}
+
+/*
+	bool := |a| > |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_greater_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c > 0, err;
+}
+
+greater_than :: proc {
+	int_greater_than,
+	int_greater_than_digit,
+};
+gt :: greater_than;
+
+greater_than_abs :: proc {
+	int_greater_than_abs,
+};
+gt_abs :: greater_than_abs;
+
 
 /*
 	Check if remainders are possible squares - fast exclude non-squares.
diff --git a/core/math/big/radix.odin b/core/math/big/radix.odin
index acf0bacbd..8a7040158 100644
--- a/core/math/big/radix.odin
+++ b/core/math/big/radix.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -14,6 +12,7 @@ package math_big
 		- Use Barrett reduction for non-powers-of-two.
 		- Also look at extracting and splatting several digits at once.
 */
+package math_big
 
 import "core:intrinsics"
 import "core:mem"
diff --git a/core/math/big/test.odin b/core/math/big/test.odin
index ea3c6be49..8d60fc5ee 100644
--- a/core/math/big/test.odin
+++ b/core/math/big/test.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -11,6 +9,7 @@ package math_big
 
 	This file exports procedures for use with the test.py test suite.
 */
+package math_big
 
 /*
 	TODO: Write tests for `internal_*` and test reusing parameters with the public implementations.
diff --git a/core/math/big/test.py b/core/math/big/test.py
index df59fa1c8..4d314401f 100644
--- a/core/math/big/test.py
+++ b/core/math/big/test.py
@@ -1,3 +1,12 @@
+#
+#	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+#	Made available under Odin's BSD-3 license.
+#
+#	A BigInt implementation in Odin.
+#	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
+#	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
+#
+
 from ctypes import *
 from random import *
 import math
@@ -404,7 +413,6 @@ def test_shr_signed(a = 0, bits = 0, expected_error = Error.Okay):
 	return test("test_shr_signed", res, [a, bits], expected_error, expected_result)
 
 def test_factorial(number = 0, expected_error = Error.Okay):
-	print("Factorial:", number)
 	args  = [number]
 	try:
 		res = int_factorial(*args)
diff --git a/core/math/big/tune.odin b/core/math/big/tune.odin
index 700a5e74a..3381065bb 100644
--- a/core/math/big/tune.odin
+++ b/core/math/big/tune.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -9,6 +7,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:fmt"
 import "core:time"
author	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2021-09-01 19:25:52 +0200
committer	GitHub <noreply@github.com>	2021-09-01 19:25:52 +0200
commit	e2f035d6eea1a6b6624213cf653fc31bd64f0b9e (patch)
tree	9c077cfbf196d7e4d58c25d2bb9724c5d81e3707 /core/math
parent	b2cf0755f2aa899a5a87bb92e87d0c46d01f593b (diff)
parent	ae354731edd1d408f1852c4fb19af2c840cbed46 (diff)