19 files changed, 3372 insertions, 393 deletions
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 3d58a8fd8..9ddb8ee83 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -28,6 +28,7 @@ jobs:
           cp LLVM-C.dll dist
           cp -r shared dist
           cp -r core dist
+          cp -r vendor dist
           cp -r bin dist
           cp -r examples dist
       - name: Upload artifact
@@ -51,6 +52,7 @@ jobs:
           cp odin dist
           cp -r shared dist
           cp -r core dist
+          cp -r vendor dist
           cp -r examples dist
       - name: Upload artifact
         uses: actions/upload-artifact@v1
@@ -77,6 +79,7 @@ jobs:
           cp odin dist
           cp -r shared dist
           cp -r core dist
+          cp -r vendor dist
           cp -r examples dist
       - name: Upload artifact
         uses: actions/upload-artifact@v1
diff --git a/core/intrinsics/intrinsics.odin b/core/intrinsics/intrinsics.odin
index 008e2ee5a..51fda2458 100644
--- a/core/intrinsics/intrinsics.odin
+++ b/core/intrinsics/intrinsics.odin
@@ -2,6 +2,9 @@
 //+ignore
 package intrinsics
 
+// Package-Related
+is_package_imported :: proc(package_name: string) -> bool ---
+
 // Types
 simd_vector :: proc($N: int, $T: typeid) -> type/#simd[N]T
 soa_struct :: proc($N: int, $T: typeid) -> type/#soa[N]T
@@ -16,7 +19,7 @@ trap       :: proc() -> ! ---
 
 // Instructions
 
-alloca             :: proc(size, align: int) -> ^u8 ---
+alloca             :: proc(size, align: int) -> [^]u8 ---
 cpu_relax          :: proc() ---
 read_cycle_counter :: proc() -> i64 ---
 
@@ -46,6 +49,9 @@ fixed_point_div_sat :: proc(lhs, rhs: $T, #const scale: uint) -> T where type_is
 // Compiler Hints
 expect :: proc(val, expected_val: T) -> T ---
 
+// Linux and Darwin Only
+syscall :: proc(id: uintptr, args: ..uintptr) -> uintptr ---
+
 
 // Atomics
 atomic_fence        :: proc() ---
diff --git a/core/math/big/api.odin b/core/math/big/api.odin
index 1c7833730..8c9d2e799 100644
--- a/core/math/big/api.odin
+++ b/core/math/big/api.odin
@@ -1,14 +1,15 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-2 license.
+	Made available under Odin's BSD-3 license.
 
 	An arbitrary precision mathematics implementation in Odin.
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 
 	This file collects public proc maps and their aliases.
+*/
+package math_big
+/*
 
 	=== === === === === === === === === === === === === === === === === === === === === === === ===
 	                                    Basic arithmetic.
diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 31480bcc8..7ca32641b 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,9 +1,11 @@
 @echo off
-:odin run . -vet
+odin run . -vet
+: -define:MATH_BIG_USE_FROBENIUS_TEST=true
 
 set TEST_ARGS=-fast-tests
+:set TEST_ARGS=
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
 \ No newline at end of file
diff --git a/core/math/big/common.odin b/core/math/big/common.odin
index c43346f5a..3fb7601c0 100644
--- a/core/math/big/common.odin
+++ b/core/math/big/common.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -8,6 +6,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:intrinsics"
 
@@ -57,10 +56,10 @@ when #config(MATH_BIG_EXE, true) {
 	debugged where necessary.
 */
 
-_DEFAULT_MUL_KARATSUBA_CUTOFF :: #config(MUL_KARATSUBA_CUTOFF,  80)
-_DEFAULT_SQR_KARATSUBA_CUTOFF :: #config(SQR_KARATSUBA_CUTOFF, 120)
-_DEFAULT_MUL_TOOM_CUTOFF      :: #config(MUL_TOOM_CUTOFF,      350)
-_DEFAULT_SQR_TOOM_CUTOFF      :: #config(SQR_TOOM_CUTOFF,      400)
+_DEFAULT_MUL_KARATSUBA_CUTOFF :: #config(MATH_BIG_MUL_KARATSUBA_CUTOFF,  80)
+_DEFAULT_SQR_KARATSUBA_CUTOFF :: #config(MATH_BIG_SQR_KARATSUBA_CUTOFF, 120)
+_DEFAULT_MUL_TOOM_CUTOFF      :: #config(MATH_BIG_MUL_TOOM_CUTOFF,      350)
+_DEFAULT_SQR_TOOM_CUTOFF      :: #config(MATH_BIG_SQR_TOOM_CUTOFF,      400)
 
 
 MAX_ITERATIONS_ROOT_N := 500
@@ -76,6 +75,29 @@ FACTORIAL_MAX_N       := 1_000_000
 FACTORIAL_BINARY_SPLIT_CUTOFF         := 6100
 FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS := 100
 
+/*
+	`internal_int_is_prime` switchables.
+
+	Use Frobenius-Underwood for primality testing, or use Lucas-Selfridge (default).
+*/
+MATH_BIG_USE_LUCAS_SELFRIDGE_TEST :: #config(MATH_BIG_USE_LUCAS_SELFRIDGE_TEST, false)
+MATH_BIG_USE_FROBENIUS_TEST       :: !MATH_BIG_USE_LUCAS_SELFRIDGE_TEST
+
+/*
+	Runtime tunable to use Miller-Rabin primality testing only and skip the above.
+*/
+USE_MILLER_RABIN_ONLY       := false
+
+/*
+	How many times we'll call `internal_int_random` during random prime generation before we bail out.
+	Set to 0 or less to try indefinitely.	
+*/
+MAX_ITERATIONS_RANDOM_PRIME  := 1_000_000
+
+/*
+	How many iterations we used for the last random prime.
+*/
+@thread_local RANDOM_PRIME_ITERATIONS_USED: int
 
 /*
 	We don't allow these to be switched at runtime for two reasons:
@@ -85,15 +107,22 @@ FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS := 100
 
 	2) Optimizations thanks to precomputed masks wouldn't work.
 */
-MATH_BIG_FORCE_64_BIT :: #config(MATH_BIG_FORCE_64_BIT, false)
-MATH_BIG_FORCE_32_BIT :: #config(MATH_BIG_FORCE_32_BIT, false)
+MATH_BIG_FORCE_64_BIT   :: #config(MATH_BIG_FORCE_64_BIT, false)
+MATH_BIG_FORCE_32_BIT   :: #config(MATH_BIG_FORCE_32_BIT, false)
 when (MATH_BIG_FORCE_32_BIT && MATH_BIG_FORCE_64_BIT) { #panic("Cannot force 32-bit and 64-bit big backend simultaneously.") }
 
-_LOW_MEMORY           :: #config(BIGINT_SMALL_MEMORY, false)
+/*
+	Trade a smaller memory footprint for more processing overhead?
+*/
+_LOW_MEMORY             :: #config(MATH_BIG_SMALL_MEMORY, false)
 when _LOW_MEMORY {
-	_DEFAULT_DIGIT_COUNT :: 8
+	_DEFAULT_DIGIT_COUNT ::   8
+	_TAB_SIZE            ::  32
+	_MAX_WIN_SIZE        ::   5
 } else {
-	_DEFAULT_DIGIT_COUNT :: 32
+	_DEFAULT_DIGIT_COUNT ::  32
+	_TAB_SIZE            :: 256
+	_MAX_WIN_SIZE        ::   0
 }
 
 /*
@@ -137,6 +166,10 @@ Error :: enum int {
 	Division_by_Zero        = 8,
 	Math_Domain_Error       = 9,
 
+	Cannot_Open_File        = 50,
+	Cannot_Read_File        = 51,
+	Cannot_Write_File       = 52,
+
 	Unimplemented           = 127,
 }
 
@@ -153,13 +186,17 @@ Error_String :: #partial [Error]string{
 	.Division_by_Zero        = "Division by zero",
 	.Math_Domain_Error       = "Math domain error",
 
+	.Cannot_Open_File        = "Cannot_Open_File",
+	.Cannot_Read_File        = "Cannot_Read_File",
+	.Cannot_Write_File       = "Cannot_Write_File",
+
 	.Unimplemented           = "Unimplemented",
 }
 
 Primality_Flag :: enum u8 {
-	Blum_Blum_Shub = 0,	/* BBS style prime */
-	Safe           = 1,	/* Safe prime (p-1)/2 == prime */
-	Second_MSB_On  = 3,  /* force 2nd MSB to 1 */
+ 	Blum_Blum_Shub = 0,  // Make prime congruent to 3 mod 4
+	Safe           = 1,  // Make sure (p-1)/2 is prime as well (implies .Blum_Blum_Shub)
+	Second_MSB_On  = 3,  // Make the 2nd highest bit one
 }
 Primality_Flags :: bit_set[Primality_Flag; u8]
 
@@ -198,7 +235,8 @@ when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
 _DIGIT_TYPE_BITS :: 8 * size_of(DIGIT)
 _WORD_TYPE_BITS  :: 8 * size_of(_WORD)
 
-_DIGIT_BITS      :: _DIGIT_TYPE_BITS - 4
+_DIGIT_NAILS     :: 4
+_DIGIT_BITS      :: _DIGIT_TYPE_BITS - _DIGIT_NAILS
 _WORD_BITS       :: 2 * _DIGIT_BITS
 
 _MASK            :: (DIGIT(1) << DIGIT(_DIGIT_BITS)) - DIGIT(1)
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index d764ad940..695f6a72f 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -9,6 +7,8 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
+
 
 import "core:fmt"
 import "core:mem"
@@ -18,11 +18,15 @@ print_configation :: proc() {
 `
 Configuration:
 	_DIGIT_BITS                           %v
+	_SMALL_MEMORY                         %v
 	_MIN_DIGIT_COUNT                      %v
 	_MAX_DIGIT_COUNT                      %v
 	_DEFAULT_DIGIT_COUNT                  %v
 	_MAX_COMBA                            %v
 	_WARRAY                               %v
+	_TAB_SIZE                             %v
+	_MAX_WIN_SIZE                         %v
+	MATH_BIG_USE_LUCAS_SELFRIDGE_TEST     %v
 Runtime tunable:
 	MUL_KARATSUBA_CUTOFF                  %v
 	SQR_KARATSUBA_CUTOFF                  %v
@@ -32,13 +36,20 @@ Runtime tunable:
 	FACTORIAL_MAX_N                       %v
 	FACTORIAL_BINARY_SPLIT_CUTOFF         %v
 	FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS %v
+	USE_MILLER_RABIN_ONLY                 %v
+	MAX_ITERATIONS_RANDOM_PRIME           %v
 
 `, _DIGIT_BITS,
+_LOW_MEMORY,
 _MIN_DIGIT_COUNT,
 _MAX_DIGIT_COUNT,
 _DEFAULT_DIGIT_COUNT,
 _MAX_COMBA,
 _WARRAY,
+_TAB_SIZE,
+_MAX_WIN_SIZE,
+MATH_BIG_USE_LUCAS_SELFRIDGE_TEST,
+
 MUL_KARATSUBA_CUTOFF,
 SQR_KARATSUBA_CUTOFF,
 MUL_TOOM_CUTOFF,
@@ -47,6 +58,8 @@ MAX_ITERATIONS_ROOT_N,
 FACTORIAL_MAX_N,
 FACTORIAL_BINARY_SPLIT_CUTOFF,
 FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS,
+USE_MILLER_RABIN_ONLY,
+MAX_ITERATIONS_RANDOM_PRIME,
 )
 
 }
@@ -73,138 +86,45 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-int_to_byte :: proc(v: ^Int) {
-	err: Error
-	size: int
-	print("v: ", v)
-	fmt.println()
-
-	t := &Int{}
-	defer destroy(t)
+// printf :: fmt.printf;
 
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err)
-		return
-	}
-	b1 := make([]u8, size, context.temp_allocator)
-	err = int_to_bytes_big(v, b1)
-	int_from_bytes_big(t, b1)
-	fmt.printf("big: %v | err: %v\n", b1, err)
-
-	int_from_bytes_big(t, b1)
-	if internal_cmp_mag(t, v) != 0 {
-		print("\tError parsing t: ", t)
-	}
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err)
-		return
-	}
-	b2 := make([]u8, size, context.temp_allocator)
-	err = int_to_bytes_big_python(v, b2)
-	fmt.printf("big python: %v | err: %v\n", b2, err)
-
-	if err == nil {
-		int_from_bytes_big_python(t, b2)
-		if internal_cmp_mag(t, v) != 0 {
-			print("\tError parsing t: ", t)
-		}
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err)
-		return
-	}
-	b3 := make([]u8, size, context.temp_allocator)
-	err = int_to_bytes_big(v, b3, true)
-	fmt.printf("big signed: %v | err: %v\n", b3, err)
-
-	int_from_bytes_big(t, b3, true)
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t)
-	}
+demo :: proc() {
+	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer destroy(a, b, c, d, e, f, res)
 
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err)
-		return
-	}
-	b4 := make([]u8, size, context.temp_allocator)
-	err = int_to_bytes_big_python(v, b4, true)
-	fmt.printf("big signed python: %v | err: %v\n", b4, err)
+	bits   := 111
+	trials := -1
 
-	int_from_bytes_big_python(t, b4, true)
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t)
-	}
-}
+	flags := Primality_Flags{}
+	fmt.printf("Trying to generate a %v bit prime using %v Miller-Rabin trials and options %v.\n", bits, trials, flags)
 
-int_to_byte_little :: proc(v: ^Int) {
 	err: Error
-	size: int
-	print("v: ", v)
-	fmt.println()
-
-	t := &Int{}
-	defer destroy(t)
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err)
-		return
+	{
+		SCOPED_TIMING(.random_prime)
+		err = internal_random_prime(a, bits, trials, flags)
 	}
-	b1 := make([]u8, size, context.temp_allocator)
-	err = int_to_bytes_little(v, b1)
-	fmt.printf("little: %v | err: %v\n", b1, err)
 
-	int_from_bytes_little(t, b1)
-	if internal_cmp_mag(t, v) != 0 {
-		print("\tError parsing t: ", t)
-	}
+	print("a(10): ", a, 10, true, true, true)
+	fmt.printf("err: %v\n", err)
+	fmt.printf("RANDOM_PRIME_ITERATIONS_USED: %v\n", RANDOM_PRIME_ITERATIONS_USED)
 
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err)
-		return
-	}
-	b2 := make([]u8, size, context.temp_allocator)
-	err = int_to_bytes_little_python(v, b2)
-	fmt.printf("little python: %v | err: %v\n", b2, err)
-
-	if err == nil {
-		int_from_bytes_little_python(t, b2)
-		if internal_cmp_mag(t, v) != 0 {
-			print("\tError parsing t: ", t)
-		}
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err)
-		return
-	}
-	b3 := make([]u8, size, context.temp_allocator)
-	err = int_to_bytes_little(v, b3, true)
-	fmt.printf("little signed: %v | err: %v\n", b3, err)
+	nails := 0
 
-	int_from_bytes_little(t, b3, true)
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t)
-	}
+	count := internal_int_pack_count(a, u8, nails)
+	buf := make([]u8, count)
+	defer delete(buf)
 
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err)
-		return
-	}
-	b4 := make([]u8, size, context.temp_allocator)
-	err = int_to_bytes_little_python(v, b4, true)
-	fmt.printf("little signed python: %v | err: %v\n", b4, err)
+	written: int
+	order := Order.LSB_First
 
-	int_from_bytes_little_python(t, b4, true)
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t)
-	}
-}
+	fmt.printf("\na.digit: %v\n", a.digit[:a.used])
+	written, err = internal_int_pack(a, buf, nails, order)
+	fmt.printf("\nPacked into buf: %v | err: %v | written: %v\n", buf, err, written)
 
-demo :: proc() {
-	a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer destroy(a, b, c, d, e, f)
+	err = internal_int_unpack(b, buf, nails, order)
+	print("\nUnpacked into b: ", b)
+	fmt.printf("err: %v\n", err)
+	fmt.printf("b.digit: %v\n", b.digit[:b.used])
 }
 
 main :: proc() {
diff --git a/core/math/big/helpers.odin b/core/math/big/helpers.odin
index 45c6771da..ac45c241d 100644
--- a/core/math/big/helpers.odin
+++ b/core/math/big/helpers.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -8,6 +6,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:intrinsics"
 import rnd "core:math/rand"
@@ -363,15 +362,15 @@ int_random_digit :: proc(r: ^rnd.Rand = nil) -> (res: DIGIT) {
 	return 0 // We shouldn't get here.
 }
 
-int_rand :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
+int_random :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
 	/*
 		Check that `a` is usable.
 	*/
 	assert_if_nil(dest)
-	return #force_inline internal_int_rand(dest, bits, r, allocator)
+	return #force_inline internal_int_random(dest, bits, r, allocator)
 
 }
-rand :: proc { int_rand, }
+random :: proc { int_random, }
 
 /*
 	Internal helpers.
@@ -496,7 +495,7 @@ int_to_bytes_little :: proc(a: ^Int, buf: []u8, signed := false, allocator := co
 	if signed {
 		buf[l - 1] = 1 if a.sign == .Negative else 0
 	}
-	for offset := 0; offset < size_in_bits; offset += 8 {
+	#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 		bits, _ := internal_int_bitfield_extract(a, offset, 8)
 		buf[i] = u8(bits & 255); i += 1
 	}
@@ -520,7 +519,7 @@ int_to_bytes_big :: proc(a: ^Int, buf: []u8, signed := false, allocator := conte
 	if signed {
 		buf[0] = 1 if a.sign == .Negative else 0
 	}
-	for offset := 0; offset < size_in_bits; offset += 8 {
+	#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 		bits, _ := internal_int_bitfield_extract(a, offset, 8)
 		buf[i] = u8(bits & 255); i -= 1
 	}
@@ -547,7 +546,7 @@ int_to_bytes_little_python :: proc(a: ^Int, buf: []u8, signed := false, allocato
 
 		size_in_bits := internal_count_bits(t)
 		i := 0
-		for offset := 0; offset < size_in_bits; offset += 8 {
+		#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 			bits, _ := internal_int_bitfield_extract(t, offset, 8)
 			buf[i] = 255 - u8(bits & 255); i += 1
 		}
@@ -555,7 +554,7 @@ int_to_bytes_little_python :: proc(a: ^Int, buf: []u8, signed := false, allocato
 	} else {
 		size_in_bits := internal_count_bits(a)
 		i := 0
-		for offset := 0; offset < size_in_bits; offset += 8 {
+		#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 			bits, _ := internal_int_bitfield_extract(a, offset, 8)
 			buf[i] = u8(bits & 255); i += 1
 		}
@@ -584,7 +583,7 @@ int_to_bytes_big_python :: proc(a: ^Int, buf: []u8, signed := false, allocator :
 
 	size_in_bits := internal_count_bits(t)
 	i := l - 1
-	for offset := 0; offset < size_in_bits; offset += 8 {
+	#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 		bits, _ := internal_int_bitfield_extract(t, offset, 8)
 		buf[i] = 255 - u8(bits & 255); i -= 1
 	}
@@ -621,7 +620,7 @@ int_from_bytes_big :: proc(a: ^Int, buf: []u8, signed := false, allocator := con
 		buf = buf[1:]
 	}
 
-	for v in buf {
+	#no_bounds_check for v in buf {
 		internal_shl(a, a, 8) or_return
 		a.digit[0] |= DIGIT(v)
 	}
@@ -658,7 +657,7 @@ int_from_bytes_big_python :: proc(a: ^Int, buf: []u8, signed := false, allocator
 		buf = buf[1:]
 	}
 
-	for v in buf {
+	#no_bounds_check for v in buf {
 		internal_shl(a, a, 8) or_return
 		if signed && sign == .Negative {
 			a.digit[0] |= DIGIT(255 - v)	
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 63c702f5e..97630a340 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -31,6 +29,7 @@ package math_big
 
 	TODO: Handle +/- Infinity and NaN.
 */
+package math_big
 
 import "core:mem"
 import "core:intrinsics"
@@ -137,7 +136,7 @@ internal_int_add_signed :: proc(dest, a, b: ^Int, allocator := context.allocator
 		Subtract the one with the greater magnitude from the other.
 		The result gets the sign of the one with the greater magnitude.
 	*/
-	if #force_inline internal_cmp_mag(a, b) == -1 {
+	if #force_inline internal_lt_abs(a, b) {
 		x, y = y, x
 	}
 
@@ -359,7 +358,7 @@ internal_int_sub_signed :: proc(dest, number, decrease: ^Int, allocator := conte
 		Subtract a positive from a positive, OR negative from a negative.
 		First, take the difference between their magnitudes, then...
 	*/
-	if #force_inline internal_cmp_mag(number, decrease) == -1 {
+	if #force_inline internal_lt_abs(number, decrease) {
 		/*
 			The second has a larger magnitude.
 			The result has the *opposite* sign from the first number.
@@ -546,6 +545,25 @@ internal_int_shl1 :: proc(dest, src: ^Int, allocator := context.allocator) -> (e
 }
 
 /*
+	Multiply bigint `a` with int `d` and put the result in `dest`.
+ 	Like `internal_int_mul_digit` but with an integer as the small input.
+*/
+internal_int_mul_integer :: proc(dest, a: ^Int, b: $T, allocator := context.allocator) -> (err: Error)
+where intrinsics.type_is_integer(T) && T != DIGIT {
+	context.allocator = allocator
+
+	t := &Int{}
+	defer internal_destroy(t)
+
+	/*
+		DIGIT might be smaller than a long, which excludes the use of `internal_int_mul_digit` here.
+	*/
+	internal_set(t, b) or_return
+	internal_mul(dest, a, t) or_return
+	return
+}
+
+/*
 	Multiply by a DIGIT.
 */
 internal_int_mul_digit :: proc(dest, src: ^Int, multiplier: DIGIT, allocator := context.allocator) -> (err: Error) {
@@ -698,7 +716,7 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc
 	return err
 }
 
-internal_mul :: proc { internal_int_mul, internal_int_mul_digit, }
+internal_mul :: proc { internal_int_mul, internal_int_mul_digit, internal_int_mul_integer }
 
 internal_sqr :: proc (dest, src: ^Int, allocator := context.allocator) -> (res: Error) {
 	/*
@@ -719,7 +737,7 @@ internal_int_divmod :: proc(quotient, remainder, numerator, denominator: ^Int, a
 	/*
 		If numerator < denominator then quotient = 0, remainder = numerator.
 	*/
-	if #force_inline internal_cmp_mag(numerator, denominator) == -1 {
+	if #force_inline internal_lt_abs(numerator, denominator) {
 		if remainder != nil {
 			internal_copy(remainder, numerator) or_return
 		}
@@ -732,7 +750,6 @@ internal_int_divmod :: proc(quotient, remainder, numerator, denominator: ^Int, a
 	if (denominator.used > 2 * MUL_KARATSUBA_CUTOFF) && (denominator.used <= (numerator.used / 3) * 2) {
 		assert(denominator.used >= 160 && numerator.used >= 240, "MUL_KARATSUBA_CUTOFF global not properly set.")
 		err = _private_int_div_recursive(quotient, remainder, numerator, denominator)
-		// err = #force_inline _private_int_div_school(quotient, remainder, numerator, denominator);
 	} else {
 		when true {
 			err = #force_inline _private_int_div_school(quotient, remainder, numerator, denominator)
@@ -856,14 +873,14 @@ internal_int_mod :: proc(remainder, numerator, denominator: ^Int, allocator := c
 
 	if remainder.used == 0 || denominator.sign == remainder.sign { return nil }
 
-	return #force_inline internal_add(remainder, remainder, numerator, allocator)
+	return #force_inline internal_add(remainder, remainder, denominator, allocator)
 }
 
 internal_int_mod_digit :: proc(numerator: ^Int, denominator: DIGIT, allocator := context.allocator) -> (remainder: DIGIT, err: Error) {
 	return internal_int_divmod_digit(nil, numerator, denominator, allocator)
 }
 
-internal_mod :: proc{ internal_int_mod, internal_int_mod_digit}
+internal_mod :: proc{ internal_int_mod, internal_int_mod_digit, }
 
 /*
 	remainder = (number + addend) % modulus.
@@ -942,6 +959,14 @@ internal_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 	return #force_inline _private_int_gcd_lcm(res_gcd, res_lcm, a, b, allocator)
 }
 
+internal_int_gcd :: proc(res_gcd, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	return #force_inline _private_int_gcd_lcm(res_gcd, nil, a, b, allocator)
+}
+
+internal_int_lcm :: proc(res_lcm, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	return #force_inline _private_int_gcd_lcm(nil, res_lcm, a, b, allocator)
+}
+
 /*
 	remainder = numerator % (1 << bits)
 
@@ -993,12 +1018,20 @@ internal_int_mod_bits :: proc(remainder, numerator: ^Int, bits: int, allocator :
 */
 
 /*
+	This procedure returns the allocated capacity of an Int.
+	Assumes `a` not to be `nil`.
+*/
+internal_int_allocated_cap :: #force_inline proc(a: ^Int) -> (cap: int) {
+	raw := transmute(mem.Raw_Dynamic_Array)a.digit
+	return raw.cap
+}
+
+/*
 	This procedure will return `true` if the `Int` is initialized, `false` if not.
 	Assumes `a` not to be `nil`.
 */
 internal_int_is_initialized :: #force_inline proc(a: ^Int) -> (initialized: bool) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit
-	return raw.cap >= _MIN_DIGIT_COUNT
+	return internal_int_allocated_cap(a) >= _MIN_DIGIT_COUNT
 }
 internal_is_initialized :: proc { internal_int_is_initialized, }
 
@@ -1091,6 +1124,7 @@ internal_is_power_of_two :: proc { internal_int_is_power_of_two, }
 	Expects `a` and `b` both to be valid `Int`s, i.e. initialized and not `nil`.
 */
 internal_int_compare :: #force_inline proc(a, b: ^Int) -> (comparison: int) {
+	assert_if_nil(a, b)
 	a_is_negative := #force_inline internal_is_negative(a)
 
 	/*
@@ -1114,6 +1148,7 @@ internal_cmp :: internal_compare
 	Expects: `a` and `b` both to be valid `Int`s, i.e. initialized and not `nil`.
 */
 internal_int_compare_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (comparison: int) {
+	assert_if_nil(a)
 	a_is_negative := #force_inline internal_is_negative(a)
 
 	switch {
@@ -1145,6 +1180,7 @@ internal_cmp_digit :: internal_compare_digit
 	Compare the magnitude of two `Int`s, unsigned.
 */
 internal_int_compare_magnitude :: #force_inline proc(a, b: ^Int) -> (comparison: int) {
+	assert_if_nil(a, b)
 	/*
 		Compare based on used digits.
 	*/
@@ -1172,6 +1208,177 @@ internal_int_compare_magnitude :: #force_inline proc(a, b: ^Int) -> (comparison:
 internal_compare_magnitude :: proc { internal_int_compare_magnitude, }
 internal_cmp_mag :: internal_compare_magnitude
 
+
+/*
+	bool := a < b
+*/
+internal_int_less_than :: #force_inline proc(a, b: ^Int) -> (less_than: bool) {
+	return internal_cmp(a, b) == -1
+}
+
+/*
+	bool := a < b
+*/
+internal_int_less_than_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (less_than: bool) {
+	return internal_cmp_digit(a, b) == -1
+}
+
+/*
+	bool := |a| < |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_less_than_abs :: #force_inline proc(a, b: ^Int) -> (less_than: bool) {
+	return internal_cmp_mag(a, b) == -1
+}
+
+internal_less_than :: proc {
+	internal_int_less_than,
+	internal_int_less_than_digit,
+}
+internal_lt :: internal_less_than
+
+internal_less_than_abs :: proc {
+	internal_int_less_than_abs,
+}
+internal_lt_abs :: internal_less_than_abs
+
+
+/*
+	bool := a <= b
+*/
+internal_int_less_than_or_equal :: #force_inline proc(a, b: ^Int) -> (less_than_or_equal: bool) {
+	return internal_cmp(a, b) <= 0
+}
+
+/*
+	bool := a <= b
+*/
+internal_int_less_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (less_than_or_equal: bool) {
+	return internal_cmp_digit(a, b) <= 0
+}
+
+/*
+	bool := |a| <= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (less_than_or_equal: bool) {
+	return internal_cmp_mag(a, b) <= 0
+}
+
+internal_less_than_or_equal :: proc {
+	internal_int_less_than_or_equal,
+	internal_int_less_than_or_equal_digit,
+}
+internal_lte :: internal_less_than_or_equal
+
+internal_less_than_or_equal_abs :: proc {
+	internal_int_less_than_or_equal_abs,
+}
+internal_lte_abs :: internal_less_than_or_equal_abs
+
+
+/*
+	bool := a == b
+*/
+internal_int_equals :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
+	return internal_cmp(a, b) == 0
+}
+
+/*
+	bool := a == b
+*/
+internal_int_equals_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (equals: bool) {
+	return internal_cmp_digit(a, b) == 0
+}
+
+/*
+	bool := |a| == |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_equals_abs :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
+	return internal_cmp_mag(a, b) == 0
+}
+
+internal_equals :: proc {
+	internal_int_equals,
+	internal_int_equals_digit,
+}
+internal_eq :: internal_equals
+
+internal_equals_abs :: proc {
+	internal_int_equals_abs,
+}
+internal_eq_abs :: internal_equals_abs
+
+
+/*
+	bool := a >= b
+*/
+internal_int_greater_than_or_equal :: #force_inline proc(a, b: ^Int) -> (greater_than_or_equal: bool) {
+	return internal_cmp(a, b) >= 0
+}
+
+/*
+	bool := a >= b
+*/
+internal_int_greater_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (greater_than_or_equal: bool) {
+	return internal_cmp_digit(a, b) >= 0
+}
+
+/*
+	bool := |a| >= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (greater_than_or_equal: bool) {
+	return internal_cmp_mag(a, b) >= 0
+}
+
+internal_greater_than_or_equal :: proc {
+	internal_int_greater_than_or_equal,
+	internal_int_greater_than_or_equal_digit,
+}
+internal_gte :: internal_greater_than_or_equal
+
+internal_greater_than_or_equal_abs :: proc {
+	internal_int_greater_than_or_equal_abs,
+}
+internal_gte_abs :: internal_greater_than_or_equal_abs
+
+
+/*
+	bool := a > b
+*/
+internal_int_greater_than :: #force_inline proc(a, b: ^Int) -> (greater_than: bool) {
+	return internal_cmp(a, b) == 1
+}
+
+/*
+	bool := a > b
+*/
+internal_int_greater_than_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (greater_than: bool) {
+	return internal_cmp_digit(a, b) == 1
+}
+
+/*
+	bool := |a| > |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_greater_than_abs :: #force_inline proc(a, b: ^Int) -> (greater_than: bool) {
+	return internal_cmp_mag(a, b) == 1
+}
+
+internal_greater_than :: proc {
+	internal_int_greater_than,
+	internal_int_greater_than_digit,
+}
+internal_gt :: internal_greater_than
+
+internal_greater_than_abs :: proc {
+	internal_int_greater_than_abs,
+}
+internal_gt_abs :: internal_greater_than_abs
+
+
 /*
 	Check if remainders are possible squares - fast exclude non-squares.
 
@@ -1229,7 +1436,7 @@ internal_int_is_square :: proc(a: ^Int, allocator := context.allocator) -> (squa
 	sqrt(t, a) or_return
 	sqr(t, t)  or_return
 
-	square = internal_cmp_mag(t, a) == 0
+	square = internal_eq_abs(t, a)
 
 	return
 }
@@ -1461,7 +1668,7 @@ internal_int_sqrt :: proc(dest, src: ^Int, allocator := context.allocator) -> (e
 		internal_add(t2, t1, x)  or_return
 		internal_shr(y, t2, 1)   or_return
 
-		if c := internal_cmp(y, x); c == 0 || c == 1 {
+		if internal_gte(y, x) {
 			internal_swap(dest, x)
 			return nil
 		}
@@ -1576,8 +1783,8 @@ internal_int_root_n :: proc(dest, src: ^Int, n: int, allocator := context.alloca
 			 Number of rounds is at most log_2(root). If it is more it
 			 got stuck, so break out of the loop and do the rest manually.
 		*/
-		if ilog2 -= 1;    ilog2 == 0 { break }
-		if internal_cmp(t1, t2) == 0 { break }
+		if ilog2 -= 1; ilog2 == 0 { break }
+		if internal_eq(t1, t2)    { break }
 
 		iterations += 1
 		if iterations == MAX_ITERATIONS_ROOT_N {
@@ -1615,7 +1822,7 @@ internal_int_root_n :: proc(dest, src: ^Int, n: int, allocator := context.alloca
 	for {
 		internal_pow(t2, t1, n) or_return
 	
-		if internal_cmp(t2, a) != 1 { break }
+		if internal_lt(t2, a) { break }
 		
 		internal_sub(t1, t1, DIGIT(1)) or_return
 
@@ -1651,8 +1858,7 @@ internal_int_destroy :: proc(integers: ..^Int) {
 	integers := integers
 
 	for a in &integers {
-		raw := transmute(mem.Raw_Dynamic_Array)a.digit
-		if raw.cap > 0 {
+		if internal_int_allocated_cap(a) > 0 {
 			mem.zero_slice(a.digit[:])
 			free(&a.digit[0])
 		}
@@ -1692,7 +1898,7 @@ internal_int_set_from_integer :: proc(dest: ^Int, src: $T, minimize := false, al
 	return nil
 }
 
-internal_set :: proc { internal_int_set_from_integer, internal_int_copy }
+internal_set :: proc { internal_int_set_from_integer, internal_int_copy, int_atoi }
 
 internal_copy_digits :: #force_inline proc(dest, src: ^Int, digits: int, offset := int(0)) -> (err: Error) {
 	#force_inline internal_error_if_immutable(dest) or_return
@@ -1821,12 +2027,12 @@ internal_int_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.alloc
 	/*
 		For all n in N and n > 0, n = 0 mod 1.
 	*/
-	if internal_is_positive(a) && internal_cmp(b, 1) == 0 { return internal_zero(dest)	}
+	if internal_is_positive(a) && internal_eq(b, 1) { return internal_zero(dest)	}
 
 	/*
 		`b` cannot be negative and has to be > 1
 	*/
-	if internal_is_negative(b) && internal_cmp(b, 1) != 1 { return .Invalid_Argument }
+	if internal_is_negative(b) || internal_gt(b, 1) { return .Invalid_Argument }
 
 	/*
 		If the modulus is odd we can use a faster routine instead.
@@ -1839,9 +2045,20 @@ internal_invmod :: proc{ internal_int_inverse_modulo, }
 
 /*
 	Helpers to extract values from the `Int`.
+	Offset is zero indexed.
 */
+internal_int_bitfield_extract_bool :: proc(a: ^Int, offset: int) -> (val: bool, err: Error) {
+	limb := offset / _DIGIT_BITS
+	if limb < 0 || limb >= a.used  { return false, .Invalid_Argument }
+	i := _WORD(1 << _WORD((offset % _DIGIT_BITS)))
+	return bool(_WORD(a.digit[limb]) & i), nil
+}
+
 internal_int_bitfield_extract_single :: proc(a: ^Int, offset: int) -> (bit: _WORD, err: Error) {
-	return #force_inline int_bitfield_extract(a, offset, 1)
+	limb := offset / _DIGIT_BITS
+	if limb < 0 || limb >= a.used  { return 0, .Invalid_Argument }
+	i := _WORD(1 << _WORD((offset % _DIGIT_BITS)))
+	return 1 if ((_WORD(a.digit[limb]) & i) != 0) else 0, nil
 }
 
 internal_int_bitfield_extract :: proc(a: ^Int, offset, count: int) -> (res: _WORD, err: Error) #no_bounds_check {
@@ -1900,6 +2117,34 @@ internal_int_bitfield_extract :: proc(a: ^Int, offset, count: int) -> (res: _WOR
 }
 
 /*
+	Helpers to (un)set a bit in an Int.
+	Offset is zero indexed.
+*/
+internal_int_bitfield_set_single :: proc(a: ^Int, offset: int) -> (err: Error) {
+	limb := offset / _DIGIT_BITS
+	if limb < 0 || limb >= a.used  { return .Invalid_Argument }
+	i := DIGIT(1 << uint((offset % _DIGIT_BITS)))
+	a.digit[limb] |= i
+	return
+}
+
+internal_int_bitfield_unset_single :: proc(a: ^Int, offset: int) -> (err: Error) {
+	limb := offset / _DIGIT_BITS
+	if limb < 0 || limb >= a.used  { return .Invalid_Argument }
+	i := DIGIT(1 << uint((offset % _DIGIT_BITS)))
+	a.digit[limb] &= _MASK - i
+	return
+}
+
+internal_int_bitfield_toggle_single :: proc(a: ^Int, offset: int) -> (err: Error) {
+	limb := offset / _DIGIT_BITS
+	if limb < 0 || limb >= a.used  { return .Invalid_Argument }
+	i := DIGIT(1 << uint((offset % _DIGIT_BITS)))
+	a.digit[limb] ~= i
+	return
+}
+
+/*
 	Resize backing store.
 	We don't need to pass the allocator, because the storage itself stores it.
 
@@ -1914,23 +2159,23 @@ internal_int_shrink :: proc(a: ^Int) -> (err: Error) {
 internal_shrink :: proc { internal_int_shrink, }
 
 internal_int_grow :: proc(a: ^Int, digits: int, allow_shrink := false, allocator := context.allocator) -> (err: Error) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit
-
 	/*
 		We need at least _MIN_DIGIT_COUNT or a.used digits, whichever is bigger.
 		The caller is asking for `digits`. Let's be accomodating.
 	*/
+	cap := internal_int_allocated_cap(a)
+
 	needed := max(_MIN_DIGIT_COUNT, a.used, digits)
 	if !allow_shrink {
-		needed = max(needed, raw.cap)
+		needed = max(needed, cap)
 	}
 
 	/*
 		If not yet iniialized, initialize the `digit` backing with the allocator we were passed.
 	*/
-	if raw.cap == 0 {
+	if cap == 0 {
 		a.digit = make([dynamic]DIGIT, needed, allocator)
-	} else if raw.cap != needed {
+	} else if cap != needed {
 		/*
 			`[dynamic]DIGIT` already knows what allocator was used for it, so resize will do the right thing.
 		*/
@@ -2558,9 +2803,27 @@ internal_int_count_lsb :: proc(a: ^Int) -> (count: int, err: Error) {
 	x: int
 	#no_bounds_check for x = 0; x < a.used && a.digit[x] == 0; x += 1 {}
 
-	q := a.digit[x]
-	x *= _DIGIT_BITS
-	x += internal_count_lsb(q)
+	when true {
+		q := a.digit[x]
+		x *= _DIGIT_BITS
+		x += internal_count_lsb(q)
+	} else {
+		lnz := []int{
+   			4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+		}
+
+		q := a.digit[x]
+		x *= _DIGIT_BITS
+		if q & 1 == 0 {
+			p: DIGIT
+			for {
+				p = q & 15
+				x += lnz[p]
+				q >>= 4
+				if p != 0 { break }
+			}
+		}		
+	}
 	return x, nil
 }
 
@@ -2583,7 +2846,7 @@ internal_int_random_digit :: proc(r: ^rnd.Rand = nil) -> (res: DIGIT) {
 	return 0 // We shouldn't get here.
 }
 
-internal_int_rand :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
+internal_int_random :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator
 
 	bits := bits
@@ -2608,7 +2871,7 @@ internal_int_rand :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator :
 	dest.used = digits
 	return nil
 }
-internal_rand :: proc { internal_int_rand, }
+internal_random :: proc { internal_int_random, }
 
 /*
 	Internal helpers.
diff --git a/core/math/big/logical.odin b/core/math/big/logical.odin
index 008450319..dbcf566c8 100644
--- a/core/math/big/logical.odin
+++ b/core/math/big/logical.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,7 @@ package math_big
 
 	This file contains logical operations like `and`, `or` and `xor`.
 */
+package math_big
 
 /*
 	The `and`, `or` and `xor` binops differ in two lines only.
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index f78c1894c..eb0cd644c 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,12 +8,15 @@ package math_big
 
 	This file contains prime finding operations.
 */
+package math_big
+
+import rnd "core:math/rand"
 
 /*
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
 	Returns true if it is, false if not. 
 */
-int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res: bool, err: Error) {
+internal_int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res: bool, err: Error) {
 	assert_if_nil(a)
 	context.allocator = allocator
 
@@ -34,186 +35,1341 @@ int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res:
 }
 
 /*
-	Computes xR**-1 == x (mod N) via Montgomery Reduction.
+	This is a shell function that calls either the normal or Montgomery exptmod functions.
+	Originally the call to the Montgomery code was embedded in the normal function but that
+	wasted alot of stack space for nothing (since 99% of the time the Montgomery code would be called).
+
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
 */
-internal_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
+internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator
+
+	dr: int
+
 	/*
-		Can the fast reduction [comba] method be used?
-		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
-		since carries are fixed up in the inner loop.
+		Modulus P must be positive.
 	*/
-	digs := (n.used * 2) + 1
-	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
-		return _private_montgomery_reduce_comba(x, n, rho)
-	}
+	if internal_is_negative(P) { return .Invalid_Argument }
 
 	/*
-		Grow the input as required
+		If exponent X is negative we have to recurse.
 	*/
-	internal_grow(x, digs)                                           or_return
-	x.used = digs
+	if internal_is_negative(X) {
+		tmpG, tmpX := &Int{}, &Int{}
+		defer internal_destroy(tmpG, tmpX)
+
+		internal_init_multi(tmpG, tmpX) or_return
 
-	for ix := 0; ix < n.used; ix += 1 {
 		/*
-			`mu = ai * rho mod b`
-			The value of rho must be precalculated via `int_montgomery_setup()`,
-			such that it equals -1/n0 mod b this allows the following inner loop
-			to reduce the input one digit at a time.
+			First compute 1/G mod P.
 		*/
+		internal_invmod(tmpG, G, P) or_return
 
-		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK))
+		/*
+			now get |X|.
+		*/
+		internal_abs(tmpX, X) or_return
 
 		/*
-			a = a + mu * m * b**i
-			Multiply and add in place.
+			And now compute (1/G)**|X| instead of G**X [X < 0].
 		*/
-		u  := DIGIT(0)
-		iy := int(0)
-		for ; iy < n.used; iy += 1 {
-			/*
-				Compute product and sum.
-			*/
-			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]))
+		return internal_int_exponent_mod(res, tmpG, tmpX, P)
+	}
 
+	/*
+		Modified diminished radix reduction.
+	*/
+	can_reduce_2k_l := _private_int_reduce_is_2k_l(P) or_return
+	if can_reduce_2k_l {
+		return _private_int_exponent_mod(res, G, X, P, 1)
+	}
+
+	/*
+		Is it a DR modulus? default to no.
+	*/
+	dr = 1 if _private_dr_is_modulus(P) else 0
+
+	/*
+		If not, is it a unrestricted DR modulus?
+	*/
+	if dr == 0 {
+		reduce_is_2k := _private_int_reduce_is_2k(P) or_return
+		dr = 2 if reduce_is_2k else 0
+	}
+
+	/*
+		If the modulus is odd or dr != 0 use the montgomery method.
+	*/
+	if internal_int_is_odd(P) || dr != 0 {
+		return _private_int_exponent_mod(res, G, X, P, dr)
+	}
+
+	/*
+		Otherwise use the generic Barrett reduction technique.
+	*/
+	return _private_int_exponent_mod(res, G, X, P, 0)
+}
+
+/*
+	Kronecker/Legendre symbol (a|p)
+	Straightforward implementation of algorithm 1.4.10 in
+	Henri Cohen: "A Course in Computational Algebraic Number Theory"
+
+	@book{cohen2013course,
+		title={A course in computational algebraic number theory},
+		author={Cohen, Henri},
+		volume={138},
+		year={2013},
+		publisher={Springer Science \& Business Media}
+	}
+
+	Assumes `a` and `p` to not be `nil` and to have been initialized.
+*/
+internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (kronecker: int, err: Error) {
+	context.allocator = allocator
+
+	a1, p1, r := &Int{}, &Int{}, &Int{}
+	defer internal_destroy(a1, p1, r)
+
+	table := []int{0, 1, 0, -1, 0, -1, 0, 1}
+
+	if internal_int_is_zero(p) {
+		if a.used == 1 && a.digit[0] == 1 {
+			return 1, nil
+		} else {
+			return 0, nil
+		}
+	}
+
+	if internal_is_even(a) && internal_is_even(p) {
+		return 0, nil
+	}
+
+	internal_copy(a1, a) or_return
+	internal_copy(p1, p) or_return
+
+	v := internal_count_lsb(p1) or_return
+	internal_shr(p1, p1, v) or_return
+
+	k := 1 if v & 1 == 0 else table[a.digit[0] & 7]
+
+	if internal_is_negative(p1) {
+		p1.sign = .Zero_or_Positive
+		if internal_is_negative(a1) {
+			k = -k
+		}
+	}
+
+	internal_zero(r) or_return
+
+	for {
+		if internal_is_zero(a1) {
+			if internal_eq(p1, 1) {
+				return k, nil
+			} else {
+				return 0, nil
+			}
+		}
+
+		v = internal_count_lsb(a1) or_return
+		internal_shr(a1, a1, v) or_return
+
+		if v & 1 == 1 {
+			k = k * table[p1.digit[0] & 7]
+		}
+
+		if internal_is_negative(a1) {
+			/*
+				Compute k = (-1)^((a1)*(p1-1)/4) * k.
+				a1.digit[0] + 1 cannot overflow because the MSB
+				of the DIGIT type is not set by definition.
+			 */
+			if ((a1.digit[0] + 1) & p1.digit[0] & 2) != 0 {
+				k = -k
+			}
+		} else {
 			/*
-				Get carry.
+				Compute k = (-1)^((a1-1)*(p1-1)/4) * k.
 			*/
-			u = DIGIT(r >> _DIGIT_BITS)
+			if (a1.digit[0] & p1.digit[0] & 2) != 0 {
+				k = -k
+			}
+		}
+
+		internal_copy(r, a1) or_return
+		r.sign = .Zero_or_Positive
+
+		internal_mod(a1, p1, r) or_return
+		internal_copy(p1, r)    or_return
+	}
+	return
+}
+internal_int_legendre :: internal_int_kronecker
+
+/*
+	Miller-Rabin test of "a" to the base of "b" as described in HAC pp. 139 Algorithm 4.24.
+
+	Sets result to `false` if definitely composite or `true` if probably prime.
+	Randomly the chance of error is no more than 1/4 and often very much lower.
+
+	Assumes `a` and `b` not to be `nil` and to have been initialized.
+*/
+internal_int_prime_miller_rabin :: proc(a, b: ^Int, allocator := context.allocator) -> (probably_prime: bool, err: Error) {
+	context.allocator = allocator
+
+	n1, y, r := &Int{}, &Int{}, &Int{}
+	defer internal_destroy(n1, y, r)
+
+	/*
+		Ensure `b` > 1.
+	*/
+	if internal_lte(b, 1) { return false, nil }
+
+	/*
+		Get `n1` = `a` - 1.
+	*/
+	internal_copy(n1, a) or_return
+	internal_sub(n1, n1, 1) or_return
+
+	/*
+		Set `2`**`s` * `r` = `n1`
+	*/
+	internal_copy(r, n1) or_return
+
+	/*
+		Count the number of least significant bits which are zero.
+	*/
+	s := internal_count_lsb(r) or_return
+
+	/*
+		Now divide `n` - 1 by `2`**`s`.
+	*/
+	internal_shr(r, r, s) or_return
+
+	/*
+		Compute `y` = `b`**`r` mod `a`.
+	*/
+	internal_int_exponent_mod(y, b, r, a) or_return
+
+	/*
+		If `y` != 1 and `y` != `n1` do.
+	*/
+	if !internal_eq(y, 1) && !internal_eq(y, n1) {
+		j := 1
+
+		/*
+			While `j` <= `s` - 1 and `y` != `n1`.
+		*/
+		for j <= (s - 1) && !internal_eq(y, n1) {
+			internal_sqrmod(y, y, a) or_return
 
 			/*
-				Fix digit.
+				If `y` == 1 then composite.
 			*/
-			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
+			if internal_eq(y, 1) {
+				return false, nil
+			}
+
+			j += 1
+		}
+
+		/*
+			If `y` != `n1` then composite.
+		*/
+		if !internal_eq(y, n1) {
+			return false, nil
+		}
+	}
+
+	/*
+		Probably prime now.
+	*/
+	return true, nil
+}
+
+/*
+	`a` is the big Int to test for primality.
+
+	`miller_rabin_trials` can be one of the following:
+		`< 0`:	For `a` up to 3_317_044_064_679_887_385_961_981, set `miller_rabin_trials` to negative to run a predetermined
+				number of trials for a deterministic answer.
+		`= 0`:	Run Miller-Rabin with bases 2, 3 and one random base < `a`. Non-deterministic.
+		`> 0`:	Run Miller-Rabin with bases 2, 3 and `miller_rabin_trials` number of random bases. Non-deterministic.
+
+	`miller_rabin_only`:
+		`false`	Also use either Frobenius-Underwood or Lucas-Selfridge, depending on the compile-time `MATH_BIG_USE_FROBENIUS_TEST` choice.
+		`true`	Run Rabin-Miller trials but skip Frobenius-Underwood / Lucas-Selfridge.
+
+	`r` takes a pointer to an instance of `core:math/rand`'s `Rand` and may be `nil` to use the global one.
+
+	Returns `is_prime` (bool), where:
+		`false`	Definitively composite.
+		`true`	Probably prime if `miller_rabin_trials` >= 0, with increasing certainty with more trials.
+				Deterministically prime if `miller_rabin_trials` = 0 for `a` up to 3_317_044_064_679_887_385_961_981.
+
+	Assumes `a` not to be `nil` and to have been initialized.
+*/
+internal_int_is_prime :: proc(a: ^Int, miller_rabin_trials := int(-1), miller_rabin_only := USE_MILLER_RABIN_ONLY, r: ^rnd.Rand = nil, allocator := context.allocator) -> (is_prime: bool, err: Error) {
+	context.allocator = allocator
+	miller_rabin_trials := miller_rabin_trials
+
+	// Default to `no`.
+	is_prime = false
+
+	b, res := &Int{}, &Int{}
+	defer internal_destroy(b, res)
+
+	// Some shortcuts
+	// `N` > 3
+	if a.used == 1 {
+		if a.digit[0] == 0 || a.digit[0] == 1 {
+			return
+		}
+		if a.digit[0] == 2 {
+			return true, nil
 		}
+	}
+
+	// `N` must be odd.
+	if internal_is_even(a) {
+		return
+	}
+
+	// `N` is not a perfect square: floor(sqrt(`N`))^2 != `N` 
+	if internal_int_is_square(a) or_return { return }
+
+	// Is the input equal to one of the primes in the table?
+	for p in _private_prime_table {
+		if internal_eq(a, p) {
+			return true, nil
+		}
+	}
+
+	// First perform trial division
+	if internal_int_prime_is_divisible(a) or_return { return }
+
+	// Run the Miller-Rabin test with base 2 for the BPSW test.
+	internal_set(b, 2) or_return
+	if !internal_int_prime_miller_rabin(a, b) or_return { return }
+
+	// Rumours have it that Mathematica does a second M-R test with base 3.
+	// Other rumours have it that their strong L-S test is slightly different.
+	// It does not hurt, though, beside a bit of extra runtime.
+
+	b.digit[0] += 1
+	if !internal_int_prime_miller_rabin(a, b) or_return { return }
+
+	// Both, the Frobenius-Underwood test and the the Lucas-Selfridge test are quite
+	// slow so if speed is an issue, set `USE_MILLER_RABIN_ONLY` to use M-R tests with
+	// bases 2, 3 and t random bases.
+
+	if !miller_rabin_only {
+		if miller_rabin_trials >= 0 {
+			when MATH_BIG_USE_FROBENIUS_TEST {
+				if !internal_int_prime_frobenius_underwood(a) or_return { return }
+			} else {
+				if !internal_int_prime_strong_lucas_selfridge(a) or_return { return }
+			}
+		}
+	}
+
+	// Run at least one Miller-Rabin test with a random base.
+	// Don't replace this with `min`, because we try known deterministic bases
+	//     for certain sized inputs when `miller_rabin_trials` is negative.
+	if miller_rabin_trials == 0 {
+		miller_rabin_trials = 1
+	}
+
+	// Only recommended if the input range is known to be < 3_317_044_064_679_887_385_961_981
+	// It uses the bases necessary for a deterministic M-R test if the input is	smaller than 3_317_044_064_679_887_385_961_981
+	// The caller has to check the size.
+	// TODO: can be made a bit finer grained but comparing is not free.
+
+	if miller_rabin_trials < 0 {
+		p_max := 0
+
+		// Sorenson, Jonathan; Webster, Jonathan (2015), "Strong Pseudoprimes to Twelve Prime Bases".
+
+		// 0x437ae92817f9fc85b7e5 = 318_665_857_834_031_151_167_461
+		atoi(b, "437ae92817f9fc85b7e5", 16) or_return
+		if internal_lt(a, b) {
+			p_max = 12
+		} else {
+			/* 0x2be6951adc5b22410a5fd = 3_317_044_064_679_887_385_961_981 */
+			atoi(b, "2be6951adc5b22410a5fd", 16) or_return
+			if internal_lt(a, b) {
+				p_max = 13
+			} else {
+				return false, .Invalid_Argument
+			}
+		}
+
+		// We did bases 2 and 3  already, skip them
+		for ix := 2; ix < p_max; ix += 1 {
+			internal_set(b, _private_prime_table[ix])
+			if !internal_int_prime_miller_rabin(a, b) or_return { return }
+		}
+	} else if miller_rabin_trials > 0 {
+		// Perform `miller_rabin_trials` M-R tests with random bases between 3 and "a".
+		// See Fips 186.4 p. 126ff
+
+		// The DIGITs have a defined bit-size but the size of a.digit is a simple 'int',
+		// the size of which can depend on the platform.
+		size_a := internal_count_bits(a)
+		mask   := (1 << uint(ilog2(size_a))) - 1
 
 		/*
-			At this point the ix'th digit of x should be zero.
-			Propagate carries upwards as required.
+			Assuming the General Rieman hypothesis (never thought to write that in a
+			comment) the upper bound can be lowered to  2*(log a)^2.
+			E. Bach, "Explicit bounds for primality testing and related problems,"
+			Math. Comp. 55 (1990), 355-380.
+
+				size_a = (size_a/10) * 7;
+				len = 2 * (size_a * size_a);
+
+			E.g.: a number of size 2^2048 would be reduced to the upper limit
+
+				floor(2048/10)*7 = 1428
+				2 * 1428^2       = 4078368
+
+			(would have been ~4030331.9962 with floats and natural log instead)
+			That number is smaller than 2^28, the default bit-size of DIGIT on 32-bit platforms.
 		*/
-		for u != 0 {
-			x.digit[ix + iy] += u
-			u = x.digit[ix + iy] >> _DIGIT_BITS
-			x.digit[ix + iy] &= _MASK
-			iy += 1
+
+		/*
+			How many tests, you might ask? Dana Jacobsen of Math::Prime::Util fame
+			does exactly 1. In words: one. Look at the end of _GMP_is_prime() in
+			Math-Prime-Util-GMP-0.50/primality.c if you do not believe it.
+
+			The function rand() goes to some length to use a cryptographically
+			good PRNG. That also means that the chance to always get the same base
+			in the loop is non-zero, although very low.
+			-- NOTE(Jeroen): This is not yet true in Odin, but I have some ideas.
+
+			If the BPSW test and/or the addtional Frobenious test have been
+			performed instead of just the Miller-Rabin test with the bases 2 and 3,
+			a single extra test should suffice, so such a very unlikely event will not do much harm.
+
+			To preemptivly answer the dangling question: no, a witness does not	need to be prime.
+		*/
+		for ix := 0; ix < miller_rabin_trials; ix += 1 {
+
+			// rand() guarantees the first digit to be non-zero
+			internal_random(b, _DIGIT_TYPE_BITS, r) or_return
+
+			// Reduce digit before casting because DIGIT might be bigger than
+			// an unsigned int and "mask" on the other side is most probably not.
+			l: int
+
+			fips_rand := (uint)(b.digit[0] & DIGIT(mask))
+			if fips_rand > (uint)(max(int) - _DIGIT_BITS) {
+				l = max(int) / _DIGIT_BITS
+			} else {
+				l = (int(fips_rand) + _DIGIT_BITS) / _DIGIT_BITS
+			}
+
+			// Unlikely.
+			if (l < 0) {
+				ix -= 1
+				continue
+			}
+			internal_random(b, l) or_return
+
+			// That number might got too big and the witness has to be smaller than "a"
+			l = internal_count_bits(b)
+			if l >= size_a {
+				l = (l - size_a) + 1
+				internal_shr(b, b, l) or_return
+			}
+
+			// Although the chance for b <= 3 is miniscule, try again.
+			if internal_lte(b, 3) {
+				ix -= 1
+				continue
+			}
+			if !internal_int_prime_miller_rabin(a, b) or_return { return }
+		}
+	}
+
+	// Passed the test.
+	return true, nil
+}
+
+/*
+ * floor of positive solution of (2^16) - 1 = (a + 4) * (2 * a + 5)
+ * TODO: Both values are smaller than N^(1/4), would have to use a bigint
+ *       for `a` instead, but any `a` bigger than about 120 are already so rare that
+ *       it is possible to ignore them and still get enough pseudoprimes.
+ *       But it is still a restriction of the set of available pseudoprimes
+ *       which makes this implementation less secure if used stand-alone.
+ */
+_FROBENIUS_UNDERWOOD_A :: 32764
+
+internal_int_prime_frobenius_underwood :: proc(N: ^Int, allocator := context.allocator) -> (result: bool, err: Error) {
+	context.allocator = allocator
+
+	T1z, T2z, Np1z, sz, tz := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(T1z, T2z, Np1z, sz, tz)
+
+	internal_init_multi(T1z, T2z, Np1z, sz, tz) or_return
+
+	a, ap2: int
+
+	frob: for a = 0; a < _FROBENIUS_UNDERWOOD_A; a += 1 {
+		switch a {
+		case 2, 4, 7, 8, 10, 14, 18, 23, 26, 28:
+			continue frob
+		}
+
+		internal_set(T1z, i32((a * a) - 4))
+		j := internal_int_kronecker(T1z, N) or_return
+
+		switch j {
+		case -1: break frob
+		case  0: return false, nil
+		}
+	}
+
+	// Tell it a composite and set return value accordingly.
+	if a >= _FROBENIUS_UNDERWOOD_A { return false, .Max_Iterations_Reached }
+
+	// Composite if N and (a+4)*(2*a+5) are not coprime.
+	internal_set(T1z, u32((a + 4) * ((2 * a) + 5)))
+	internal_int_gcd(T1z, T1z, N) or_return
+
+	if !(T1z.used == 1 && T1z.digit[0] == 1) {
+		// Composite.
+		return false, nil
+	}
+
+	ap2 = a + 2
+	internal_add(Np1z, N, 1) or_return
+
+	internal_set(sz, 1) or_return
+	internal_set(tz, 2) or_return
+
+	for i := internal_count_bits(Np1z) - 2; i >= 0; i -= 1 {
+		// temp = (sz * (a * sz + 2 * tz)) % N;
+		// tz   = ((tz - sz) * (tz + sz)) % N;
+		// sz   = temp;
+
+		internal_int_shl1(T2z, tz) or_return
+
+		// a = 0 at about 50% of the cases (non-square and odd input)
+		if a != 0 {
+			internal_mul(T1z, sz, DIGIT(a)) or_return
+			internal_add(T2z, T2z, T1z) or_return
+		}
+
+		internal_mul(T1z, T2z, sz) or_return
+		internal_sub(T2z, tz, sz) or_return
+		internal_add(sz, sz, tz) or_return
+		internal_mul(tz, sz, T2z) or_return
+		internal_mod(tz, tz, N) or_return
+		internal_mod(sz, T1z, N) or_return
+
+		if bit, _ := internal_int_bitfield_extract_bool(Np1z, i); bit {
+			// temp = (a+2) * sz + tz
+			// tz   = 2 * tz - sz
+			// sz   = temp
+			if a == 0 {
+				internal_int_shl1(T1z, sz) or_return
+			} else {
+				internal_mul(T1z, sz, DIGIT(ap2)) or_return
+			}
+			internal_add(T1z, T1z, tz) or_return
+			internal_int_shl1(T2z, tz) or_return
+			internal_sub(tz, T2z, sz)
+			internal_swap(sz, T1z)
+		}
+	}
+
+	internal_set(T1z, u32((2 * a) + 5)) or_return
+	internal_mod(T1z, T1z, N) or_return
+
+	result = internal_is_zero(sz) && internal_eq(tz, T1z)
+
+	return
+}
+
+
+/*
+	Strong Lucas-Selfridge test.
+	returns true if it is a strong L-S prime, false if it is composite
+
+	Code ported from Thomas Ray Nicely's implementation of the BPSW test at http://www.trnicely.net/misc/bpsw.html
+
+	Freeware copyright (C) 2016 Thomas R. Nicely <http://www.trnicely.net>.
+	Released into the public domain by the author, who disclaims any legal liability arising from its use.
+
+	The multi-line comments are made by Thomas R. Nicely and are copied verbatim.
+	(If that name sounds familiar, he is the guy who found the fdiv bug in the Pentium CPU.)
+*/
+internal_int_prime_strong_lucas_selfridge :: proc(a: ^Int, allocator := context.allocator) -> (lucas_selfridge: bool, err: Error) {
+	// TODO: choose better variable names!
+
+	Dz, gcd, Np1, Uz, Vz, U2mz, V2mz, Qmz, Q2mz, Qkdz, T1z, T2z, T3z, T4z, Q2kdz := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(Dz, gcd, Np1, Uz, Vz, U2mz, V2mz, Qmz, Q2mz, Qkdz, T1z, T2z, T3z, T4z, Q2kdz)
+
+	/*
+		Find the first element D in the sequence {5, -7, 9, -11, 13, ...}
+		such that Jacobi(D,N) = -1 (Selfridge's algorithm). Theory
+		indicates that, if N is not a perfect square, D will "nearly
+		always" be "small." Just in case, an overflow trap for D is	included.
+	*/
+	internal_init_multi(Dz, gcd, Np1, Uz, Vz, U2mz, V2mz, Qmz, Q2mz, Qkdz, T1z, T2z, T3z, T4z, Q2kdz) or_return
+
+	D    := 5
+	sign := 1
+	Ds   : int
+
+	for {
+		Ds   = sign * D
+		sign = -sign
+
+		internal_set(Dz, D) or_return
+		internal_int_gcd(gcd, a, Dz) or_return
+
+		/*
+			If 1 < GCD < `N` then `N` is composite with factor "D", and
+			Jacobi(D, N) is technically undefined (but often returned as zero).
+		*/
+		if internal_gt(gcd, 1) && internal_lt(gcd, a)    { return }
+		if Ds < 0 { Dz.sign = .Negative }
+
+		j := internal_int_kronecker(Dz, a) or_return
+		if j == -1 { break }
+
+		D += 2
+		if D > max(int) - 2                              { return false, .Invalid_Argument }
+	}
+
+	Q := (1 - Ds) / 4   /* Required so D = P*P - 4*Q */
+
+	/*
+		NOTE: The conditions (a) N does not divide Q, and
+		(b) D is square-free or not a perfect square, are included by
+		some authors; e.g., "Prime numbers and computer methods for
+		factorization," Hans Riesel (2nd ed., 1994, Birkhauser, Boston),
+		p. 130. For this particular application of Lucas sequences,
+		these conditions were found to be immaterial.
+	*/
+
+	/*
+		Now calculate N - Jacobi(D,N) = N + 1 (even), and calculate the
+		odd positive integer d and positive integer s for which
+		N + 1 = 2^s*d (similar to the step for N - 1 in Miller's test).
+		The strong Lucas-Selfridge test then returns N as a strong
+		Lucas probable prime (slprp) if any of the following
+		conditions is met: U_d=0, V_d=0, V_2d=0, V_4d=0, V_8d=0,
+		V_16d=0, ..., etc., ending with V_{2^(s-1)*d}=V_{(N+1)/2}=0
+		(all equalities mod N). Thus d is the highest index of U that
+		must be computed (since V_2m is independent of U), compared
+		to U_{N+1} for the standard Lucas-Selfridge test; and no
+		index of V beyond (N+1)/2 is required, just as in the
+		standard Lucas-Selfridge test. However, the quantity Q^d must
+		be computed for use (if necessary) in the latter stages of
+		the test. The result is that the strong Lucas-Selfridge test
+		has a running time only slightly greater (order of 10 %) than
+		that of the standard Lucas-Selfridge test, while producing
+		only (roughly) 30 % as many pseudoprimes (and every strong
+		Lucas pseudoprime is also a standard Lucas pseudoprime). Thus
+		the evidence indicates that the strong Lucas-Selfridge test is
+		more effective than the standard Lucas-Selfridge test, and a
+		Baillie-PSW test based on the strong Lucas-Selfridge test
+		should be more reliable.
+	*/
+	internal_add(Np1, a, 1) or_return
+	s := internal_count_lsb(Np1) or_return
+
+	/*
+		This should round towards zero because Thomas R. Nicely used GMP's mpz_tdiv_q_2exp()
+		and mp_div_2d() is equivalent. Additionally: dividing an even number by two does not produce
+		any leftovers.
+	*/
+	internal_int_shr(Dz, Np1, s) or_return
+
+	/*
+		We must now compute U_d and V_d. Since d is odd, the accumulated
+		values U and V are initialized to U_1 and V_1 (if the target
+		index were even, U and V would be initialized instead to U_0=0
+		and V_0=2). The values of U_2m and V_2m are also initialized to
+		U_1 and V_1; the FOR loop calculates in succession U_2 and V_2,
+		U_4 and V_4, U_8 and V_8, etc. If the corresponding bits
+		(1, 2, 3, ...) of t are on (the zero bit having been accounted
+		for in the initialization of U and V), these values are then
+		combined with the previous totals for U and V, using the
+		composition formulas for addition of indices.
+	*/
+	internal_set(Uz,   1) or_return
+	internal_set(Vz,   1) or_return //	P := 1; /* Selfridge's choice */
+	internal_set(U2mz, 1) or_return
+	internal_set(V2mz, 1) or_return //	P := 1; /* Selfridge's choice */
+	internal_set(Qmz,  Q) or_return
+
+	internal_int_shl1(Q2mz, Qmz) or_return
+
+	/*
+		Initializes calculation of Q^d.
+	*/
+	internal_set(Qkdz, Q) or_return
+	Nbits := internal_count_bits(Dz)
+
+	for u := 1; u < Nbits; u += 1 { /* zero bit off, already accounted for */
+		/*
+			Formulas for doubling of indices (carried out mod N). Note that
+			the indices denoted as "2m" are actually powers of 2, specifically
+			2^(ul-1) beginning each loop and 2^ul ending each loop.
+			U_2m = U_m*V_m
+			V_2m = V_m*V_m - 2*Q^m
+		*/
+		internal_mul(U2mz, U2mz, V2mz) or_return
+		internal_mod(U2mz, U2mz, a) or_return
+		internal_sqr(V2mz, V2mz) or_return
+		internal_sub(V2mz, V2mz, Q2mz) or_return
+		internal_mod(V2mz, V2mz, a) or_return
+
+		/*
+			Must calculate powers of Q for use in V_2m, also for Q^d later.
+		*/
+		internal_sqr(Qmz, Qmz) or_return
+
+		/* Prevents overflow. Still necessary without a fixed prealloc'd mem.? */
+		internal_mod(Qmz, Qmz, a) or_return
+		internal_int_shl1(Q2mz, Qmz) or_return
+
+		if internal_int_bitfield_extract_bool(Dz, u) or_return {
+			/*
+				Formulas for addition of indices (carried out mod N);
+				U_(m+n) = (U_m*V_n + U_n*V_m)/2
+				V_(m+n) = (V_m*V_n + D*U_m*U_n)/2
+				Be careful with division by 2 (mod N)!
+			*/
+			internal_mul(T1z, U2mz, Vz) or_return
+			internal_mul(T2z, Uz, V2mz) or_return
+			internal_mul(T3z, V2mz, Vz) or_return
+			internal_mul(T4z, U2mz, Uz) or_return
+			internal_mul(T4z, T4z,  Ds) or_return
+
+			internal_add(Uz,  T1z, T2z) or_return
+
+			if internal_is_odd(Uz) {
+				internal_add(Uz, Uz, a) or_return
+			}
+
+			/*
+				This should round towards negative infinity because Thomas R. Nicely used GMP's mpz_fdiv_q_2exp().
+				But `internal_shr1` does not do so, it is truncating instead.
+			*/
+			oddness := internal_is_odd(Uz)
+			internal_int_shr1(Uz, Uz) or_return
+			if internal_is_negative(Uz) && oddness {
+				internal_sub(Uz, Uz, 1) or_return
+			}
+			internal_add(Vz, T3z, T4z) or_return
+			if internal_is_odd(Vz) {
+				internal_add(Vz, Vz, a) or_return
+			}
+
+			oddness  = internal_is_odd(Vz)
+			internal_int_shr1(Vz, Vz) or_return
+			if internal_is_negative(Vz) && oddness {
+				internal_sub(Vz, Vz, 1) or_return
+			}
+			internal_mod(Uz, Uz, a) or_return
+			internal_mod(Vz, Vz, a) or_return
+
+			/* Calculating Q^d for later use */
+			internal_mul(Qkdz, Qkdz, Qmz) or_return
+			internal_mod(Qkdz, Qkdz, a) or_return
 		}
 	}
 
 	/*
-		At this point the n.used'th least significant digits of x are all zero,
-		which means we can shift x to the right by n.used digits and the
-		residue is unchanged.
+		If U_d or V_d is congruent to 0 mod N, then N is a prime or a strong Lucas pseudoprime. */
+	if internal_is_zero(Uz) || internal_is_zero(Vz) {
+		return true, nil
+	}
 
-		x = x/b**n.used.
+	/*
+		NOTE: Ribenboim ("The new book of prime number records," 3rd ed.,
+		1995/6) omits the condition V0 on p.142, but includes it on
+		p. 130. The condition is NECESSARY; otherwise the test will
+		return false negatives---e.g., the primes 29 and 2000029 will be
+		returned as composite.
 	*/
-	internal_clamp(x)
-	internal_shr_digit(x, n.used)
 
 	/*
-		if x >= n then x = x - n
+		Otherwise, we must compute V_2d, V_4d, V_8d, ..., V_{2^(s-1)*d}
+		by repeated use of the formula V_2m = V_m*V_m - 2*Q^m. If any of
+		these are congruent to 0 mod N, then N is a prime or a strong
+		Lucas pseudoprime.
 	*/
-	if internal_cmp_mag(x, n) != -1 {
-		return internal_sub(x, x, n)
+
+	/* Initialize 2*Q^(d*2^r) for V_2m */
+	internal_int_shr1(Q2kdz, Qkdz) or_return
+
+	for r := 1; r < s; r += 1 {
+		internal_sqr(Vz, Vz) or_return
+		internal_sub(Vz, Vz, Q2kdz) or_return
+		internal_mod(Vz, Vz, a) or_return
+		if internal_is_zero(Vz) {
+			return true, nil
+		}
+		/* Calculate Q^{d*2^r} for next r (final iteration irrelevant). */
+		if r < (s - 1) {
+			internal_sqr(Qkdz, Qkdz) or_return
+			internal_mod(Qkdz, Qkdz, a) or_return
+			internal_int_shl1(Q2kdz, Qkdz) or_return
+		}
 	}
+	return false, nil
+}
+
+/*
+	Performs one Fermat test.
+
+	If "a" were prime then b**a == b (mod a) since the order of
+	the multiplicative sub-group would be phi(a) = a-1.  That means
+	it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
+
+	Returns `true` if the congruence holds, or `false` otherwise.
+
+	Assumes `a` and `b` not to be `nil` and to have been initialized.
+*/
+internal_prime_fermat :: proc(a, b: ^Int, allocator := context.allocator) -> (fermat: bool, err: Error) {
+	t := &Int{}
+	defer internal_destroy(t)
+
+	/*
+		Ensure `b` > 1.
+	*/
+	if !internal_gt(b, 1) { return false, .Invalid_Argument }
+
+	/*
+		Compute `t` = `b`**`a` mod `a`
+	*/
+	internal_int_exponent_mod(t, b, a, a) or_return
 
-	return nil
+	/*
+		Is it equal to b?
+	*/
+	fermat = internal_eq(t, b)
+	return
 }
 
-int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
-	assert_if_nil(x, n)
+/*
+	Tonelli-Shanks algorithm
+	https://en.wikipedia.org/wiki/Tonelli%E2%80%93Shanks_algorithm
+	https://gmplib.org/list-archives/gmp-discuss/2013-April/005300.html
+*/
+internal_int_sqrtmod_prime :: proc(res, n, prime: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator
 
-	internal_clear_if_uninitialized(x, n) or_return
+	/*
+		The type is "int" because of the types in the mp_int struct.
+		Don't forget to change them here when you change them there!
+	*/
+	S, M, i: int
+
+	t1, C, Q, Z, T, R, two := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(t1, C, Q, Z, T, R, two)
+
+	/*
+		First handle the simple cases.
+	*/
+	if internal_is_zero(n)                                           { return internal_zero(res)	}
+
+	/*
+		"prime" must be odd and > 2
+	*/
+	if internal_is_even(prime) || internal_lt(prime, 3)              { return .Invalid_Argument }
+	legendre := internal_int_kronecker(n, prime)                     or_return
+
+	/*
+		n \not\cong 0 (mod p) and n \cong r^2 (mod p) for some r \in N^+
+	*/
+	if legendre != 1                                                 { return .Invalid_Argument }
+
+	internal_init_multi(t1, C, Q, Z, T, R, two)                      or_return
+
+	/*
+		SPECIAL CASE: if prime mod 4 == 3
+		compute directly: err = n^(prime+1)/4 mod prime
+		Handbook of Applied Cryptography algorithm 3.36
+
+		x%4 == x&3 for x in N and x>0
+	*/
+	if prime.digit[0] & 3 == 3 {
+		internal_add(t1, prime, 1)                                   or_return
+		internal_shr(t1, t1, 2)                                      or_return
+		internal_int_exponent_mod(res, n, t1, prime)                 or_return
+		return
+	}
 
-	return #force_inline internal_int_montgomery_reduce(x, n, rho)
+	/*
+		NOW: Tonelli-Shanks algorithm
+		Factor out powers of 2 from prime-1, defining Q and S as: prime-1 = Q*2^S
+
+		Q = prime - 1
+	*/
+	internal_copy(Q, prime)                                          or_return
+	internal_sub(Q, Q, 1)                                            or_return
+
+	/*
+		S = 0
+	*/
+	S = 0
+	for internal_is_even(Q) {
+		/*
+			Q = Q / 2
+		*/
+		internal_int_shr1(Q, Q)                                      or_return
+		/*
+			S = S + 1
+		*/
+		S += 1
+	}
+
+	/*
+		Find a `Z` such that the Legendre symbol (Z|prime) == -1.
+		Z = 2.
+	*/
+	internal_set(Z, 2)                                               or_return
+
+	for {
+		legendre = internal_int_kronecker(Z, prime)                  or_return
+
+		/*
+			If "prime" (p) is an odd prime Jacobi(k|p) = 0 for k \cong 0 (mod p)
+			but there is at least one non-quadratic residue before k>=p if p is an odd prime.
+		*/
+		if legendre == 0                                             { return .Invalid_Argument }
+		if legendre == -1                                            { break }
+
+		/*
+			Z = Z + 1
+		*/
+		internal_add(Z, Z, 1)                                        or_return
+	}
+
+	/*
+		C = Z ^ Q mod prime
+	*/
+	internal_int_exponent_mod(C, Z, Q, prime)                        or_return
+
+	/*
+		t1 = (Q + 1) / 2
+	*/
+	internal_add(t1, Q, 1)                                           or_return
+	internal_int_shr1(t1, t1)                                        or_return
+
+	/*
+		R = n ^ ((Q + 1) / 2) mod prime
+	*/
+	internal_int_exponent_mod(R, n, t1, prime)                       or_return
+
+	/*
+		T = n ^ Q mod prime
+	*/
+	internal_int_exponent_mod(T, n, Q, prime)                        or_return
+
+	/*
+		M = S
+	*/
+	M = S
+	internal_set(two, 2)
+
+	for {
+		internal_copy(t1, T)                                         or_return
+
+		i = 0
+		for {
+			if internal_eq(T, 1)                                     { break }
+
+			/*
+				No exponent in the range 0 < i < M found.
+				(M is at least 1 in the first round because "prime" > 2)
+			*/
+			if M == i                                                { return .Invalid_Argument }
+			internal_int_exponent_mod(t1, t1, two, prime)            or_return
+
+			i += 1
+		}
+
+		if i == 0 {
+			internal_copy(res, R)                                    or_return
+		}
+
+		/*
+			t1 = 2 ^ (M - i - 1)
+		*/
+		internal_set(t1, M - i - 1)                                  or_return
+		internal_int_exponent_mod(t1, two, t1, prime)                or_return
+
+		/*
+			t1 = C ^ (2 ^ (M - i - 1)) mod prime
+		*/
+		internal_int_exponent_mod(t1, C, t1, prime)                  or_return
+
+		/*
+			C = (t1 * t1) mod prime
+		*/
+		internal_sqrmod(C, t1, prime)                                or_return
+
+		/*
+			R = (R * t1) mod prime
+		*/
+		internal_mulmod(R, R, t1, prime)                             or_return
+
+		/*
+			T = (T * C) mod prime
+		*/
+		mulmod(T, T, C, prime)                                       or_return
+
+		/*
+			M = i
+		*/
+		M = i
+	}
+
+	return
 }
 
 /*
-	Shifts with subtractions when the result is greater than b.
-
-	The method is slightly modified to shift B unconditionally upto just under
-	the leading bit of b.  This saves alot of multiple precision shifting.
+	Finds the next prime after the number `a` using `t` trials of Miller-Rabin,
+	in place: It sets `a` to the prime found.
+	`bbs_style` = true means the prime must be congruent to 3 mod 4
 */
-internal_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+internal_int_prime_next_prime :: proc(a: ^Int, trials: int, bbs_style: bool, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator
+
+	res_tab := [_PRIME_TAB_SIZE]DIGIT{}
+
+	/*
+		Force positive.
+	*/
+	a.sign = .Zero_or_Positive
+
+	/*
+		Simple algo if `a` is less than the largest prime in the table.
+	*/
+	if internal_lt(a, _private_prime_table[_PRIME_TAB_SIZE - 1]) {
+		/*
+			Find which prime it is bigger than `a`
+		*/
+		for p in _private_prime_table {
+			cmp := internal_cmp(a, p)
+
+			if cmp == 0 { continue }
+			if cmp != 1 {
+				if bbs_style && (p & 3 != 3) {
+					/*
+						Try again until we get a prime congruent to 3 mod 4.
+					*/
+					continue
+				} else {
+					return internal_set(a, p)
+				}
+			}
+		}
+		/*
+			Fall through to the sieve.
+		*/
+	}
+
 	/*
-		How many bits of last digit does b use.
+		Generate a prime congruent to 3 mod 4 or 1/3 mod 4?
 	*/
-	bits := internal_count_bits(b) % _DIGIT_BITS
+	kstep: DIGIT = 4 if bbs_style else 2
 
-	if b.used > 1 {
-		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1
-		internal_int_power_of_two(a, power)                          or_return
+	/*
+		At this point we will use a combination of a sieve and Miller-Rabin.
+	*/
+	if bbs_style {
+		/*
+			If `a` mod 4 != 3 subtract the correct value to make it so.
+		*/
+		if a.digit[0] & 3 != 3 {
+			internal_sub(a, a, (a.digit[0] & 3) + 1) or_return
+		}
 	} else {
-		internal_one(a)
-		bits = 1
+		if internal_is_even(a) {
+			/*
+				Force odd.
+			*/
+			internal_sub(a, a, 1) or_return
+		}
 	}
 
 	/*
-		Now compute C = A * B mod b.
+		Generate the restable.
 	*/
-	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
-		internal_int_shl1(a, a)                                      or_return
-		if internal_cmp_mag(a, b) != -1 {
-			internal_sub(a, a, b)                                    or_return
+	for x := 1; x < _PRIME_TAB_SIZE; x += 1 {
+		res_tab = internal_mod(a, _private_prime_table[x]) or_return
+	}
+
+	for {
+		step := DIGIT(0)
+		y: bool
+
+		/*
+			Skip to the next non-trivially divisible candidate.
+		*/
+		for {
+			/*
+				y == true if any residue was zero [e.g. cannot be prime]
+			*/
+			y = false
+
+			/*
+				Increase step to next candidate.
+			*/
+			step += kstep
+
+			/*
+				Compute the new residue without using division.
+			*/
+			for x := 1; x < _PRIME_TAB_SIZE; x += 1 {
+				/*
+					Add the step to each residue.
+				*/
+				res_tab[x] += kstep
+
+				/*
+					Subtract the modulus [instead of using division].
+				*/
+				if res_tab[x] >= _private_prime_table[x] {
+					res_tab[x] -= _private_prime_table[x]
+				}
+
+				/*
+					Set flag if zero.
+				*/
+				if res_tab[x] == 0 {
+					y = true
+				}
+			}
+			if !(y && (step < (((1 << _DIGIT_BITS) - kstep)))) { break }
 		}
+
+		/*
+			Add the step.
+		*/
+		internal_add(a, a, step) or_return
+
+		/*
+			If we didn't pass the sieve and step == MP_MAX then skip test */
+		if (y && (step >= ((1 << _DIGIT_BITS) - kstep))) { continue }
+
+		if internal_int_is_prime(a, trials) or_return { break }
 	}
-	return nil
+	return
 }
 
-int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	assert_if_nil(a, b)
+/*
+	Makes a truly random prime of a given size (bits),
+
+	Flags are as follows:
+	 	Blum_Blum_Shub    - Make prime congruent to 3 mod 4
+		Safe              - Make sure (p-1)/2 is prime as well (implies .Blum_Blum_Shub)
+		Second_MSB_On     - Make the 2nd highest bit one
+
+	This is possibly the mother of all prime generation functions, muahahahahaha!
+*/
+internal_random_prime :: proc(a: ^Int, size_in_bits: int, trials: int, flags := Primality_Flags{}, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator
+	flags  := flags
+	trials := trials
 
-	internal_clear_if_uninitialized(a, b) or_return
+	t := &Int{}
+	defer internal_destroy(t)
+
+	/*
+		Sanity check the input.
+	*/
+	if size_in_bits <= 1 || trials < -1                              { return .Invalid_Argument }
 
-	return #force_inline internal_int_montgomery_calc_normalization(a, b)
+	/*
+		`.Safe` implies `.Blum_Blum_Shub`.
+	*/
+	if .Safe in flags {
+		if size_in_bits < 3 {
+			/*
+				The smallest safe prime is 5, which takes 3 bits.
+				We early out now, else we'd be locked in an infinite loop trying to generate a 2-bit Safe Prime.
+			*/
+			return .Invalid_Argument
+		}
+		flags += { .Blum_Blum_Shub, }
+	}
+
+	/*
+		Automatically choose the number of Rabin-Miller trials?
+	*/
+	if trials == -1 {
+		trials = number_of_rabin_miller_trials(size_in_bits)
+	}
+
+	res: bool
+	RANDOM_PRIME_ITERATIONS_USED = 0
+
+	for {
+		if MAX_ITERATIONS_RANDOM_PRIME > 0 {
+			RANDOM_PRIME_ITERATIONS_USED += 1
+			if RANDOM_PRIME_ITERATIONS_USED > MAX_ITERATIONS_RANDOM_PRIME {
+				return .Max_Iterations_Reached
+			}
+		}
+
+		internal_int_random(a, size_in_bits)                         or_return
+
+		/*
+			Make sure it's odd.
+		*/
+		if size_in_bits > 2 {
+			a.digit[0] |= 1
+		} else {
+			/*
+				A 2-bit prime can be either 2 (0b10) or 3 (0b11).
+				So, let's force the top bit to 1 and return early.
+			*/
+			a.digit[0] |= 2
+			return nil
+		}
+
+		if .Blum_Blum_Shub in flags {
+			a.digit[0] |= 3
+		}
+		if .Second_MSB_On in flags {
+			internal_int_bitfield_set_single(a, size_in_bits - 2) or_return
+		}
+
+		/*
+			Is it prime?
+		*/
+		res = internal_int_is_prime(a, trials)                       or_return
+
+		if (!res) {
+			continue
+		}
+
+		if .Safe in flags {
+			/*
+				See if (a-1)/2 is prime.
+			*/
+			internal_sub(a, a, 1)                                    or_return
+			internal_int_shr1(a, a)                                  or_return
+
+			/*
+				Is it prime?
+			*/
+			res = internal_int_is_prime(a, trials)                   or_return
+		}
+		if res { break }
+	}
+
+	if .Safe in flags {
+		/*
+			Restore a to the original value.
+		*/
+		internal_int_shl1(a, a)                                      or_return
+		internal_add(a, a, 1)                                        or_return
+	}
+	return
 }
 
 /*
-	Sets up the Montgomery reduction stuff.
+	Extended Euclidean algorithm of (a, b) produces `a * u1` + `b * u2` = `u3`.
 */
-internal_int_montgomery_setup :: proc(n: ^Int) -> (rho: DIGIT, err: Error) {
+internal_int_extended_euclidean :: proc(a, b, U1, U2, U3: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	u1, u2, u3, v1, v2, v3, t1, t2, t3, q, tmp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(u1, u2, u3, v1, v2, v3, t1, t2, t3, q, tmp)
+	internal_init_multi(u1, u2, u3, v1, v2, v3, t1, t2, t3, q, tmp)  or_return
+
 	/*
-		Fast inversion mod 2**k
-		Based on the fact that:
+		Initialize, (u1, u2, u3) = (1, 0, a).
+	*/
+	internal_set(u1, 1)                                              or_return
+	internal_set(u3, a)                                              or_return
 
-		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
-		                  =>  2*X*A - X*X*A*A = 1
-		                  =>  2*(1) - (1)     = 1
+	/*
+		Initialize, (v1, v2, v3) = (0, 1, b).
 	*/
-	b := n.digit[0]
-	if b & 1 == 0 { return 0, .Invalid_Argument }
+	internal_set(v2, 1)                                              or_return
+	internal_set(v3, b)                                              or_return
+
+	/*
+		Loop while v3 != 0
+	*/
+	for !internal_is_zero(v3) {
+		/*
+			q = u3 / v3
+		*/
+		internal_div(q, u3, v3)                                      or_return
+
+		/*
+			(t1, t2, t3) = (u1, u2, u3) - (v1, v2, v3)q
+		*/
+		internal_mul(tmp, v1, q)                                     or_return
+		internal_sub( t1, u1, tmp)                                   or_return
+
+		internal_mul(tmp, v2, q)                                     or_return
+		internal_sub( t2, u2, tmp)                                   or_return
 
-	x := (((b + 2) & 4) << 1) + b /* here x*a==1 mod 2**4 */
-	x *= 2 - (b * x)              /* here x*a==1 mod 2**8 */
-	x *= 2 - (b * x)              /* here x*a==1 mod 2**16 */
-	when _WORD_TYPE_BITS == 64 {
-		x *= 2 - (b * x)              /* here x*a==1 mod 2**32 */
-		x *= 2 - (b * x)              /* here x*a==1 mod 2**64 */
+		internal_mul(tmp, v3, q)                                     or_return
+		internal_sub( t3, u3, tmp)                                   or_return
+
+		/*
+			(u1, u2, u3) = (v1, v2, v3)
+		*/
+		internal_set(u1, v1)                                         or_return
+		internal_set(u2, v2)                                         or_return
+		internal_set(u3, v3)                                         or_return
+
+		/*
+			(v1, v2, v3) = (t1, t2, t3)
+		*/
+		internal_set(v1, t1)                                         or_return
+		internal_set(v2, t2)                                         or_return
+		internal_set(v3, t3)                                         or_return
 	}
 
 	/*
-		rho = -1/m mod b
+		Make sure U3 >= 0.
 	*/
-	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK))
-	return rho, nil
-}
-
-int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
-	assert_if_nil(n)
-	internal_clear_if_uninitialized(n, allocator) or_return
+	if internal_is_negative(u3) {
+		internal_neg(u1, u1)                                         or_return
+		internal_neg(u2, u2)                                         or_return
+		internal_neg(u3, u3)                                         or_return
+	}
 
-	return #force_inline internal_int_montgomery_setup(n)
+	/*
+		Copy result out.
+	*/
+	if U1 != nil {
+		internal_swap(u1, U1)
+	}
+	if U2 != nil {
+		internal_swap(u2, U2)
+	}
+	if U3 != nil {
+		internal_swap(u3, U3)
+	}
+	return
 }
 
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */
 number_of_rabin_miller_trials :: proc(bit_size: int) -> (number_of_trials: int) {
 	switch {
 	case bit_size <=    80:
-		return - 1		/* Use deterministic algorithm for size <= 80 bits */
+		return -1		/* Use deterministic algorithm for size <= 80 bits */
 	case bit_size >=    81 && bit_size <     96:
 		return 37		/* max. error = 2^(-96)  */
 	case bit_size >=    96 && bit_size <    128:
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 241cd7441..b3d4d80e5 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -17,6 +15,7 @@ package math_big
 
 	These aren't exported for the same reasons.
 */
+package math_big
 
 import "core:intrinsics"
 import "core:mem"
@@ -1072,11 +1071,11 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 
 	internal_shl_digit(y, n - t) or_return
 
-	c := internal_cmp(x, y)
-	for c != -1 {
+	gte := internal_gte(x, y)
+	for gte {
 		q.digit[n - t] += 1
 		internal_sub(x, x, y) or_return
-		c = internal_cmp(x, y)
+		gte = internal_gte(x, y)
 	}
 
 	/*
@@ -1135,7 +1134,7 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 			t2.digit[2] = x.digit[i]
 			t2.used = 3
 
-			if t1_t2 := internal_cmp_mag(t1, t2); t1_t2 != 1 {
+			if internal_lte(t1, t2) {
 				break
 			}
 			iter += 1; if iter > 100 {
@@ -1228,15 +1227,13 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
 	*/
-	if internal_cmp(A1, 0) == -1 {
+	if internal_lt(A1, 0) {
 		internal_shl(t, b, k * _DIGIT_BITS) or_return
 
 		for {
 			internal_decr(Q1) or_return
 			internal_add(A1, A1, t) or_return
-			if internal_cmp(A1, 0) != -1 {
-				break
-			}
+			if internal_gte(A1, 0) { break }
 		}
 	}
 
@@ -1257,7 +1254,7 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
 	*/
-	for internal_cmp(A2, 0) == -1 {
+	for internal_is_negative(A2) { // internal_lt(A2, 0) {
 		internal_decr(Q0) or_return
 		internal_add(A2, A2, b) or_return
 	}
@@ -1376,7 +1373,7 @@ _private_int_div_recursive :: proc(quotient, remainder, a, b: ^Int, allocator :=
 _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) {
 
 	ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}
-	c: int
+
 	defer internal_destroy(ta, tb, tq, q)
 
 	for {
@@ -1392,7 +1389,7 @@ _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int
 		shl(tq, tq, n)       or_return
 
 		for n >= 0 {
-			if c, _ = cmp_mag(ta, tb); c == 0 || c == 1 {
+			if internal_gte(ta, tb) {
 				// ta -= tb
 				sub(ta, ta, tb) or_return
 				//  q += tq
@@ -1438,7 +1435,7 @@ _private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := cont
 	internal_one(inner, false)                                       or_return
 	internal_one(outer, false)                                       or_return
 
-	bits_used := int(_DIGIT_TYPE_BITS - intrinsics.count_leading_zeros(n))
+	bits_used := ilog2(n)
 
 	for i := bits_used; i >= 0; i -= 1 {
 		start := (n >> (uint(i) + 1)) + 1 | 1
@@ -1597,7 +1594,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		/*
 			Make sure `v` is the largest.
 		*/
-		if internal_cmp_mag(u, v) == 1 {
+		if internal_gt(u, v) {
 			/*
 				Swap `u` and `v` to make sure `v` is >= `u`.
 			*/
@@ -1635,7 +1632,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		Computes least common multiple as `|a*b|/gcd(a,b)`
 		Divide the smallest by the GCD.
 	*/
-	if internal_cmp_mag(a, b) == -1 {
+	if internal_lt_abs(a, b) {
 		/*
 			Store quotient in `t2` such that `t2 * b` is the LCM.
 		*/
@@ -1695,9 +1692,7 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 		/*
 			Iterate until `a` is bracketed between low + high.
 		*/
-		if #force_inline internal_cmp(bracket_high, a) != -1 {
-			break
-		}
+		if #force_inline internal_gte(bracket_high, a) { break }
 
 		low = high
 		#force_inline internal_copy(bracket_low, bracket_high) or_return
@@ -1731,9 +1726,6 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 	return
 }
 
-
-
-
 /*
 	Computes xR**-1 == x (mod N) via Montgomery Reduction.
 	This is an optimized implementation of `internal_montgomery_reduce`
@@ -1754,7 +1746,7 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		Grow `x` as required.
 	*/
-	internal_grow(x, n.used + 1) or_return
+	internal_grow(x, n.used + 1)                                     or_return
 
 	/*
 		First we have to get the digits of the input into an array of double precision words W[...]
@@ -1848,9 +1840,1018 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		if A >= m then A = A - m
 	*/
-	if internal_cmp_mag(x, n) != -1 {
+	if internal_gte_abs(x, n) {
+		return internal_sub(x, x, n)
+	}
+	return nil
+}
+
+/*
+	Computes xR**-1 == x (mod N) via Montgomery Reduction.
+	Assumes `x` and `n` not to be nil.
+*/
+_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	/*
+		Can the fast reduction [comba] method be used?
+		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
+		since carries are fixed up in the inner loop.
+	*/
+	internal_clear_if_uninitialized(x, n) or_return
+
+	digs := (n.used * 2) + 1
+	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
+		return _private_montgomery_reduce_comba(x, n, rho)
+	}
+
+	/*
+		Grow the input as required
+	*/
+	internal_grow(x, digs)                                           or_return
+	x.used = digs
+
+	for ix := 0; ix < n.used; ix += 1 {
+		/*
+			`mu = ai * rho mod b`
+			The value of rho must be precalculated via `int_montgomery_setup()`,
+			such that it equals -1/n0 mod b this allows the following inner loop
+			to reduce the input one digit at a time.
+		*/
+
+		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK))
+
+		/*
+			a = a + mu * m * b**i
+			Multiply and add in place.
+		*/
+		u  := DIGIT(0)
+		iy := int(0)
+		for ; iy < n.used; iy += 1 {
+			/*
+				Compute product and sum.
+			*/
+			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]))
+
+			/*
+				Get carry.
+			*/
+			u = DIGIT(r >> _DIGIT_BITS)
+
+			/*
+				Fix digit.
+			*/
+			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
+		}
+
+		/*
+			At this point the ix'th digit of x should be zero.
+			Propagate carries upwards as required.
+		*/
+		for u != 0 {
+			x.digit[ix + iy] += u
+			u = x.digit[ix + iy] >> _DIGIT_BITS
+			x.digit[ix + iy] &= _MASK
+			iy += 1
+		}
+	}
+
+	/*
+		At this point the n.used'th least significant digits of x are all zero,
+		which means we can shift x to the right by n.used digits and the
+		residue is unchanged.
+
+		x = x/b**n.used.
+	*/
+	internal_clamp(x)
+	internal_shr_digit(x, n.used)
+
+	/*
+		if x >= n then x = x - n
+	*/
+	if internal_gte_abs(x, n) {
 		return internal_sub(x, x, n)
 	}
+
+	return nil
+}
+
+/*
+	Shifts with subtractions when the result is greater than b.
+
+	The method is slightly modified to shift B unconditionally upto just under
+	the leading bit of b.  This saves alot of multiple precision shifting.
+
+	Assumes `a` and `b` not to be `nil`.
+*/
+_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	/*
+		How many bits of last digit does b use.
+	*/
+	internal_clear_if_uninitialized(a, b) or_return
+
+	bits := internal_count_bits(b) % _DIGIT_BITS
+
+	if b.used > 1 {
+		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1
+		internal_int_power_of_two(a, power)                          or_return
+	} else {
+		internal_one(a)                                              or_return
+		bits = 1
+	}
+
+	/*
+		Now compute C = A * B mod b.
+	*/
+	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
+		internal_int_shl1(a, a)                                      or_return
+		if internal_gte_abs(a, b) {
+			internal_sub(a, a, b)                                    or_return
+		}
+	}
+	return nil
+}
+
+/*
+	Sets up the Montgomery reduction stuff.
+*/
+_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
+	/*
+		Fast inversion mod 2**k
+		Based on the fact that:
+
+		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
+		                  =>  2*X*A - X*X*A*A = 1
+		                  =>  2*(1) - (1)     = 1
+	*/
+	internal_clear_if_uninitialized(n, allocator) or_return
+
+	b := n.digit[0]
+	if b & 1 == 0 { return 0, .Invalid_Argument }
+
+	x := (((b + 2) & 4) << 1) + b /* here x*a==1 mod 2**4 */
+	x *= 2 - (b * x)              /* here x*a==1 mod 2**8 */
+	x *= 2 - (b * x)              /* here x*a==1 mod 2**16 */
+
+	when _DIGIT_TYPE_BITS == 64 {
+		x *= 2 - (b * x)              /* here x*a==1 mod 2**32 */
+		x *= 2 - (b * x)              /* here x*a==1 mod 2**64 */
+	}
+
+	/*
+		rho = -1/m mod b
+	*/
+	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK))
+	return rho, nil
+}
+
+/*
+	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
+	From HAC pp.604 Algorithm 14.42
+
+	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
+*/
+_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	q := &Int{}
+	defer internal_destroy(q)
+	um := m.used
+
+	/*
+		q = x
+	*/
+	internal_copy(q, x)                                              or_return
+
+	/*
+		q1 = x / b**(k-1)
+	*/
+	internal_shr_digit(q, um - 1)
+
+	/*
+		According to HAC this optimization is ok.
+	*/
+	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
+		internal_mul(q, q, mu)                                       or_return
+	} else {
+		_private_int_mul_high(q, q, mu, um)                          or_return
+	}
+
+	/*
+		q3 = q2 / b**(k+1)
+	*/
+	internal_shr_digit(q, um + 1)
+
+	/*
+		x = x mod b**(k+1), quick (no division)
+	*/
+	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return
+
+	/*
+		q = q * m mod b**(k+1), quick (no division)
+	*/
+	_private_int_mul(q, q, m, um + 1)                                or_return
+
+	/*
+		x = x - q
+	*/
+	internal_sub(x, x, q)                                            or_return
+
+	/*
+		If x < 0, add b**(k+1) to it.
+	*/
+	if internal_is_negative(x) {
+		internal_set(q, 1)                                           or_return
+		internal_shl_digit(q, um + 1)                                or_return
+		internal_add(x, x, q)                                        or_return
+	}
+
+	/*
+		Back off if it's too big.
+	*/
+	for internal_gte(x, m) {
+		internal_sub(x, x, m)                                        or_return
+	}
+
+	return nil
+}
+
+/*
+	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
+*/
+_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	q := &Int{}
+	defer internal_destroy(q)
+
+	internal_zero(q)                                                 or_return
+
+	p := internal_count_bits(n)
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return
+
+		if d != 1 {
+			/*
+				q = q * d
+			*/
+			internal_mul(q, q, d)                                    or_return
+		}
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return
+		if internal_lt_abs(a, n)                                     { break }
+		internal_sub(a, a, n)                                        or_return
+	}
+
+	return nil
+}
+
+/*
+	Reduces `a` modulo `n` where `n` is of the form 2**p - d
+	This differs from reduce_2k since "d" can be larger than a single digit.
+*/
+_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	q := &Int{}
+	defer internal_destroy(q)
+
+	internal_zero(q)                                                 or_return
+
+	p := internal_count_bits(n)
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return
+
+		/*
+			q = q * d
+		*/
+		internal_mul(q, q, d)                                        or_return
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return
+		if internal_lt_abs(a, n)                                     { break }
+		internal_sub(a, a, n)                                        or_return
+	}
+
+	return nil
+}
+
+/*
+	Determines if `internal_int_reduce_2k` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a)
+
+	if internal_is_zero(a) {
+		return false, nil
+	} else if a.used == 1 {
+		return true, nil
+	} else if a.used  > 1 {
+		iy := internal_count_bits(a)
+		iw := 1
+		iz := DIGIT(1)
+
+		/*
+			Test every bit from the second digit up, must be 1.
+		*/
+		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
+			if a.digit[iw] & iz == 0 {
+				return false, nil
+			}
+
+			iz <<= 1
+			if iz > _DIGIT_MAX {
+				iw += 1
+				iz  = 1
+			}
+		}
+		return true, nil
+	} else {
+		return true, nil
+	}
+}
+
+/*
+	Determines if `internal_int_reduce_2k_l` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a)
+
+	if internal_int_is_zero(a) {
+		return false, nil
+	} else if a.used == 1 {
+		return true, nil
+	} else if a.used  > 1 {
+		/*
+			If more than half of the digits are -1 we're sold.
+		*/
+		ix := 0
+		iy := 0
+
+		for ; ix < a.used; ix += 1 {
+			if a.digit[ix] == _DIGIT_MAX {
+				iy += 1
+			}
+		}
+		return iy >= (a.used / 2), nil
+	} else {
+		return false, nil
+	}
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
+	context.allocator = allocator
+
+	tmp := &Int{}
+	defer internal_destroy(tmp)
+	internal_zero(tmp)                                               or_return
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return
+	internal_sub(tmp, tmp, a)                                        or_return
+
+	return tmp.digit[0], nil
+}
+
+/*
+	Determines the setup value.
+	Assumes `mu` and `P` are not `nil`.
+
+	d := (1 << a.bits) - a;
+*/
+_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	tmp := &Int{}
+	defer internal_destroy(tmp)
+	internal_zero(tmp)                                               or_return
+
+	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return
+	internal_sub(mu, tmp, P)                                         or_return
+
+	return nil
+}
+
+/*
+	Pre-calculate the value required for Barrett reduction.
+	For a given modulus "P" it calulates the value required in "mu"
+	Assumes `mu` and `P` are not `nil`.
+*/
+_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return
+	return internal_int_div(mu, mu, P)
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
+	/*
+		The casts are required if _DIGIT_BITS is one less than
+		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
+	*/
+	return DIGIT((1 << _DIGIT_BITS) - a.digit[0])
+}
+
+/*
+	Determines if a number is a valid DR modulus.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
+	/*
+		Must be at least two digits.
+	*/
+	if a.used < 2 { return false }
+
+	/*
+		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
+	*/
+	for ix := 1; ix < a.used; ix += 1 {
+		if a.digit[ix] != _MASK {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+	Based on algorithm from the paper
+
+		"Generating Efficient Primes for Discrete Log Cryptosystems"
+					Chae Hoon Lim, Pil Joong Lee,
+			POSTECH Information Research Laboratories
+
+	The modulus must be of a special format [see manual].
+	Has been modified to use algorithm 7.10 from the LTM book instead
+
+	Input x must be in the range 0 <= x <= (n-1)**2
+	Assumes `x` and `n` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
+	/*
+		m = digits in modulus.
+	*/
+	m := n.used
+
+	/*
+		Ensure that "x" has at least 2m digits.
+	*/
+	internal_grow(x, m + m)                                          or_return
+
+	/*
+		Top of loop, this is where the code resumes if another reduction pass is required.
+	*/
+	for {
+		i: int
+		mu := DIGIT(0)
+
+		/*
+			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
+		*/
+		for i = 0; i < m; i += 1 {
+			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu)
+			x.digit[i] = DIGIT(r & _WORD(_MASK))
+			mu         = DIGIT(r >> _WORD(_DIGIT_BITS))
+		}
+
+		/*
+			Set final carry.
+		*/
+		x.digit[i] = mu
+
+		/*
+			Zero words above m.
+		*/
+		mem.zero_slice(x.digit[m + 1:][:x.used - m])
+
+		/*
+			Clamp, sub and return.
+		*/
+		internal_clamp(x)                                            or_return
+
+		/*
+			If x >= n then subtract and reduce again.
+			Each successive "recursion" makes the input smaller and smaller.
+		*/
+		if internal_lt_abs(x, n) { break }
+
+		internal_sub(x, x, n)                                        or_return
+	}
+	return nil
+}
+
+/*
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	M := [_TAB_SIZE]Int{}
+	winsize: uint
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error)
+
+	defer {
+		internal_destroy(&M[1])
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x])
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X)
+	switch {
+	case x <= 7:
+		winsize = 2
+	case x <= 36:
+		winsize = 3
+	case x <= 140:
+		winsize = 4
+	case x <= 450:
+		winsize = 5
+	case x <= 1303:
+		winsize = 6
+	case x <= 3529:
+		winsize = 7
+	case:
+		winsize = 8
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
+
+	/*
+		Init M array.
+		Init first cell.
+	*/
+	internal_zero(&M[1])                                             or_return
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_zero(&M[x])                                         or_return
+	}
+
+	/*
+		Create `mu`, used for Barrett reduction.
+	*/
+	mu := &Int{}
+	defer internal_destroy(mu)
+	internal_zero(mu)                                                or_return
+
+	if redmode == 0 {
+		_private_int_reduce_setup(mu, P)                             or_return
+		redux = _private_int_reduce
+	} else {
+		_private_int_reduce_2k_setup_l(mu, P)                        or_return
+		redux = _private_int_reduce_2k_l
+	}
+
+	/*
+		Create M table.
+
+		The M table contains powers of the base, e.g. M[x] = G**x mod P.
+		The first half of the table is not computed, though, except for M[0] and M[1].
+	*/
+	internal_int_mod(&M[1], G, P)                                    or_return
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+
+		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
+		instead of repeated squaring.
+	*/
+	slot := 1 << (winsize - 1)
+	internal_copy(&M[slot], &M[1])                                   or_return
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		/*
+			Square it.
+		*/
+		internal_sqr(&M[slot], &M[slot])                             or_return
+
+		/*
+			Reduce modulo P
+		*/
+		redux(&M[slot], P, mu)                                       or_return
+	}
+
+	/*
+		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+	*/
+	for x = slot + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return
+		redux(&M[x], P, mu)                                          or_return
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_one(res)                                                or_return
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0
+	bitcnt := 1
+	buf    := DIGIT(0)
+	digidx := X.used - 1
+	bitcpy := uint(0)
+	bitbuf := DIGIT(0)
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits.
+			*/
+			if digidx == -1 { break }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx]
+			digidx -= 1
+			bitcnt = _DIGIT_BITS
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := buf >> (_DIGIT_BITS - 1) & 1
+		buf <<= 1
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit
+			in the exponent.  Technically this opt is not required but it
+			does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 {
+			continue
+		}
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, mu)                                        or_return
+			continue
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1
+		bitbuf |= (y << (winsize - bitcpy))
+		mode    = 2
+
+		if (bitcpy == winsize) {
+			/*
+				Window is filled so square as required and multiply.
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return
+				redux(res, P, mu)                                    or_return
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return
+			redux(res, P, mu)                                        or_return
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0
+			bitbuf = 0
+			mode   = 1
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < int(bitcpy); x += 1 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, mu)                                        or_return
+
+			bitbuf <<= 1
+			if ((bitbuf & (1 << winsize)) != 0) {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return
+				redux(res, P, mu)                                    or_return
+			}
+		}
+	}
+	return err
+}
+
+/*
+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+
+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
+	The value of `k` changes based on the size of the exponent.
+
+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	M := [_TAB_SIZE]Int{}
+	winsize: uint
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error)
+
+	defer {
+		internal_destroy(&M[1])
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x])
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X)
+	switch {
+	case x <= 7:
+		winsize = 2
+	case x <= 36:
+		winsize = 3
+	case x <= 140:
+		winsize = 4
+	case x <= 450:
+		winsize = 5
+	case x <= 1303:
+		winsize = 6
+	case x <= 3529:
+		winsize = 7
+	case:
+		winsize = 8
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
+
+	/*
+		Init M array
+		Init first cell.
+	*/
+	cap := internal_int_allocated_cap(P)
+	internal_grow(&M[1], cap)                                        or_return
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_grow(&M[x], cap)                                    or_return
+	}
+
+	/*
+		Determine and setup reduction code.
+	*/
+	rho: DIGIT
+
+	if redmode == 0 {
+		/*
+			Now setup Montgomery.
+		*/
+		rho = _private_int_montgomery_setup(P)                       or_return
+
+		/*
+			Automatically pick the comba one if available (saves quite a few calls/ifs).
+		*/
+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
+			redux = _private_montgomery_reduce_comba
+		} else {
+			/*
+				Use slower baseline Montgomery method.
+			*/
+			redux = _private_int_montgomery_reduce
+		}
+	} else if redmode == 1 {
+		/*
+			Setup DR reduction for moduli of the form B**k - b.
+		*/
+		rho = _private_int_dr_setup(P)
+		redux = _private_int_dr_reduce
+	} else {
+		/*
+			Setup DR reduction for moduli of the form 2**k - b.
+		*/
+		rho = _private_int_reduce_2k_setup(P)                        or_return
+		redux = _private_int_reduce_2k
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_grow(res, cap)                                          or_return
+
+	/*
+		Create M table
+		The first half of the table is not computed, though, except for M[0] and M[1]
+	*/
+
+	if redmode == 0 {
+		/*
+			Now we need R mod m.
+		*/
+		_private_int_montgomery_calc_normalization(res, P)           or_return
+
+		/*
+			Now set M[1] to G * R mod m.
+		*/
+		internal_mulmod(&M[1], G, res, P)                            or_return
+	} else {
+		internal_one(res)                                            or_return
+		internal_mod(&M[1], G, P)                                    or_return
+	}
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+	*/
+	slot := 1 << (winsize - 1)
+	internal_copy(&M[slot], &M[1])                                   or_return
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		internal_sqr(&M[slot], &M[slot])                             or_return
+		redux(&M[slot], P, rho)                                      or_return
+	}
+
+	/*
+		Create upper table.
+	*/
+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return
+		redux(&M[x], P, rho)                                         or_return
+	}
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0
+	bitcnt := 1
+	buf    := DIGIT(0)
+	digidx := X.used - 1
+	bitcpy := 0
+	bitbuf := DIGIT(0)
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits so break.
+			*/
+			if digidx == -1 { break }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx]
+			digidx -= 1
+			bitcnt = _DIGIT_BITS
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := (buf >> (_DIGIT_BITS - 1)) & 1
+		buf <<= 1
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit in the exponent.
+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 { continue }
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, rho)                                       or_return
+			continue
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1
+		bitbuf |= (y << (winsize - uint(bitcpy)))
+		mode    = 2
+
+		if bitcpy == int(winsize) {
+			/*
+				Window is filled so square as required and multiply
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return
+				redux(res, P, rho)                                   or_return
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return
+			redux(res, P, rho)                                       or_return
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0
+			bitbuf = 0
+			mode   = 1
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < bitcpy; x += 1 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, rho)                                       or_return
+
+			/*
+				Get next bit of the window.
+			*/
+			bitbuf <<= 1
+			if bitbuf & (1 << winsize) != 0 {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return
+				redux(res, P, rho)                                   or_return
+			}
+		}
+	}
+
+	if redmode == 0 {
+		/*
+			Fixup result if Montgomery reduction is used.
+			Recall that any value in a Montgomery system is actually multiplied by R mod n.
+			So we have to reduce one more time to cancel out the factor of R.
+		*/
+		redux(res, P, rho)                                           or_return
+	}
+
 	return nil
 }
 
@@ -1980,21 +2981,21 @@ _private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator
 	/*
 		If `v` != `1` then there is no inverse.
 	*/
-	if internal_cmp(v, 1) != 0 {
+	if !internal_eq(v, 1) {
 		return .Invalid_Argument
 	}
 
 	/*
 		If its too low.
 	*/
-	if internal_cmp(C, 0) == -1 {
+	if internal_is_negative(C) {
 		internal_add(C, C, b) or_return
 	}
 
 	/*
 		Too big.
 	*/
-	if internal_cmp(C, 0) != -1 {
+	if internal_gte(C, 0) {
 		internal_sub(C, C, b) or_return
 	}
 
@@ -2147,7 +3148,7 @@ _private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.alloc
 	/*
 		Too big.
 	*/
-	for internal_cmp_mag(D, b) != -1 {
+	for internal_gte_abs(D, b) {
 		internal_sub(D, D, b) or_return
 	}
 
@@ -2222,7 +3223,8 @@ _private_int_rem_105 := [?]DIGIT{
 }
 #assert(105 * size_of(DIGIT) == size_of(_private_int_rem_105))
 
-_private_prime_table := [?]DIGIT{
+_PRIME_TAB_SIZE :: 256
+_private_prime_table := [_PRIME_TAB_SIZE]DIGIT{
 	0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
 	0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
 	0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
@@ -2259,7 +3261,7 @@ _private_prime_table := [?]DIGIT{
 	0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
 	0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653,
 }
-#assert(256 * size_of(DIGIT) == size_of(_private_prime_table))
+#assert(_PRIME_TAB_SIZE * size_of(DIGIT) == size_of(_private_prime_table))
 
 when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
 	_factorial_table := [35]_WORD{
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index d7c6ec19c..f0fc16f36 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,9 @@ package math_big
 
 	This file contains basic arithmetic operations like `add`, `sub`, `mul`, `div`, ...
 */
+package math_big
+
+import "core:intrinsics"
 
 /*
 	===========================
@@ -384,6 +385,10 @@ digit_log :: proc(a: DIGIT, base: DIGIT) -> (log: int, err: Error) {
 }
 log :: proc { int_log, digit_log, }
 
+ilog2 :: proc(value: $T) -> (log2: T) {
+	return (size_of(T) * 8) - intrinsics.count_leading_zeros(value)
+}
+
 /*
 	Calculate `dest = base^power` using a square-multiply algorithm.
 */
@@ -556,6 +561,298 @@ int_compare_magnitude :: proc(a, b: ^Int, allocator := context.allocator) -> (re
 
 	return #force_inline internal_cmp_mag(a, b), nil
 }
+int_cmp_mag :: int_compare_magnitude
+
+
+/*
+	bool := a < b
+*/
+int_less_than :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c == -1, err
+}
+
+/*
+	bool := a < b
+*/
+int_less_than_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c == -1, err
+}
+
+/*
+	bool := |a| < |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_less_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp_mag(a, b)
+
+	return c == -1, err
+}
+
+less_than :: proc {
+	int_less_than,
+	int_less_than_digit,
+}
+lt :: less_than
+
+less_than_abs :: proc {
+	int_less_than_abs,
+}
+lt_abs :: less_than_abs
+
+
+/*
+	bool := a <= b
+*/
+int_less_than_or_equal :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c <= 0, err
+}
+
+/*
+	bool := a <= b
+*/
+int_less_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c <= 0, err
+}
+
+/*
+	bool := |a| <= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp_mag(a, b)
+
+	return c <= 0, err
+}
+
+less_than_or_equal :: proc {
+	int_less_than_or_equal,
+	int_less_than_or_equal_digit,
+}
+lteq :: less_than_or_equal
+
+less_than_or_equal_abs :: proc {
+	int_less_than_or_equal_abs,
+}
+lteq_abs :: less_than_or_equal_abs
+
+
+/*
+	bool := a == b
+*/
+int_equals :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c == 0, err
+}
+
+/*
+	bool := a == b
+*/
+int_equals_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c == 0, err
+}
+
+/*
+	bool := |a| == |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_equals_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp_mag(a, b)
+
+	return c == 0, err
+}
+
+equals :: proc {
+	int_equals,
+	int_equals_digit,
+}
+eq :: equals
+
+equals_abs :: proc {
+	int_equals_abs,
+}
+eq_abs :: equals_abs
+
+
+/*
+	bool := a >= b
+*/
+int_greater_than_or_equal :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c >= 0, err
+}
+
+/*
+	bool := a >= b
+*/
+int_greater_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c >= 0, err
+}
+
+/*
+	bool := |a| >= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp_mag(a, b)
+
+	return c >= 0, err
+}
+
+greater_than_or_equal :: proc {
+	int_greater_than_or_equal,
+	int_greater_than_or_equal_digit,
+}
+gteq :: greater_than_or_equal
+
+greater_than_or_equal_abs :: proc {
+	int_greater_than_or_equal_abs,
+}
+gteq_abs :: greater_than_or_equal_abs
+
+
+/*
+	bool := a > b
+*/
+int_greater_than :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c > 0, err
+}
+
+/*
+	bool := a > b
+*/
+int_greater_than_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a) or_return
+
+	c: int
+	c, err = cmp(a, b)
+
+	return c > 0, err
+}
+
+/*
+	bool := |a| > |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_greater_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a, b)
+	context.allocator = allocator
+
+	internal_clear_if_uninitialized(a, b) or_return
+
+	c: int
+	c, err = cmp_mag(a, b)
+
+	return c > 0, err
+}
+
+greater_than :: proc {
+	int_greater_than,
+	int_greater_than_digit,
+}
+gt :: greater_than
+
+greater_than_abs :: proc {
+	int_greater_than_abs,
+}
+gt_abs :: greater_than_abs
+
 
 /*
 	Check if remainders are possible squares - fast exclude non-squares.
diff --git a/core/math/big/radix.odin b/core/math/big/radix.odin
index f0a80d7b2..760c49d77 100644
--- a/core/math/big/radix.odin
+++ b/core/math/big/radix.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -14,23 +12,22 @@ package math_big
 		- Use Barrett reduction for non-powers-of-two.
 		- Also look at extracting and splatting several digits at once.
 */
+package math_big
 
 import "core:intrinsics"
 import "core:mem"
+import "core:os"
 
 /*
-	This version of `itoa` allocates one behalf of the caller. The caller must free the string.
+	This version of `itoa` allocates on behalf of the caller. The caller must free the string.
+	The radix defaults to 10.
 */
-int_itoa_string :: proc(a: ^Int, radix := i8(-1), zero_terminate := false, allocator := context.allocator) -> (res: string, err: Error) {
+int_itoa_string :: proc(a: ^Int, radix := i8(10), zero_terminate := false, allocator := context.allocator) -> (res: string, err: Error) {
 	assert_if_nil(a)
 	context.allocator = allocator
 
 	a := a; radix := radix
 	clear_if_uninitialized(a) or_return
-	/*
-		Radix defaults to 10.
-	*/
-	radix = radix if radix > 0 else 10
 
 	/*
 		TODO: If we want to write a prefix for some of the radixes, we can oversize the buffer.
@@ -58,18 +55,15 @@ int_itoa_string :: proc(a: ^Int, radix := i8(-1), zero_terminate := false, alloc
 }
 
 /*
-	This version of `itoa` allocates one behalf of the caller. The caller must free the string.
+	This version of `itoa` allocates on behalf of the caller. The caller must free the string.
+	The radix defaults to 10.
 */
-int_itoa_cstring :: proc(a: ^Int, radix := i8(-1), allocator := context.allocator) -> (res: cstring, err: Error) {
+int_itoa_cstring :: proc(a: ^Int, radix := i8(10), allocator := context.allocator) -> (res: cstring, err: Error) {
 	assert_if_nil(a)
 	context.allocator = allocator
 
 	a := a; radix := radix
 	clear_if_uninitialized(a) or_return
-	/*
-		Radix defaults to 10.
-	*/
-	radix = radix if radix > 0 else 10
 
 	s: string
 	s, err = int_itoa_string(a, radix, true)
@@ -379,6 +373,177 @@ radix_size :: proc(a: ^Int, radix: i8, zero_terminate := false, allocator := con
 }
 
 /*
+	We might add functions to read and write byte-encoded Ints from/to files, using `int_to_bytes_*` functions.
+
+	LibTomMath allows exporting/importing to/from a file in ASCII, but it doesn't support a much more compact representation in binary, even though it has several pack functions int_to_bytes_* (which I expanded upon and wrote Python interoperable versions of as well), and (un)pack, which is GMP compatible.
+	Someone could implement their own read/write binary int procedures, of course.
+
+	Could be worthwhile to add a canonical binary file representation with an optional small header that says it's an Odin big.Int, big.Rat or Big.Float, byte count for each component that follows, flag for big/little endian and a flag that says a checksum exists at the end of the file.
+	For big.Rat and big.Float the header couldn't be optional, because we'd have no way to distinguish where the components end.
+*/
+
+/*
+	Read an Int from an ASCII file.
+*/
+internal_int_read_from_ascii_file :: proc(a: ^Int, filename: string, radix := i8(10), allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	/*
+		We can either read the entire file at once, or read a bunch at a time and keep multiplying by the radix.
+		For now, we'll read the entire file. Eventually we'll replace this with a copy that duplicates the logic
+		of `atoi` so we don't need to read the entire file.
+	*/
+
+	res, ok := os.read_entire_file(filename, allocator)
+	defer delete(res, allocator)
+
+	if !ok {
+		return .Cannot_Read_File
+	}
+
+	as := string(res)
+	return atoi(a, as, radix)
+}
+
+/*
+	Write an Int to an ASCII file.
+*/
+internal_int_write_to_ascii_file :: proc(a: ^Int, filename: string, radix := i8(10), allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	/*
+		For now we'll convert the Int using itoa and writing the result in one go.
+		If we want to preserve memory we could duplicate the itoa logic and write backwards.
+	*/
+
+	as := itoa(a, radix) or_return
+	defer delete(as)
+
+	l := len(as)
+	assert(l > 0)
+
+	data := transmute([]u8)mem.Raw_Slice{
+		data = raw_data(as),
+		len  = l,
+	}
+
+	ok := os.write_entire_file(name=filename, data=data, truncate=true)
+	return nil if ok else .Cannot_Write_File
+}
+
+/*
+	Calculate the size needed for `internal_int_pack`.
+
+	See https://gmplib.org/manual/Integer-Import-and-Export.html
+*/
+internal_int_pack_count :: proc(a: ^Int, $T: typeid, nails := 0) -> (size_needed: int) {
+	assert(nails >= 0 && nails < (size_of(T) * 8))
+
+	bits := internal_count_bits(a)
+	size := size_of(T)
+
+	size_needed  =  bits / ((size * 8) - nails)
+	size_needed += 1 if (bits % ((size * 8) - nails)) != 0 else 0
+
+	return size_needed
+}
+
+/*
+	Based on gmp's mpz_export.
+	See https://gmplib.org/manual/Integer-Import-and-Export.html
+
+	`buf` is a pre-allocated slice of type `T` "words", which must be an unsigned integer of some description.
+		Use `internal_int_pack_count(a, T, nails)` to calculate the necessary size.
+		The library internally uses `DIGIT` as the type, which is u64 or u32 depending on the platform.
+		You are of course welcome to export to []u8, []u32be, and so forth.
+		After this you can use `mem.slice_data_cast` to interpret the buffer as bytes if you so choose.
+
+	`nails` are the number of top bits the output "word" reserves.
+		To mimic the internals of this library, this would be 4.
+
+	To use the minimum amount of output bytes, set `nails` to 0 and pass a `[]u8`.
+	IMPORTANT: `pack` serializes the magnitude of an Int, that is, the output is unsigned.
+
+	Assumes `a` not to be `nil` and to have been initialized.
+*/
+internal_int_pack :: proc(a: ^Int, buf: []$T, nails := 0, order := Order.LSB_First) -> (written: int, err: Error)
+                     where intrinsics.type_is_integer(T) && intrinsics.type_is_unsigned(T) && size_of(T) <= 16 {
+
+	assert(nails >= 0 && nails < (size_of(T) * 8))
+
+	type_size  := size_of(T)
+	type_bits  := (type_size * 8) - nails
+
+	word_count := internal_int_pack_count(a, T, nails)
+	bit_count  := internal_count_bits(a)
+
+	if len(buf) < word_count {
+		return 0, .Buffer_Overflow
+	}
+
+	bit_offset  := 0
+	word_offset := 0
+
+	#no_bounds_check for i := 0; i < word_count; i += 1 {
+		bit_offset = i * type_bits
+		if order == .MSB_First {
+			word_offset = word_count - i - 1
+		} else {
+			word_offset = i
+		}
+
+		bits_to_get := min(type_bits, bit_count - bit_offset)
+		W := internal_int_bitfield_extract(a, bit_offset, bits_to_get) or_return
+		buf[word_offset] = T(W)
+	}
+
+	return word_count, nil
+}
+
+
+
+internal_int_unpack :: proc(a: ^Int, buf: []$T, nails := 0, order := Order.LSB_First, allocator := context.allocator) -> (err: Error)
+                     where intrinsics.type_is_integer(T) && intrinsics.type_is_unsigned(T) && size_of(T) <= 16 {
+	assert(nails >= 0 && nails < (size_of(T) * 8))
+	context.allocator = allocator
+
+	type_size  := size_of(T)
+	type_bits  := (type_size * 8) - nails
+	type_mask  := T(1 << uint(type_bits)) - 1
+
+	if len(buf) == 0 {
+		return .Invalid_Argument
+	}
+
+	bit_count   := type_bits * len(buf)
+	digit_count := (bit_count / _DIGIT_BITS) + min(1, bit_count % _DIGIT_BITS)
+
+	/*
+		Pre-size output Int.
+	*/
+	internal_grow(a, digit_count) or_return
+
+	t := &Int{}
+	defer internal_destroy(t)
+
+	if order == .LSB_First {
+		for W, i in buf {
+			internal_set(t, W & type_mask)                           or_return
+			internal_shl(t, t, type_bits * i)                        or_return
+			internal_add(a, a, t)                                    or_return
+		}
+	} else {
+		for W in buf {
+			internal_set(t, W & type_mask)                           or_return
+			internal_shl(a, a, type_bits)                            or_return
+			internal_add(a, a, t)                                    or_return
+		}		
+	}
+
+	return internal_clamp(a)
+}
+
+/*
 	Overestimate the size needed for the bigint to string conversion by a very small amount.
 	The error is about 10^-8; it will overestimate the result by at most 11 elements for
 	a number of the size 2^(2^31)-1 which is currently the largest possible in this library.
@@ -414,14 +579,14 @@ _log_bases :: [65]u32{
 */
 RADIX_TABLE := "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/"
 RADIX_TABLE_REVERSE := [RADIX_TABLE_REVERSE_SIZE]u8{
-   0x3e, 0xff, 0xff, 0xff, 0x3f, 0x00, 0x01, 0x02, 0x03, 0x04, /* +,-./01234 */
-   0x05, 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, 0xff, /* 56789:;<=> */
-   0xff, 0xff, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, /* ?@ABCDEFGH */
-   0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, /* IJKLMNOPQR */
-   0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0xff, 0xff, /* STUVWXYZ[\ */
-   0xff, 0xff, 0xff, 0xff, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, /* ]^_`abcdef */
-   0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, /* ghijklmnop */
-   0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, /* qrstuvwxyz */
+	0x3e, 0xff, 0xff, 0xff, 0x3f, 0x00, 0x01, 0x02, 0x03, 0x04, /* +,-./01234 */
+	0x05, 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, 0xff, /* 56789:;<=> */
+	0xff, 0xff, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, /* ?@ABCDEFGH */
+	0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, /* IJKLMNOPQR */
+	0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0xff, 0xff, /* STUVWXYZ[\ */
+	0xff, 0xff, 0xff, 0xff, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, /* ]^_`abcdef */
+	0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, /* ghijklmnop */
+	0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, /* qrstuvwxyz */
 }
 RADIX_TABLE_REVERSE_SIZE :: 80
 
diff --git a/core/math/big/test.odin b/core/math/big/test.odin
index 76cb6b7ef..8db2ab7eb 100644
--- a/core/math/big/test.odin
+++ b/core/math/big/test.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -11,6 +9,7 @@ package math_big
 
 	This file exports procedures for use with the test.py test suite.
 */
+package math_big
 
 /*
 	TODO: Write tests for `internal_*` and test reusing parameters with the public implementations.
diff --git a/core/math/big/test.py b/core/math/big/test.py
index df59fa1c8..4d314401f 100644
--- a/core/math/big/test.py
+++ b/core/math/big/test.py
@@ -1,3 +1,12 @@
+#
+#	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+#	Made available under Odin's BSD-3 license.
+#
+#	A BigInt implementation in Odin.
+#	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
+#	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
+#
+
 from ctypes import *
 from random import *
 import math
@@ -404,7 +413,6 @@ def test_shr_signed(a = 0, bits = 0, expected_error = Error.Okay):
 	return test("test_shr_signed", res, [a, bits], expected_error, expected_result)
 
 def test_factorial(number = 0, expected_error = Error.Okay):
-	print("Factorial:", number)
 	args  = [number]
 	try:
 		res = int_factorial(*args)
diff --git a/core/math/big/tune.odin b/core/math/big/tune.odin
index c34f73b37..a268f541d 100644
--- a/core/math/big/tune.odin
+++ b/core/math/big/tune.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -9,6 +7,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:fmt"
 import "core:time"
@@ -24,6 +23,8 @@ Category :: enum {
 	sqr,
 	bitfield_extract,
 	rm_trials,
+	is_prime,
+	random_prime,
 }
 
 Event :: struct {
diff --git a/core/reflect/reflect.odin b/core/reflect/reflect.odin
index 7c5d851b4..e7cb928c3 100644
--- a/core/reflect/reflect.odin
+++ b/core/reflect/reflect.odin
@@ -1276,81 +1276,175 @@ as_raw_data :: proc(a: any) -> (value: rawptr, valid: bool) {
 	return
 }
 
-/*
-not_equal :: proc(a, b: any) -> bool {
-	return !equal(a, b);
+eq :: equal
+ne :: not_equal
+
+DEFAULT_EQUAL_MAX_RECURSION_LEVEL :: 32
+
+not_equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_level := 0) -> bool {
+	return !equal(a, b, including_indirect_array_recursion, recursion_level)
 }
-equal :: proc(a, b: any) -> bool {
+equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_level := 0) -> bool {
 	if a == nil && b == nil {
-		return true;
+		return true
 	}
 
 	if a.id != b.id {
-		return false;
+		return false
 	}
 
 	if a.data == b.data {
-		return true;
+		return true
 	}
-
-	t := type_info_of(a.id);
-	if .Comparable not_in t.flags {
-		return false;
+	
+	including_indirect_array_recursion := including_indirect_array_recursion
+	if recursion_level >= DEFAULT_EQUAL_MAX_RECURSION_LEVEL {
+		including_indirect_array_recursion = false
+	} 
+
+	t := type_info_of(a.id)
+	if .Comparable not_in t.flags && !including_indirect_array_recursion {
+		return false
 	}
 
 	if t.size == 0 {
-		return true;
+		return true
 	}
 
 	if .Simple_Compare in t.flags {
-		return mem.compare_byte_ptrs((^byte)(a.data), (^byte)(b.data), t.size) == 0;
+		return mem.compare_byte_ptrs((^byte)(a.data), (^byte)(b.data), t.size) == 0
 	}
+	
+	t = runtime.type_info_core(t)
 
-	t = runtime.type_info_core(t);
-
-	#partial switch v in t.variant {
+	switch v in t.variant {
+	case Type_Info_Named:
+		unreachable()
+	case Type_Info_Tuple:
+		unreachable()
+	case Type_Info_Any:
+		if !including_indirect_array_recursion {
+			return false
+		}
+		va := (^any)(a.data)
+		vb := (^any)(b.data)
+		return equal(va, vb, including_indirect_array_recursion, recursion_level+1) 
+	case Type_Info_Map:
+		return false
+	case Type_Info_Relative_Slice:
+		return false
+	case 
+		Type_Info_Boolean,
+		Type_Info_Integer, 
+		Type_Info_Rune,
+		Type_Info_Float,
+		Type_Info_Complex,
+		Type_Info_Quaternion,
+		Type_Info_Type_Id,
+		Type_Info_Pointer,
+		Type_Info_Multi_Pointer,
+		Type_Info_Procedure,
+		Type_Info_Bit_Set,
+		Type_Info_Enum,
+		Type_Info_Simd_Vector,
+		Type_Info_Relative_Pointer:
+		return mem.compare_byte_ptrs((^byte)(a.data), (^byte)(b.data), t.size) == 0
+		
 	case Type_Info_String:
 		if v.is_cstring {
-			x := string((^cstring)(a.data)^);
-			y := string((^cstring)(b.data)^);
-			return x == y;
+			x := string((^cstring)(a.data)^)
+			y := string((^cstring)(b.data)^)
+			return x == y
 		} else {
-			x := (^string)(a.data)^;
-			y := (^string)(b.data)^;
-			return x == y;
+			x := (^string)(a.data)^
+			y := (^string)(b.data)^
+			return x == y
 		}
-
+		return true
 	case Type_Info_Array:
 		for i in 0..<v.count {
-			x := rawptr(uintptr(a.data) + uintptr(v.elem_size*i));
-			y := rawptr(uintptr(b.data) + uintptr(v.elem_size*i));
-			if !equal(any{x, v.elem.id}, any{y, v.elem.id}) {
-				return false;
+			x := rawptr(uintptr(a.data) + uintptr(v.elem_size*i))
+			y := rawptr(uintptr(b.data) + uintptr(v.elem_size*i))
+			if !equal(any{x, v.elem.id}, any{y, v.elem.id}, including_indirect_array_recursion, recursion_level) {
+				return false
 			}
 		}
+		return true
 	case Type_Info_Enumerated_Array:
 		for i in 0..<v.count {
-			x := rawptr(uintptr(a.data) + uintptr(v.elem_size*i));
-			y := rawptr(uintptr(b.data) + uintptr(v.elem_size*i));
-			if !equal(any{x, v.elem.id}, any{y, v.elem.id}) {
-				return false;
+			x := rawptr(uintptr(a.data) + uintptr(v.elem_size*i))
+			y := rawptr(uintptr(b.data) + uintptr(v.elem_size*i))
+			if !equal(any{x, v.elem.id}, any{y, v.elem.id}, including_indirect_array_recursion, recursion_level) {
+				return false
 			}
 		}
+		return true
 	case Type_Info_Struct:
 		if v.equal != nil {
-			return v.equal(a.data, b.data);
+			return v.equal(a.data, b.data)
 		} else {
 			for offset, i in v.offsets {
-				x := rawptr(uintptr(a.data) + offset);
-				y := rawptr(uintptr(b.data) + offset);
-				id := v.types[i].id;
-				if !equal(any{x, id}, any{y, id}) {
-					return false;
+				x := rawptr(uintptr(a.data) + offset)
+				y := rawptr(uintptr(b.data) + offset)
+				id := v.types[i].id
+				if !equal(any{x, id}, any{y, id}, including_indirect_array_recursion, recursion_level) {
+					return false
 				}
 			}
+			return true
+		}
+	case Type_Info_Union:
+		if v.equal != nil {
+			return v.equal(a.data, b.data)
+		}
+		return false
+	case Type_Info_Slice:
+		if !including_indirect_array_recursion {
+			return false
+		}
+		array_a := (^mem.Raw_Slice)(a.data)
+		array_b := (^mem.Raw_Slice)(b.data)
+		if array_a.len != array_b.len {
+			return false
+		}
+		if array_a.data == array_b.data {
+			return true
+		}
+		for i in 0..<array_a.len {
+			x := rawptr(uintptr(array_a.data) + uintptr(v.elem_size*i))
+			y := rawptr(uintptr(array_b.data) + uintptr(v.elem_size*i))
+			if !equal(any{x, v.elem.id}, any{y, v.elem.id}, including_indirect_array_recursion, recursion_level+1) {
+				return false
+			}	
+		}
+		return true
+	case Type_Info_Dynamic_Array:
+		if !including_indirect_array_recursion {
+			return false
+		}
+		array_a := (^mem.Raw_Dynamic_Array)(a.data)
+		array_b := (^mem.Raw_Dynamic_Array)(b.data)
+		if array_a.len != array_b.len {
+			return false
+		}
+		if array_a.data == array_b.data {
+			return true
+		}
+		if .Simple_Compare in v.elem.flags {
+			return mem.compare_byte_ptrs((^byte)(array_a.data), (^byte)(array_b.data), array_a.len * v.elem.size) == 0
 		}
+		
+		for i in 0..<array_a.len {
+			x := rawptr(uintptr(array_a.data) + uintptr(v.elem_size*i))
+			y := rawptr(uintptr(array_b.data) + uintptr(v.elem_size*i))
+			if !equal(any{x, v.elem.id}, any{y, v.elem.id}, including_indirect_array_recursion, recursion_level+1) {
+				return false
+			}	
+		}
+		return true
 	}
-
-	return true;
+	
+	runtime.print_typeid(a.id)
+	runtime.print_string("\n")
+	return true
 }
-*/
diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp
index 05a5fceda..1f3928bd8 100644
--- a/src/check_builtin.cpp
+++ b/src/check_builtin.cpp
@@ -1871,6 +1871,29 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 		operand->type = alloc_type_simd_vector(count, elem);
 		break;
 	}
+	
+	case BuiltinProc_is_package_imported: {
+		bool value = false;
+
+		if (!is_type_string(operand->type) && (operand->mode != Addressing_Constant)) {
+			error(ce->args[0], "Expected a constant string for '%.*s'", LIT(builtin_name));
+		} else if (operand->value.kind == ExactValue_String) {
+			String pkg_name = operand->value.value_string;
+			// TODO(bill): probably should have this be a `StringMap` eventually
+			for_array(i, c->info->packages.entries) {
+				AstPackage *pkg = c->info->packages.entries[i].value;
+				if (pkg->name == pkg_name) {
+					value = true;
+					break;
+				}
+			}
+		}
+		
+		operand->mode = Addressing_Constant;
+		operand->type = t_untyped_bool;
+		operand->value = exact_value_bool(value);
+		break;
+	}
 
 	case BuiltinProc_soa_struct: {
 		Operand x = {};
@@ -2032,7 +2055,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 				error(al.expr, "Alignment parameter to '%.*s' must be constant", LIT(builtin_name));
 			}
 
-			operand->type = t_u8_ptr;
+			operand->type = alloc_type_multi_pointer(t_u8);
 			operand->mode = Addressing_Value;
 			break;
 		}
diff --git a/src/checker_builtin_procs.hpp b/src/checker_builtin_procs.hpp
index dc9d139f7..466e679c3 100644
--- a/src/checker_builtin_procs.hpp
+++ b/src/checker_builtin_procs.hpp
@@ -36,6 +36,8 @@ enum BuiltinProcId {
 	BuiltinProc_DIRECTIVE, // NOTE(bill): This is used for specialized hash-prefixed procedures
 
 	// "Intrinsics"
+	BuiltinProc_is_package_imported,
+	
 	BuiltinProc_simd_vector,
 	BuiltinProc_soa_struct,
 
@@ -230,7 +232,6 @@ BuiltinProc__type_simple_boolean_end,
 
 BuiltinProc__type_end,
 
-
 	BuiltinProc_COUNT,
 };
 gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
@@ -270,6 +271,8 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 
 
 	// "Intrinsics"
+	{STR_LIT("is_package_imported"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+		
 	{STR_LIT("simd_vector"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type
 	{STR_LIT("soa_struct"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type
 
@@ -459,6 +462,6 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 
 	{STR_LIT("type_equal_proc"),  1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("type_hasher_proc"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-
+	
 	{STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
 };