core/math/math_sincos.odin


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308

package math

import "core:math/bits"

// The original C code, the long comment, and the constants
// below were from http://netlib.sandia.gov/cephes/cmath/sin.c,
// available from http://www.netlib.org/cephes/cmath.tgz.
// The go code is a simplified version of the original C.
//
//      sin.c
//
//      Circular sine
//
// SYNOPSIS:
//
// double x, y, sin();
// y = sin( x );
//
// DESCRIPTION:
//
// Range reduction is into intervals of pi/4.  The reduction error is nearly
// eliminated by contriving an extended precision modular arithmetic.
//
// Two polynomial approximating functions are employed.
// Between 0 and pi/4 the sine is approximated by
//      x  +  x**3 P(x**2).
// Between pi/4 and pi/2 the cosine is represented as
//      1  -  x**2 Q(x**2).
//
// ACCURACY:
//
//                      Relative error:
// arithmetic   domain      # trials      peak         rms
//    DEC       0, 10       150000       3.0e-17     7.8e-18
//    IEEE -1.07e9,+1.07e9  130000       2.1e-16     5.4e-17
//
// Partial loss of accuracy begins to occur at x = 2**30 = 1.074e9.  The loss
// is not gradual, but jumps suddenly to about 1 part in 10e7.  Results may
// be meaningless for x > 2**49 = 5.6e14.
//
//      cos.c
//
//      Circular cosine
//
// SYNOPSIS:
//
// double x, y, cos();
// y = cos( x );
//
// DESCRIPTION:
//
// Range reduction is into intervals of pi/4.  The reduction error is nearly
// eliminated by contriving an extended precision modular arithmetic.
//
// Two polynomial approximating functions are employed.
// Between 0 and pi/4 the cosine is approximated by
//      1  -  x**2 Q(x**2).
// Between pi/4 and pi/2 the sine is represented as
//      x  +  x**3 P(x**2).
//
// ACCURACY:
//
//                      Relative error:
// arithmetic   domain      # trials      peak         rms
//    IEEE -1.07e9,+1.07e9  130000       2.1e-16     5.4e-17
//    DEC        0,+1.07e9   17000       3.0e-17     7.2e-18
//
// Cephes Math Library Release 2.8:  June, 2000
// Copyright 1984, 1987, 1989, 1992, 2000 by Stephen L. Moshier
//
// The readme file at http://netlib.sandia.gov/cephes/ says:
//    Some software in this archive may be from the book _Methods and
// Programs for Mathematical Functions_ (Prentice-Hall or Simon & Schuster
// International, 1989) or from the Cephes Mathematical Library, a
// commercial product. In either event, it is copyrighted by the author.
// What you see here may be used freely but it comes with no support or
// guarantee.
//
//   The two known misprints in the book are repaired here in the
// source listings for the gamma function and the incomplete beta
// integral.
//
//   Stephen L. Moshier
//   moshier@na-net.ornl.gov

sincos :: proc{
	sincos_f16, sincos_f16le, sincos_f16be,
	sincos_f32, sincos_f32le, sincos_f32be,
	sincos_f64, sincos_f64le, sincos_f64be,
}

sincos_f16 :: proc "contextless" (x: f16) -> (sin, cos: f16) #no_bounds_check {
	s, c := sincos_f64(f64(x))
	return f16(s), f16(c)
}
sincos_f16le :: proc "contextless" (x: f16le) -> (sin, cos: f16le) #no_bounds_check {
	s, c := sincos_f64(f64(x))
	return f16le(s), f16le(c)
}
sincos_f16be :: proc "contextless" (x: f16be) -> (sin, cos: f16be) #no_bounds_check {
	s, c := sincos_f64(f64(x))
	return f16be(s), f16be(c)
}

sincos_f32 :: proc "contextless" (x: f32) -> (sin, cos: f32) #no_bounds_check {
	s, c := sincos_f64(f64(x))
	return f32(s), f32(c)
}
sincos_f32le :: proc "contextless" (x: f32le) -> (sin, cos: f32le) #no_bounds_check {
	s, c := sincos_f64(f64(x))
	return f32le(s), f32le(c)
}
sincos_f32be :: proc "contextless" (x: f32be) -> (sin, cos: f32be) #no_bounds_check {
	s, c := sincos_f64(f64(x))
	return f32be(s), f32be(c)
}

sincos_f64le :: proc "contextless" (x: f64le) -> (sin, cos: f64le) #no_bounds_check {
	s, c := sincos_f64(f64(x))
	return f64le(s), f64le(c)
}
sincos_f64be :: proc "contextless" (x: f64be) -> (sin, cos: f64be) #no_bounds_check {
	s, c := sincos_f64(f64(x))
	return f64be(s), f64be(c)
}

sincos_f64 :: proc "contextless" (x: f64) -> (sin, cos: f64) #no_bounds_check {
	x := x

	PI4A :: 0h3fe921fb40000000 // 7.85398125648498535156e-1  PI/4 split into three parts
	PI4B :: 0h3e64442d00000000 // 3.77489470793079817668e-8
	PI4C :: 0h3ce8469898cc5170 // 2.69515142907905952645e-15

	// special cases
	switch {
	case x == 0:
		return x, 1 // return ±0.0, 1.0
	case is_nan(x) || is_inf(x, 0):
		return nan_f64(), nan_f64()
	}

	// make argument positive
	sin_sign, cos_sign := false, false
	if x < 0 {
		x = -x
		sin_sign = true
	}

	j: u64
	y, z: f64
	if x >= REDUCE_THRESHOLD {
		j, z = _trig_reduce_f64(x)
	} else {
		j = u64(x * (4 / PI)) // integer part of x/(PI/4), as integer for tests on the phase angle
		y = f64(j)           // integer part of x/(PI/4), as float

		if j&1 == 1 { // map zeros to origin
			j += 1
			y += 1
		}
		j &= 7                               // octant modulo TAU radians (360 degrees)
		z = ((x - y*PI4A) - y*PI4B) - y*PI4C // Extended precision modular arithmetic
	}
	if j > 3 { // reflect in x axis
		j -= 4
		sin_sign, cos_sign = !sin_sign, !cos_sign
	}
	if j > 1 {
		cos_sign = !cos_sign
	}

	zz := z * z

	cos = 1.0 - 0.5*zz + zz*zz*((((((_cos[0]*zz)+_cos[1])*zz+_cos[2])*zz+_cos[3])*zz+_cos[4])*zz+_cos[5])
	sin = z + z*zz*((((((_sin[0]*zz)+_sin[1])*zz+_sin[2])*zz+_sin[3])*zz+_sin[4])*zz+_sin[5])

	if j == 1 || j == 2 {
		sin, cos = cos, sin
	}
	if cos_sign {
		cos = -cos
	}
	if sin_sign {
		sin = -sin
	}
	return
}

// sin coefficients
@(private="file")
_sin := [?]f64{
	0h3de5d8fd1fd19ccd, //  1.58962301576546568060e-10
	0hbe5ae5e5a9291f5d, // -2.50507477628578072866e-8
	0h3ec71de3567d48a1, //  2.75573136213857245213e-6
	0hbf2a01a019bfdf03, // -1.98412698295895385996e-4
	0h3f8111111110f7d0, //  8.33333333332211858878e-3
	0hbfc5555555555548, // -1.66666666666666307295e-1
}

// cos coefficients
@(private="file")
_cos := [?]f64{
	0hbda8fa49a0861a9b, // -1.13585365213876817300e-11,
	0h3e21ee9d7b4e3f05, //  2.08757008419747316778e-9,
	0hbe927e4f7eac4bc6, // -2.75573141792967388112e-7,
	0h3efa01a019c844f5, //  2.48015872888517045348e-5,
	0hbf56c16c16c14f91, // -1.38888888888730564116e-3,
	0h3fa555555555554b, //  4.16666666666665929218e-2,
}

// REDUCE_THRESHOLD is the maximum value of x where the reduction using Pi/4
// in 3 f64 parts still gives accurate results. This threshold
// is set by y*C being representable as a f64 without error
// where y is given by y = floor(x * (4 / Pi)) and C is the leading partial
// terms of 4/Pi. Since the leading terms (PI4A and PI4B in sin.go) have 30
// and 32 trailing zero bits, y should have less than 30 significant bits.
//
//	y < 1<<30  -> floor(x*4/Pi) < 1<<30 -> x < (1<<30 - 1) * Pi/4
//
// So, conservatively we can take x < 1<<29.
// Above this threshold Payne-Hanek range reduction must be used.
@(private="file")
REDUCE_THRESHOLD :: 1 << 29

// _trig_reduce_f64 implements Payne-Hanek range reduction by Pi/4
// for x > 0. It returns the integer part mod 8 (j) and
// the fractional part (z) of x / (Pi/4).
// The implementation is based on:
// "ARGUMENT REDUCTION FOR HUGE ARGUMENTS: Good to the Last Bit"
// K. C. Ng et al, March 24, 1992
// The simulated multi-precision calculation of x*B uses 64-bit integer arithmetic.
_trig_reduce_f64 :: proc "contextless" (x: f64) -> (j: u64, z: f64) #no_bounds_check {
	// bd_pi4 is the binary digits of 4/pi as a u64 array,
	// that is, 4/pi = Sum bd_pi4[i]*2^(-64*i)
	// 19 64-bit digits and the leading one bit give 1217 bits
	// of precision to handle the largest possible f64 exponent.
	@(static, rodata) bd_pi4 := [?]u64{
		0x0000000000000001,
		0x45f306dc9c882a53,
		0xf84eafa3ea69bb81,
		0xb6c52b3278872083,
		0xfca2c757bd778ac3,
		0x6e48dc74849ba5c0,
		0x0c925dd413a32439,
		0xfc3bd63962534e7d,
		0xd1046bea5d768909,
		0xd338e04d68befc82,
		0x7323ac7306a673e9,
		0x3908bf177bf25076,
		0x3ff12fffbc0b301f,
		0xde5e2316b414da3e,
		0xda6cfd9e4f96136e,
		0x9e8c7ecd3cbfd45a,
		0xea4f758fd7cbe2f6,
		0x7a0e73ef14a525d4,
		0xd7f6bf623f1aba10,
		0xac06608df8f6d757,
	}

	PI4 :: PI / 4
	if x < PI4 {
		return 0, x
	}

	MASK  :: 0x7FF
	SHIFT :: 64 - 11 - 1
	BIAS  :: 1023

	// Extract out the integer and exponent such that,
	// x = ix * 2 ** exp.
	ix := transmute(u64)x
	exp := int(ix>>SHIFT&MASK) - BIAS - SHIFT
	ix &~= MASK << SHIFT
	ix |= 1 << SHIFT
	// Use the exponent to extract the 3 appropriate u64 digits from bd_pi4,
	// B ~ (z0, z1, z2), such that the product leading digit has the exponent -61.
	// Note, exp >= -53 since x >= PI4 and exp < 971 for maximum f64.
	digit, bitshift := uint(exp+61)/64, uint(exp+61)%64
	z0 := (bd_pi4[digit] << bitshift) | (bd_pi4[digit+1] >> (64 - bitshift))
	z1 := (bd_pi4[digit+1] << bitshift) | (bd_pi4[digit+2] >> (64 - bitshift))
	z2 := (bd_pi4[digit+2] << bitshift) | (bd_pi4[digit+3] >> (64 - bitshift))
	// Multiply mantissa by the digits and extract the upper two digits (hi, lo).
	z2hi, _ := bits.mul(z2, ix)
	z1hi, z1lo := bits.mul(z1, ix)
	z0lo := z0 * ix
	lo, c := bits.add(z1lo, z2hi, 0)
	hi, _ := bits.add(z0lo, z1hi, c)
	// The top 3 bits are j.
	j = hi >> 61
	// Extract the fraction and find its magnitude.
	hi = hi<<3 | lo>>61
	lz := uint(bits.leading_zeros(hi))
	e := u64(BIAS - (lz + 1))
	// Clear implicit mantissa bit and shift into place.
	hi = (hi << (lz + 1)) | (lo >> (64 - (lz + 1)))
	hi >>= 64 - SHIFT
	// Include the exponent and convert to a float.
	hi |= e << SHIFT
	z = transmute(f64)hi
	// Map zeros to origin.
	if j&1 == 1 {
		j += 1
		j &= 7
		z -= 1
	}
	// Multiply the fractional part by pi/4.
	return j, z * PI4
}