From ddc401e0bf4f972bc2916601797d12bb97c5f1dc Mon Sep 17 00:00:00 2001 From: pravic Date: Mon, 6 Jun 2016 23:05:39 +0300 Subject: update to 2016-06-06 --- libcore/num/dec2flt/algorithm.rs | 89 +++++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 10 deletions(-) (limited to 'libcore/num/dec2flt/algorithm.rs') diff --git a/libcore/num/dec2flt/algorithm.rs b/libcore/num/dec2flt/algorithm.rs index e33c281..c7af46a 100644 --- a/libcore/num/dec2flt/algorithm.rs +++ b/libcore/num/dec2flt/algorithm.rs @@ -32,19 +32,80 @@ fn power_of_ten(e: i16) -> Fp { Fp { f: sig, e: exp } } +// In most architectures, floating point operations have an explicit bit size, therefore the +// precision of the computation is determined on a per-operation basis. +#[cfg(any(not(target_arch="x86"), target_feature="sse2"))] +mod fpu_precision { + pub fn set_precision() { } +} + +// On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available. +// The x87 FPU operates with 80 bits of precision by default, which means that operations will +// round to 80 bits causing double rounding to happen when values are eventually represented as +// 32/64 bit float values. To overcome this, the FPU control word can be set so that the +// computations are performed in the desired precision. +#[cfg(all(target_arch="x86", not(target_feature="sse2")))] +mod fpu_precision { + use mem::size_of; + use ops::Drop; + + /// A structure used to preserve the original value of the FPU control word, so that it can be + /// restored when the structure is dropped. + /// + /// The x87 FPU is a 16-bits register whose fields are as follows: + /// + /// | 12-15 | 10-11 | 8-9 | 6-7 | 5 | 4 | 3 | 2 | 1 | 0 | + /// |------:|------:|----:|----:|---:|---:|---:|---:|---:|---:| + /// | | RC | PC | | PM | UM | OM | ZM | DM | IM | + /// + /// The documentation for all of the fields is available in the IA-32 Architectures Software + /// Developer's Manual (Volume 1). + /// + /// The only field which is relevant for the following code is PC, Precision Control. This + /// field determines the precision of the operations performed by the FPU. It can be set to: + /// - 0b00, single precision i.e. 32-bits + /// - 0b10, double precision i.e. 64-bits + /// - 0b11, double extended precision i.e. 80-bits (default state) + /// The 0b01 value is reserved and should not be used. + pub struct FPUControlWord(u16); + + fn set_cw(cw: u16) { + unsafe { asm!("fldcw $0" :: "m" (cw) :: "volatile") } + } + + /// Set the precision field of the FPU to `T` and return a `FPUControlWord` + pub fn set_precision() -> FPUControlWord { + let cw = 0u16; + + // Compute the value for the Precision Control field that is appropriate for `T`. + let cw_precision = match size_of::() { + 4 => 0x0000, // 32 bits + 8 => 0x0200, // 64 bits + _ => 0x0300, // default, 80 bits + }; + + // Get the original value of the control word to restore it later, when the + // `FPUControlWord` structure is dropped + unsafe { asm!("fnstcw $0" : "=*m" (&cw) ::: "volatile") } + + // Set the control word to the desired precision. This is achieved by masking away the old + // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above. + set_cw((cw & 0xFCFF) | cw_precision); + + FPUControlWord(cw) + } + + impl Drop for FPUControlWord { + fn drop(&mut self) { + set_cw(self.0) + } + } +} + /// The fast path of Bellerophon using machine-sized integers and floats. /// /// This is extracted into a separate function so that it can be attempted before constructing /// a bignum. -/// -/// The fast path crucially depends on arithmetic being correctly rounded, so on x86 -/// without SSE or SSE2 it will be **wrong** (as in, off by one ULP occasionally), because the x87 -/// FPU stack will round to 80 bit first before rounding to 64/32 bit. However, as such hardware -/// is extremely rare nowadays and in fact all in-tree target triples assume an SSE2-capable -/// microarchitecture, there is little incentive to deal with that. There's a test that will fail -/// when SSE or SSE2 is disabled, so people building their own non-SSE copy will get a heads up. -/// -/// FIXME: It would nevertheless be nice if we had a good way to detect and deal with x87. pub fn fast_path(integral: &[u8], fractional: &[u8], e: i64) -> Option { let num_digits = integral.len() + fractional.len(); // log_10(f64::max_sig) ~ 15.95. We compare the exact value to max_sig near the end, @@ -60,9 +121,17 @@ pub fn fast_path(integral: &[u8], fractional: &[u8], e: i64) -> Opt if f > T::max_sig() { return None; } + + // The fast path crucially depends on arithmetic being rounded to the correct number of bits + // without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision + // of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit. + // The `set_precision` function takes care of setting the precision on architectures which + // require setting it by changing the global state (like the control word of the x87 FPU). + let _cw = fpu_precision::set_precision::(); + // The case e < 0 cannot be folded into the other branch. Negative powers result in // a repeating fractional part in binary, which are rounded, which causes real - // (and occasioally quite significant!) errors in the final result. + // (and occasionally quite significant!) errors in the final result. if e >= 0 { Some(T::from_int(f) * T::short_fast_pow10(e as usize)) } else { -- cgit v1.2.3