In the problem from currently running Huawei contest (Accuracy-Preserving Summation Algorithm) a part of the task is to choose between IEEE-754 binary64, binary32 and binary16 floating point formats to use for number summation.
Apparently, people in charge of the contest don't know that both Intel and AMD support fp16 since 2011-2012 (AMD Bulldozer and Intel Ivy Bridge), it's supported by GCC 12+ and Clang 15+ as _Float16 and since C++23 as std::float16_t.
So for the checker they wrote their own implementation, which differs from IEEE in two places:
the exponent range is [-16; +16] instead of [-15; +15];
during conversion from fp64 least significant bits are just thrown away instead of being rounded.
Considering it's importance in the problem, I decided to check what's the range of fp64 that doesn't overflow when converted to fp16 and the difference that aforementioned differences make. Therefore I wrote a little program which prints the ranges and decided to share results here with anyone interested. It shows ranges for both "Huawei FP16", IEEE and "Corrected Huawei" — without rounding, but with correct exponent range.
Here are the results:
Huawei FP16
MAX
fp64: 131071.99999999999 (0x1.fffffffffffffp+16)
fp16: 131008 (0x1.ffcp+16)
OVERFLOW
fp64: 131072 (0x1p+17)
fp16: inf
RANGE
fp64: (-131072; 131072)
fp16: [-131008; 131008]
IEEE-754 FP16
MAX
fp64: 65519.999999999993 (0x1.ffdffffffffffp+15)
fp16: 65504 (0x1.ffcp+15)
OVERFLOW
fp64: 65520 (0x1.ffep+15)
fp16: inf
RANGE
fp64: (-65520; 65520)
fp16: [-65504; 65504]
Huawei FP16 with correct range
MAX
fp64: 65535.999999999993 (0x1.fffffffffffffp+15)
fp16: 65504 (0x1.ffcp+15)
OVERFLOW
fp64: 65536 (0x1p+16)
fp16: inf
RANGE
fp64: (-65536; 65536)
fp16: [-65504; 65504]
https://godbolt.org/z/3s8GoKe8j
Source#include <iostream> // cout
#include <cstdint> // uint32_t, uint64_t
#include <cstring> // memcpy
#include <cmath> // nextafter
using namespace std;
//simulated fp16
class Float16 {
static const uint32_t mantissaShift = 42;
static const uint32_t expShiftMid = 56;
static const uint32_t expShiftOut = 52;
double dValue_;
public:
Float16(double in) : dValue_(in) {
uint64_t utmp;
memcpy(&utmp, &dValue_, sizeof utmp);
//zeroing mantissa bits starting from 11th (this is NOT rounding)
utmp = utmp >> mantissaShift;
utmp = utmp << mantissaShift;
//setting masks for 5-bit exponent extraction out of 11-bit one
const uint64_t maskExpMid = (63llu << expShiftMid);
const uint64_t maskExpOut = (15llu << expShiftOut);
const uint64_t maskExpLead = (1llu << 62);
const uint64_t maskMantissaD = (1llu << 63) + maskExpLead + maskExpMid + maskExpOut;
if (utmp & maskExpLead) {// checking leading bit, suspect overflow
if (utmp & maskExpMid) { //Detected overflow if at least 1 bit is non-zero
//Assign Inf with proper sign
utmp = utmp | maskExpMid; //setting 1s in the middle 6 bits of of exponent
utmp = utmp & maskMantissaD; //zeroing mantissa irrelative of original values to prevent NaN
utmp = utmp | maskExpOut; //setting 1s in the last 4 bits of exponent
}
} else { //checking small numbers according to exponent range
if ((utmp & maskExpMid) != maskExpMid) { //Detected underflow if at least 1 bit is 0
utmp = 0;
}
}
memcpy(&dValue_, &utmp, sizeof utmp);
}
explicit operator double() { return dValue_; }
};
class CorrectFloat16 {
static const uint32_t mantissaShift = 42;
static const uint32_t expShiftMid = 56;
static const uint32_t expShiftOut = 52;
double dValue_;
public:
CorrectFloat16(double in) : dValue_(in) {
uint64_t utmp;
memcpy(&utmp, &dValue_, sizeof utmp);
utmp = utmp >> mantissaShift;
utmp = utmp << mantissaShift;
const uint64_t maskExpMid = (63llu << expShiftMid);
const uint64_t maskExpOut = (15llu << expShiftOut);
const uint64_t maskExpLead = (1llu << 62);
const uint64_t maskMantissaD = (1llu << 63) + maskExpLead + maskExpMid + maskExpOut;
if (utmp & maskExpLead) {
if (utmp & maskExpMid || (utmp & maskExpOut) == maskExpOut) { // <- Changed here
utmp = utmp | maskExpMid;
utmp = utmp & maskMantissaD;
utmp = utmp | maskExpOut;
}
} else {
if ((utmp & maskExpMid) != maskExpMid) {
utmp = 0;
}
}
memcpy(&dValue_, &utmp, sizeof utmp);
}
explicit operator double() { return dValue_; }
};
#if __GNUC__ >= 13 && __cplusplus >= 202100L
#include <stdfloat> // float16_t
#else
typedef _Float16 float16_t; // GCC >= 12 || Clang >= 15
#endif
int main() {
cout.precision(17);
double SFP16_MAX = 0x1p17 - 0x1p-36; // 131071.99999999999;
double SFP16_INF = nextafter(SFP16_MAX, INFINITY);
cout << "Huawei FP16\n";
cout << " MAX\n";
cout << " fp64: " << defaultfloat << SFP16_MAX << " (" << hexfloat << SFP16_MAX << ")\n"
<< " fp16: " << defaultfloat << double(Float16(SFP16_MAX)) << " (" << hexfloat << double(Float16(SFP16_MAX)) << ")\n";
cout << " OVERFLOW\n";
cout << " fp64: " << defaultfloat << SFP16_INF << " (" << hexfloat << SFP16_INF << ")\n"
<< " fp16: " << defaultfloat << double(Float16(SFP16_INF)) << "\n";
cout << " RANGE\n";
cout << " fp64: (" << -SFP16_INF << "; " << SFP16_INF << ")\n"
<< " fp16: [" << double(Float16(-SFP16_MAX)) << "; " << double(Float16(SFP16_MAX)) << "]\n";
cout << "\n";
double FP16_MAX = 0x1p16 - 0x1p-37 - 0x1p4; // 65519.99999999999;
double FP16_INF = nextafter(FP16_MAX, INFINITY);
cout << "IEEE-754 FP16\n";
cout << " MAX\n";
cout << " fp64: " << defaultfloat << FP16_MAX << " (" << hexfloat << FP16_MAX << ")\n"
<< " fp16: " << defaultfloat << double(float16_t(FP16_MAX)) << " (" << hexfloat << double(float16_t(FP16_MAX)) << ")\n";
cout << " OVERFLOW\n";
cout << " fp64: " << defaultfloat << FP16_INF << " (" << hexfloat << FP16_INF << ")\n"
<< " fp16: " << defaultfloat << double(float16_t(FP16_INF)) << "\n";
cout << " RANGE\n";
cout << " fp64: (" << -FP16_INF << "; " << FP16_INF << ")\n"
<< " fp16: [" << double(float16_t(-FP16_MAX)) << "; " << double(float16_t(FP16_MAX)) << "]\n";
cout << "\n";
double CSFP16_MAX = 0x1p16 - 0x1p-37; // 65535.99999999999;
double CSFP16_INF = nextafter(CSFP16_MAX, INFINITY);
cout << "Huawei FP16 with correct range\n";
cout << " MAX\n";
cout << " fp64: " << defaultfloat << CSFP16_MAX << " (" << hexfloat << CSFP16_MAX << ")\n"
<< " fp16: " << defaultfloat << double(CorrectFloat16(CSFP16_MAX)) << " (" << hexfloat << double(CorrectFloat16(CSFP16_MAX)) << ")\n";
cout << " OVERFLOW\n";
cout << " fp64: " << defaultfloat << CSFP16_INF << " (" << hexfloat << CSFP16_INF << ")\n"
<< " fp16: " << defaultfloat << double(CorrectFloat16(CSFP16_INF)) << "\n";
cout << " RANGE\n";
cout << " fp64: (" << -CSFP16_INF << "; " << CSFP16_INF << ")\n"
<< " fp16: [" << double(CorrectFloat16(-CSFP16_MAX)) << "; " << double(CorrectFloat16(CSFP16_MAX)) << "]\n";
}
I think it's kinda sloppy that the problem statement is inconsistent with the checker, but at least they've shared the checker's code.
Полный текст и комментарии »