@@ -35,4 +35,126 @@ public static implicit operator uintptr_t(uint d)
3535 return new uintptr_t ( ) ;
3636 }
3737 }
38+
39+ public struct half
40+ {
41+ private ushort Value ;
42+
43+ public override string ToString ( )
44+ {
45+ return ( ( float ) this ) . ToString ( ) ;
46+ }
47+
48+ public static explicit operator half ( float d )
49+ {
50+ return new half ( d ) ;
51+ }
52+
53+ public unsafe static explicit operator float ( half d )
54+ {
55+ bool isPos = ( d . Value & Float16Params . SignMask ) == 0 ;
56+ uint biasedExponent = ( d . Value & Float16Params . ExpMask ) >> Float16Params . ExpOffset ;
57+ uint frac = ( d . Value & Float16Params . FracMask ) ;
58+ bool isInf = biasedExponent == Float16Params . BiasedExpMax && ( frac == 0 ) ;
59+
60+ if ( isInf )
61+ {
62+ return isPos ? float . PositiveInfinity : float . NegativeInfinity ;
63+ }
64+
65+ bool isNan = biasedExponent == Float16Params . BiasedExpMax && ( frac != 0 ) ;
66+ if ( isNan )
67+ {
68+ return float . NaN ;
69+ }
70+
71+ bool isSubnormal = biasedExponent == 0 ;
72+ if ( isSubnormal )
73+ {
74+ return frac * Float16Params . SmallestSubnormalAsFloat * ( isPos ? 1.0f : - 1.0f ) ;
75+ }
76+
77+ int unbiasedExp = ( int ) biasedExponent - Float16Params . ExpBias ;
78+ uint biasedF32Exponent = ( uint ) ( unbiasedExp + Float32Params . ExpBias ) ;
79+
80+ uint bits ;
81+
82+ bits = ( isPos ? 0u : 1u << Float32Params . SignOffset )
83+ | ( biasedF32Exponent << Float32Params . ExpOffset )
84+ | ( frac << ( Float32Params . ExpOffset - Float16Params . ExpOffset ) ) ;
85+
86+ return * ( float * ) & bits ;
87+ }
88+
89+ public unsafe half ( float d )
90+ {
91+ uint bits = * ( uint * ) & d ;
92+
93+ uint fAbsBits = bits & Float32Params . AbsValueMask ;
94+ bool isNeg = ( bits & Float32Params . SignBitMask ) != 0 ;
95+ uint sign = ( bits & Float32Params . SignBitMask ) >> ( Float16Params . NumFracBits + Float16Params . NumExpBits + 1 ) ;
96+ uint half ;
97+
98+ if ( float . IsNaN ( d ) )
99+ {
100+ half = ( Float16Params . ExpMask | Float16Params . FracMask ) ;
101+ }
102+ else if ( float . IsInfinity ( d ) )
103+ {
104+ half = isNeg ? Float16Params . SignMask | Float16Params . ExpMask : Float16Params . ExpMask ;
105+ }
106+ else if ( fAbsBits > Float16Params . MaxNormal )
107+ {
108+ // Clamp to max float 16 value
109+ half = sign | ( ( ( 1 << Float16Params . NumExpBits ) - 1 ) << Float16Params . NumFracBits ) | Float16Params . FracMask ;
110+ }
111+ else if ( fAbsBits < Float16Params . MinNormal )
112+ {
113+ uint fracBits = ( fAbsBits & Float32Params . MantissaMask ) | ( 1 << Float32Params . NumMantissaBits ) ;
114+ int nshift = Float16Params . Emin + Float32Params . Emax - ( int ) ( fAbsBits >> Float32Params . NumMantissaBits ) ;
115+ uint shiftedBits = nshift < 24 ? fracBits >> nshift : 0 ;
116+ half = sign | ( shiftedBits >> Float16Params . FracBitsDiff ) ;
117+ }
118+ else
119+ {
120+ half = sign | ( ( fAbsBits + Float16Params . BiasDiff ) >> Float16Params . FracBitsDiff ) ;
121+ }
122+ this . Value = ( ushort ) half ;
123+ }
124+
125+ private static class Float16Params
126+ {
127+ public const uint BitSize = 16 ; // total number of bits in the representation
128+ public const int NumFracBits = 10 ; // number of fractional (mantissa) bits
129+ public const int NumExpBits = 5 ; // number of (biased) exponent bits
130+ public const uint SignBit = 15 ; // position of the sign bit
131+ public const uint SignMask = 1 << 15 ; // mask to extract sign bit
132+ public const uint FracMask = ( 1 << 10 ) - 1 ; // mask to extract the fractional (mantissa) bits
133+ public const uint ExpMask = ( ( 1 << 5 ) - 1 ) << 10 ; // mask to extract the exponent bits
134+ public const uint Emax = ( 1 << ( 5 - 1 ) ) - 1 ; // max value for the exponent
135+ public const int Emin = - ( ( 1 << ( 5 - 1 ) ) - 1 ) + 1 ; // min value for the exponent
136+ public const uint MaxNormal = ( ( ( ( 1 << ( 5 - 1 ) ) - 1 ) + 127 ) << 23 ) | 0x7FE000 ; // max value that can be represented by the 16 bit float
137+ public const uint MinNormal = ( ( - ( ( 1 << ( 5 - 1 ) ) - 1 ) + 1 ) + 127 ) << 23 ; // min value that can be represented by the 16 bit float
138+ public const uint BiasDiff = unchecked ( ( uint ) ( ( ( 1 << ( 5 - 1 ) ) - 1 ) - 127 ) << 23 ) ; // difference in bias between the float16 and float32 exponent
139+ public const int FracBitsDiff = 23 - 10 ; // difference in number of fractional bits between float16/float32
140+
141+ public const int ExpBias = 15 ;
142+ public const int ExpOffset = 10 ;
143+ public const ushort BiasedExpMax = ( 1 << 5 ) - 1 ;
144+ public const float SmallestSubnormalAsFloat = 5.96046448e-8f ;
145+ }
146+
147+ private static class Float32Params
148+ {
149+ public const uint AbsValueMask = 0x7FFFFFFF ; // ANDing with this value gives the abs value
150+ public const uint SignBitMask = 0x80000000 ; // ANDing with this value gives the sign
151+ public const int Emax = 127 ; // max value for the exponent
152+ public const int NumMantissaBits = 23 ; // 23 bit mantissa on single precision floats
153+ public const uint MantissaMask = 0x007FFFFF ; // 23 bit mantissa on single precision floats
154+
155+ public const int SignOffset = 31 ;
156+ public const int ExpBias = 127 ;
157+ public const int ExpOffset = 23 ;
158+ }
159+ }
38160}
0 commit comments