@@ -60,3 +60,252 @@ const char* ffCPUQualcommCodeToName(uint32_t code)
6060 default : return NULL ;
6161 }
6262}
63+
64+ #if defined(__x86_64__ ) || defined(__i386__ )
65+
66+ #include <cpuid.h>
67+
68+ void ffCPUDetectByCpuid (FFCPUResult * cpu )
69+ {
70+ uint32_t eax = 0 , ebx = 0 , ecx = 0 , edx = 0 ;
71+ if (__get_cpuid (0x16 , & eax , & ebx , & ecx , & edx ))
72+ {
73+ // WARNING: CPUID may report frequencies of efficient cores
74+ // cpuid returns 0 MHz when hypervisor is enabled
75+ if (eax ) cpu -> frequencyBase = eax ;
76+ if (ebx ) cpu -> frequencyMax = ebx ;
77+ }
78+
79+ if (__get_cpuid (1 , & eax , & ebx , & ecx , & edx ))
80+ {
81+ // Feature tests (leaf1.ecx, leaf7.ebx)
82+ bool sse2 = (ecx & bit_SSE2 ) != 0 ;
83+ bool sse4_2 = (ecx & bit_SSE4_2 ) != 0 ;
84+ bool pclmul = (ecx & bit_PCLMUL ) != 0 ;
85+ bool popcnt = (ecx & bit_POPCNT ) != 0 ;
86+ bool fma = (ecx & bit_FMA ) != 0 ;
87+ bool osxsave = (ecx & bit_OSXSAVE ) != 0 ;
88+
89+ unsigned int eax7 = 0 , ebx7 = 0 , ecx7 = 0 , edx7 = 0 ;
90+ __get_cpuid_count (7 , 0 , & eax7 , & ebx7 , & ecx7 , & edx7 );
91+
92+ bool avx2 = (ebx7 & bit_AVX2 ) != 0 ;
93+ bool bmi2 = (ebx7 & bit_BMI2 ) != 0 ;
94+ bool avx512f = (ebx7 & bit_AVX512F ) != 0 ;
95+ bool avx512bw = (ebx7 & bit_AVX512BW ) != 0 ;
96+ bool avx512dq = (ebx7 & bit_AVX512DQ ) != 0 ;
97+
98+ // OS support for AVX/AVX512: check XGETBV (requires OSXSAVE)
99+ bool avx_os = false;
100+ bool avx512_os = false;
101+ if (osxsave )
102+ {
103+ __asm__ __volatile__(
104+ "xgetbv"
105+ : "=a" (eax ), "=d" (edx )
106+ : "c" (0 )
107+ :
108+ );
109+ uint64_t xcr0 = ((uint64_t )edx << 32 ) | eax ;
110+
111+ // AVX requires XCR0[1:2] == 11b (XMM and YMM state)
112+ avx_os = (xcr0 & 0x6ULL ) == 0x6ULL ;
113+ // AVX512 requires XCR0[7,5,6] etc. common mask 0xE6 (bits 1,2,5,6,7)
114+ avx512_os = (xcr0 & 0xE6ULL ) == 0xE6ULL ;
115+ }
116+
117+ cpu -> march = "unknown" ;
118+ if (avx512f && avx512bw && avx512dq && avx512_os ) cpu -> march = "x86_64-v4" ;
119+ else if (avx2 && fma && bmi2 && avx_os ) cpu -> march = "x86_64-v3" ;
120+ else if (sse4_2 && popcnt && pclmul ) cpu -> march = "x86_64-v2" ;
121+ else if (sse2 ) cpu -> march = "x86_64-v1" ;
122+ }
123+ }
124+
125+ #elif defined(__aarch64__ )
126+
127+ // This is not accurate because a lot of flags are optional from old versions
128+ // https://developer.arm.com/documentation/109697/2025_06/Feature-descriptions?lang=en
129+ // https://en.wikipedia.org/wiki/AArch64#ARM-A_(application_architecture)
130+ // Worth noting: Apple M1 is marked as ARMv8.5-A on Wikipedia, but it lacks BTI (mandatory in v8.5)
131+
132+ #ifdef __linux__
133+ #include "common/io/io.h"
134+ #include <elf.h>
135+ #include <asm/hwcap.h>
136+
137+ #ifndef HWCAP2_SME
138+ #define HWCAP2_SME (1UL << 23)
139+ #endif
140+ #ifndef HWCAP2_SME2
141+ #define HWCAP2_SME2 (1UL << 37)
142+ #endif
143+ #ifndef HWCAP2_CSSC
144+ #define HWCAP2_CSSC (1UL << 34)
145+ #endif
146+ #ifndef HWCAP2_SME2P1
147+ #define HWCAP2_SME2P1 (1UL << 38)
148+ #endif
149+ #ifndef HWCAP2_MOPS
150+ #define HWCAP2_MOPS (1UL << 43)
151+ #endif
152+ #ifndef HWCAP2_F8E4M3
153+ #define HWCAP2_F8E4M3 (1UL << 55)
154+ #endif
155+ #ifndef HWCAP2_F8E5M2
156+ #define HWCAP2_F8E5M2 (1UL << 56)
157+ #endif
158+ #ifndef HWCAP_CMPBR
159+ #define HWCAP_CMPBR (1UL << 33)
160+ #endif
161+ #ifndef HWCAP_FPRCVT
162+ #define HWCAP_FPRCVT (1UL << 34)
163+ #endif
164+
165+ void ffCPUDetectByCpuid (FFCPUResult * cpu )
166+ {
167+ char buf [PROC_FILE_BUFFSIZ ];
168+ ssize_t nRead = ffReadFileData ("/proc/self/auxv" , ARRAY_SIZE (buf ), buf );
169+
170+ if (nRead < (ssize_t ) sizeof (Elf64_auxv_t )) return ;
171+
172+ uint64_t hwcap = 0 , hwcap2 = 0 ;
173+
174+ for (Elf64_auxv_t * auxv = (Elf64_auxv_t * )buf ; (char * )auxv < buf + nRead ; ++ auxv )
175+ {
176+ if (auxv -> a_type == AT_HWCAP )
177+ {
178+ hwcap = auxv -> a_un .a_val ;
179+ }
180+ else if (auxv -> a_type == AT_HWCAP2 )
181+ {
182+ hwcap2 = auxv -> a_un .a_val ;
183+ }
184+ }
185+
186+ if (!hwcap ) return ;
187+
188+ cpu -> march = "unknown" ;
189+
190+ // ARMv8-A
191+ bool has_fp = (hwcap & HWCAP_FP ) != 0 ;
192+ bool has_asimd = (hwcap & HWCAP_ASIMD ) != 0 ;
193+
194+ // ARMv8.1-A
195+ bool has_atomics = (hwcap & HWCAP_ATOMICS ) != 0 ; // optional from v8.0
196+ bool has_crc32 = (hwcap & HWCAP_CRC32 ) != 0 ; // optional from v8.0
197+ bool has_asimdrdm = (hwcap & HWCAP_ASIMDRDM ) != 0 ; // optional from v8.0
198+
199+ // ARMv8.2-A
200+ bool has_fphp = (hwcap & HWCAP_FPHP ) != 0 ; // optional
201+ bool has_dcpop = (hwcap & HWCAP_DCPOP ) != 0 ; // DC CVAP, optional from v8.1
202+
203+ // ARMv8.3-A
204+ bool has_paca = (hwcap & HWCAP_PACA ) != 0 ; // optional from v8.2
205+ bool has_lrcpc = (hwcap & HWCAP_LRCPC ) != 0 ; // optional from v8.2
206+ bool has_fcma = (hwcap & HWCAP_FCMA ) != 0 ; // optional from v8.2
207+ bool has_jscvt = (hwcap & HWCAP_JSCVT ) != 0 ; // optional from v8.2
208+
209+ // ARMv8.4-A
210+ bool has_dit = (hwcap & HWCAP_DIT ) != 0 ; // optional from v8.3
211+ bool has_flagm = (hwcap & HWCAP_FLAGM ) != 0 ; // optional from v8.1
212+ bool has_ilrcpc = (hwcap & HWCAP_ILRCPC ) != 0 ; // optional from v8.2
213+
214+ // ARMv8.5-A
215+ bool has_bti = (hwcap2 & HWCAP2_BTI ) != 0 ; // optional from v8.4
216+ bool has_sb = (hwcap & HWCAP_SB ) != 0 ; // optional from v8.0
217+ bool has_dcpodp = (hwcap2 & HWCAP2_DCPODP ) != 0 ; // optional from v8.1
218+ bool has_flagm2 = (hwcap2 & HWCAP2_FLAGM2 ) != 0 ; // optional from v8.4
219+ bool has_frint = (hwcap2 & HWCAP2_FRINT ) != 0 ; // optional from v8.4
220+
221+ // ARMv9.0-A
222+ bool has_sve2 = (hwcap2 & HWCAP2_SVE2 ) != 0 ;
223+
224+ // ARMv9.1-A
225+ // ARMv8.6-A
226+ bool has_bf16 = (hwcap2 & HWCAP2_BF16 ) != 0 ; // optional from v8.2
227+ bool has_i8mm = (hwcap2 & HWCAP2_I8MM ) != 0 ; // optional from v8.1
228+
229+ // ARMv8.7-A
230+ bool has_afp = (hwcap2 & HWCAP2_AFP ) != 0 ; // optional from v8.6
231+
232+ // ARMv9.2-A
233+ bool has_sme = (hwcap2 & HWCAP2_SME ) != 0 ;
234+
235+ // ARMv9.3-A
236+ bool has_sme2 = (hwcap2 & HWCAP2_SME2 ) != 0 ; // optional from v9.2
237+
238+ // ARMv8.8-A
239+ bool has_mops = (hwcap2 & HWCAP2_MOPS ) != 0 ; // optional from v8.7
240+
241+ // ARMv8.9-A
242+ bool has_cssc = (hwcap2 & HWCAP2_CSSC ) != 0 ; // optional from v8.7
243+
244+ // ARMv9.4-A
245+ bool has_sme2p1 = (hwcap2 & HWCAP2_SME2P1 ) != 0 ; // optional from v9.2
246+
247+ // ARMv9.5-A
248+ bool has_f8e4m3 = (hwcap2 & HWCAP2_F8E4M3 ) != 0 ; // optional from v9.2
249+ bool has_f8e5m2 = (hwcap2 & HWCAP2_F8E5M2 ) != 0 ; // optional from v9.2
250+
251+ // ARMv9.6-A
252+ bool has_cmpbr = (hwcap & HWCAP_CMPBR ) != 0 ; // optional from v9.5
253+ bool has_fprcvt = (hwcap & HWCAP_FPRCVT ) != 0 ; // optional from v9.5
254+
255+ if (has_sve2 || has_sme ) {
256+ // ARMv9
257+ if (has_cmpbr && has_fprcvt ) {
258+ cpu -> march = "ARMv9.6-A" ;
259+ } else if (has_f8e5m2 && has_f8e4m3 ) {
260+ cpu -> march = "ARMv9.5-A" ;
261+ } else if (has_sme2p1 ) {
262+ cpu -> march = "ARMv9.4-A" ;
263+ } else if (has_sme2 ) {
264+ cpu -> march = "ARMv9.3-A" ;
265+ } else if (has_sme ) {
266+ cpu -> march = "ARMv9.2-A" ;
267+ } else if (has_i8mm && has_bf16 ) {
268+ cpu -> march = "ARMv9.1-A" ;
269+ } else {
270+ cpu -> march = "ARMv9.0-A" ;
271+ }
272+ } else {
273+ // ARMv8
274+ if (has_cssc ) {
275+ cpu -> march = "ARMv8.9-A" ;
276+ } else if (has_mops ) {
277+ cpu -> march = "ARMv8.8-A" ;
278+ } else if (has_afp ) {
279+ cpu -> march = "ARMv8.7-A" ;
280+ } else if (has_i8mm && has_bf16 ) {
281+ cpu -> march = "ARMv8.6-A" ;
282+ } else if (has_bti && has_sb && has_dcpodp && has_flagm2 && has_frint ) {
283+ cpu -> march = "ARMv8.5-A" ;
284+ } else if (has_dit && has_flagm && has_ilrcpc ) {
285+ cpu -> march = "ARMv8.4-A" ;
286+ } else if (has_paca && has_lrcpc && has_fcma && has_jscvt ) {
287+ cpu -> march = "ARMv8.3-A" ;
288+ } else if (has_fphp && has_dcpop ) {
289+ cpu -> march = "ARMv8.2-A" ;
290+ } else if (has_atomics && has_crc32 && has_asimdrdm ) {
291+ cpu -> march = "ARMv8.1-A" ;
292+ } else if (has_asimd && has_fp ) {
293+ cpu -> march = "ARMv8-A" ;
294+ }
295+ }
296+ }
297+ #else
298+ void ffCPUDetectByCpuid (FF_MAYBE_UNUSED FFCPUResult * cpu )
299+ {
300+ // Unsupported platform
301+ }
302+ #endif // __linux__
303+
304+ #else
305+
306+ void ffCPUDetectByCpuid (FF_MAYBE_UNUSED FFCPUResult * cpu )
307+ {
308+ // Unsupported platform
309+ }
310+
311+ #endif
0 commit comments