Do you have experience with the best approach to query each thread to find out various capabilities (SMT, big or little, and any future differences)? Is the CPUID instruction the preferred choice, or OS APIs?
CPUID instruction provides more details about the processor features. OS API ( depends on an Operating System ) is recommended for Process or Thread affinity control and priorities.
Here is a piece of
test-code I used for Windows ( any version that supports Get/Set ThreadAffinity API ) in order to control threads affinity:
...
SYSTEM_INFO si = { 0 };
::GetSystemInfo( &si );
RTBOOL bRc = RTFALSE;
RThandle hProcess = RTnull;
RThandle hThread = RTnull;
RTulong dwProcessMask = 0;
RTulong dwSystemMask = 0;
RTulong dwThreadAM = 0;
RTulong dwThreadAMPrev = 0;
RTulong dwThread1PrefferedCPU = 0;
DWORD dwErrorCode = 0;
hProcess = SysGetCurrentProcess();
hThread = SysGetCurrentThread();
bRc = ::GetProcessAffinityMask( hProcess, ( PDWORD_PTR )&dwProcessMask, ( PDWORD_PTR )&dwSystemMask );
RTint iCpuNum = ( 8 - 1 );
RTint iThreadAffinityMask = _RUN_ON_CPU_08; // Default Logical CPU 07 at the beginning of Verification
// Take into account that Logical CPUs are numbered from 0
dwThreadAMPrev = ::SetThreadAffinityMask( hThread, iThreadAffinityMask );
SysSleep( 0 );
dwErrorCode = SysGetLastError();
CrtPrintf( RTU("\t\tSwitched to Logical CPU%d - Previous Thread AM: %3d - Error Code: %3d\n"),
iCpuNum,
dwThreadAMPrev, dwErrorCode );
for( RTuint i = 0; i < ( ( RTuint )16777216 * 128 ); i += 1 )
{
volatile RTfloat fX = 32.0f;
fX = ( RTfloat )i * ( fX * 2 ) * ( fX * 4 ) * ( fX * 8 );
}
SysSleep( 5000 );
iCpuNum = 0;
for( iThreadAffinityMask = 1; iThreadAffinityMask < 256; iThreadAffinityMask *= 2 )
{
iCpuNum++;
dwThreadAMPrev = ::SetThreadAffinityMask( hThread, iThreadAffinityMask );
SysSleep( 0 );
dwErrorCode = SysGetLastError();
CrtPrintf( RTU("\t\tSwitched to Logical CPU%d - Previous Thread AM: %3d - Error Code: %3d - Thread Affinity: %3d\n"),
( iCpuNum - 1 ),
dwThreadAMPrev, dwErrorCode, iThreadAffinityMask );
for( RTuint i = 0; i < ( ( RTuint )16777216 * 128 ); i += 1 )
{
volatile RTfloat fX = 32.0f;
fX = ( RTfloat )i * ( fX * 2 ) * ( fX * 4 ) * ( fX * 8 );
}
SysSleep( 5000 );
}
...
Do you have experience with the best approach to query each thread to find out various capabilities (SMT, big or little, and any future differences)? Is the CPUID instruction the preferred choice, or OS APIs?
It is very easy to get
Time Stamp Counter value for a Logical CPU using
RDTSC instruction:
...
// Test-Case 3 - Retrieving RDTSC values for Logical CPUs
{
CrtPrintf( RTU("\n\tTest-Case 3 - Retrieving RDTSC values for Logical CPUs - 1\n") );
RTBOOL bRc = RTFALSE;
RThandle hProcess = RTnull;
RThandle hThread = RTnull;
RTulong dwProcessMask = 0;
RTulong dwSystemMask = 0;
RTulong dwThreadAM = 0;
RTulong dwThreadAMPrev1 = 0;
RTulong dwThreadAMPrev2 = 0;
RTulong dwThread1PrefferedCPU = 0;
ClockV cvRdtscCPU1 = { 0 }; // RDTSC Value for Logical CPU1
ClockV cvRdtscCPU2 = { 0 }; // RDTSC Value for Logical CPU2
while( RTtrue )
{
hProcess = SysGetCurrentProcess();
hThread = SysGetCurrentThread();
bRc = ::GetProcessAffinityMask( hProcess, ( PDWORD_PTR )&dwProcessMask, ( PDWORD_PTR )&dwSystemMask );
if( bRc == RTFALSE )
{
CrtPrintf( RTU("\t\tError: [ GetProcessAffinityMask ] failed\n") );
break;
}
bRc = SysSetPriorityClass( hProcess, REALTIME_PRIORITY_CLASS );
if( bRc == RTFALSE )
{
CrtPrintf( RTU("\t\tError: [ SetPriorityClass ] failed\n") );
break;
}
bRc = SysSetThreadPriority( hThread, THREAD_PRIORITY_TIME_CRITICAL );
if( bRc == RTFALSE )
{
CrtPrintf( RTU("\t\tError: [ SetThreadPriority ] failed\n") );
break;
}
dwThreadAMPrev1 = ::SetThreadAffinityMask( hThread, _RUN_ON_CPU_01 );
SysSleep( 0 );
cvRdtscCPU1.uiClockV = __rdtsc();
dwThreadAMPrev1 = ::SetThreadAffinityMask( hThread, _RUN_ON_CPU_01 );
// dwThreadAMPrev2 = ::SetThreadAffinityMask( hThread, _RUN_ON_CPU_02 );
SysSleep( 0 );
cvRdtscCPU2.uiClockV = __rdtsc();
SysSetPriorityClass( hProcess, NORMAL_PRIORITY_CLASS );
SysSetThreadPriority( hThread, THREAD_PRIORITY_NORMAL );
CrtPrintf( RTU("\t\tRDTSC for Logical CPU1 : %.0f\n"), ( RTfloat )cvRdtscCPU1.uiClockV );
CrtPrintf( RTU("\t\tRDTSC for Logical CPU2 : %.0f\n"), ( RTfloat )cvRdtscCPU2.uiClockV );
CrtPrintf( RTU("\t\tRDTSC Difference: %.0f ( RDTSC2 - RDTSC1 )\n"),
( RTfloat )( cvRdtscCPU2.uiClockV - cvRdtscCPU1.uiClockV ) );
CrtPrintf( RTU("\t\tdwThreadAMPrev1 : %3d ( Processing Error if 0 )\n"), dwThreadAMPrev1 );
CrtPrintf( RTU("\t\tdwThreadAMPrev2 : %3d ( Processing Error if 0 )\n"), dwThreadAMPrev2 );
break;
}
}
...