// -*- Mode: c -*- // /* * A module that allows the reading of performance counters on Pentium. * * This file contains both code that uses the performance counters to * compute the number of cycles per second (to be used during ./configure) * and also code to read the performance counters from Ocaml. * * Author: George Necula (necula@cs.berkeley.edu) */ #include #include #include #if defined(__GNUC__) #define longlong long long // RDTSC puts the result in EAX and EDX. We tell gcc to use those registers // for "low" and "high" #if defined(__i386__) || defined(__x86_64__) #define GETCOUNTER(low,high) \ __asm__ volatile ("rdtsc" : "=a" (low), "=d" (high)); #else #define GETCOUNTER(low,high) \ printf ("Reading of performance counters is supported only on Intel x86\n"); \ exit(1); #endif #else // Microsoft Visual Studio #define longlong __int64 #define inline __inline #define GETCOUNTER(low,high) __asm { \ __asm rdtsc \ __asm mov low, eax \ __asm mov high, edx }; #endif /* Store here the first value read from the performance counter */ unsigned static longlong first_value; /* This is the function that actually reads the performance counter. */ inline unsigned longlong read_ppc(void) { unsigned long pclow, pchigh; unsigned longlong lowhigh; GETCOUNTER(pclow, pchigh); // printf ("Read low=0x%08lx high=0x%08lx\n", low, high); // Put the 64-bit value together lowhigh = ((unsigned longlong)pclow) | ((unsigned longlong)pchigh << 32); if(first_value == 0) { first_value = lowhigh; } return lowhigh - first_value; } /* sm: I want a version that is as fast as possible, dropping * bits that aren't very important to achieve it. * * * This version drops the low 20 bits and the high 14 bits so the * result is 30 bits (always a positive Ocaml int); this yields * megacycles, which for GHz machines will be something like * milliseconds. */ static unsigned long sample_ppc_20(void) { unsigned long pclow, pchigh; GETCOUNTER(pclow, pchigh); return ((pclow >> 20) | (pchigh << 12)) & 0x3FFFFFFF; } /* This version drops the low 10 bits, yielding something like * microseconds. */ inline static unsigned long sample_ppc_10() { unsigned long pclow, pchigh; GETCOUNTER(pclow,pchigh); return ((pclow >> 10) | (pchigh << 22)) & 0x3FFFFFFF; } #ifndef CONFIGURATION_ONLY /*** This is the OCAML stub for the read_ppc ***/ #include #include #include // At configuration time, we estimated the cycles per microsecond as // @CYCLES_PER_USEC@. But if we can get the cycle counts directly from the OS, // do that instead, since it's more reliable and it lets us move programs // to different machines without recompiling. double cycles_per_usec = @CYCLES_PER_USEC@; #if defined(__FreeBSD__) #define CAN_GET_SPEED_FROM_OS // Get the CPU speed from the sysctl machdep.tsc_freq // This only works on FreeBSD #include #include #include double getSpeedFromOS() { unsigned long tscfreq; size_t tflen = sizeof(tscfreq); if (sysctlbyname("machdep.tsc_freq", (void *) &tscfreq, &tflen, NULL, 0) < 0) { perror("sysctl failed"); return @CYCLES_PER_USEC@; } return (double)tscfreq / 1000000; // We care about cycles per microsecond } #elif defined(__CYGWIN__) || defined(__linux__) #define CAN_GET_SPEED_FROM_OS // Get the CPU speed from /proc/cpuinfo #define CPUINFO_KEY "cpu MHz" #include double getSpeedFromOS(){ char buffer[100]; FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); if (!cpuinfo) { printf("Error: could not open /proc/cpuinfo: %s", strerror(errno)); exit(1); } while(fgets(buffer, 100, cpuinfo)) { if (0 == strncasecmp(buffer, CPUINFO_KEY, sizeof(CPUINFO_KEY)-1)) { char* speed = buffer + sizeof(CPUINFO_KEY)-1; double result; while (*speed == ' ' || *speed == '\t' || *speed == ':') { speed++; } result = atof(speed); if (result != 0.0) { return result; } } } // Reading /proc/cpuinfo failed to find "cpu MHz" return @CYCLES_PER_USEC@; } #endif /* Returns false if hardware counters are not availible. */ value has_performance_counters() { // HAS_PERFCOUNT is set by the configuration code at the end of // this file, during ./configure if ((@HAS_PERFCOUNT@) && (cycles_per_usec > 0.0)) { return Val_true; } else { return Val_false; } } /* The Ocaml system can use this function to set cycles_per_usec. Returns false if hardware counters are not availible. */ value reset_performance_counters() { #ifdef CAN_GET_SPEED_FROM_OS cycles_per_usec = getSpeedFromOS(); #endif return has_performance_counters(); } value read_pentium_perfcount() { double counter = (double)read_ppc() / (1000000.0 * cycles_per_usec); return copy_double(counter); } /* sm: interface to above from Ocaml */ value sample_pentium_perfcount_20() { return Val_long(sample_ppc_20()); } value sample_pentium_perfcount_10() { return Val_long(sample_ppc_10()); } #endif /* Now we have a function that tries to compute the number of cycles per * second (to be used during ./configure) */ #ifdef CONFIGURATION_ONLY #include #include #include int main() { struct tms t; clock_t start, finish, diff; unsigned longlong start_pc, finish_pc, diff_pc; long clk_per_sec = sysconf(_SC_CLK_TCK); double cycles_per_usec; if(clk_per_sec <= 0) { printf("Cannot find clk_per_sec (got %ld)\n", clk_per_sec); exit(1); } times(&t); start = t.tms_utime; start_pc = read_ppc(); // Do something for a while { int i; double a = 5.678; for(i=0;i<10000000;i++) { a = (i & 1) ? (a * a) : (sqrt(a)); } } times(&t); finish = t.tms_utime; finish_pc = read_ppc(); diff = finish - start; diff_pc = finish_pc - start_pc; if(diff == 0) { printf("Cannot use Unix.times\n"); exit(1); } if(diff_pc == 0) { printf("Invalid result from the peformance counters\n"); exit(1); } diff_pc /= 1000000; // We care about cycles per microsecond // printf("diff = %ld, diff_pc = %ld, clk = %ld\n", // (long)diff, // (long)diff_pc, (long)clk_per_sec); cycles_per_usec = (((double)diff_pc / (double)diff) * (double)clk_per_sec); /* Whatever value we print here will be used as the CYCLES_PER_USEC * below */ printf("%.3lf\n", cycles_per_usec); exit(0); } #endif //defined CONFIGURATION_ONLY