hwcdrv.c 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454
  1. /* Copyright (C) 2021 Free Software Foundation, Inc.
  2. Contributed by Oracle.
  3. This file is part of GNU Binutils.
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 3, or (at your option)
  7. any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, 51 Franklin Street - Fifth Floor, Boston,
  15. MA 02110-1301, USA. */
  16. #include <errno.h>
  17. #include <unistd.h>
  18. #include <fcntl.h>
  19. #include <sys/mman.h>
  20. #include <sys/ioctl.h>
  21. #include <sys/syscall.h>
  22. #include <linux/perf_event.h>
  23. #include "hwcdrv.h"
  24. /*---------------------------------------------------------------------------*/
  25. /* macros */
  26. #define IS_GLOBAL /* Mark global symbols */
  27. #include "cpuid.c" /* ftns for identifying a chip */
  28. static hdrv_pcbe_api_t hdrv_pcbe_core_api;
  29. static hdrv_pcbe_api_t hdrv_pcbe_opteron_api;
  30. static hdrv_pcbe_api_t *hdrv_pcbe_drivers[] = {
  31. &hdrv_pcbe_core_api,
  32. &hdrv_pcbe_opteron_api,
  33. NULL
  34. };
  35. #include "opteron_pcbe.c" /* CPU-specific code */
  36. #include "core_pcbe.c" /* CPU-specific code */
  37. extern hwcdrv_api_t hwcdrv_pcl_api;
  38. IS_GLOBAL hwcdrv_api_t *hwcdrv_drivers[] = {
  39. &hwcdrv_pcl_api,
  40. NULL
  41. };
  42. /*---------------------------------------------------------------------------*/
  43. /* utils for drivers */
  44. IS_GLOBAL int
  45. hwcdrv_assign_all_regnos (Hwcentry* entries[], unsigned numctrs)
  46. {
  47. unsigned int pmc_assigned[MAX_PICS];
  48. unsigned idx;
  49. for (int ii = 0; ii < MAX_PICS; ii++)
  50. pmc_assigned[ii] = 0;
  51. /* assign the HWCs that we already know about */
  52. for (idx = 0; idx < numctrs; idx++)
  53. {
  54. regno_t regno = entries[idx]->reg_num;
  55. if (regno == REGNO_ANY)
  56. {
  57. /* check to see if list of possible registers only contains one entry */
  58. regno = REG_LIST_SINGLE_VALID_ENTRY (entries[idx]->reg_list);
  59. }
  60. if (regno != REGNO_ANY)
  61. {
  62. if (regno < 0 || regno >= MAX_PICS || !regno_is_valid (entries[idx], regno))
  63. {
  64. logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/
  65. return HWCFUNCS_ERROR_HWCARGS;
  66. }
  67. TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): preselected: idx=%d, regno=%d\n", idx, regno);
  68. entries[idx]->reg_num = regno; /* assigning back to entries */
  69. pmc_assigned[regno] = 1;
  70. }
  71. }
  72. /* assign HWCs that are currently REGNO_ANY */
  73. for (idx = 0; idx < numctrs; idx++)
  74. {
  75. if (entries[idx]->reg_num == REGNO_ANY)
  76. {
  77. int assigned = 0;
  78. regno_t *reg_list = entries[idx]->reg_list;
  79. for (; reg_list && *reg_list != REGNO_ANY; reg_list++)
  80. {
  81. regno_t regno = *reg_list;
  82. if (regno < 0 || regno >= MAX_PICS)
  83. {
  84. logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/
  85. return HWCFUNCS_ERROR_HWCARGS;
  86. }
  87. if (pmc_assigned[regno] == 0)
  88. {
  89. TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): assigned: idx=%d, regno=%d\n", idx, regno);
  90. entries[idx]->reg_num = regno; /* assigning back to entries */
  91. pmc_assigned[regno] = 1;
  92. assigned = 1;
  93. break;
  94. }
  95. }
  96. if (!assigned)
  97. {
  98. logerr (GTXT ("Counter '%s' could not be bound to a register\n"),
  99. entries[idx]->name ? entries[idx]->name : "<NULL>");
  100. return HWCFUNCS_ERROR_HWCARGS;
  101. }
  102. }
  103. }
  104. return 0;
  105. }
  106. IS_GLOBAL int
  107. hwcdrv_lookup_cpuver (const char * cpcN_cciname)
  108. {
  109. libcpc2_cpu_lookup_t *plookup;
  110. static libcpc2_cpu_lookup_t cpu_table[] = {
  111. LIBCPC2_CPU_LOOKUP_LIST
  112. };
  113. if (cpcN_cciname == NULL)
  114. return CPUVER_UNDEFINED;
  115. /* search table for name */
  116. for (plookup = cpu_table; plookup->cpc2_cciname; plookup++)
  117. {
  118. int n = strlen (plookup->cpc2_cciname);
  119. if (!strncmp (plookup->cpc2_cciname, cpcN_cciname, n))
  120. return plookup->cpc2_cpuver;
  121. }
  122. /* unknown, but does have a descriptive string */
  123. TprintfT (DBG_LT0, "hwcfuncs: CPC2: WARNING: Id of processor '%s' "
  124. "could not be determined\n",
  125. cpcN_cciname);
  126. return CPUVER_GENERIC;
  127. }
  128. /*---------------------------------------------------------------------------*/
  129. /* utils to generate x86 register definitions on Linux */
  130. /*
  131. * This code is structured as though we're going to initialize the
  132. * HWC by writing the Intel MSR register directly. That is, we
  133. * assume the lowest 16 bits of the event number will have the event
  134. * and that higher bits will set attributes.
  135. *
  136. * While SPARC is different, we can nonetheless use basically the
  137. * same "x86"-named functions:
  138. *
  139. * - The event code will still be 16 bits. It will still
  140. * be in the lowest 16 bits of the event number. Though
  141. * perf_event_code() on SPARC will expect those bits to
  142. * shifted, hwcdrv_pcl.c can easily perform that shift.
  143. *
  144. * - On SPARC we support only two attributes, "user" and "system",
  145. * which hwcdrv_pcl.c already converts to the "exclude_user"
  146. * and "exclude_kernel" fields expected by perf_event_open().
  147. * "user" and "system" are stored in event bits 16 and 17.
  148. * For M8, a 4-bit mask of supported PICs is stored in bits [23:20].
  149. */
  150. IS_GLOBAL hwcdrv_get_eventnum_fn_t *hwcdrv_get_x86_eventnum = 0;
  151. static const attr_info_t perfctr_sparc_attrs[] = {
  152. {NTXT ("user"), 0, 0x01, 16}, //usr
  153. {NTXT ("system"), 0, 0x01, 17}, //os
  154. {NULL, 0, 0x00, 0},
  155. };
  156. static const attr_info_t perfctr_x64_attrs[] = {/* ok for Core2 & later */
  157. {NTXT ("umask"), 0, 0xff, 8},
  158. {NTXT ("user"), 0, 0x01, 16}, //usr
  159. //{NTXT("nouser"), 1, 0x01, 16}, //usr (inverted)
  160. {NTXT ("system"), 0, 0x01, 17}, //os
  161. {NTXT ("edge"), 0, 0x01, 18},
  162. {NTXT ("pc"), 0, 0x01, 19},
  163. {NTXT ("inv"), 0, 0x01, 23},
  164. {NTXT ("cmask"), 0, 0xff, 24},
  165. {NULL, 0, 0x00, 0},
  166. };
  167. const attr_info_t *perfctr_attrs_table = perfctr_x64_attrs;
  168. static const eventsel_t perfctr_evntsel_enable_bits = (0x01 << 16) | /* usr */
  169. // (0xff << 0) | /* event*/
  170. // (0xff << 8) | /* umask */
  171. // (0x01 << 17) | /* os */
  172. // (0x01 << 18) | /* edge */
  173. // (0x01 << 19) | /* pc */
  174. (0x01 << 20) | /* int */
  175. // (0x01 << 21) | /* reserved */
  176. (0x01 << 22) | /* enable */
  177. // (0x01 << 23) | /* inv */
  178. // (0xff << 24) | /* cmask */
  179. 0;
  180. static int
  181. myperfctr_get_x86_eventnum (const char *eventname, uint_t pmc,
  182. eventsel_t *eventsel, eventsel_t *valid_umask,
  183. uint_t *pmc_sel)
  184. {
  185. if (hwcdrv_get_x86_eventnum &&
  186. !hwcdrv_get_x86_eventnum (eventname, pmc, eventsel, valid_umask, pmc_sel))
  187. return 0;
  188. /* check for numerically-specified counters */
  189. char * endptr;
  190. uint64_t num = strtoull (eventname, &endptr, 0);
  191. if (*eventname && !*endptr)
  192. {
  193. *eventsel = EXTENDED_EVNUM_2_EVSEL (num);
  194. *valid_umask = 0xff; /* allow any umask (unused for SPARC?) */
  195. *pmc_sel = pmc;
  196. return 0;
  197. }
  198. /* name does not specify a numeric value */
  199. *eventsel = (eventsel_t) - 1;
  200. *valid_umask = 0x0;
  201. *pmc_sel = pmc;
  202. return -1;
  203. }
  204. static int
  205. mask_shift_set (eventsel_t *presult, eventsel_t invalue,
  206. eventsel_t mask, eventsel_t shift)
  207. {
  208. if (invalue & ~mask)
  209. return -1; /* invalue attempts to set bits outside of mask */
  210. *presult &= ~(mask << shift); /* clear all the mask bits */
  211. *presult |= (invalue << shift); /* set bits according to invalue */
  212. return 0;
  213. }
  214. static int
  215. set_x86_attr_bits (eventsel_t *result_mask, eventsel_t evnt_valid_umask,
  216. hwcfuncs_attr_t attrs[], int nattrs, const char*nameOnly)
  217. {
  218. eventsel_t evntsel = *result_mask;
  219. for (int ii = 0; ii < (int) nattrs; ii++)
  220. {
  221. const char *attrname = attrs[ii].ca_name;
  222. eventsel_t attrval = (eventsel_t) attrs[ii].ca_val;
  223. const char *tmpname;
  224. int attr_found = 0;
  225. for (int jj = 0; (tmpname = perfctr_attrs_table[jj].attrname); jj++)
  226. {
  227. if (strcmp (attrname, tmpname) == 0)
  228. {
  229. if (strcmp (attrname, "umask") == 0)
  230. {
  231. if (attrval & ~evnt_valid_umask)
  232. {
  233. logerr (GTXT ("for `%s', allowable umask bits are: 0x%llx\n"),
  234. nameOnly, (long long) evnt_valid_umask);
  235. return -1;
  236. }
  237. }
  238. if (mask_shift_set (&evntsel,
  239. perfctr_attrs_table[jj].is_inverted ? (attrval^1) : attrval,
  240. perfctr_attrs_table[jj].mask,
  241. perfctr_attrs_table[jj].shift))
  242. {
  243. logerr (GTXT ("`%s' attribute `%s' could not be set to 0x%llx\n"),
  244. nameOnly, attrname, (long long) attrval);
  245. return -1;
  246. }
  247. TprintfT (DBG_LT2, "hwcfuncs: Counter %s, attribute %s set to 0x%llx\n",
  248. nameOnly, attrname, (long long) attrval);
  249. attr_found = 1;
  250. break;
  251. }
  252. }
  253. if (!attr_found)
  254. {
  255. logerr (GTXT ("attribute `%s' is invalid\n"), attrname);
  256. return -1;
  257. }
  258. }
  259. *result_mask = evntsel;
  260. return 0;
  261. }
  262. IS_GLOBAL int
  263. hwcfuncs_get_x86_eventsel (unsigned int regno, const char *int_name,
  264. eventsel_t *return_event, uint_t *return_pmc_sel)
  265. {
  266. hwcfuncs_attr_t attrs[HWCFUNCS_MAX_ATTRS + 1];
  267. unsigned nattrs = 0;
  268. char *nameOnly = NULL;
  269. eventsel_t evntsel = 0; // event number
  270. eventsel_t evnt_valid_umask = 0;
  271. uint_t pmc_sel = 0;
  272. int rc = -1;
  273. *return_event = 0;
  274. *return_pmc_sel = 0;
  275. void *attr_mem = hwcfuncs_parse_attrs (int_name, attrs, HWCFUNCS_MAX_ATTRS,
  276. &nattrs, NULL);
  277. if (!attr_mem)
  278. {
  279. logerr (GTXT ("out of memory, could not parse attributes\n"));
  280. return -1;
  281. }
  282. hwcfuncs_parse_ctr (int_name, NULL, &nameOnly, NULL, NULL, NULL);
  283. if (regno == REGNO_ANY)
  284. {
  285. logerr (GTXT ("reg# could not be determined for `%s'\n"), nameOnly);
  286. goto attr_wrapup;
  287. }
  288. /* look up evntsel */
  289. if (myperfctr_get_x86_eventnum (nameOnly, regno,
  290. &evntsel, &evnt_valid_umask, &pmc_sel))
  291. {
  292. logerr (GTXT ("counter `%s' is not valid\n"), nameOnly);
  293. goto attr_wrapup;
  294. }
  295. TprintfT (DBG_LT1, "hwcfuncs: event=0x%llx pmc=0x%x '%s' nattrs = %u\n",
  296. (long long) evntsel, pmc_sel, nameOnly, nattrs);
  297. /* determine event attributes */
  298. eventsel_t evnt_attrs = perfctr_evntsel_enable_bits;
  299. if (set_x86_attr_bits (&evnt_attrs, evnt_valid_umask, attrs, nattrs, nameOnly))
  300. goto attr_wrapup;
  301. if (evntsel & evnt_attrs)
  302. TprintfT (DBG_LT0, "hwcfuncs: ERROR - evntsel & enable bits overlap: 0x%llx 0x%llx 0x%llx\n",
  303. (long long) evntsel, (long long) evnt_attrs,
  304. (long long) (evntsel & evnt_attrs));
  305. *return_event = evntsel | evnt_attrs;
  306. *return_pmc_sel = pmc_sel;
  307. rc = 0;
  308. attr_wrapup:
  309. free (attr_mem);
  310. free (nameOnly);
  311. return rc;
  312. }
  313. #ifdef __x86_64__
  314. #define syscall_instr "syscall"
  315. #define syscall_clobber "rcx", "r11", "memory"
  316. #endif
  317. #ifdef __i386__
  318. #define syscall_instr "int $0x80"
  319. #define syscall_clobber "memory"
  320. #endif
  321. static inline int
  322. perf_event_open (struct perf_event_attr *hw_event_uptr, pid_t pid,
  323. int cpu, int group_fd, unsigned long flags)
  324. {
  325. /* It seems that perf_event_open() sometimes fails spuriously,
  326. * even while an immediate retry succeeds.
  327. * So, let's try a few retries if the call fails just to be sure.
  328. */
  329. int rc;
  330. for (int retry = 0; retry < 5; retry++)
  331. {
  332. rc = syscall (__NR_perf_event_open, hw_event_uptr, pid, cpu, group_fd, flags);
  333. if (rc != -1)
  334. return rc;
  335. }
  336. return rc;
  337. }
  338. /*---------------------------------------------------------------------------*/
  339. /* macros & fwd prototypes */
  340. #define HWCDRV_API static /* Mark functions used by hwcdrv API */
  341. HWCDRV_API int hwcdrv_start (void);
  342. HWCDRV_API int hwcdrv_free_counters ();
  343. static pid_t
  344. hwcdrv_gettid (void)
  345. {
  346. #ifndef LIBCOLLECTOR_SRC
  347. return syscall (__NR_gettid);
  348. #elif defined(intel)
  349. pid_t r;
  350. __asm__ __volatile__(syscall_instr
  351. : "=a" (r) : "0" (__NR_gettid)
  352. : syscall_clobber);
  353. return r;
  354. #else
  355. return syscall (__NR_gettid); // FIXUP_XXX_SPARC_LINUX // write gettid in asm
  356. #endif
  357. }
  358. /*---------------------------------------------------------------------------*/
  359. /* types */
  360. #define NPAGES_PER_BUF 1 // number of pages to be used for perf_event samples
  361. // must be a power of 2
  362. /*---------------------------------------------------------------------------*/
  363. /* typedefs */
  364. typedef struct
  365. { // event (hwc) definition
  366. unsigned int reg_num; // PMC assignment, potentially for detecting conflicts
  367. eventsel_t eventsel; // raw event bits (Intel/AMD)
  368. uint64_t counter_preload; // number of HWC events before signal
  369. struct perf_event_attr hw; // perf_event definition
  370. hrtime_t min_time; // minimum time we're targeting between events
  371. char *name;
  372. } perf_event_def_t;
  373. typedef struct
  374. { // runtime state of perf_event buffer
  375. void *buf; // pointer to mmapped buffer
  376. size_t pagesz; // size of pages
  377. } buffer_state_t;
  378. typedef struct
  379. { // runtime state of counter values
  380. uint64_t prev_ena_ts; // previous perf_event "enabled" time
  381. uint64_t prev_run_ts; // previous perf_event "running" time
  382. uint64_t prev_value; // previous HWC value
  383. } counter_value_state_t;
  384. typedef struct
  385. { // per-counter information
  386. perf_event_def_t *ev_def; // global HWC definition for one counter
  387. int fd; // perf_event fd
  388. buffer_state_t buf_state; // perf_event buffer's state
  389. counter_value_state_t value_state; // counter state
  390. int needs_restart; // workaround for dbx failure to preserve si_fd
  391. uint64_t last_overflow_period;
  392. hrtime_t last_overflow_time;
  393. } counter_state_t;
  394. typedef struct
  395. { // per-thread context
  396. counter_state_t *ctr_list;
  397. int signal_fd; // fd that caused the most recent signal
  398. pthread_t tid; // for debugging signal delivery problems
  399. } hdrv_pcl_ctx_t;
  400. /*---------------------------------------------------------------------------*/
  401. /* static variables */
  402. static struct
  403. {
  404. int library_ok;
  405. int internal_open_called;
  406. hwcfuncs_tsd_get_fn_t find_vpc_ctx;
  407. unsigned hwcdef_cnt; /* number of *active* hardware counters */
  408. hwcdrv_get_events_fn_t *get_events;
  409. } hdrv_pcl_state;
  410. static hwcdrv_about_t hdrv_pcl_about = {.cpcN_cpuver = CPUVER_UNDEFINED};
  411. static perf_event_def_t global_perf_event_def[MAX_PICS];
  412. #define COUNTERS_ENABLED() (hdrv_pcl_state.hwcdef_cnt)
  413. /* perf_event buffer formatting and handling */
  414. static void
  415. reset_buf (buffer_state_t *bufstate)
  416. {
  417. TprintfT (0, "hwcdrv: ERROR: perf_event reset_buf() called!\n");
  418. struct perf_event_mmap_page *metadata = bufstate->buf;
  419. if (metadata)
  420. metadata->data_tail = metadata->data_head;
  421. }
  422. static int
  423. skip_buf (buffer_state_t *bufstate, size_t sz)
  424. {
  425. TprintfT (DBG_LT1, "hwcdrv: WARNING: perf_event skip_buf called!\n");
  426. struct perf_event_mmap_page *metadata = bufstate->buf;
  427. if (metadata == NULL)
  428. return -1;
  429. size_t pgsz = bufstate->pagesz;
  430. size_t bufsz = NPAGES_PER_BUF*pgsz;
  431. uint64_t d_tail = metadata->data_tail;
  432. uint64_t d_head = metadata->data_head;
  433. // validate request size
  434. if (sz > d_head - d_tail || sz >= bufsz)
  435. {
  436. reset_buf (bufstate);
  437. return -1;
  438. }
  439. metadata->data_tail = d_tail + sz; // advance tail
  440. return 0;
  441. }
  442. static int
  443. read_buf (buffer_state_t *bufstate, void *buf, size_t sz)
  444. {
  445. struct perf_event_mmap_page *metadata = bufstate->buf;
  446. if (metadata == NULL)
  447. return -1;
  448. size_t pgsz = bufstate->pagesz;
  449. size_t bufsz = NPAGES_PER_BUF*pgsz;
  450. uint64_t d_tail = metadata->data_tail;
  451. uint64_t d_head = metadata->data_head;
  452. // validate request size
  453. if (sz > d_head - d_tail || sz >= bufsz)
  454. {
  455. reset_buf (bufstate);
  456. return -1;
  457. }
  458. char *buf_base = ((char *) metadata) + pgsz; // start of data buffer
  459. uint64_t start_pos = d_tail & (bufsz - 1); // char offset into data buffer
  460. size_t nbytes = sz;
  461. if (start_pos + sz > bufsz)
  462. {
  463. // will wrap past end of buffer
  464. nbytes = bufsz - start_pos;
  465. memcpy (buf, buf_base + start_pos, nbytes);
  466. start_pos = 0; // wrap to start
  467. buf = (void *) (((char *) buf) + nbytes);
  468. nbytes = sz - nbytes;
  469. }
  470. memcpy (buf, buf_base + start_pos, nbytes);
  471. metadata->data_tail += sz;
  472. return 0;
  473. }
  474. static int
  475. read_u64 (buffer_state_t *bufstate, uint64_t *value)
  476. {
  477. return read_buf (bufstate, value, sizeof (uint64_t));
  478. }
  479. static int
  480. read_sample (counter_state_t *ctr_state, int msgsz, uint64_t *rvalue,
  481. uint64_t *rlost)
  482. {
  483. // returns count of bytes read
  484. buffer_state_t *bufstate = &ctr_state->buf_state;
  485. counter_value_state_t *cntstate = &ctr_state->value_state;
  486. int readsz = 0;
  487. // PERF_SAMPLE_IP
  488. uint64_t ipc = 0;
  489. int rc = read_u64 (bufstate, &ipc);
  490. if (rc)
  491. return -1;
  492. readsz += sizeof (uint64_t);
  493. // PERF_SAMPLE_READ: value
  494. uint64_t value = 0;
  495. rc = read_u64 (bufstate, &value);
  496. if (rc)
  497. return -2;
  498. readsz += sizeof (uint64_t);
  499. /* Bug 20806896
  500. * Old Linux kernels (e.g. 2.6.32) on certain systems return enabled and
  501. * running times in the sample data that correspond to the metadata times
  502. * metadata->time_enabled
  503. * metadata->time_running
  504. * from the PREVIOUS (not current) sample. Probably just ignore this bug
  505. * since it's on old kernels and we only use the enabled and running times
  506. * to construct loss_estimate.
  507. */
  508. // PERF_SAMPLE_READ: PERF_FORMAT_ENABLED
  509. uint64_t enabled_time = 0;
  510. rc = read_u64 (bufstate, &enabled_time);
  511. if (rc)
  512. return -3;
  513. readsz += sizeof (uint64_t);
  514. // PERF_SAMPLE_READ: PERF_FORMAT_RUNNING
  515. uint64_t running_time = 0;
  516. rc = read_u64 (bufstate, &running_time);
  517. if (rc)
  518. return -4;
  519. readsz += sizeof (uint64_t);
  520. uint64_t value_delta = value - cntstate->prev_value;
  521. uint64_t enabled_delta = enabled_time - cntstate->prev_ena_ts;
  522. uint64_t running_delta = running_time - cntstate->prev_run_ts;
  523. cntstate->prev_value = value;
  524. cntstate->prev_ena_ts = enabled_time;
  525. cntstate->prev_run_ts = running_time;
  526. // 24830461 need workaround for Linux anomalous HWC skid overrun
  527. int set_error_flag = 0;
  528. if (value_delta > 2 * ctr_state->last_overflow_period + 2000 /* HWC_SKID_TOLERANCE */)
  529. set_error_flag = 1;
  530. uint64_t loss_estimate = 0; // estimate loss of events caused by multiplexing
  531. if (running_delta == enabled_delta)
  532. {
  533. // counter was running 100% of time, no multiplexing
  534. }
  535. else if (running_delta == 0)
  536. loss_estimate = 1; // token amount to aid in debugging perfctr oddities
  537. else if ((running_delta > enabled_delta) || (enabled_delta & 0x1000000000000000ll))
  538. {
  539. // running should be smaller than enabled, can't estimate
  540. /*
  541. * 21418391 HWC can have a negative count
  542. *
  543. * We've also seen enabled not only be smaller than running
  544. * but in fact go negative. Guard against this.
  545. */
  546. loss_estimate = 2; // token amount to aid in debugging perfctr oddities
  547. }
  548. else
  549. {
  550. // counter was running less than 100% of time
  551. // Example: ena=7772268 run=6775669 raw_value=316004 scaled_value=362483 loss_est=46479
  552. uint64_t scaled_delta = (double) value_delta * enabled_delta / running_delta;
  553. value_delta = scaled_delta;
  554. #if 0
  555. // We should perhaps warn the user that multiplexing is going on,
  556. // but hwcdrv_pcl.c doesn't know about the collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_* values.
  557. // For now we simply don't report.
  558. // Perhaps we should address the issue not here but in the caller collector_sigemt_handler(),
  559. // but at that level "lost" has a meaning that's considerably broader than just multiplexing.
  560. collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
  561. SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
  562. ctr_list[idx].last_overflow_period, new_period);
  563. #endif
  564. }
  565. TprintfT ((loss_estimate || set_error_flag) ? DBG_LT1 : DBG_LT3,
  566. "hwcdrv: '%s' ipc=0x%llx ena=%llu run=%llu "
  567. "value_delta=%lld(0x%llx) loss_est=%llu %s error_flag='%s'\n",
  568. ctr_state->ev_def->name, (long long) ipc,
  569. (long long) enabled_delta, (long long) running_delta,
  570. (long long) value_delta, (long long) value_delta,
  571. (unsigned long long) loss_estimate,
  572. loss_estimate ? ", WARNING - SCALED" : "",
  573. set_error_flag ? ", ERRORFLAG" : "");
  574. if (set_error_flag == 1)
  575. value_delta |= (1ULL << 63) /* HWCVAL_ERR_FLAG */;
  576. *rvalue = value_delta;
  577. *rlost = loss_estimate;
  578. if (readsz != msgsz)
  579. {
  580. TprintfT (0, "hwcdrv: ERROR: perf_event sample not fully parsed\n");
  581. return -5;
  582. }
  583. return 0;
  584. }
  585. static void
  586. dump_perf_event_attr (struct perf_event_attr *at)
  587. {
  588. TprintfT (DBG_LT2, "dump_perf_event_attr: size=%d type=%d sample_period=%lld\n"
  589. " config=0x%llx config1=0x%llx config2=0x%llx wakeup_events=%lld __reserved_1=%lld\n",
  590. (int) at->size, (int) at->type, (unsigned long long) at->sample_period,
  591. (unsigned long long) at->config, (unsigned long long) at->config1,
  592. (unsigned long long) at->config2, (unsigned long long) at->wakeup_events,
  593. (unsigned long long) at->__reserved_1);
  594. #define DUMP_F(fld) if (at->fld) TprintfT(DBG_LT2, " %-10s : %lld\n", #fld, (long long) at->fld)
  595. DUMP_F (disabled);
  596. DUMP_F (inherit);
  597. DUMP_F (pinned);
  598. DUMP_F (exclusive);
  599. DUMP_F (exclude_user);
  600. DUMP_F (exclude_kernel);
  601. DUMP_F (exclude_hv);
  602. DUMP_F (exclude_idle);
  603. // DUMP_F(xmmap);
  604. DUMP_F (comm);
  605. DUMP_F (freq);
  606. DUMP_F (inherit_stat);
  607. DUMP_F (enable_on_exec);
  608. DUMP_F (task);
  609. DUMP_F (watermark);
  610. }
  611. static void
  612. init_perf_event (struct perf_event_attr *hw, uint64_t event, uint64_t period)
  613. {
  614. memset (hw, 0, sizeof (struct perf_event_attr));
  615. hw->size = sizeof (struct perf_event_attr); // fwd/bwd compat
  616. #if defined(__i386__) || defined(__x86_64)
  617. //note: Nehalem/Westmere OFFCORE_RESPONSE in upper 32 bits
  618. hw->config = event;
  619. hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw...
  620. #elif defined(__aarch64__)
  621. hw->type = (event >> 24) & 7;
  622. hw->config = event & 0xff;
  623. #elif defined(sparc)
  624. //SPARC needs to be shifted up 16 bits
  625. hw->config = (event & 0xFFFF) << 16; // uint64_t event
  626. uint64_t regs = (event >> 20) & 0xf; // see sparc_pcbe.c
  627. hw->config |= regs << 4; // for M8, supported PICs need to be placed at bits [7:4]
  628. hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw...
  629. #endif
  630. hw->sample_period = period;
  631. hw->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_READ |
  632. // PERF_SAMPLE_TID |
  633. // PERF_SAMPLE_TIME | // possibly interesting
  634. // PERF_SAMPLE_ADDR |
  635. PERF_SAMPLE_READ | // HWC value
  636. // PERF_SAMPLE_CALLCHAIN | // interesting
  637. // PERF_SAMPLE_ID |
  638. // PERF_SAMPLE_CPU | // possibly interesting
  639. // PERF_SAMPLE_PERIOD |
  640. // PERF_SAMPLE_STREAM_ID |
  641. // PERF_SAMPLE_RAW |
  642. 0;
  643. hw->read_format =
  644. PERF_FORMAT_TOTAL_TIME_ENABLED | // detect when hwc not scheduled
  645. PERF_FORMAT_TOTAL_TIME_RUNNING | // detect when hwc not scheduled
  646. // PERF_FORMAT_ID |
  647. // PERF_FORMAT_GROUP |
  648. 0;
  649. hw->disabled = 1; /* off by default */
  650. // Note: the following override config.priv bits!
  651. hw->exclude_user = (event & (1 << 16)) == 0; /* don't count user */
  652. hw->exclude_kernel = (event & (1 << 17)) == 0; /* ditto kernel */
  653. hw->exclude_hv = 1; /* ditto hypervisor */
  654. hw->wakeup_events = 1; /* wakeup every n events */
  655. dump_perf_event_attr (hw);
  656. }
  657. static int
  658. start_one_ctr (int ii, size_t pgsz, hdrv_pcl_ctx_t * pctx, char *error_string)
  659. {
  660. // pe_attr should have been initialized in hwcdrv_create_counters()
  661. struct perf_event_attr pe_attr;
  662. memcpy (&pe_attr, &global_perf_event_def[ii].hw, sizeof (pe_attr));
  663. // but we adjust the period, so make sure that pctx->ctr_list[ii].last_overflow_period has been set
  664. pe_attr.sample_period = pctx->ctr_list[ii].last_overflow_period;
  665. int hwc_fd = perf_event_open (&pe_attr, pctx->tid, -1, -1, 0);
  666. if (hwc_fd == -1)
  667. {
  668. TprintfT (DBG_LT1, "%s idx=%d perf_event_open failed, errno=%d\n",
  669. error_string, ii, errno);
  670. return 1;
  671. }
  672. size_t buffer_area_sz = (NPAGES_PER_BUF + 1) * pgsz; // add a page for metadata
  673. void * buf = mmap (NULL, buffer_area_sz, //YXXX is this a safe call?
  674. PROT_READ | PROT_WRITE, MAP_SHARED, hwc_fd, 0);
  675. if (buf == MAP_FAILED)
  676. {
  677. TprintfT (0, "sz = %ld, pgsz = %ld\n err=%s idx=%d mmap failed: %s\n",
  678. (long) buffer_area_sz, (long) pgsz, error_string, ii, strerror (errno));
  679. return 1;
  680. }
  681. pctx->ctr_list[ii].ev_def = &global_perf_event_def[ii]; // why do we set ev_def? we never seem to use it
  682. pctx->ctr_list[ii].fd = hwc_fd;
  683. pctx->ctr_list[ii].buf_state.buf = buf;
  684. pctx->ctr_list[ii].buf_state.pagesz = pgsz;
  685. pctx->ctr_list[ii].value_state.prev_ena_ts = 0;
  686. pctx->ctr_list[ii].value_state.prev_run_ts = 0;
  687. pctx->ctr_list[ii].value_state.prev_value = 0;
  688. pctx->ctr_list[ii].last_overflow_time = gethrtime ();
  689. /* set async mode */
  690. long flags = fcntl (hwc_fd, F_GETFL, 0) | O_ASYNC;
  691. int rc = fcntl (hwc_fd, F_SETFL, flags);
  692. if (rc == -1)
  693. {
  694. TprintfT (0, "%s idx=%d O_ASYNC failed\n", error_string, ii);
  695. return 1;
  696. }
  697. /*
  698. * set lwp ownership of the fd
  699. * See BUGS section of "man perf_event_open":
  700. * The F_SETOWN_EX option to fcntl(2) is needed to properly get
  701. * overflow signals in threads. This was introduced in Linux 2.6.32.
  702. * Legacy references:
  703. * see http://lkml.org/lkml/2009/8/4/128
  704. * google man fcntl F_SETOWN_EX -conflict
  705. * "From Linux 2.6.32 onward, use F_SETOWN_EX to target
  706. * SIGIO and SIGURG signals at a particular thread."
  707. * http://icl.cs.utk.edu/papi/docs/da/d2a/examples__v2_8x_2self__smpl__multi_8c.html
  708. * See 2010 CSCADS presentation by Eranian
  709. */
  710. struct f_owner_ex fowner_ex;
  711. fowner_ex.type = F_OWNER_TID;
  712. fowner_ex.pid = pctx->tid;
  713. rc = fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex);
  714. if (rc == -1)
  715. {
  716. TprintfT (0, "%s idx=%d F_SETOWN failed\n", error_string, ii);
  717. return 1;
  718. }
  719. /* Use sigio so handler can determine FD via siginfo->si_fd. */
  720. rc = fcntl (hwc_fd, F_SETSIG, SIGIO);
  721. if (rc == -1)
  722. {
  723. TprintfT (0, "%s idx=%d F_SETSIG failed\n", error_string, ii);
  724. return 1;
  725. }
  726. return 0;
  727. }
  728. static int
  729. stop_one_ctr (int ii, counter_state_t *ctr_list)
  730. {
  731. int hwc_rc = 0;
  732. if (-1 == ioctl (ctr_list[ii].fd, PERF_EVENT_IOC_DISABLE, 1))
  733. {
  734. TprintfT (0, "hwcdrv: ERROR: PERF_EVENT_IOC_DISABLE #%d failed: errno=%d\n", ii, errno);
  735. hwc_rc = HWCFUNCS_ERROR_GENERIC;
  736. }
  737. void *buf = ctr_list[ii].buf_state.buf;
  738. if (buf)
  739. {
  740. size_t bufsz = (NPAGES_PER_BUF + 1) * ctr_list[ii].buf_state.pagesz;
  741. ctr_list[ii].buf_state.buf = NULL;
  742. int tmprc = munmap (buf, bufsz);
  743. if (tmprc)
  744. {
  745. TprintfT (0, "hwcdrv: ERROR: munmap() #%d failed: errno=%d\n", ii, errno);
  746. hwc_rc = HWCFUNCS_ERROR_GENERIC;
  747. }
  748. }
  749. if (-1 == close (ctr_list[ii].fd))
  750. {
  751. TprintfT (0, "hwcdrv: ERROR: close(fd) #%d failed: errno=%d\n", ii, errno);
  752. hwc_rc = HWCFUNCS_ERROR_GENERIC;
  753. }
  754. return hwc_rc;
  755. }
  756. /* HWCDRV_API for thread-specific actions */
  757. HWCDRV_API int
  758. hwcdrv_lwp_init (void)
  759. {
  760. return hwcdrv_start ();
  761. }
  762. HWCDRV_API void
  763. hwcdrv_lwp_fini (void)
  764. {
  765. hwcdrv_free_counters (); /* also sets pctx->ctr_list=NULL; */
  766. }
  767. /* open */
  768. static int
  769. hdrv_pcl_internal_open ()
  770. {
  771. if (hdrv_pcl_state.internal_open_called)
  772. {
  773. TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open: already called\n");
  774. return HWCFUNCS_ERROR_ALREADY_CALLED;
  775. }
  776. // determine if PCL is available
  777. perf_event_def_t tmp_event_def;
  778. memset (&tmp_event_def, 0, sizeof (tmp_event_def));
  779. struct perf_event_attr *pe_attr = &tmp_event_def.hw;
  780. init_perf_event (pe_attr, 0, 0);
  781. pe_attr->type = PERF_TYPE_HARDWARE; // specify abstracted HW event
  782. pe_attr->config = PERF_COUNT_HW_INSTRUCTIONS; // specify abstracted insts
  783. int hwc_fd = perf_event_open (pe_attr,
  784. 0, // pid/tid, 0 is self
  785. -1, // cpu, -1 is per-thread mode
  786. -1, // group_fd, -1 is root
  787. 0); // flags
  788. if (hwc_fd == -1)
  789. {
  790. TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
  791. " perf_event_open() failed, errno=%d\n", errno);
  792. goto internal_open_error;
  793. }
  794. /* see if the PCL is new enough to know about F_SETOWN_EX */
  795. struct f_owner_ex fowner_ex;
  796. fowner_ex.type = F_OWNER_TID;
  797. fowner_ex.pid = hwcdrv_gettid (); // "pid=tid" is correct w/F_OWNER_TID
  798. if (fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex) == -1)
  799. {
  800. TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open: "
  801. "F_SETOWN failed, errno=%d\n", errno);
  802. close (hwc_fd);
  803. goto internal_open_error;
  804. }
  805. close (hwc_fd);
  806. hdrv_pcl_state.internal_open_called = 1;
  807. hdrv_pcl_state.library_ok = 1; // set to non-zero to show it's initted
  808. hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
  809. TprintfT (DBG_LT2, "hwcdrv: hdrv_pcl_internal_open()\n");
  810. for (int ii = 0; hdrv_pcbe_drivers[ii]; ii++)
  811. {
  812. hdrv_pcbe_api_t *ppcbe = hdrv_pcbe_drivers[ii];
  813. if (!ppcbe->hdrv_pcbe_init ())
  814. {
  815. hdrv_pcl_about.cpcN_cciname = ppcbe->hdrv_pcbe_impl_name ();
  816. hdrv_pcl_about.cpcN_cpuver = hwcdrv_lookup_cpuver (hdrv_pcl_about.cpcN_cciname);
  817. if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
  818. goto internal_open_error;
  819. hdrv_pcl_about.cpcN_npics = ppcbe->hdrv_pcbe_ncounters ();
  820. hdrv_pcl_about.cpcN_docref = ppcbe->hdrv_pcbe_cpuref ();
  821. hdrv_pcl_state.get_events = ppcbe->hdrv_pcbe_get_events;
  822. hwcdrv_get_x86_eventnum = ppcbe->hdrv_pcbe_get_eventnum;
  823. break;
  824. }
  825. }
  826. if (hdrv_pcl_about.cpcN_npics > MAX_PICS)
  827. {
  828. TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
  829. " reducing number of HWCs from %u to %u on processor '%s'\n",
  830. hdrv_pcl_about.cpcN_npics, MAX_PICS, hdrv_pcl_about.cpcN_cciname);
  831. hdrv_pcl_about.cpcN_npics = MAX_PICS;
  832. }
  833. TprintfT (DBG_LT1, "hwcdrv: hdrv_pcl_internal_open:"
  834. " perf_event cpuver=%d, name='%s'\n",
  835. hdrv_pcl_about.cpcN_cpuver, hdrv_pcl_about.cpcN_cciname);
  836. return 0;
  837. internal_open_error:
  838. hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
  839. hdrv_pcl_about.cpcN_npics = 0;
  840. hdrv_pcl_about.cpcN_docref = NULL;
  841. hdrv_pcl_about.cpcN_cciname = NULL;
  842. return HWCFUNCS_ERROR_NOT_SUPPORTED;
  843. }
  844. static void *
  845. single_thread_tsd_ftn ()
  846. {
  847. static hdrv_pcl_ctx_t tsd_context;
  848. return &tsd_context;
  849. }
  850. /* HWCDRV_API */
  851. HWCDRV_API int
  852. hwcdrv_init (hwcfuncs_abort_fn_t abort_ftn, int *tsd_sz)
  853. {
  854. hdrv_pcl_state.find_vpc_ctx = single_thread_tsd_ftn;
  855. if (tsd_sz)
  856. *tsd_sz = sizeof (hdrv_pcl_ctx_t);
  857. if (hdrv_pcl_state.internal_open_called)
  858. return HWCFUNCS_ERROR_ALREADY_CALLED;
  859. return hdrv_pcl_internal_open ();
  860. }
  861. HWCDRV_API void
  862. hwcdrv_get_info (int *cpuver, const char **cciname, uint_t *npics,
  863. const char **docref, uint64_t *support)
  864. {
  865. if (cpuver)
  866. *cpuver = hdrv_pcl_about.cpcN_cpuver;
  867. if (cciname)
  868. *cciname = hdrv_pcl_about.cpcN_cciname;
  869. if (npics)
  870. *npics = hdrv_pcl_about.cpcN_npics;
  871. if (docref)
  872. *docref = hdrv_pcl_about.cpcN_docref;
  873. if (support)
  874. *support = HWCFUNCS_SUPPORT_OVERFLOW_PROFILING | HWCFUNCS_SUPPORT_OVERFLOW_CTR_ID;
  875. }
  876. HWCDRV_API int
  877. hwcdrv_enable_mt (hwcfuncs_tsd_get_fn_t tsd_ftn)
  878. {
  879. if (tsd_ftn)
  880. hdrv_pcl_state.find_vpc_ctx = tsd_ftn;
  881. else
  882. {
  883. TprintfT (0, "hwcdrv: ERROR: enable_mt(): tsd_ftn==NULL\n");
  884. return HWCFUNCS_ERROR_UNAVAIL;
  885. }
  886. return 0;
  887. }
  888. HWCDRV_API int
  889. hwcdrv_get_descriptions (hwcf_hwc_cb_t *hwc_cb, hwcf_attr_cb_t *attr_cb)
  890. {
  891. int count = 0;
  892. if (hwc_cb && hdrv_pcl_state.get_events)
  893. count = hdrv_pcl_state.get_events (hwc_cb);
  894. if (attr_cb)
  895. for (int ii = 0; perfctr_attrs_table && perfctr_attrs_table[ii].attrname; ii++)
  896. attr_cb (perfctr_attrs_table[ii].attrname);
  897. if (!count)
  898. return -1;
  899. return 0;
  900. }
  901. HWCDRV_API int
  902. hwcdrv_assign_regnos (Hwcentry* entries[], unsigned numctrs)
  903. {
  904. return hwcdrv_assign_all_regnos (entries, numctrs);
  905. }
  906. static int
  907. internal_hwc_start (int fd)
  908. {
  909. int rc = ioctl (fd, PERF_EVENT_IOC_REFRESH, 1);
  910. if (rc == -1)
  911. {
  912. TprintfT (DBG_LT0, "hwcdrv: ERROR: internal_hwc_start:"
  913. " PERF_EVENT_IOC_REFRESH(fd=%d) failed: errno=%d\n", fd, errno);
  914. return HWCFUNCS_ERROR_UNAVAIL;
  915. }
  916. TprintfT (DBG_LT3, "hwcdrv: internal_hwc_start(fd=%d)\n", fd);
  917. return 0;
  918. }
  919. HWCDRV_API int
  920. hwcdrv_overflow (siginfo_t *si, hwc_event_t *eventp, hwc_event_t *lost_events)
  921. {
  922. /* set expired counters to overflow value and all others to 0 */
  923. /* return 0: OK, counters should be restarted */
  924. /* return non-zero: eventp not set, counters should not be restarted */
  925. /* clear return values */
  926. int ii;
  927. for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
  928. {
  929. eventp->ce_pic[ii] = 0;
  930. lost_events->ce_pic[ii] = 0;
  931. }
  932. hrtime_t sig_ts = gethrtime (); //YXXX get this from HWC event?
  933. eventp->ce_hrt = sig_ts;
  934. lost_events->ce_hrt = sig_ts;
  935. /* determine source signal */
  936. int signal_fd = -1;
  937. switch (si->si_code)
  938. {
  939. case POLL_HUP: /* expected value from pcl */
  940. /* According to Stephane Eranian:
  941. * "expect POLL_HUP instead of POLL_IN because we are
  942. * in one-shot mode (IOC_REFRESH)"
  943. */
  944. signal_fd = si->si_fd;
  945. break;
  946. case SI_TKILL: /* event forwarded by tkill */
  947. /* DBX can only forward SI_TKILL when it detects POLL_HUP
  948. * unfortunately, this means that si->si_fd has been lost...
  949. * We need to process the buffers, but we don't know the fd!
  950. */
  951. TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
  952. " SI_TKILL detected\n", sig_ts);
  953. break;
  954. default:
  955. // "sometimes we see a POLL_IN (1) with very high event rates,"
  956. // according to eranian(?)
  957. TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
  958. " unexpected si_code 0x%x\n", sig_ts, si->si_code);
  959. return HWCFUNCS_ERROR_GENERIC;
  960. }
  961. hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
  962. if (!pctx)
  963. {
  964. TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
  965. " tsd context is NULL\n", sig_ts);
  966. return HWCFUNCS_ERROR_UNEXPECTED;
  967. }
  968. counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
  969. if (!ctr_list)
  970. {
  971. TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
  972. " ctr_list is NULL\n", sig_ts);
  973. return HWCFUNCS_ERROR_UNEXPECTED;
  974. }
  975. /* clear needs_restart flag */
  976. for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
  977. ctr_list[ii].needs_restart = 0;
  978. /* attempt to identify the counter to read */
  979. int signal_idx = -1;
  980. pctx->signal_fd = signal_fd; // save the signal provided by siginfo_t
  981. if (signal_fd != -1)
  982. {
  983. for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
  984. {
  985. if (ctr_list[ii].fd == signal_fd)
  986. {
  987. signal_idx = ii;
  988. break;
  989. }
  990. }
  991. }
  992. if (signal_idx < 0)
  993. {
  994. TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
  995. " pmc not determined!\n", sig_ts);
  996. lost_events->ce_pic[0] = 1; /* record a bogus value into experiment */
  997. // note: bogus value may get overwritten in loop below
  998. }
  999. /* capture sample(s). In addition to signal_idx, check other counters. */
  1000. struct perf_event_header sheader;
  1001. int idx;
  1002. for (idx = 0; idx < hdrv_pcl_state.hwcdef_cnt; idx++)
  1003. {
  1004. int num_recs = 0;
  1005. while (1)
  1006. {
  1007. /* check for samples */
  1008. struct perf_event_mmap_page *metadata = ctr_list[idx].buf_state.buf;
  1009. if (metadata == NULL)
  1010. break; // empty
  1011. if (metadata->data_tail == metadata->data_head)
  1012. break; // empty
  1013. /* read header */
  1014. if (read_buf (&ctr_list[idx].buf_state, &sheader, sizeof (sheader)))
  1015. break;
  1016. num_recs++;
  1017. /* check for PERF_RECORD_SAMPLE */
  1018. size_t datasz = sheader.size - sizeof (struct perf_event_header);
  1019. if (sheader.type != PERF_RECORD_SAMPLE)
  1020. {
  1021. TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
  1022. " unexpected recd type=%d\n",
  1023. sig_ts, sheader.type);
  1024. if (skip_buf (&ctr_list[idx].buf_state, datasz))
  1025. {
  1026. TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
  1027. " skip recd type=%d failed\n", sig_ts, sheader.type);
  1028. lost_events->ce_pic[idx] = 4; /* record a bogus value */
  1029. break; // failed to skip buffer??
  1030. }
  1031. lost_events->ce_pic[idx] = 2; /* record a bogus value */
  1032. continue; // advance to next record
  1033. }
  1034. /* type is PERF_RECORD_SAMPLE */
  1035. uint64_t value, lostv;
  1036. if (read_sample (&ctr_list[idx], datasz, &value, &lostv))
  1037. {
  1038. TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
  1039. " read_sample() failed\n", sig_ts);
  1040. lost_events->ce_pic[idx] = 3; // record a bogus value
  1041. break; // failed to read sample data??
  1042. }
  1043. TprintfT (DBG_LT3, "hwcdrv: sig_ts=%llu: hwcdrv_overflow:"
  1044. " idx=%d value=%llu lost=%llu\n", (unsigned long long) sig_ts,
  1045. idx, (unsigned long long) value, (unsigned long long) lostv);
  1046. if (eventp->ce_pic[idx])
  1047. {
  1048. TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
  1049. " idx=%d previous sample recorded as lost_event\n", sig_ts, idx);
  1050. lost_events->ce_pic[idx] += eventp->ce_pic[idx];
  1051. }
  1052. eventp->ce_pic[idx] = value;
  1053. lost_events->ce_pic[idx] += lostv;
  1054. }
  1055. /* debug output for unexpected (but common) cases */
  1056. if (idx == signal_idx)
  1057. {
  1058. if (num_recs != 1)
  1059. TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
  1060. " %d records for signal_idx=%d\n", sig_ts, num_recs, signal_idx);
  1061. }
  1062. else if (num_recs)
  1063. TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
  1064. " %d unexpected record(s) for idx=%d (signal_idx=%d)\n",
  1065. sig_ts, num_recs, idx, signal_idx);
  1066. /* trigger counter restart whenever records were found */
  1067. if (num_recs)
  1068. {
  1069. /* check whether to adapt the overflow interval */
  1070. /* This is the Linux version.
  1071. * The Solaris version is in hwprofile.c collector_update_overflow_counters().
  1072. */
  1073. hrtime_t min_time = global_perf_event_def[idx].min_time;
  1074. if (min_time > 0 // overflow interval is adaptive
  1075. && sig_ts - ctr_list[idx].last_overflow_time < min_time) // last interval below min
  1076. {
  1077. /* pick a new overflow interval */
  1078. /* roughly doubled, but add funny numbers */
  1079. /* hopefully the result is prime or not a multiple of some # of ops/loop */
  1080. uint64_t new_period = 2 * ctr_list[idx].last_overflow_period + 37;
  1081. #if 0
  1082. // On Solaris, we report the adjustment to the log file.
  1083. // On Linux it's hard for us to do so since hwcdrv_pcl.c doesn't know about collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_HWCADJ.
  1084. // For now we simply don't report.
  1085. collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
  1086. SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
  1087. ctr_list[idx].last_overflow_period, new_period);
  1088. #endif
  1089. /* There are a variety of ways of resetting the period on Linux.
  1090. * The most elegant is
  1091. * ioctl(fd,PERF_EVENT_IOC_PERIOD,&period)
  1092. * but check the perf_event_open man page for PERF_EVENT_IOC_PERIOD:
  1093. * > Prior to Linux 2.6.36 this ioctl always failed due to a bug in the kernel.
  1094. * > Prior to Linux 3.14 (or 3.7 on ARM), the new period did not take effect
  1095. * until after the next overflow.
  1096. * So we're kind of stuck shutting the fd down and restarting it with the new period.
  1097. */
  1098. if (stop_one_ctr (idx, ctr_list))
  1099. {
  1100. // EUGENE figure out what to do on error
  1101. }
  1102. ctr_list[idx].last_overflow_period = new_period;
  1103. if (start_one_ctr (idx, ctr_list[idx].buf_state.pagesz, pctx, "hwcdrv: ERROR: hwcdrv_overflow (readjust overflow):"))
  1104. {
  1105. // EUGENE figure out what to do on error
  1106. }
  1107. }
  1108. ctr_list[idx].last_overflow_time = sig_ts;
  1109. #if 0
  1110. ctr_list[idx].needs_restart = 1;
  1111. #else // seems to be more reliable to restart here instead of hwcdrv_sighlr_restart()
  1112. internal_hwc_start (ctr_list[idx].fd);
  1113. #endif
  1114. }
  1115. }
  1116. return 0; // OK to restart counters
  1117. }
  1118. HWCDRV_API int
  1119. hwcdrv_sighlr_restart (const hwc_event_t *pp)
  1120. {
  1121. #if 0 // restarting here doesn't seem to work as well as restarting in hwcdrv_overflow()
  1122. hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
  1123. if (!pctx)
  1124. {
  1125. TprintfT (DBG_LT0, "hwcdrv: ERROR: hwcdrv_sighlr_restart: find_vpc_ctx()==NULL\n");
  1126. return -1;
  1127. }
  1128. counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
  1129. if (!ctr_list)
  1130. {
  1131. TprintfT (DBG_LT0, "hwcdrv: WARNING: hwcdrv_sighlr_restart: ctr_list is NULL\n");
  1132. return -1;
  1133. }
  1134. int errors = 0;
  1135. for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
  1136. {
  1137. if (ctr_list[ii].needs_restart)
  1138. errors |= internal_hwc_start (ctr_list[ii].fd);
  1139. ctr_list[ii].needs_restart = 0;
  1140. }
  1141. return errors;
  1142. #else
  1143. return 0;
  1144. #endif
  1145. }
  1146. /* create counters based on hwcdef[] */
  1147. HWCDRV_API int
  1148. hwcdrv_create_counters (unsigned hwcdef_cnt, Hwcentry *hwcdef)
  1149. {
  1150. if (hwcdef_cnt > hdrv_pcl_about.cpcN_npics)
  1151. {
  1152. logerr (GTXT ("More than %d counters were specified\n"), hdrv_pcl_about.cpcN_npics); /*!*/
  1153. return HWCFUNCS_ERROR_HWCARGS;
  1154. }
  1155. if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
  1156. {
  1157. logerr (GTXT ("Processor not supported\n"));
  1158. return HWCFUNCS_ERROR_HWCARGS;
  1159. }
  1160. /* add counters */
  1161. for (unsigned idx = 0; idx < hwcdef_cnt; idx++)
  1162. {
  1163. perf_event_def_t *glb_event_def = &global_perf_event_def[idx];
  1164. memset (glb_event_def, 0, sizeof (perf_event_def_t));
  1165. unsigned int pmc_sel;
  1166. eventsel_t evntsel;
  1167. if (hwcfuncs_get_x86_eventsel (hwcdef[idx].reg_num,
  1168. hwcdef[idx].int_name, &evntsel, &pmc_sel))
  1169. {
  1170. TprintfT (0, "hwcdrv: ERROR: hwcfuncs_get_x86_eventsel() failed\n");
  1171. return HWCFUNCS_ERROR_HWCARGS;
  1172. }
  1173. glb_event_def->reg_num = pmc_sel;
  1174. glb_event_def->eventsel = evntsel;
  1175. glb_event_def->counter_preload = hwcdef[idx].val;
  1176. glb_event_def->min_time = hwcdef[idx].min_time;
  1177. glb_event_def->name = strdup (hwcdef[idx].name); // memory leak??? very minor
  1178. init_perf_event (&glb_event_def->hw, glb_event_def->eventsel,
  1179. glb_event_def->counter_preload);
  1180. TprintfT (DBG_LT1, "hwcdrv: create_counters: pic=%u name='%s' interval=%lld"
  1181. "(min_time=%lld): reg_num=0x%x eventsel=0x%llx ireset=%lld usr=%lld sys=%lld\n",
  1182. idx, hwcdef[idx].int_name, (long long) glb_event_def->counter_preload,
  1183. (long long) glb_event_def->min_time, (int) glb_event_def->reg_num,
  1184. (long long) glb_event_def->eventsel,
  1185. (long long) HW_INTERVAL_PRESET (hwcdef[idx].val),
  1186. (long long) glb_event_def->hw.exclude_user,
  1187. (long long) glb_event_def->hw.exclude_kernel);
  1188. }
  1189. hdrv_pcl_state.hwcdef_cnt = hwcdef_cnt;
  1190. return 0;
  1191. }
  1192. HWCDRV_API int
  1193. hwcdrv_free_counters () // note: only performs shutdown for this thread
  1194. {
  1195. hdrv_pcl_ctx_t * pctx;
  1196. if (!COUNTERS_ENABLED ())
  1197. return 0;
  1198. pctx = hdrv_pcl_state.find_vpc_ctx ();
  1199. if (!pctx)
  1200. {
  1201. TprintfT (0, "hwcdrv: WARNING: hwcdrv_free_counters: tsd context is NULL\n");
  1202. return HWCFUNCS_ERROR_GENERIC;
  1203. }
  1204. counter_state_t *ctr_list = pctx->ctr_list;
  1205. if (!ctr_list)
  1206. {
  1207. // fork child: prolog suspends hwcs, then epilog frees them
  1208. TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_free_counters: ctr_list is already NULL\n");
  1209. return 0;
  1210. }
  1211. int hwc_rc = 0;
  1212. for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
  1213. if (stop_one_ctr (ii, ctr_list))
  1214. hwc_rc = HWCFUNCS_ERROR_GENERIC;
  1215. TprintfT (DBG_LT1, "hwcdrv: hwcdrv_free_counters(tid=0x%lx).\n", pctx->tid);
  1216. pctx->ctr_list = NULL;
  1217. return hwc_rc;
  1218. }
  1219. HWCDRV_API int
  1220. hwcdrv_start (void) /* must be called from each thread ? */
  1221. {
  1222. hdrv_pcl_ctx_t *pctx = NULL;
  1223. if (!COUNTERS_ENABLED ())
  1224. {
  1225. TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_start: no counters to start \n");
  1226. return 0;
  1227. }
  1228. if (!hdrv_pcl_state.library_ok)
  1229. {
  1230. TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: library is not open\n");
  1231. return HWCFUNCS_ERROR_NOT_SUPPORTED;
  1232. }
  1233. /*
  1234. * set up per-thread context
  1235. */
  1236. pctx = hdrv_pcl_state.find_vpc_ctx ();
  1237. if (!pctx)
  1238. {
  1239. TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: tsd context is NULL\n");
  1240. return HWCFUNCS_ERROR_UNEXPECTED;
  1241. }
  1242. pctx->tid = hwcdrv_gettid ();
  1243. TprintfT (DBG_LT1, "hwcdrv: hwcdrv_start(tid=0x%lx)\n", pctx->tid);
  1244. /*
  1245. * create per-thread counter list
  1246. */
  1247. counter_state_t *ctr_list = (counter_state_t *) calloc (hdrv_pcl_state.hwcdef_cnt,
  1248. sizeof (counter_state_t));
  1249. if (!ctr_list)
  1250. {
  1251. TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: calloc(ctr_list) failed\n");
  1252. return HWCFUNCS_ERROR_MEMORY;
  1253. }
  1254. int ii;
  1255. for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
  1256. ctr_list[ii].fd = -1; // invalidate fds in case we have to close prematurely
  1257. pctx->ctr_list = ctr_list;
  1258. /*
  1259. * bind the counters
  1260. */
  1261. size_t pgsz = sysconf (_SC_PAGESIZE);
  1262. for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
  1263. {
  1264. ctr_list[ii].last_overflow_period = global_perf_event_def[ii].hw.sample_period;
  1265. if (start_one_ctr (ii, pgsz, pctx, "hwcdrv: ERROR: hwcdrv_start:")) goto hwcdrv_start_cleanup;
  1266. }
  1267. /*
  1268. * start the counters
  1269. */
  1270. for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
  1271. {
  1272. int rc = internal_hwc_start (ctr_list[ii].fd);
  1273. if (rc < 0)
  1274. goto hwcdrv_start_cleanup;
  1275. }
  1276. return 0;
  1277. hwcdrv_start_cleanup:
  1278. hwcdrv_free_counters (); // PERF_EVENT_IOC_DISABLE and close() for all fds
  1279. return HWCFUNCS_ERROR_UNAVAIL;
  1280. }
  1281. HWCDRV_API int
  1282. hwcdrv_lwp_suspend (void) /* must be called from each thread */
  1283. {
  1284. if (!COUNTERS_ENABLED ())
  1285. {
  1286. TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_suspend: no counters\n");
  1287. return 0;
  1288. }
  1289. TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_suspend()\n");
  1290. return hwcdrv_free_counters ();
  1291. }
  1292. HWCDRV_API int
  1293. hwcdrv_lwp_resume (void) /* must be called from each thread */
  1294. {
  1295. if (!COUNTERS_ENABLED ())
  1296. {
  1297. TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_resume: no counters\n");
  1298. return 0;
  1299. }
  1300. TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_resume()\n");
  1301. return hwcdrv_start ();
  1302. }
  1303. HWCDRV_API int
  1304. hwcdrv_read_events (hwc_event_t *overflow_data, hwc_event_samples_t *sampled_data)
  1305. {
  1306. overflow_data->ce_hrt = 0;
  1307. for (int i = 0; i < MAX_PICS; i++)
  1308. {
  1309. overflow_data->ce_pic[i] = 0;
  1310. if (sampled_data)
  1311. HWCFUNCS_SAMPLE_RESET (&sampled_data->sample[i]);
  1312. }
  1313. return 0;
  1314. }
  1315. /*---------------------------------------------------------------------------*/
  1316. /* HWCDRV_API */
  1317. hwcdrv_api_t hwcdrv_pcl_api = {
  1318. hwcdrv_init,
  1319. hwcdrv_get_info,
  1320. hwcdrv_enable_mt,
  1321. hwcdrv_get_descriptions,
  1322. hwcdrv_assign_regnos,
  1323. hwcdrv_create_counters,
  1324. hwcdrv_start,
  1325. hwcdrv_overflow,
  1326. hwcdrv_read_events,
  1327. hwcdrv_sighlr_restart,
  1328. hwcdrv_lwp_suspend,
  1329. hwcdrv_lwp_resume,
  1330. hwcdrv_free_counters,
  1331. hwcdrv_lwp_init,
  1332. hwcdrv_lwp_fini,
  1333. -1 // hwcdrv_init_status
  1334. };