diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index 96969cf53baca..662fe2a49afe6 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -85,6 +85,8 @@ class DataAggregator : public DataReader { }; friend raw_ostream &operator<<(raw_ostream &OS, const LBREntry &); + friend struct PerfSpeEventsTestHelper; + struct PerfBranchSample { SmallVector LBR; }; diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h index 4acce5a3e8320..a75b6bf720ec4 100644 --- a/bolt/include/bolt/Utils/CommandLineOpts.h +++ b/bolt/include/bolt/Utils/CommandLineOpts.h @@ -48,6 +48,7 @@ extern llvm::cl::OptionCategory BinaryAnalysisCategory; extern llvm::cl::opt AlignText; extern llvm::cl::opt AlignFunctions; extern llvm::cl::opt AggregateOnly; +extern llvm::cl::opt ArmSPE; extern llvm::cl::opt BucketsPerLine; extern llvm::cl::opt CompactCodeModel; extern llvm::cl::opt DiffOnly; diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 178c9d3a63730..e03fa9bd53220 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -49,6 +49,9 @@ static cl::opt cl::desc("aggregate basic samples (without LBR info)"), cl::cat(AggregatorCategory)); +cl::opt ArmSPE("spe", cl::desc("Enable Arm SPE mode."), + cl::cat(AggregatorCategory)); + static cl::opt ITraceAggregation("itrace", cl::desc("Generate LBR info with perf itrace argument"), @@ -181,11 +184,21 @@ void DataAggregator::start() { findPerfExecutable(); + if (opts::ArmSPE) { + // pid from_ip to_ip flags + // where flags could be: + // P/M: whether branch was Predicted or Mispredicted. + // N: optionally appears when the branch was Not-Taken (ie fall-through) + // 12345 0x123/0x456/PN/-/-/8/RET/- + opts::ITraceAggregation = "bl"; + opts::ParseMemProfile = true; + opts::BasicAggregation = false; + } + if (opts::BasicAggregation) { - launchPerfProcess("events without LBR", - MainEventsPPI, + launchPerfProcess("events without LBR", MainEventsPPI, "script -F pid,event,ip", - /*Wait = */false); + /*Wait = */ false); } else if (!opts::ITraceAggregation.empty()) { // Disable parsing memory profile from trace data, unless requested by user. if (!opts::ParseMemProfile.getNumOccurrences()) @@ -994,9 +1007,22 @@ ErrorOr DataAggregator::parseLBREntry() { if (std::error_code EC = MispredStrRes.getError()) return EC; StringRef MispredStr = MispredStrRes.get(); - if (MispredStr.size() != 1 || - (MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) { - reportError("expected single char for mispred bit"); + // SPE brstack mispredicted flags might be up to two characters long: + // 'PN' or 'MN'. Where 'N' optionally appears. + bool ValidStrSize = opts::ArmSPE + ? MispredStr.size() >= 1 && MispredStr.size() <= 2 + : MispredStr.size() == 1; + bool SpeTakenBitErr = + (opts::ArmSPE && MispredStr.size() == 2 && MispredStr[1] != 'N'); + bool PredictionBitErr = + !ValidStrSize || + (MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-'); + if (SpeTakenBitErr) + reportError("expected 'N' as SPE prediction bit for a not-taken branch"); + if (PredictionBitErr) + reportError("expected 'P', 'M' or '-' char as a prediction bit"); + + if (SpeTakenBitErr || PredictionBitErr) { Diag << "Found: " << MispredStr << "\n"; return make_error_code(llvm::errc::io_error); } @@ -1497,7 +1523,9 @@ void DataAggregator::printBranchStacksDiagnostics( } std::error_code DataAggregator::parseBranchEvents() { - outs() << "PERF2BOLT: parse branch events...\n"; + std::string BranchEventTypeStr = + opts::ArmSPE ? "SPE branch events in LBR-format" : "branch events"; + outs() << "PERF2BOLT: parse " << BranchEventTypeStr << "...\n"; NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); @@ -1525,7 +1553,8 @@ std::error_code DataAggregator::parseBranchEvents() { } NumEntries += Sample.LBR.size(); - if (BAT && Sample.LBR.size() == 32 && !NeedsSkylakeFix) { + if (this->BC->isX86() && BAT && Sample.LBR.size() == 32 && + !NeedsSkylakeFix) { errs() << "PERF2BOLT-WARNING: using Intel Skylake bug workaround\n"; NeedsSkylakeFix = true; } @@ -1548,10 +1577,18 @@ std::error_code DataAggregator::parseBranchEvents() { if (NumSamples && NumSamplesNoLBR == NumSamples) { // Note: we don't know if perf2bolt is being used to parse memory samples // at this point. In this case, it is OK to parse zero LBRs. - errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack " - "LBR. Record profile with perf record -j any or run perf2bolt " - "in no-LBR mode with -nl (the performance improvement in -nl " - "mode may be limited)\n"; + if (!opts::ArmSPE) + errs() + << "PERF2BOLT-WARNING: all recorded samples for this binary lack " + "LBR. Record profile with perf record -j any or run perf2bolt " + "in no-LBR mode with -nl (the performance improvement in -nl " + "mode may be limited)\n"; + else + errs() + << "PERF2BOLT-WARNING: All recorded samples for this binary lack " + "SPE brstack entries. Make sure you are running Linux perf 6.14 " + "or later, otherwise you get zero samples. Record the profile " + "with: perf record -e 'arm_spe_0/branch_filter=1/'."; } else { printBranchStacksDiagnostics(NumTotalSamples - NumSamples); } diff --git a/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test new file mode 100644 index 0000000000000..91f5c857fbab0 --- /dev/null +++ b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test @@ -0,0 +1,12 @@ +## Check that Arm SPE mode is available on AArch64. + +REQUIRES: system-linux,perf,target=aarch64{{.*}} + +RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe + +RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe 2> /dev/null + +RUN: (perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2> /dev/null; exit 0) | FileCheck %s --check-prefix=CHECK-SPE-LBR + +CHECK-SPE-LBR: PERF2BOLT: parse SPE branch events in LBR-format + diff --git a/bolt/test/perf2bolt/X86/perf2bolt-spe.test b/bolt/test/perf2bolt/X86/perf2bolt-spe.test new file mode 100644 index 0000000000000..8eed2c8595098 --- /dev/null +++ b/bolt/test/perf2bolt/X86/perf2bolt-spe.test @@ -0,0 +1,9 @@ +## Check that Arm SPE mode is unavailable on X86. + +REQUIRES: system-linux,x86_64-linux + +RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe +RUN: touch %t.empty.perf.data +RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --spe --pa %t.exe 2>&1 | FileCheck %s + +CHECK: perf2bolt: -spe is available only on AArch64. diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp index b9836c2397b6b..cf1b31f8c0c66 100644 --- a/bolt/tools/driver/llvm-bolt.cpp +++ b/bolt/tools/driver/llvm-bolt.cpp @@ -237,6 +237,13 @@ int main(int argc, char **argv) { if (Error E = RIOrErr.takeError()) report_error(opts::InputFilename, std::move(E)); RewriteInstance &RI = *RIOrErr.get(); + + if (opts::AggregateOnly && !RI.getBinaryContext().isAArch64() && + opts::ArmSPE) { + errs() << ToolName << ": -spe is available only on AArch64.\n"; + exit(1); + } + if (!opts::PerfData.empty()) { if (!opts::AggregateOnly) { errs() << ToolName diff --git a/bolt/unittests/Profile/CMakeLists.txt b/bolt/unittests/Profile/CMakeLists.txt index e0aa0926b49c0..ce01c6c4b949e 100644 --- a/bolt/unittests/Profile/CMakeLists.txt +++ b/bolt/unittests/Profile/CMakeLists.txt @@ -1,11 +1,25 @@ +set(LLVM_LINK_COMPONENTS + DebugInfoDWARF + Object + ${LLVM_TARGETS_TO_BUILD} + ) + add_bolt_unittest(ProfileTests DataAggregator.cpp + PerfSpeEvents.cpp DISABLE_LLVM_LINK_LLVM_DYLIB ) target_link_libraries(ProfileTests PRIVATE + LLVMBOLTCore LLVMBOLTProfile + LLVMTargetParser + LLVMTestingSupport ) +foreach (tgt ${BOLT_TARGETS_TO_BUILD}) + string(TOUPPER "${tgt}" upper) + target_compile_definitions(ProfileTests PRIVATE "${upper}_AVAILABLE") +endforeach() diff --git a/bolt/unittests/Profile/PerfSpeEvents.cpp b/bolt/unittests/Profile/PerfSpeEvents.cpp new file mode 100644 index 0000000000000..3e3e05395246c --- /dev/null +++ b/bolt/unittests/Profile/PerfSpeEvents.cpp @@ -0,0 +1,164 @@ +//===- bolt/unittests/Profile/PerfSpeEvents.cpp ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef AARCH64_AVAILABLE + +#include "bolt/Core/BinaryContext.h" +#include "bolt/Profile/DataAggregator.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/TargetSelect.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::bolt; +using namespace llvm::object; +using namespace llvm::ELF; + +namespace opts { +extern cl::opt ReadPerfEvents; +extern cl::opt ArmSPE; +} // namespace opts + +namespace llvm { +namespace bolt { + +/// Perform checks on perf SPE branch events. +struct PerfSpeEventsTestHelper : public testing::Test { + void SetUp() override { + initalizeLLVM(); + prepareElf(); + initializeBOLT(); + } + +protected: + using Trace = DataAggregator::Trace; + using TakenBranchInfo = DataAggregator::TakenBranchInfo; + + void initalizeLLVM() { + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllDisassemblers(); + llvm::InitializeAllTargets(); + llvm::InitializeAllAsmPrinters(); + } + + void prepareElf() { + memcpy(ElfBuf, "\177ELF", 4); + ELF64LE::Ehdr *EHdr = reinterpret_cast(ElfBuf); + EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64; + EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB; + EHdr->e_machine = llvm::ELF::EM_AARCH64; + MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF"); + ObjFile = cantFail(ObjectFile::createObjectFile(Source)); + } + + void initializeBOLT() { + Relocation::Arch = ObjFile->makeTriple().getArch(); + BC = cantFail(BinaryContext::createBinaryContext( + ObjFile->makeTriple(), std::make_shared(), + ObjFile->getFileName(), nullptr, /*IsPIC*/ false, + DWARFContext::create(*ObjFile.get()), {llvm::outs(), llvm::errs()})); + ASSERT_FALSE(!BC); + } + + char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {}; + std::unique_ptr ObjFile; + std::unique_ptr BC; + + /// Helper function to export lists to show the mismatch. + void reportBrStackEventMismatch( + const std::vector> &Traces, + const std::vector> &ExpectedSamples) { + llvm::errs() << "Traces items: \n"; + for (const auto &[Trace, BI] : Traces) + llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << "," + << Trace.To << ", " << BI.TakenCount << ", " + << BI.MispredCount << "}" << "\n"; + + llvm::errs() << "Expected items: \n"; + for (const auto &[Trace, BI] : ExpectedSamples) + llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << ", " + << Trace.To << ", " << BI.TakenCount << ", " + << BI.MispredCount << "}" << "\n"; + } + + /// Parse and check SPE brstack as LBR. + void parseAndCheckBrstackEvents( + uint64_t PID, + const std::vector> &ExpectedSamples) { + DataAggregator DA(""); + DA.ParsingBuf = opts::ReadPerfEvents; + DA.BC = BC.get(); + DataAggregator::MMapInfo MMap; + DA.BinaryMMapInfo.insert(std::make_pair(PID, MMap)); + + DA.parseBranchEvents(); + + EXPECT_EQ(DA.Traces.size(), ExpectedSamples.size()); + if (DA.Traces.size() != ExpectedSamples.size()) + reportBrStackEventMismatch(DA.Traces, ExpectedSamples); + + const auto TracesBegin = DA.Traces.begin(); + const auto TracesEnd = DA.Traces.end(); + for (const auto &BI : ExpectedSamples) { + auto it = find_if(TracesBegin, TracesEnd, + [&BI](const auto &Tr) { return Tr.first == BI.first; }); + + EXPECT_NE(it, TracesEnd); + EXPECT_EQ(it->second.MispredCount, BI.second.MispredCount); + EXPECT_EQ(it->second.TakenCount, BI.second.TakenCount); + } + } +}; + +} // namespace bolt +} // namespace llvm + +TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstack) { + // Check perf input with SPE branch events as brstack format. + // Example collection command: + // ``` + // perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY + // ``` + // How Bolt extracts the branch events: + // ``` + // perf script -F pid,brstack --itrace=bl + // ``` + + opts::ArmSPE = true; + opts::ReadPerfEvents = " 1234 0xa001/0xa002/PN/-/-/10/COND/-\n" + " 1234 0xb001/0xb002/P/-/-/4/RET/-\n" + " 1234 0xc456/0xc789/P/-/-/13/-/-\n" + " 1234 0xd123/0xd456/M/-/-/7/RET/-\n" + " 1234 0xe001/0xe002/P/-/-/14/RET/-\n" + " 1234 0xd123/0xd456/M/-/-/7/RET/-\n" + " 1234 0xf001/0xf002/MN/-/-/8/COND/-\n" + " 1234 0xc456/0xc789/M/-/-/13/-/-\n"; + + // ExpectedSamples contains the aggregated information about + // a branch {{Branch From, To}, {TakenCount, MispredCount}}. + // Consider this example trace: {{0xd123, 0xd456, Trace::BR_ONLY}, + // {2,2}}. This entry has a TakenCount = 2, as we have two samples for + // (0xd123, 0xd456) in our input. It also has MispredsCount = 2, + // as 'M' misprediction flag appears in both cases. BR_ONLY means + // the trace only contains branch data. + std::vector> ExpectedSamples = { + {{0xa001, 0xa002, Trace::BR_ONLY}, {1, 0}}, + {{0xb001, 0xb002, Trace::BR_ONLY}, {1, 0}}, + {{0xc456, 0xc789, Trace::BR_ONLY}, {2, 1}}, + {{0xd123, 0xd456, Trace::BR_ONLY}, {2, 2}}, + {{0xe001, 0xe002, Trace::BR_ONLY}, {1, 0}}, + {{0xf001, 0xf002, Trace::BR_ONLY}, {1, 1}}}; + + parseAndCheckBrstackEvents(1234, ExpectedSamples); +} + +#endif