Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Yc test/1212 quarantine context #2455

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions source/loader/layers/sanitizer/asan/asan_ddi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices,
bool isInstrumentedKernel(ur_kernel_handle_t hKernel) {
auto hProgram = GetProgram(hKernel);
auto PI = getAsanInterceptor()->getProgramInfo(hProgram);
if (PI == nullptr) {
return false;
}
return PI->isKernelInstrumented(hKernel);
}

Expand Down Expand Up @@ -290,8 +293,9 @@ __urdlllocal ur_result_t UR_APICALL urProgramRetain(
UR_CALL(pfnRetain(hProgram));

auto ProgramInfo = getAsanInterceptor()->getProgramInfo(hProgram);
UR_ASSERT(ProgramInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE);
ProgramInfo->RefCount++;
if (ProgramInfo != nullptr) {
ProgramInfo->RefCount++;
}

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -364,6 +368,7 @@ __urdlllocal ur_result_t UR_APICALL urProgramLink(

UR_CALL(pfnProgramLink(hContext, count, phPrograms, pOptions, phProgram));

UR_CALL(getAsanInterceptor()->insertProgram(*phProgram));
UR_CALL(getAsanInterceptor()->registerProgram(*phProgram));

return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -395,6 +400,7 @@ ur_result_t UR_APICALL urProgramLinkExp(
UR_CALL(pfnProgramLinkExp(hContext, numDevices, phDevices, count,
phPrograms, pOptions, phProgram));

UR_CALL(getAsanInterceptor()->insertProgram(*phProgram));
UR_CALL(getAsanInterceptor()->registerProgram(*phProgram));

return UR_RESULT_SUCCESS;
Expand All @@ -417,8 +423,7 @@ ur_result_t UR_APICALL urProgramRelease(
UR_CALL(pfnProgramRelease(hProgram));

auto ProgramInfo = getAsanInterceptor()->getProgramInfo(hProgram);
UR_ASSERT(ProgramInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE);
if (--ProgramInfo->RefCount == 0) {
if (ProgramInfo != nullptr && --ProgramInfo->RefCount == 0) {
UR_CALL(getAsanInterceptor()->unregisterProgram(hProgram));
UR_CALL(getAsanInterceptor()->eraseProgram(hProgram));
}
Expand Down
103 changes: 58 additions & 45 deletions source/loader/layers/sanitizer/asan/asan_interceptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,6 @@ ur_result_t AsanInterceptor::allocateMemory(ur_context_handle_t Context,

ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context,
void *Ptr) {
auto ContextInfo = getContextInfo(Context);

auto Addr = reinterpret_cast<uptr>(Ptr);
auto AllocInfoItOp = findAllocInfoByAddress(Addr);

Expand Down Expand Up @@ -193,56 +191,23 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context,
return UR_RESULT_ERROR_INVALID_ARGUMENT;
}

AllocInfo->IsReleased = true;
AllocInfo->ReleaseStack = GetCurrentBacktrace();

if (AllocInfo->Type == AllocType::HOST_USM) {
ContextInfo->insertAllocInfo(ContextInfo->DeviceList, AllocInfo);
} else {
ContextInfo->insertAllocInfo({AllocInfo->Device}, AllocInfo);
}

// If quarantine is disabled, USM is freed immediately
if (!m_Quarantine) {
getContext()->logger.debug("Free: {}", (void *)AllocInfo->AllocBegin);

ContextInfo->Stats.UpdateUSMRealFreed(AllocInfo->AllocSize,
AllocInfo->getRedzoneSize());

std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
m_AllocationMap.erase(AllocInfoIt);

return getContext()->urDdiTable.USM.pfnFree(
Context, (void *)(AllocInfo->AllocBegin));
}

// If quarantine is enabled, cache it
auto ReleaseList = m_Quarantine->put(AllocInfo->Device, AllocInfoIt);
if (ReleaseList.size()) {
std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
for (auto &It : ReleaseList) {
getContext()->logger.info("Quarantine Free: {}",
(void *)It->second->AllocBegin);

ContextInfo->Stats.UpdateUSMRealFreed(AllocInfo->AllocSize,
AllocInfo->getRedzoneSize());

m_AllocationMap.erase(It);
if (AllocInfo->Type == AllocType::HOST_USM) {
for (auto &Device : ContextInfo->DeviceList) {
UR_CALL(getDeviceInfo(Device)->Shadow->ReleaseShadow(
AllocInfo));
}
} else {
UR_CALL(getDeviceInfo(AllocInfo->Device)
->Shadow->ReleaseShadow(AllocInfo));
UR_CALL(releaseAllocationNoCheck(Context, AllocInfo, false));
} else {
// If quarantine is enabled, cache it
auto ReleaseList = m_Quarantine->put(AllocInfo);
if (ReleaseList.size()) {
std::scoped_lock<ur_shared_mutex> Guard(m_AllocationMapMutex);
for (auto &ToFreeAllocInfo : ReleaseList) {
m_AllocationMap.erase(ToFreeAllocInfo->AllocBegin);
UR_CALL(
releaseAllocationNoCheck(Context, ToFreeAllocInfo, true));
}

UR_CALL(getContext()->urDdiTable.USM.pfnFree(
Context, (void *)(It->second->AllocBegin)));
}
}
ContextInfo->Stats.UpdateUSMFreed(AllocInfo->AllocSize);

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -431,6 +396,7 @@ ur_result_t AsanInterceptor::registerProgram(ur_program_handle_t Program) {

ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) {
auto ProgramInfo = getProgramInfo(Program);
assert(ProgramInfo != nullptr && "unregistered program!");

for (auto AI : ProgramInfo->AllocInfoForGlobals) {
UR_CALL(getDeviceInfo(AI->Device)->Shadow->ReleaseShadow(AI));
Expand Down Expand Up @@ -475,6 +441,7 @@ ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) {
}

auto PI = getProgramInfo(Program);
assert(PI != nullptr && "unregistered program!");
for (const auto &SKI : SKInfo) {
if (SKI.Size == 0) {
continue;
Expand Down Expand Up @@ -511,6 +478,7 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) {
auto Context = GetContext(Program);
auto ContextInfo = getContextInfo(Context);
auto ProgramInfo = getProgramInfo(Program);
assert(ProgramInfo != nullptr && "unregistered program!");

for (auto Device : Devices) {
ManagedQueue Queue(Context, Device);
Expand Down Expand Up @@ -582,6 +550,23 @@ ur_result_t AsanInterceptor::insertContext(ur_context_handle_t Context,
}

ur_result_t AsanInterceptor::eraseContext(ur_context_handle_t Context) {
// Remove quarantined memory when associated context is removed
// We don't use findAllocInfoByContext() here because we need to remove elements from quarantine, and it will break iterator.
if (m_Quarantine) {
auto ContextInfo = getContextInfo(Context);
auto AllocInfoIt = m_AllocationMap.begin();
while (AllocInfoIt != m_AllocationMap.end()) {
auto AI = AllocInfoIt->second;
if (AI->Context == Context) {
m_Quarantine->remove(AI);
releaseAllocationNoCheck(Context, AI, AI->IsReleased);
AllocInfoIt = m_AllocationMap.erase(AllocInfoIt);
} else {
AllocInfoIt++;
}
}
}

std::scoped_lock<ur_shared_mutex> Guard(m_ContextMapMutex);
assert(m_ContextMap.find(Context) != m_ContextMap.end());
m_ContextMap.erase(Context);
Expand Down Expand Up @@ -936,6 +921,34 @@ AsanInterceptor::findAllocInfoByContext(ur_context_handle_t Context) {
return AllocInfos;
}

ur_result_t
AsanInterceptor::releaseAllocationNoCheck(ur_context_handle_t Context,
std::shared_ptr<AllocInfo> AI,
bool IsFromQuarantine) {
if (IsFromQuarantine) {
getContext()->logger.debug("Quarantine Free: {}",
(void *)AI->AllocBegin);
} else {
getContext()->logger.debug("Free: {}", (void *)AI->AllocBegin);
}
auto ContextInfo = getContextInfo(Context);

ContextInfo->Stats.UpdateUSMRealFreed(AI->AllocSize, AI->getRedzoneSize());
ContextInfo->Stats.UpdateUSMFreed(AI->AllocSize);

AI->IsReleased = true;
AI->ReleaseStack = GetCurrentBacktrace();

if (AI->Type == AllocType::HOST_USM) {
ContextInfo->insertAllocInfo(ContextInfo->DeviceList, AI);
} else {
ContextInfo->insertAllocInfo({AI->Device}, AI);
}

return getContext()->urDdiTable.USM.pfnFree(Context,
(void *)(AI->AllocBegin));
}

bool ProgramInfo::isKernelInstrumented(ur_kernel_handle_t Kernel) const {
const auto Name = GetKernelName(Kernel);
return InstrumentedKernels.find(Name) != InstrumentedKernels.end();
Expand Down
9 changes: 7 additions & 2 deletions source/loader/layers/sanitizer/asan/asan_interceptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,8 +334,10 @@ class AsanInterceptor {

std::shared_ptr<ProgramInfo> getProgramInfo(ur_program_handle_t Program) {
std::shared_lock<ur_shared_mutex> Guard(m_ProgramMapMutex);
assert(m_ProgramMap.find(Program) != m_ProgramMap.end());
return m_ProgramMap[Program];
if (m_ProgramMap.find(Program) != m_ProgramMap.end()) {
return m_ProgramMap[Program];
}
return nullptr;
}

std::shared_ptr<KernelInfo> getKernelInfo(ur_kernel_handle_t Kernel) {
Expand Down Expand Up @@ -376,6 +378,9 @@ class AsanInterceptor {

ur_result_t registerDeviceGlobals(ur_program_handle_t Program);
ur_result_t registerSpirKernels(ur_program_handle_t Program);
ur_result_t releaseAllocationNoCheck(ur_context_handle_t Context,
std::shared_ptr<AllocInfo> AI,
bool isFromQuarantine);

private:
// m_Options may be used in other places, place it at the top
Expand Down
23 changes: 15 additions & 8 deletions source/loader/layers/sanitizer/asan/asan_quarantine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,31 @@
namespace ur_sanitizer_layer {
namespace asan {

std::vector<AllocationIterator> Quarantine::put(ur_device_handle_t Device,
AllocationIterator &It) {
auto &AI = It->second;
std::vector<std::shared_ptr<AllocInfo>> Quarantine::put(std::shared_ptr<AllocInfo> &AI) {
auto Device = AI->Device;
auto AllocSize = AI->AllocSize;
auto &Cache = getCache(Device);

std::vector<AllocationIterator> DequeueList;
std::vector<std::shared_ptr<AllocInfo>> DequeueList;
std::scoped_lock<ur_mutex> Guard(Cache.Mutex);
while (Cache.size() + AllocSize > m_MaxQuarantineSize) {
auto ElementOp = Cache.dequeue();
if (!ElementOp) {
auto AIToFreeOp = Cache.dequeue();
if (!AIToFreeOp) {
break;
}
DequeueList.emplace_back(*ElementOp);
DequeueList.emplace_back(*AIToFreeOp);
}
Cache.enqueue(It);
Cache.enqueue(AI);
return DequeueList;
}

void Quarantine::remove(std::shared_ptr<AllocInfo> &AI) {
auto Device = AI->Device;
auto &Cache = getCache(Device);

std::scoped_lock<ur_mutex> Guard(Cache.Mutex);
Cache.remove(AI);
}

} // namespace asan
} // namespace ur_sanitizer_layer
44 changes: 29 additions & 15 deletions source/loader/layers/sanitizer/asan/asan_quarantine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,32 +24,46 @@ namespace asan {

class QuarantineCache {
public:
using Element = AllocationIterator;
using List = std::queue<Element>;
using ElementT = std::shared_ptr<AllocInfo>;
using QueueT = std::queue<ElementT>;

// The following methods are not thread safe, use this lock
ur_mutex Mutex;

// Total memory used, including internal accounting.
uptr size() const { return m_Size; }

void enqueue(Element &It) {
m_List.push(It);
m_Size += It->second->AllocSize;
void enqueue(ElementT &AI) {
m_Queue.push(AI);
m_Size += AI->AllocSize;
}

std::optional<Element> dequeue() {
if (m_List.empty()) {
return std::optional<Element>{};
std::optional<ElementT> dequeue() {
if (m_Queue.empty()) {
return std::optional<ElementT>{};
}
auto It = m_List.front();
m_List.pop();
m_Size -= It->second->AllocSize;
return It;
auto &AI = m_Queue.front();
m_Queue.pop();
m_Size -= AI->AllocSize;
return AI;
}

void remove(ElementT &AI) {
m_Size -= AI->AllocSize;
// remove the element from the queue
QueueT newQueue;
while (!m_Queue.empty()) {
auto currentElement = m_Queue.front();
m_Queue.pop();
if (currentElement != AI) {
newQueue.push(currentElement);
}
}
std::swap(m_Queue, newQueue);
}

private:
List m_List;
QueueT m_Queue;
std::atomic_uintptr_t m_Size = 0;
};

Expand All @@ -58,8 +72,8 @@ class Quarantine {
explicit Quarantine(size_t MaxQuarantineSize)
: m_MaxQuarantineSize(MaxQuarantineSize) {}

std::vector<AllocationIterator> put(ur_device_handle_t Device,
AllocationIterator &Ptr);
std::vector<std::shared_ptr<AllocInfo>> put(std::shared_ptr<AllocInfo> &AI);
void remove(std::shared_ptr<AllocInfo> &AI);

private:
QuarantineCache &getCache(ur_device_handle_t Device) {
Expand Down
1 change: 1 addition & 0 deletions source/loader/layers/sanitizer/asan/asan_shadow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ ur_result_t ShadowMemoryGPU::ReleaseShadow(std::shared_ptr<AllocInfo> AI) {
getContext()->logger.debug("urVirtualMemUnmap: {} ~ {}",
(void *)MappedPtr,
(void *)(MappedPtr + PageSize - 1));
VirtualMemMaps.erase(MappedPtr);
}
}

Expand Down