Refactor NVML, allow unavailable items to disappear, make thermal dis…

…play color thresholds configurable both compile-time and runtime
xmrig · Oct 20, 2019 · 625ea00 · 625ea00
1 parent 69af502
commit 625ea00
Show file tree

Hide file tree

Showing 13 changed files with 198 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -68,6 +68,8 @@ Use [config.xmrig.com](https://config.xmrig.com/nvidia) to generate, edit or sha
       --cuda-bfactor=[0-12] run CryptoNight core kernel in smaller pieces
       --cuda-bsleep=N       insert a delay of N microseconds between kernel launches
       --cuda-affinity=N     affine GPU threads to a CPU
+      --temp-low=N          list of celsius temperature below which is green
+      --temp-high=N         list of celsius temperature above which is red
       --no-color            disable colored output
       --variant             algorithm PoW variant
       --donate-level=N      donate level, default 5% (5 minutes in 100 minutes)

diff --git a/src/common/interfaces/IConfig.h b/src/common/interfaces/IConfig.h
@@ -126,6 +126,8 @@ class IConfig
         CudaLaunchKey     = 1204,
         CudaAffinityKey   = 1205,
         CudaMaxUsageKey   = 1206,
+        NvmlTempL         = 1207,
+        NvmlTempH         = 1208,
     };
 
     virtual ~IConfig() = default;

diff --git a/src/core/Config.cpp b/src/core/Config.cpp
@@ -99,6 +99,8 @@ void xmrig::Config::getJSON(rapidjson::Document &doc) const
     doc.AddMember("cuda-bfactor",     m_cudaCLI.bfactor(), allocator);
     doc.AddMember("cuda-bsleep",      m_cudaCLI.bsleep(), allocator);
     doc.AddMember("cuda-max-threads", m_maxGpuThreads, allocator);
+    doc.AddMember("temp-low",         m_cudaCLI.temp_low(), allocator);
+    doc.AddMember("temp-high",        m_cudaCLI.temp_high(), allocator);
     doc.AddMember("donate-level",     donateLevel(), allocator);
     doc.AddMember("log-file",         logFile() ? Value(StringRef(logFile())).Move() : Value(kNullType).Move(), allocator);
     doc.AddMember("pools",            m_pools.toJSON(doc), allocator);
@@ -181,6 +183,14 @@ bool xmrig::Config::parseString(int key, const char *arg)
     case CudaMaxUsageKey:
         return parseUint64(key, strtoul(arg, nullptr, 10));
 
+    case NvmlTempL: /* --temp-low */
+        m_cudaCLI.parseTempLow(arg);
+        break;
+
+    case NvmlTempH: /* --temp-high */
+        m_cudaCLI.parseTempHigh(arg);
+        break;
+
     default:
         break;
     }

diff --git a/src/core/ConfigLoader_platform.h b/src/core/ConfigLoader_platform.h
@@ -61,6 +61,8 @@ static struct option const options[] = {
     { "cuda-max-threads",  1, nullptr, xmrig::IConfig::CudaMaxThreadsKey },
     { "max-gpu-threads",   1, nullptr, xmrig::IConfig::CudaMaxThreadsKey }, // deprecated, use --cuda-max-threads instead.
     { "max-gpu-usage",     1, nullptr, xmrig::IConfig::CudaMaxUsageKey   }, // deprecated.
+    { "temp-low",          1, nullptr, xmrig::IConfig::NvmlTempL         },
+    { "temp-high",         1, nullptr, xmrig::IConfig::NvmlTempH         },
     { "config",            1, nullptr, xmrig::IConfig::ConfigKey         },
     { "donate-level",      1, nullptr, xmrig::IConfig::DonateLevelKey    },
     { "dry-run",           0, nullptr, xmrig::IConfig::DryRunKey         },
@@ -106,6 +108,8 @@ static struct option const config_options[] = {
     { "cuda-max-threads",  1, nullptr, xmrig::IConfig::CudaMaxThreadsKey },
     { "max-gpu-threads",   1, nullptr, xmrig::IConfig::CudaMaxThreadsKey }, // deprecated, use --cuda-max-threads instead.
     { "max-gpu-usage",     1, nullptr, xmrig::IConfig::CudaMaxUsageKey   }, // deprecated.
+    { "temp-low",          1, nullptr, xmrig::IConfig::NvmlTempL         },
+    { "temp-high",         1, nullptr, xmrig::IConfig::NvmlTempH         },
     { nullptr,             0, nullptr, 0                                 }
 };
 

diff --git a/src/core/usage.h b/src/core/usage.h
@@ -63,6 +63,8 @@ Options:\n\
       --cuda-bfactor=[0-12] run CryptoNight core kernel in smaller pieces\n\
       --cuda-bsleep=N       insert a delay of N microseconds between kernel launches\n\
       --cuda-affinity=N     affine GPU threads to a CPU\n\
+      --temp-low=N          list of celsius temperature below which is green\n\
+      --temp-high=N         list of celsius temperature above which is red\n\
       --no-color            disable colored output\n\
       --variant             algorithm PoW variant\n\
       --donate-level=N      donate level, default 5%% (5 minutes in 100 minutes)\n\

diff --git a/src/defaults.h b/src/defaults.h
@@ -0,0 +1,31 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __DEFAULTS_H__
+#define __DEFAULTS_H__
+
+//temperature display points
+// (below L is green, between is yellow, above H is red)
+#define DFL_nvmlTempL 45
+#define DFL_nvmlTempH 65
+
+#endif /* __DEFAULTS_H__ */
diff --git a/src/nvidia/CudaCLI.cpp b/src/nvidia/CudaCLI.cpp
@@ -153,6 +153,34 @@ void CudaCLI::parseLaunch(const char *arg)
 }
 
 
+void CudaCLI::parseTempLow(const char *arg)
+{
+    char *value = strdup(arg);
+    char *pch   = strtok(value, ",");
+
+    while (pch != nullptr) {
+        m_temp_low.push_back(static_cast<int>(strtoul(pch, nullptr, 10)));
+        pch = strtok(nullptr, ",");
+    }
+
+    free(value);
+}
+
+
+void CudaCLI::parseTempHigh(const char *arg)
+{
+    char *value = strdup(arg);
+    char *pch   = strtok(value, ",");
+
+    while (pch != nullptr) {
+        m_temp_high.push_back(static_cast<int>(strtoul(pch, nullptr, 10)));
+        pch = strtok(nullptr, ",");
+    }
+
+    free(value);
+}
+
+
 int CudaCLI::get(const std::vector<int> &vector, int index, int defaultValue) const
 {
     if (vector.empty()) {

diff --git a/src/nvidia/CudaCLI.h b/src/nvidia/CudaCLI.h
@@ -28,6 +28,7 @@
 #include <vector>
 
 
+#include "defaults.h"
 #include "common/xmrig.h"
 
 
@@ -46,6 +47,8 @@ class CudaCLI
     void autoConf(std::vector<xmrig::IThread *> &threads, xmrig::Algo algo, bool isCNv2);
     void parseDevices(const char *arg);
     void parseLaunch(const char *arg);
+    void parseTempLow(const char *arg);
+    void parseTempHigh(const char *arg);
 
     inline void addBFactor(int bfactor)        { m_bfactors.push_back(bfactor); }
     inline void addBSleep(int bsleep)          { m_bsleeps.push_back(bsleep); }
@@ -71,6 +74,9 @@ class CudaCLI
 #       endif
     }
 
+    inline int temp_low(int index = 0) const  { return get(m_temp_low, index, DFL_nvmlTempL); }
+    inline int temp_high(int index = 0) const { return get(m_temp_high, index, DFL_nvmlTempH); }
+
 private:
     inline int affinity(int index) const { return get(m_affinity, index, -1); }
     inline int blocks(int index) const   { return get(m_blocks, index, -1); }
@@ -87,6 +93,8 @@ class CudaCLI
     std::vector<int> m_bsleeps;
     std::vector<int> m_devices;
     std::vector<int> m_threads;
+    std::vector<int> m_temp_low;
+    std::vector<int> m_temp_high;
 };
 
 

diff --git a/src/nvidia/Health.h b/src/nvidia/Health.h
@@ -27,6 +27,7 @@
 
 #include <stdint.h>
 
+#define PROBED_UNSUPPORTED 0x8675309
 
 class Health
 {

diff --git a/src/nvidia/NvmlApi.cpp b/src/nvidia/NvmlApi.cpp
@@ -30,7 +30,7 @@
 
 
 static uv_lib_t nvmlLib;
-static char nvmlVerion[80] = { 0 };
+static char nvmlVersion[NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE] = { 0 };
 
 
 bool NvmlApi::m_available = false;
@@ -50,8 +50,8 @@ static nvmlReturn_t(*pNvmlDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t *
 bool NvmlApi::init()
 {
 #   ifdef _WIN32
-    char tmp[512];
-    ExpandEnvironmentStringsA("%PROGRAMFILES%\\NVIDIA Corporation\\NVSMI\\nvml.dll", tmp, sizeof(tmp));
+    char tmp[261]; //LoadLibrary calls are still "260 char" limited
+    ExpandEnvironmentStringsA(R"(%ProgramFiles%\NVIDIA Corporation\NVSMI\nvml.dll)", tmp, sizeof(tmp));
     if (uv_dlopen(tmp, &nvmlLib) == -1 && uv_dlopen("nvml.dll", &nvmlLib) == -1) {
         return false;
     }
@@ -78,7 +78,7 @@ bool NvmlApi::init()
     m_available = pNvmlInit() == NVML_SUCCESS;
 
     if (pNvmlSystemGetNVMLVersion) {
-        pNvmlSystemGetNVMLVersion(nvmlVerion, sizeof(nvmlVerion));
+        pNvmlSystemGetNVMLVersion(nvmlVersion, sizeof(nvmlVersion));
     }
 
     return m_available;
@@ -95,34 +95,52 @@ void NvmlApi::release()
 }
 
 
-bool NvmlApi::health(int id, Health &health)
+bool NvmlApi::health(int i, Health &health)
 {
-    if (id == -1 || !isAvailable()) {
+    const auto id = static_cast<unsigned int>(i);
+    nvmlDevice_t device;
+
+    if (i == -1 || !isAvailable()
+        ||
+        (pNvmlDeviceGetHandleByIndex && pNvmlDeviceGetHandleByIndex(id, &device) != NVML_SUCCESS)
+    ) {
         return false;
     }
 
-    health.reset();
+    // cache items previously pegged as unavailable via function call failure
+    // this has to happen before the reset or we don't see the previous value
+    const bool hasPowerUsage = PROBED_UNSUPPORTED != health.power;
+    const bool hasFanSpeed   = PROBED_UNSUPPORTED != health.fanSpeed;
+    const bool hasClockInfo  = PROBED_UNSUPPORTED != health.clock;
 
-    nvmlDevice_t device;
-    if (pNvmlDeviceGetHandleByIndex && pNvmlDeviceGetHandleByIndex(id, &device) != NVML_SUCCESS) {
-        return false;
-    }
+    health.reset();
 
     if (pNvmlDeviceGetTemperature) {
         pNvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &health.temperature);
     }
 
     if (pNvmlDeviceGetPowerUsage) {
-        pNvmlDeviceGetPowerUsage(device, &health.power);
+        if (!hasPowerUsage || pNvmlDeviceGetPowerUsage(device, &health.power) != NVML_SUCCESS){
+            health.power = PROBED_UNSUPPORTED;
+        }
     }
 
     if (pNvmlDeviceGetFanSpeed) {
-        pNvmlDeviceGetFanSpeed(device, &health.fanSpeed);
+        if (!hasFanSpeed || pNvmlDeviceGetFanSpeed(device, &health.fanSpeed) != NVML_SUCCESS){
+            health.fanSpeed = PROBED_UNSUPPORTED;
+        }
     }
 
     if (pNvmlDeviceGetClockInfo) {
-        pNvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &health.clock);
-        pNvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &health.memClock);
+        if (!hasClockInfo
+            ||
+            pNvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &health.clock) != NVML_SUCCESS
+            ||
+            pNvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &health.memClock) != NVML_SUCCESS
+        ) {
+            health.clock = PROBED_UNSUPPORTED;
+            health.memClock = PROBED_UNSUPPORTED;
+        }
     }
 
     return true;
@@ -131,7 +149,7 @@ bool NvmlApi::health(int id, Health &health)
 
 const char *NvmlApi::version()
 {
-    return nvmlVerion;
+    return nvmlVersion;
 }
 
 
@@ -158,7 +176,7 @@ void NvmlApi::bind(const std::vector<xmrig::IThread*> &threads)
         }
 
         for (xmrig::IThread *t : threads) {
-            auto thread = static_cast<CudaThread *>(t);
+            auto thread = dynamic_cast<CudaThread *>(t);
             if (thread->pciBusID() == pci.bus && thread->pciDeviceID() == pci.device && thread->pciDomainID() == pci.domain) {
                 thread->setNvmlId(i);
                 break;

diff --git a/src/workers/CudaThread.cpp b/src/workers/CudaThread.cpp
@@ -27,6 +27,7 @@
 #include <string.h>
 
 
+#include "defaults.h"
 #include "rapidjson/document.h"
 #include "workers/CudaThread.h"
 
@@ -38,6 +39,8 @@ CudaThread::CudaThread() :
     m_clockRate(0),
     m_memoryClockRate(0),
     m_nvmlId(-1),
+    m_nvmlTempL(DFL_nvmlTempL),
+    m_nvmlTempH(DFL_nvmlTempH),
     m_smx(0),
     m_threads(0),
     m_affinity(-1),
@@ -63,6 +66,8 @@ CudaThread::CudaThread(const nvid_ctx &ctx, int64_t affinity, xmrig::Algo algori
     m_clockRate(ctx.device_clockRate),
     m_memoryClockRate(ctx.device_memoryClockRate),
     m_nvmlId(-1),
+    m_nvmlTempL(DFL_nvmlTempL),
+    m_nvmlTempH(DFL_nvmlTempH),
     m_smx(ctx.device_mpcount),
     m_threads(ctx.device_threads),
     m_affinity(affinity),
@@ -88,6 +93,8 @@ CudaThread::CudaThread(const rapidjson::Value &object) :
     m_clockRate(0),
     m_memoryClockRate(0),
     m_nvmlId(-1),
+    m_nvmlTempL(DFL_nvmlTempL),
+    m_nvmlTempH(DFL_nvmlTempH),
     m_smx(0),
     m_threads(0),
     m_affinity(-1),
@@ -117,6 +124,16 @@ CudaThread::CudaThread(const rapidjson::Value &object) :
     if (affinity.IsInt()) {
         setAffinity(affinity.GetInt());
     }
+
+    const rapidjson::Value &tempL = object["temp_low"];
+    if (tempL.IsInt()) {
+        setNvmlTempL(static_cast<uint32_t>(tempL.GetInt()));
+    }
+
+    const rapidjson::Value &tempH = object["temp_high"];
+    if (tempH.IsInt()) {
+        setNvmlTempH(static_cast<uint32_t>(tempH.GetInt()));
+    }
 }
 
 

diff --git a/src/workers/CudaThread.h b/src/workers/CudaThread.h
@@ -53,6 +53,8 @@ class CudaThread : public xmrig::IThread
     inline size_t memoryTotal() const     { return m_memoryTotal; }
     inline size_t memoryFree() const      { return m_memoryFree; }
     inline int nvmlId() const             { return m_nvmlId; }
+    inline uint32_t nvmlTempL() const     { return m_nvmlTempL; }
+    inline uint32_t nvmlTempH() const     { return m_nvmlTempH; }
     inline int smx() const                { return m_smx; }
     inline int threads() const            { return m_threads; }
     inline size_t threadId() const        { return m_threadId; }
@@ -74,6 +76,8 @@ class CudaThread : public xmrig::IThread
     inline void setBSleep(int bsleep)          { m_bsleep = bsleep; }
     inline void setIndex(size_t index)         { m_index = index; }
     inline void setNvmlId(int id)              { m_nvmlId = id; }
+    inline void setNvmlTempL(uint32_t temp)    { m_nvmlTempL = temp; }
+    inline void setNvmlTempH(uint32_t temp)    { m_nvmlTempH = temp; }
     inline void setThreadId(size_t threadId)   { m_threadId = threadId; }
     inline void setThreads(int threads)        { m_threads = threads; }
     inline void setSyncMode(uint32_t syncMode) { m_syncMode = syncMode > 3 ? 3 : syncMode; }
@@ -98,6 +102,8 @@ class CudaThread : public xmrig::IThread
     int m_clockRate;
     int m_memoryClockRate;
     int m_nvmlId;
+    uint32_t m_nvmlTempL;
+    uint32_t m_nvmlTempH;
     int m_smx;
     int m_threads;
     int64_t m_affinity;