Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for device types and predictable device paths #235

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 54 additions & 25 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,13 @@ import (

// Device
type Device struct {
Name string `json:"name"`
Info_Name string `json:"info_name"`
Type string `json:"type"`
Name string
Type string
Label string
}

func (d Device) String() string {
return d.Name + ";" + d.Type + " (" + d.Label + ")"
}

// SMARTctlManagerCollector implements the Collector interface.
Expand Down Expand Up @@ -81,6 +85,7 @@ func (i *SMARTctlManagerCollector) RescanForDevices() {
time.Sleep(*smartctlRescanInterval)
level.Info(i.logger).Log("msg", "Rescanning for devices")
devices := scanDevices(i.logger)
devices = buildDevicesFromFlag(devices)
i.mutex.Lock()
i.Devices = devices
i.mutex.Unlock()
Expand All @@ -97,8 +102,9 @@ var (
smartctlRescanInterval = kingpin.Flag("smartctl.rescan",
"The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.",
).Default("10m").Duration()
smartctlScan = kingpin.Flag("smartctl.scan", "Enable scanning. This is a default if no devices are specified").Default("false").Bool()
smartctlDevices = kingpin.Flag("smartctl.device",
"The device to monitor (repeatable)",
"The device to monitor. Device type can be specified after a semicolon, eg. '/dev/bus/0;megaraid,1' (repeatable)",
).Strings()
smartctlDeviceExclude = kingpin.Flag(
"smartctl.device-exclude",
Expand All @@ -108,6 +114,10 @@ var (
"smartctl.device-include",
"Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)",
).Default("").String()
smartctlScanDeviceTypes = kingpin.Flag(
"smartctl.scan-device-type",
"Device type to use during automatic scan. Special by-id value forces predictable device names. (repeatable)",
).Strings()
smartctlFakeData = kingpin.Flag("smartctl.fake-data",
"The device to monitor (repeatable)",
).Default("false").Hidden().Bool()
Expand All @@ -121,34 +131,45 @@ func scanDevices(logger log.Logger) []Device {
scanDevices := json.Get("devices").Array()
var scanDeviceResult []Device
for _, d := range scanDevices {
deviceName := extractDiskName(strings.TrimSpace(d.Get("info_name").String()))
if filter.ignored(deviceName) {
level.Info(logger).Log("msg", "Ignoring device", "name", deviceName)
deviceName := d.Get("name").String()
deviceType := d.Get("type").String()

// SATA devices are reported as SCSI during scan - fallback to auto scraping
if deviceType == "scsi" {
deviceType = "auto"
}

deviceLabel := buildDeviceLabel(deviceName, deviceType)
if filter.ignored(deviceLabel) {
level.Info(logger).Log("msg", "Ignoring device", "name", deviceLabel)
} else {
level.Info(logger).Log("msg", "Found device", "name", deviceName)
level.Info(logger).Log("msg", "Found device", "name", deviceLabel)
device := Device{
Name: d.Get("name").String(),
Info_Name: deviceName,
Type: d.Get("type").String(),
Name: deviceName,
Type: deviceType,
Label: deviceLabel,
}
scanDeviceResult = append(scanDeviceResult, device)
}
}
return scanDeviceResult
}

func filterDevices(logger log.Logger, devices []Device, filters []string) []Device {
var filtered []Device
for _, d := range devices {
for _, filter := range filters {
level.Debug(logger).Log("msg", "filterDevices", "device", d.Info_Name, "filter", filter)
if strings.Contains(d.Info_Name, filter) {
filtered = append(filtered, d)
break
}
func buildDevicesFromFlag(devices []Device) []Device {
// TODO: deduplication?
for _, device := range *smartctlDevices {
deviceName, deviceType, _ := strings.Cut(device, ";")
if deviceType == "" {
deviceType = "auto"
}

devices = append(devices, Device{
Name: deviceName,
Type: deviceType,
Label: buildDeviceLabel(deviceName, deviceType),
})
}
return filtered
return devices
}

func main() {
Expand All @@ -168,11 +189,19 @@ func main() {
level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext())

var devices []Device
devices = scanDevices(logger)
level.Info(logger).Log("msg", "Number of devices found", "count", len(devices))

if len(*smartctlDevices) == 0 {
*smartctlScan = true
}

if *smartctlScan {
devices = scanDevices(logger)
level.Info(logger).Log("msg", "Number of devices found", "count", len(devices))
}

if len(*smartctlDevices) > 0 {
level.Info(logger).Log("msg", "Devices specified", "devices", strings.Join(*smartctlDevices, ", "))
devices = filterDevices(logger, devices, *smartctlDevices)
devices = buildDevicesFromFlag(devices)
level.Info(logger).Log("msg", "Devices filtered", "count", len(devices))
}

Expand All @@ -181,7 +210,7 @@ func main() {
logger: logger,
}

if *smartctlRescanInterval >= 1*time.Second {
if *smartctlScan && *smartctlRescanInterval >= 1*time.Second {
level.Info(logger).Log("msg", "Start background scan process")
level.Info(logger).Log("msg", "Rescanning for devices every", "rescanInterval", *smartctlRescanInterval)
go collector.RescanForDevices()
Expand Down
36 changes: 23 additions & 13 deletions readjson.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,27 +64,37 @@ func readFakeSMARTctl(logger log.Logger, device Device) gjson.Result {
// Get json from smartctl and parse it
func readSMARTctl(logger log.Logger, device Device) (gjson.Result, bool) {
start := time.Now()
out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device="+device.Type, device.Name).Output()
var smartctlArgs = []string{"--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device=" + device.Type, device.Name}

level.Debug(logger).Log("msg", "Calling smartctl with args", "args", strings.Join(smartctlArgs, " "))
out, err := exec.Command(*smartctlPath, smartctlArgs...).Output()
if err != nil {
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device.Info_Name)
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device)
}
json := parseJSON(string(out))
rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int())
jsonOk := jsonIsOk(logger, json)
level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device.Info_Name, "duration", time.Since(start))
level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start))
return json, rcOk && jsonOk
}

func readSMARTctlDevices(logger log.Logger) gjson.Result {
level.Debug(logger).Log("msg", "Scanning for devices")
out, err := exec.Command(*smartctlPath, "--json", "--scan").Output()
var scanArgs []string = []string{"--json", "--scan"}
for _, d := range *smartctlScanDeviceTypes {
scanArgs = append(scanArgs, "--device", d)
}
out, err := exec.Command(*smartctlPath, scanArgs...).Output()
if exiterr, ok := err.(*exec.ExitError); ok {
level.Debug(logger).Log("msg", "Exit Status", "exit_code", exiterr.ExitCode())
// The smartctl command returns 2 if devices are sleeping, ignore this error.
if exiterr.ExitCode() != 2 {
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading error", "err", err)
return gjson.Result{}
}
} else if err != nil {
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading error", "err", err)
return gjson.Result{}
}
return parseJSON(string(out))
}
Expand All @@ -102,7 +112,7 @@ func readData(logger log.Logger, device Device) gjson.Result {
jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()})
j, found := jsonCache.Load(device)
if !found {
level.Warn(logger).Log("msg", "device not found", "device", device.Info_Name)
level.Warn(logger).Log("msg", "device not found", "device", device)
}
return j.(JSONCache).JSON
}
Expand All @@ -117,30 +127,30 @@ func resultCodeIsOk(logger log.Logger, device Device, SMARTCtlResult int64) bool
if SMARTCtlResult > 0 {
b := SMARTCtlResult
if (b & 1) != 0 {
level.Error(logger).Log("msg", "Command line did not parse", "device", device.Info_Name)
level.Error(logger).Log("msg", "Command line did not parse", "device", device)
result = false
}
if (b & (1 << 1)) != 0 {
level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device.Info_Name)
level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device)
result = false
}
if (b & (1 << 2)) != 0 {
level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device.Info_Name)
level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device)
}
if (b & (1 << 3)) != 0 {
level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device.Info_Name)
level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device)
}
if (b & (1 << 4)) != 0 {
level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device.Info_Name)
level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device)
}
if (b & (1 << 5)) != 0 {
level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device.Info_Name)
level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device)
}
if (b & (1 << 6)) != 0 {
level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device.Info_Name)
level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device)
}
if (b & (1 << 7)) != 0 {
level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device.Info_Name)
level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device)
}
}
return result
Expand Down
30 changes: 9 additions & 21 deletions smartctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,28 +43,16 @@ type SMARTctl struct {
device SMARTDevice
}

func extractDiskName(input string) string {
re := regexp.MustCompile(`^(?:/dev/(?P<bus_name>\S+)/(?P<bus_num>\S+)\s\[|/dev/|\[)(?:\s\[|)(?P<disk>[a-z0-9_]+)(?:\].*|)$`)
match := re.FindStringSubmatch(input)

if len(match) > 0 {
busNameIndex := re.SubexpIndex("bus_name")
busNumIndex := re.SubexpIndex("bus_num")
diskIndex := re.SubexpIndex("disk")
var name []string
if busNameIndex != -1 && match[busNameIndex] != "" {
name = append(name, match[busNameIndex])
}
if busNumIndex != -1 && match[busNumIndex] != "" {
name = append(name, match[busNumIndex])
}
if diskIndex != -1 && match[diskIndex] != "" {
name = append(name, match[diskIndex])
}
func buildDeviceLabel(inputName string, inputType string) string {
// Strip /dev prefix and replace / with _ (/dev/bus/0 becomes bus_0, /dev/disk/by-id/abcd becomes abcd)
devReg := regexp.MustCompile(`^/dev/(?:disk/by-id/|disk/by-path/|)`)
deviceName := strings.ReplaceAll(devReg.ReplaceAllString(inputName, ""), "/", "_")

return strings.Join(name, "_")
if strings.Contains(inputType, ",") {
return deviceName + "_" + strings.ReplaceAll(inputType, ",", "_")
}
return ""

return deviceName
}

// NewSMARTctl is smartctl constructor
Expand All @@ -85,7 +73,7 @@ func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metr
json: json,
logger: logger,
device: SMARTDevice{
device: extractDiskName(strings.TrimSpace(json.Get("device.info_name").String())),
device: buildDeviceLabel(json.Get("device.name").String(), json.Get("device.type").String()),
serial: strings.TrimSpace(json.Get("serial_number").String()),
family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")),
model: strings.TrimSpace(model_name),
Expand Down
44 changes: 44 additions & 0 deletions smartctl_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"testing"
)

func TestBuildDeviceLabel(t *testing.T) {
tests := []struct {
deviceName string
deviceType string
expectedLabel string
}{
{"/dev/bus/0", "megaraid,1", "bus_0_megaraid_1"},
{"/dev/sda", "auto", "sda"},
{"/dev/disk/by-id/ata-CT500MX500SSD1_ABCDEFGHIJ", "auto", "ata-CT500MX500SSD1_ABCDEFGHIJ"},
// Some cases extracted from smartctl docs. Are these the prettiest?
// Probably not. Are they unique enough. Definitely.
{"/dev/sg1", "cciss,1", "sg1_cciss_1"},
{"/dev/bsg/sssraid0", "sssraid,0,1", "bsg_sssraid0_sssraid_0_1"},
{"/dev/cciss/c0d0", "cciss,0", "cciss_c0d0_cciss_0"},
{"/dev/sdb", "aacraid,1,0,4", "sdb_aacraid_1_0_4"},
{"/dev/twl0", "3ware,1", "twl0_3ware_1"},
}

for _, test := range tests {
result := buildDeviceLabel(test.deviceName, test.deviceType)
if result != test.expectedLabel {
t.Errorf("deviceName=%v deviceType=%v expected=%v result=%v", test.deviceName, test.deviceType, test.expectedLabel, result)
}
}
}