From 95e036ef8fee35b0db12edf710177e3c83513b8e Mon Sep 17 00:00:00 2001 From: Richard Durso Date: Mon, 17 Jul 2023 16:25:32 -0400 Subject: [PATCH] journalctl is now default instead of syslog --- 36-diskstatus | 41 +++++++++++++++++++++++++++++------------ README.md | 26 +++++++++++++------------- 2 files changed, 42 insertions(+), 25 deletions(-) diff --git a/36-diskstatus b/36-diskstatus index d799fec..54c759d 100755 --- a/36-diskstatus +++ b/36-diskstatus @@ -13,7 +13,7 @@ # # AUTHOR : Richard J. DURSO # DATE : 07/17/2023 -# VERSION : 1.9.1 +# VERSION : 1.10 ############################################################################## set -eu @@ -35,6 +35,9 @@ do done #---[ Variables and Constants ]----------------------------------------------- +# Warning and criticial temperatures vary by devices. Historically based on +# personal preferences with spinning rust HDDs. Acceptable temperatures for +# SATA SSDs and NVMe are higher. Adjust these as you see fit. # Max Temperature (red color) MAX_TEMP_C=45 MAX_TEMP_F=113 @@ -64,8 +67,14 @@ hddtemp_port=7634 # if you want that value converted to F. convert_c_to_f=/bin/true -# logfiles to check -logfiles=(/var/log/syslog /var/log/syslog.1) +# logfiles to check. Leave empty to use journalctl, or specify log files to check +# logfiles=(/var/log/syslog /var/log/syslog.1) +logfiles= + +# Define how many days to search for journalctl entries. As in days ago to today. +# Such as past 3 days. +journalctl_days=3 + #------------------------------------------------------------------------------ #---[ You can updates these ]-------------------------------------------------- @@ -74,12 +83,12 @@ logfiles=(/var/log/syslog /var/log/syslog.1) findthese="ata-|scsi-SATA_|nvme-" # This is used by awk to remove unwanted disk devices which matched above. -ignorethese="nvme-(eui|nvme).*" +ignorethese="nvme-(eui|nvme).*|nvme-.*_1[[:space:]]" # This is used by sed to remove text from disk device names. This does not alter # device selection like ones above. This just helps to make disk device names # nicer (smaller). -sed_filter="s/^scsi-SATA_//; s/^ata-//; s/Series_//; s/^nvme-//;" +sed_filter="s/^scsi-SATA_//; s/^ata-//; s/Series_//; s/^nvme-//; s/_with_Heatsink//;" #------------------------------------------------------------------------------- #---[ Do Not Update These ]----------------------------------------------------- @@ -111,6 +120,7 @@ readarray -t diskserials <<< "$(printf '%s\n' "${disks[@]}" | sed -r 's/.*(....) #------------------------------------------------------------------------------- +# Display a colorized percent indicator __colorize_percent_left() { # If a percent_left determined, format and colorize it if [[ -n ${percent_left} ]]; then @@ -126,9 +136,16 @@ __colorize_percent_left() { fi } #------------------------------------------------------------------------------- - -# get all lines with smartd entries from syslog -lines=$(tac "${logfiles[@]}" 2>/dev/null | grep -hiP 'smartd.*previous self-test') +# System with only NVMe devices, might not find any results. +set +e +if [[ -n "${logfiles}" ]]; then + # get all lines with smartd entries from syslog + lines=$(tac "${logfiles[@]}" 2>/dev/null | grep -hiP 'smartd.*previous self-test') +else + # get all lines with smartd entries from jounrlctl within last week. + lines=$(journalctl -r -t smartd --since "$(date +%Y-%m-%d -d "${journalctl_days} days ago")" | grep -hiP 'smartd.*previous self-test') +fi +set -e # use nc to query temps from each hddtemp daemon instance, echo needed to get NC to return data fetch_hddtemp=$(echo -n | nc ${hddtemp_host} ${hddtemp_port} |sed 's/|//m' | sed 's/||/ \n/g') @@ -152,7 +169,7 @@ out="" for i in "${!disksalias[@]}"; do #for every /dev/sdX device name # Get smartd testing status # Determine all possible names for the disk device - readarray -t possible_names <<< "$(echo "${alldisks[@]}" | awk -v a="${disksalias[$i]}" '$0 ~ a { print $1 }')" + readarray -t possible_names <<< "$(echo "${alldisks[@]}" | awk -v a="${disksalias[$i]}" '$0 ~ a { print $1;exit }')" for name in "${possible_names[@]}";do life="" @@ -178,11 +195,11 @@ for i in "${!disksalias[@]}"; do #for every /dev/sdX device name # If no result, see if unexpected results can be found if [[ -z ${result} ]]; then - if [ "$(tac "${logfiles[@]}" 2>/dev/null | grep -m 1 -HiP "${name}.*self-test" | grep -ci "skip")" -eq 1 ]; then # Test skipped + if [ "$(echo "${lines}" | grep -m 1 -HiP "${name}.*self-test" | grep -ci "skip")" -eq 1 ]; then # Test skipped result="skipped" break fi - if [ "$(tac "${logfiles[@]}" 2>/dev/null | grep -m 1 -HiP "${name}.*self-test" | grep -ci "in progress")" -eq 1 ]; then # Test in progress + if [ "$(echo "${lines}" | grep -m 1 -HiP "${name}.*self-test" | grep -ci "in progress")" -eq 1 ]; then # Test in progress result="in progress" break fi @@ -333,4 +350,4 @@ out+="\n" printf "\ndisk status:\n" # column: -t is table format, -s is column seperator, -d is do not display header -printf '%b' "$out" | column -t -s$',' -d --table-columns DEVICE,TEMP1,STATUS,DEVICE,TEMP2,STATUS --table-right TEMP1,TEMP2 | sed -e 's/^/ /' +printf '%b' "$out" | column -t -s$',' -d --table-columns DEVICE,TMP1,STATUS,DEVICE,TMP2,STATUS --table-right TMP1,TMP2 | sed -e 's/^/ /' diff --git a/README.md b/README.md index 094d394..c2b44cf 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ option set to `yes` in your sshd config. The duplicate files are different versions of the same, use either one of them. E.g. `30-zpool-simple` will not print usage bars. -The script `36-diskstatus` will grep syslog for smartd entries to read last self-test result. You have to enable smartd monitoring & run regular self-tests for it to display anything. The nvme client is required to get NVMe device errors and wear leveling. +The script `36-diskstatus` will grep either *journalctl* or *syslog* for `smartd` entries to read last self-test result. You have to enable smartd monitoring & run regular self-tests for it to display anything. The nvme client is required to get NVMe device errors and wear leveling. If you use `50-fail2ban` you should comment out the `compress` option in `/etc/logrotate.d/fail2ban`, so that the logs are not compressed and can be read by grep. @@ -76,9 +76,9 @@ $ sudo hddtemp /dev/sda --- -## Disk Status has NVMe Support +## 36-diskstatus has NVMe Support -Below shows how support for NVMe devices is provided in `36-diskstatus` where it will include any device that starts with `nvme-`: +Support for NVMe devices is provided in `36-diskstatus` where it will include any device that starts with `nvme-` (in the `/dev/disk/by-id` directory): ```bash #---[ You can updates these ]-------------------------------------------------- @@ -94,33 +94,33 @@ Then any devices matching `nvme-eui` or `nvme-nvme` are removed from the list. ignorethese="ata-Samsung|nvme-(eui|nvme).*" ``` -Lastly there is a filter which attempts to pretty up the device names for display, for NVMe devices it removes the `nvme-` prefix with the intent that the device name will now start with your manufacture name. +Lastly there is a filter which attempts to pretty up the device names for display, for NVMe devices it removes the `nvme-` prefix with the intent that the device name will now start with your manufacture name. It will remove references to `_with_Heatsink` popular with Samsung devices. ```yaml # This is used by sed to remove text from disk device names. This does not alter # device selection like ones above. This just helps to make disk device names # nicer (smaller). -sed_filter="s/^scsi-SATA_//; s/^ata-//; s/Series_//; s/^nvme-//;" +sed_filter="s/^scsi-SATA_//; s/^ata-//; s/Series_//; s/^nvme-//; s/_with_Heatsink//;" ``` -Examples: +Example Results: * `Samsung_SSD_980_1TB_316T` is a Samsung device, model 980 with 1 TB capacity with last 4 digits of the serial number `316T`. * `HP_SSD_EX950_1TB_3005` is a HP (Hewlett Packard) model EX950 with a 1 TB capacity with last 4 digits of serial number `3005`. -* `WDS100T3XHC-00SJG0_2993` shows how horrible WD (Western Digital) names their devices. This is a WD Black model SN750 but there is no way to know that by the device name. +* `WDS100T3XHC-00SJG0_2993` shows how horrible WD (Western Digital) names their devices. This is a WD Black model SN750 but there is no way to know that by the device name. (You could use the filter above to rename if you like) -Should you have many devices and one reports `FAILED` having part of the serial number will be helpful in identifying which device has a problem. +The idea behind this is should you have many devices and one reports `FAILED`, having part of the serial number will be helpful in identifying which device has a problem. ### NVMe Device Temperature -The HDDTemp utility does not support NVMe devices. If `36-diskstatus` script detects a NVMe device, it will try to get the temperature from `smartctl` by looking for `Temperature:` and parsing the value such as `38 Celsius`. +If `36-diskstatus` script detects a NVMe device, it will scrape the output of `smartctl` by looking for `Temperature:` and parsing the value such as `38 Celsius`. ```text disk status: Samsung_SSD_980_1TB_316T (nvme0n1): 38C passed [97%] ``` -The HDDTemp utility allows devices to report in Celsius or Fahrenheit. If the script had to pull a temperature from `smartctl` and that value was Celsius, you can convert that to Fahrenheit by setting variable `convert_c_to_f` to `/bin/true` (set to `/bin/false` for Celsius). +A Celsius temperature from `smartctl` can be converted to Fahrenheit by setting variable `convert_c_to_f` to `/bin/true` (set to `/bin/false` for Celsius). ```bash # hddtemp can already report F or C temperatures for SATA devices. If this script @@ -143,9 +143,9 @@ disk status: --- -Example showing a mix of SATA SSDs, SATA HDDs and NVMe devices. +Example showing a mix of SATA SSDs, SATA HDDs and NVMe devices -* Temperatures are set to be converted to Fahrenheit +* Temperatures are converted to Fahrenheit * Elevated temperatures are in yellow * Wear level / life expectancy are expressed as a percentage from 100% and lowers towards 0% @@ -175,7 +175,7 @@ disk status: When a SATA device test result is not found (either not supported, not performed, log purged, etc) then the `smartctl` self assessment status will be displayed which should be a simple `PASSED` or `FAILED!` value. It is still possible to show `PASSED` and have device issues. This is **not** an equivalent of `without error`. -NVMe device will use the nvme-cli to fetch error information from the device. If the last test, a result of zero is `passed` and a non-zero is `error`. An error indicates you need to investigate the device, start with something like `sudo nvme error-log -e 1 /dev/nvme0n1` to return the last error log entry. +NVMe device will use the *nvme-cli* to fetch error information from the device. If the last test, a result of zero is `passed` and a non-zero is `error`. An error indicates you need to investigate the device, start with something like `sudo nvme error-log -e 1 /dev/nvme0n1` to return the last error log entry. ```text disk status: