diff --git a/README.md b/README.md index 6e85584..ba513b1 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,8 @@ However, Matthias' current version 1.23 is not supporting my requirements. And i I added the following features to finddupe: - multiple reference directories that shall not be touched (v1.24) - unicode support (v1.25) +- alert message if order of options is wrong (v1.26) +- support for ignoring files by patterns (v1.26) It works for me, but some more testing is desirable. @@ -23,8 +25,12 @@ I've udated the project to use Visual Studio 2019. ## Usage ``` -finddupe v1.25 compiled Jun 4 2017 -Usage: finddupe [options] [-ref] [filepat]... +finddupe v1.26 compiled Oct 18 2020 +an enhanced version by thomas694 (@GH), originally by Matthias Wandel +This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you +are welcome to redistribute it under certain conditions; view GNU GPLv3 for more. + +Usage: finddupe [options] [-ign ...] [-ref ...] ... Options: -bat Create batch file with commands to do the hard linking. run batch file afterwards to do it @@ -34,8 +40,6 @@ Options: -v Verbose -sigs Show signatures calculated based on first 32k for each file -rdonly Apply to readonly files also (as opposed to skipping them) - -ref Following file pattern are files that are for reference, NOT - to be eliminated, only used to check duplicates against -z Do not skip zero length files (zero length files are ignored by default) -u Do not print a warning for files that cannot be read @@ -43,6 +47,9 @@ Options: -j Follow NTFS junctions and reparse points (off by default) -listlink hardlink list mode. Not valid with -del, -bat, -hardlink, or -rdonly, options + -ign Ignore file pattern, eg. .bak or .tmp (repeatable) + -ref Following file pattern are files that are for reference, NOT to + be eliminated, only used to check duplicates against (repeatable) filepat Pattern for files. Examples: c:\** Match everything on drive C c:\**\*.jpg Match only .jpg files on drive C diff --git a/finddupe.c b/finddupe.c index 50b9a7d..9789c0c 100644 --- a/finddupe.c +++ b/finddupe.c @@ -14,8 +14,11 @@ // Version 1.25 // Copyright (C) Jun 2017 thomas694 // added unicode support +// Version 1.26 +// Copyright (C) Oct 2020 thomas694 +// added support for ignore filename patterns // -// This program is free software: you can redistribute it and/or modify +// finddupe is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. @@ -29,7 +32,7 @@ // along with this program. If not, see . //-------------------------------------------------------------------------- -#define VERSION "1.25" +#define VERSION "1.26" #define REF_CODE @@ -43,6 +46,9 @@ #include #include +#include /* StrStrI */ +#pragma comment(lib, "shlwapi.lib") /* unresolved external symbol __imp__StrStrIW@8 */ + #include #include #include @@ -92,6 +98,7 @@ struct { int HardlinkGroups; int CantReadFiles; int ZeroLengthFiles; + int IgnoredFiles; __int64 TotalBytes; __int64 DuplicateBytes; }DupeStats; @@ -116,7 +123,11 @@ int ShowProgress = 1; // Show progressing file count... int HideCantReadMessage= 0;// Hide the can't read file error int SkipZeroLength = 1; // Ignore zero length files. int ProgressIndicatorVisible = 0; // Weither a progress indicator needs to be overwritten. -int FollowReparse = 0; // Wether to follow reparse points (like unix softlinks for NTFS) +int FollowReparse = 0; // Whether to follow reparse points (like unix softlinks for NTFS) + +TCHAR* * IgnorePatterns; // Patterns of filename to ignore (can be repeated, eg. .bak, .tmp) +int IgnorePatternsAlloc; // Number of allocated ignore patterns +int IgnorePatternsCount; // Number of specified ignore patterns int MyGlob(const TCHAR * Pattern, int FollowReparse, void (*FileFuncParm)(const TCHAR * FileName)); @@ -187,7 +198,7 @@ static int EliminateDuplicate(FileData_t ThisFile, FileData_t DupeOf) if (ThisFile.FileSize != DupeOf.FileSize) return 0; Hardlinked = 0; - if (DupeOf.NumLinks && memcmp(&ThisFile.FileIndex, &DupeOf.FileIndex, 8) == 0){ + if (DupeOf.NumLinks && memcmp(&ThisFile.FileIndex, &DupeOf.FileIndex, sizeof(DupeOf.FileIndex)) == 0){ Hardlinked = 1; goto dont_read; } @@ -368,6 +379,21 @@ static int IsNonRefPath(TCHAR * filename) } #endif +static void StoreFileData(FileData_t ThisFile) +{ + if (NumUnique >= NumAllocated) { + // Array is full. Make it bigger + NumAllocated = NumAllocated + NumAllocated / 2; + FileData = (FileData_t*)realloc(FileData, sizeof(FileData_t) * NumAllocated); + if (FileData == NULL) { + fprintf(stderr, "Malloc failure"); + exit(EXIT_FAILURE); + } + } + FileData[NumUnique] = ThisFile; + NumUnique += 1; +} + //-------------------------------------------------------------------------- // Check for duplicates. //-------------------------------------------------------------------------- @@ -375,28 +401,30 @@ static void CheckDuplicate(FileData_t ThisFile) { int Ptr; int * Link; - // Find where in the trie structure it belongs. + // Find where in the tree structure it belongs. Ptr = 0; - DupeStats.TotalFiles += 1; - DupeStats.TotalBytes += (__int64) ThisFile.FileSize; - if (NumUnique == 0) goto store_it; for(;;){ int comp; comp = memcmp(&ThisFile.Checksum, &FileData[Ptr].Checksum, sizeof(Checksum_t)); - if (comp == 0){ + if (comp == 0) { + // the same file + if (_tcscmp(ThisFile.FileName, FileData[Ptr].FileName) == 0) { + return; + } // Check for true duplicate. #ifdef REF_CODE - if (!ReferenceFiles && !HardlinkSearchMode && IsNonRefPath(ThisFile.FileName)){ + if (!ReferenceFiles && !HardlinkSearchMode && IsNonRefPath(ThisFile.FileName)) { #else - if (!ReferenceFiles && !HardlinkSearchMode){ + if (!ReferenceFiles && !HardlinkSearchMode) { #endif int r = EliminateDuplicate(ThisFile, FileData[Ptr]); - if (r){ + if (r) { if (r == 2) FileData[Ptr].NumLinks += 1; // Update link count. - // Its a duplicate for elimination. Do not store info on it. + // Its a duplicate for elimination. Do not store info on it. New: store info for correct statistic calculation + StoreFileData(ThisFile); return; } } @@ -421,19 +449,12 @@ static void CheckDuplicate(FileData_t ThisFile) } } + DupeStats.TotalFiles += 1; + DupeStats.TotalBytes += (__int64) ThisFile.FileSize; + store_it: - if (NumUnique >= NumAllocated){ - // Array is full. Make it bigger - NumAllocated = NumAllocated + NumAllocated/2; - FileData = (FileData_t*) realloc(FileData, sizeof(FileData_t) * NumAllocated); - if (FileData == NULL){ - fprintf(stderr, "Malloc failure"); - exit(EXIT_FAILURE); - } - } - FileData[NumUnique] = ThisFile; - NumUnique += 1; + StoreFileData(ThisFile); } //-------------------------------------------------------------------------- @@ -479,9 +500,14 @@ static void ProcessFile(const TCHAR * FileName) Checksum_t CheckSum; struct _stat FileStat; + for (int i = 0; i < NumUnique; i++) + { + if (_tcscmp(FileName, FileData[i].FileName) == 0) + return; + } + FileData_t ThisFile; memset(&ThisFile, 0, sizeof(ThisFile)); - { static int LastPrint, Now; Now = GetTickCount(); @@ -625,6 +651,18 @@ static void ProcessFile(const TCHAR * FileName) ThisFile.FileName = _tcsdup(FileName); // allocate the string last, so // we don't waste memory on errors. + + // skip if filename contains a ignore pattern + for (int i = 0; i < IgnorePatternsCount; i++) + { + if (StrStrI(FileName, IgnorePatterns[i])) + { + DupeStats.IgnoredFiles++; + StoreFileData(ThisFile); + return; + } + } + CheckDuplicate(ThisFile); } @@ -634,10 +672,10 @@ static void ProcessFile(const TCHAR * FileName) static void Usage (void) { _tprintf(TEXT("finddupe v%s compiled %s\n"), TEXT(VERSION), TEXT(__DATE__)); - _tprintf(TEXT("an enhanced version by thomas694 (@GH)\n")); + _tprintf(TEXT("an enhanced version by thomas694 (@GH), originally by Matthias Wandel\n")); _tprintf(TEXT("This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you\n")); - _tprintf(TEXT("are welcome to redistribute it under certain conditions; view GNU GPLv3 for more.\n")); - _tprintf(TEXT("Usage: finddupe [options] [-ref] [filepat]...\n")); + _tprintf(TEXT("are welcome to redistribute it under certain conditions; view GNU GPLv3 for more.\n\n")); + _tprintf(TEXT("Usage: finddupe [options] [-ign ...] [-ref ...] ...\n")); _tprintf(TEXT("Options:\n") TEXT(" -bat Create batch file with commands to do the hard\n") TEXT(" linking. run batch file afterwards to do it\n") @@ -647,8 +685,6 @@ static void Usage (void) TEXT(" -v Verbose\n") TEXT(" -sigs Show signatures calculated based on first 32k for each file\n") TEXT(" -rdonly Apply to readonly files also (as opposed to skipping them)\n") - TEXT(" -ref Following file pattern are files that are for reference, NOT\n") - TEXT(" to be eliminated, only used to check duplicates against\n") TEXT(" -z Do not skip zero length files (zero length files are ignored\n") TEXT(" by default)\n") TEXT(" -u Do not print a warning for files that cannot be read\n") @@ -656,6 +692,9 @@ static void Usage (void) TEXT(" -j Follow NTFS junctions and reparse points (off by default)\n") TEXT(" -listlink hardlink list mode. Not valid with -del, -bat, -hardlink,\n") TEXT(" or -rdonly, options\n") + TEXT(" -ign Ignore file pattern, eg. .bak or .tmp (repeatable)\n") + TEXT(" -ref Following file pattern are files that are for reference, NOT to\n") + TEXT(" be eliminated, only used to check duplicates against (repeatable)\n") TEXT(" filepat Pattern for files. Examples:\n") TEXT(" c:\\** Match everything on drive C\n") TEXT(" c:\\**\\*.jpg Match only .jpg files on drive C\n") @@ -675,12 +714,24 @@ int _tmain (int argc, TCHAR **argv) TCHAR * arg; TCHAR DefaultDrive; TCHAR DriveUsed = '\0'; + int indexFirstRef = 0; PrintDuplicates = 0; PrintFileSigs = 0; HardlinkSearchMode = 0; Verbose = 0; + for (argn = 1; argn < argc; argn++) { + arg = argv[argn]; + if (indexFirstRef == 0 && !_tcscmp(arg, TEXT("-ref"))) indexFirstRef = argn; + if (indexFirstRef > 0 && (!_tcscmp(arg, TEXT("-bat")) || !_tcscmp(arg, TEXT("-v")) || !_tcscmp(arg, TEXT("-sigs")) || !_tcscmp(arg, TEXT("-hardlink")) || + !_tcscmp(arg, TEXT("-del")) || !_tcscmp(arg, TEXT("-rdonly")) || !_tcscmp(arg, TEXT("-listlink")) || !_tcscmp(arg, TEXT("-z")) || + !_tcscmp(arg, TEXT("-u")) || !_tcscmp(arg, TEXT("-p")) || !_tcscmp(arg, TEXT("-j")) || !_tcscmp(arg, TEXT("-ign"))) && argn > indexFirstRef) { + fprintf(stderr, "Wrong order of options! Use -h for help\n"); + exit(EXIT_FAILURE); + } + } + for (argn=1;argn= IgnorePatternsAlloc) { + // Array is full. Make it bigger + IgnorePatternsAlloc = IgnorePatternsAlloc + 4; + IgnorePatterns = realloc(IgnorePatterns, sizeof(TCHAR*) * IgnorePatternsAlloc); + if (IgnorePatterns == NULL) { + _ftprintf(stderr, TEXT("Malloc failure")); + exit(EXIT_FAILURE); + } + }; + TCHAR* substr = _tcsdup(argv[++argn]); + IgnorePatterns[IgnorePatternsCount++] = substr; }else{ _tprintf(TEXT("Argument '%s' not understood. Use -h for help.\n"), arg); exit(-1); @@ -837,13 +901,16 @@ int _tmain (int argc, TCHAR **argv) ClearProgressInd(); _tprintf(TEXT("\n")); _tprintf(TEXT("Files: %8u kBytes in %5d files\n"), - (unsigned)(DupeStats.TotalBytes/1000), DupeStats.TotalFiles); + (unsigned)(DupeStats.TotalBytes/1024), DupeStats.TotalFiles); _tprintf(TEXT("Dupes: %8u kBytes in %5d files\n"), - (unsigned)(DupeStats.DuplicateBytes/1000), DupeStats.DuplicateFiles); + (unsigned)(DupeStats.DuplicateBytes/1024), DupeStats.DuplicateFiles); } if (DupeStats.ZeroLengthFiles){ _tprintf(TEXT(" %d files of zero length were skipped\n"), DupeStats.ZeroLengthFiles); } + if (DupeStats.IgnoredFiles) { + _tprintf(TEXT(" %d files were ignored\n"), DupeStats.IgnoredFiles); + } if (DupeStats.CantReadFiles){ _tprintf(TEXT(" %d files could not be opened\n"), DupeStats.CantReadFiles); } diff --git a/myglob.c b/myglob.c index d27031b..29b3dae 100644 --- a/myglob.c +++ b/myglob.c @@ -1,12 +1,10 @@ //-------------------------------------------------------------------------------- -// This file is part of finddupe. -// // Module to do recursive directory file matching under windows. // // Tries to do pattern matching to produce similar results as Unix, but using // the Windows _findfirst to do all the pattern matching. // -// Also hadles recursive directories - "**" path component expands into +// Also handles recursive directories - "**" path component expands into // any levels of subdirectores (ie c:\**\*.c matches ALL .c files on drive c:) // // Matthias Wandel Nov 5 2000 - March 2009 @@ -18,7 +16,9 @@ // Copyright (C) Jun 2017 thomas694 // added unicode support // -// This program is free software: you can redistribute it and/or modify +// This file is part of finddupe. +// +// finddupe is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. diff --git a/version.h b/version.h index 0184c4e..cd9202e 100644 --- a/version.h +++ b/version.h @@ -3,7 +3,7 @@ #define STRINGIZE(s) STRINGIZE2(s) #define VERSION_MAJOR 1 -#define VERSION_MINOR 25 +#define VERSION_MINOR 26 #define VERSION_REVISION 0 #define VERSION_BUILD 0 @@ -17,7 +17,7 @@ #define VER_PRODUCT_VERSION_STR VER_FILE_VERSION_STR #define VER_ORIGINAL_FILENAME_STR VER_PRODUCTNAME_STR ".exe" #define VER_INTERNAL_NAME_STR VER_ORIGINAL_FILENAME_STR -#define VER_COPYRIGHT_STR "(C) 2017 modifications by thomas694 (@GH)" +#define VER_COPYRIGHT_STR "(C) 2017, 2020 modifications by thomas694 (@GH)" #ifdef _DEBUG #define VER_VER_DEBUG VS_FF_DEBUG