1 /** 2 Command line tool that filters TSV files. 3 4 This tool filters tab-delimited files based on numeric or string comparisons 5 against specific fields. See the helpText string for details. 6 7 Copyright (c) 2015-2021, eBay Inc. 8 Initially written by Jon Degenhardt 9 10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_utils.tsv_filter; 13 14 import std.algorithm : canFind, equal, findSplit, max, min; 15 import std.conv : to; 16 import std.exception : enforce; 17 import std.format : format; 18 import std.math : abs, isFinite, isInfinity, isNaN; 19 import std.range; 20 import std.regex; 21 import std.stdio; 22 import std..string : isNumeric; 23 import std.typecons; 24 import std.uni: asLowerCase, toLower, byGrapheme; 25 26 /* The program has two main parts, command line arg processing and processing the input 27 * files. Much of the work is in command line arg processing. This sets up the tests run 28 * against each input line. The tests are an array of delegates (closures) run against the 29 * fields in the line. The tests are based on command line arguments, of which there is 30 * a lengthy set, one for each test. 31 */ 32 33 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 34 35 /** Main program. Invokes command line arg processing and tsv-filter to perform 36 * the real work. Any errors are caught and reported. 37 */ 38 int main(string[] cmdArgs) 39 { 40 /* When running in DMD code coverage mode, turn on report merging. */ 41 version(D_Coverage) version(DigitalMars) 42 { 43 import core.runtime : dmd_coverSetMerge; 44 dmd_coverSetMerge(true); 45 } 46 47 TsvFilterOptions cmdopt; 48 const r = cmdopt.processArgs(cmdArgs); 49 if (!r[0]) return r[1]; 50 version(LDC_Profile) 51 { 52 import ldc.profile : resetAll; 53 resetAll(); 54 } 55 try tsvFilterCommand(cmdopt); 56 catch (Exception e) 57 { 58 stderr.writefln("Error [%s]: %s", cmdopt.programName, e.msg); 59 return 1; 60 } 61 return 0; 62 } 63 64 immutable helpText = q"EOS 65 Synopsis: tsv-filter [options] [file...] 66 67 Filter tab-delimited files for matching lines via comparison tests against 68 individual fields. Use '--help-verbose' for a more detailed description. 69 70 Fields are specified using field number or field name. Field names require 71 that the input file has a header line. Use '--help-fields' for details. 72 73 Global options: 74 --help-verbose Print full help. 75 --help-options Print the options list by itself. 76 --help-fields Print help on specifying fields. 77 --V|version Print version information and exit. 78 --H|header Treat the first line of each file as a header. 79 --or Evaluate tests as an OR rather than an AND clause. 80 --v|invert Invert the filter, printing lines that do not match. 81 --c|count Print only a count of the matched lines. 82 --d|delimiter CHR Field delimiter. Default: TAB. 83 --label STR Rather than filter, mark each record as passing the 84 filter or not. STR is the header, ignored if there 85 is no header line. 86 --label-values STR1:STR2 87 The pass/no-pass values used by '--label'. Defaults 88 to '1' and '0'. 89 --line-buffered Immediately output every matched line. 90 91 Operators: 92 * Test if a field is empty (no characters) or blank (empty or whitespace only). 93 Syntax: --empty|not-empty|blank|not-blank FIELD 94 Example: --empty name # True if the 'name' field is empty 95 96 * Test if a field is numeric, finite, NaN, or infinity 97 Syntax: --is-numeric|is-finite|is-nan|is-infinity FIELD 98 Example: --is-numeric 5 --gt 5:100 # Ensure field 5 is numeric before --gt test. 99 100 * Compare a field to a number (integer or float) 101 Syntax: --eq|ne|lt|le|gt|ge FIELD:NUM 102 Example: --lt size:1000 --gt weight:0.5 # ('size' < 1000) and ('weight' > 0.5) 103 104 * Compare a field to a string 105 Syntax: --str-eq|str-ne|istr-eq|istr-ne FIELD:STR 106 Example: --str-eq color:red # True if 'color' field is "red" 107 108 * Test if a field contains a string (substring search) 109 Syntax: --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld FIELD:STR 110 Example: --str-in-fld color:dark # True if 'color field contains "dark" 111 112 * Test if a field matches a regular expression. 113 Syntax: --regex|iregex|not-regex|not-iregex FIELD:REGEX 114 Example: --regex '3:ab*c' # True if field 3 contains "ac", "abc", "abbc", etc. 115 116 * Test a field's character or byte length 117 Syntax: --char-len-[le|lt|ge|gt|eq|ne] FIELD:NUM 118 --byte-len-[le|lt|ge|gt|eq|ne] FIELD:NUM 119 Example: --char-len-lt 2:10 # True if field 2 is less than 10 characters long. 120 --byte-len-gt 2:10 # True if field 2 is greater than 10 bytes long. 121 122 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field. 123 Syntax: --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge FIELD1:FIELD2 124 --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne FIELD1:FIELD2 125 Example: --ff-eq 2:4 # True if fields 2 and 4 are numerically equivalent 126 --ff-str-eq 2:4 # True if fields 2 and 4 are the same strings 127 128 * Field to field difference comparisons - Absolute and relative difference 129 Syntax: --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM 130 --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM 131 Example: --ff-absdiff-lt 1:3:0.25 # True if abs(field1 - field2) < 0.25 132 133 EOS"; 134 135 immutable helpTextVerbose = q"EOS 136 Synopsis: tsv-filter [options] [file...] 137 138 Filter lines of tab-delimited files via comparison tests against fields. 139 Multiple tests can be specified, by default they are evaluated as an AND 140 clause. Lines satisfying the tests are written to standard output. 141 142 Typical test syntax is '--op field:value', where 'op' is an operator, 143 'field' is a either a field name and or field number, and 'value' is the 144 comparison basis. For example, '--lt length:500' tests if the 'length' 145 field is less than 500. A more complete example: 146 147 tsv-filter --header --gt length:50 --lt length:100 --le width:200 data.tsv 148 149 This outputs all lines from file data.tsv where the 'length' field is 150 greater than 50 and less than 100, and the 'width' field is less than or 151 equal to 200. The header line is also output. 152 153 Field numbers can also be used to identify fields, and must be used when 154 the input file doesn't have a header line. For example: 155 156 tsv-filter --gt 1:50 --lt 1:100 --le 2:200 data.tsv 157 158 Field lists can be used to specify multiple fields at once. For example: 159 160 tsv-filter --not-blank 1-10 --str-ne 1,2,5:'--' data.tsv 161 162 tests that fields 1-10 are not blank and fields 1,2,5 are not "--". 163 164 Wildcarded field names can also be used to specify multiple fields. The 165 following finds lines where any field name ending in '*_id' is empty: 166 167 tsv-filter -H --or --empty '*_id' 168 169 Use '--help-fields' for details on using field names. 170 171 Tests available include: 172 * Test if a field is empty (no characters) or blank (empty or whitespace only). 173 * Test if a field is interpretable as a number, a finite number, NaN, or Infinity. 174 * Compare a field to a number - Numeric equality and relational tests. 175 * Compare a field to a string - String equality and relational tests. 176 * Test if a field matches a regular expression. Case sensitive or insensitive. 177 * Test if a field contains a string. Sub-string search, case sensitive or insensitive. 178 * Test a field's character or byte length. 179 * Field to field comparisons - Similar to the other tests, except comparing 180 one field to another in the same line. 181 182 As an alternative to filtering, records can be marked to indicate if they meet 183 the filter criteria or not. For example, the following will add a field to each 184 record indicating if the 'Color' field is a primary color. 185 186 tsv-filter -H --or --str-eq Color:Red --str-eq Color:Yellow str-eq Color:Blue \ 187 --label IsPrimaryColor data.tsv 188 189 Values default to '1' and '0' and can be changed using '--label-values'. The 190 header name pass to '--label' is ignored if headers are not being used. 191 192 Details: 193 * The run is aborted if there are not enough fields in an input line. 194 * Numeric tests will fail and abort the run if a field cannot be interpreted as a 195 number. This includes fields with no text. To avoid this use '--is-numeric' or 196 '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100' 197 ensures field 5 is numeric before running the --gt test. 198 * Regular expression syntax is defined by the D programming language. They follow 199 common conventions (perl, python, etc.). Most common forms work as expected. 200 * Output is buffered by default to improve performance. Use '--line-buffered' to 201 have each matched line immediately written out. 202 203 Options: 204 EOS"; 205 206 immutable helpTextOptions = q"EOS 207 Synopsis: tsv-filter [options] [file...] 208 209 Options: 210 EOS"; 211 212 /* The next blocks of code define the structure of the boolean tests run against input lines. 213 * This includes function and delegate (closure) signatures, creation mechanisms, option 214 * handlers, etc. Command line arg processing to build the test structure. 215 */ 216 217 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean 218 * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure) 219 * containing all info about the test except the field values of the line being tested. 220 * These delegates are created as part of command line arg processing. The wrapped data 221 * includes operation, field indexes, literal values, etc. At run-time the delegate is 222 * passed one argument, the split input line. 223 */ 224 alias FieldsPredicate = bool delegate(const char[][] fields); 225 226 /* FieldsPredicate function signatures - These aliases represent the different function 227 * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make' 228 * function. The 'make' function takes a real predicate function and closure args and 229 * returns a FieldsPredicate delegate. Predicates types are: 230 * 231 * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4) 232 * - FieldVsNumberPredicate - Test based on a field index (used to get the field value) 233 * and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100). 234 * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc) 235 * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string. 236 * (e.g. --istr-eq 2:abc) 237 * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c') 238 * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4). 239 * 240 * An actual FieldsPredicate takes the fields from the line and the closure args and 241 * runs the test. For example, a function testing if a field is less than a specific 242 * value would pull the specified field from the fields array, convert the string to 243 * a number, then run the less-than test. 244 */ 245 alias FieldUnaryPredicate = bool function(const char[][] fields, size_t index); 246 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value); 247 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value); 248 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value); 249 alias FieldVsRegexPredicate = bool function(const char[][] fields, size_t index, Regex!char value); 250 alias FieldVsFieldPredicate = bool function(const char[][] fields, size_t index1, size_t index2); 251 alias FieldFieldNumPredicate = bool function(const char[][] fields, size_t index1, size_t index2, double value); 252 253 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index) 254 { 255 return fields => fn(fields, index); 256 } 257 258 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value) 259 { 260 return fields => fn(fields, index, value); 261 } 262 263 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value) 264 { 265 return fields => fn(fields, index, value); 266 } 267 268 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value) 269 { 270 return fields => fn(fields, index, value); 271 } 272 273 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value) 274 { 275 return fields => fn(fields, index, value); 276 } 277 278 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2) 279 { 280 return fields => fn(fields, index1, index2); 281 } 282 283 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value) 284 { 285 return fields => fn(fields, index1, index2, value); 286 } 287 288 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They 289 * are a direct reflection of the operators available via command line args. Each matches 290 * one of the FieldsPredicate function aliases defined above. 291 */ 292 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; } 293 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; } 294 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); } 295 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); } 296 297 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; } 298 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; } 299 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; } 300 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; } 301 302 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; } 303 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double < val; } 304 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; } 305 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double > val; } 306 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; } 307 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; } 308 309 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; } 310 bool strLT(const char[][] fields, size_t index, string val) { return fields[index] < val; } 311 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; } 312 bool strGT(const char[][] fields, size_t index, string val) { return fields[index] > val; } 313 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; } 314 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; } 315 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); } 316 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); } 317 318 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander. 319 */ 320 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); } 321 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); } 322 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); } 323 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); } 324 325 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are 326 * used for both case-sensitive and case-insensitive regex operators. 327 */ 328 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); } 329 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); } 330 331 bool charLenLE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength <= val; } 332 bool charLenLT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength < val; } 333 bool charLenGE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength >= val; } 334 bool charLenGT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength > val; } 335 bool charLenEQ(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength == val; } 336 bool charLenNE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength != val; } 337 338 bool byteLenLE(const char[][] fields, size_t index, double val) { return fields[index].length <= val; } 339 bool byteLenLT(const char[][] fields, size_t index, double val) { return fields[index].length < val; } 340 bool byteLenGE(const char[][] fields, size_t index, double val) { return fields[index].length >= val; } 341 bool byteLenGT(const char[][] fields, size_t index, double val) { return fields[index].length > val; } 342 bool byteLenEQ(const char[][] fields, size_t index, double val) { return fields[index].length == val; } 343 bool byteLenNE(const char[][] fields, size_t index, double val) { return fields[index].length != val; } 344 345 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; } 346 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double < fields[index2].to!double; } 347 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; } 348 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double > fields[index2].to!double; } 349 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; } 350 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; } 351 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; } 352 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; } 353 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2) 354 { 355 return equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 356 } 357 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2) 358 { 359 return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 360 } 361 362 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; } 363 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); } 364 365 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 366 { 367 return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value; 368 } 369 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 370 { 371 return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value; 372 } 373 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 374 { 375 return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value; 376 } 377 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 378 { 379 return RelDiff(fields[index1].to!double, fields[index2].to!double) > value; 380 } 381 382 /* Command line option handlers - There is a command line option handler for each 383 * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate, 384 * etc. Option handlers are passed the tests array, the predicate function, and the 385 * command line option arguments. A FieldsPredicate delegate is created and appended to 386 * the tests array. An exception is thrown if errors are detected while processing the 387 * option, the error text is intended for the end user. 388 * 389 * All the option handlers have similar functionality, differing in option processing and 390 * error message generation. fieldVsNumberOptionHandler is described as an example. It 391 * handles command options such as '--lt 3:1000', which tests field 3 for a values less 392 * than 1000. It is passed the tests array, the 'numLE' predicate function used for the 393 * test, and the string "3:1000" representing the option value. It is also passed the 394 * header line from the first input file and an indication of whether header processing 395 * is enabled (--H|header). parseFieldList (fieldlist module) is used to parse the 396 * field-list component of the option ("3" in the example). The comparison value ("1000") 397 * is converted to a double. These are wrapped in a FieldsPredicate delegate which is 398 * added to the tests array. An error is signaled if the option string is invalid. 399 * 400 * During processing, fields indexes are converted from one-based to zero-based. As an 401 * optimization, the maximum field index is also tracked. This allows early termination of 402 * line splitting. 403 * 404 * The header line from the input file is not available when std.getop processes the 405 * command line option. The processing described above must be deferred. This is done 406 * using a 'CmdOptionHandler' delegate. There is a 'make' function for every Command line 407 * option handler that creates these. These are created during std.getopt processing. 408 * They are run when the header line becomes available. 409 * 410 * The final setup for the '--lt' (numeric less-than) operator' is as follows: 411 * - Function 'handlerNumLE' (in TsvFilterOptions.processArgs) is associated with the 412 * command line option "--lt <val>". When called by std.getopt it creates an option 413 * hander delegate via 'makeFieldVsNumberOptionHandler'. This is appended to an 414 * array of delegates. 415 * - 'fieldVsNumberOptionHandler' is invoked via the delegate after the header line 416 * becomes available (in TsvFilterOptions.processArgs). If args are valid, 417 * 'makeFieldVsNumberDelegate' is used to create a delegate invoking the 'numLE' 418 * predicate function. This delegate is added to the set of run-time tests. 419 * 420 * Note that in the above setup the 'numLE' predicate is specified in 'handlerNumLE' 421 * and passed through all the steps. This is how the command line option gets 422 * associated with the predicate function. 423 */ 424 425 /* CmdOptionHandler delegate signature - This is the call made to process the command 426 * line option arguments after the header line has been read. 427 */ 428 alias CmdOptionHandler = void delegate(ref FieldsPredicate[] tests, ref size_t maxFieldIndex, 429 bool hasHeader, string[] headerFields); 430 431 CmdOptionHandler makeFieldUnaryOptionHandler(FieldUnaryPredicate predicateFn, string option, string optionVal) 432 { 433 return 434 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 435 => fieldUnaryOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 436 } 437 438 void fieldUnaryOptionHandler( 439 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 440 FieldUnaryPredicate fn, string option, string optionVal) 441 { 442 import tsv_utils.common.fieldlist; 443 444 try foreach (fieldNum, fieldIndex; 445 optionVal 446 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields) 447 .enumerate(1)) 448 { 449 tests ~= makeFieldUnaryDelegate(fn, fieldIndex); 450 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 451 } 452 catch (Exception e) 453 { 454 e.msg = format("Invalid option: [--%s %s]. %s\n Expected: '--%s <field>' or '--%s <field-list>'.", 455 option, optionVal, e.msg, option, option); 456 throw e; 457 } 458 } 459 460 CmdOptionHandler makeFieldVsNumberOptionHandler(FieldVsNumberPredicate predicateFn, string option, string optionVal) 461 { 462 return 463 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 464 => fieldVsNumberOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 465 } 466 467 void fieldVsNumberOptionHandler( 468 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 469 FieldVsNumberPredicate fn, string option, string optionVal) 470 { 471 import tsv_utils.common.fieldlist; 472 473 auto formatErrorMsg(string option, string optionVal, string errorMessage="") 474 { 475 string optionalSpace = (errorMessage.length == 0) ? "" : " "; 476 return format( 477 "Invalid option: [--%s %s].%s%s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val> where <val> is a number.", 478 option, optionVal, optionalSpace, errorMessage, option, option); 479 } 480 481 try 482 { 483 auto optionValParse = 484 optionVal 485 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 486 (hasHeader, headerFields); 487 488 auto fieldIndices = optionValParse.array; 489 enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 490 double value = optionVal[optionValParse.consumed + 1 .. $].to!double; 491 492 foreach (fieldIndex; fieldIndices) 493 { 494 tests ~= makeFieldVsNumberDelegate(fn, fieldIndex, value); 495 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 496 } 497 } 498 catch (Exception e) 499 { 500 e.msg = formatErrorMsg(option, optionVal, e.msg); 501 throw e; 502 } 503 } 504 505 CmdOptionHandler makeFieldVsStringOptionHandler(FieldVsStringPredicate predicateFn, string option, string optionVal) 506 { 507 return 508 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 509 => fieldVsStringOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 510 } 511 512 void fieldVsStringOptionHandler( 513 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 514 FieldVsStringPredicate fn, string option, string optionVal) 515 { 516 import tsv_utils.common.fieldlist; 517 518 try 519 { 520 auto optionValParse = 521 optionVal 522 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 523 (hasHeader, headerFields); 524 525 auto fieldIndices = optionValParse.array; 526 enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 527 string value = optionVal[optionValParse.consumed + 1 .. $].idup; 528 529 foreach (fieldIndex; fieldIndices) 530 { 531 tests ~= makeFieldVsStringDelegate(fn, fieldIndex, value); 532 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 533 } 534 535 } 536 catch (Exception e) 537 { 538 e.msg = format( 539 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 540 option, optionVal, e.msg, option, option); 541 throw e; 542 } 543 } 544 545 CmdOptionHandler makeFieldVsIStringOptionHandler(FieldVsIStringPredicate predicateFn, string option, string optionVal) 546 { 547 return 548 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 549 => fieldVsIStringOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 550 } 551 552 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the 553 * case-insensitive comparison will be done on lower-cased values. 554 */ 555 void fieldVsIStringOptionHandler( 556 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 557 FieldVsIStringPredicate fn, string option, string optionVal) 558 { 559 import tsv_utils.common.fieldlist; 560 561 try 562 { 563 auto optionValParse = 564 optionVal 565 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 566 (hasHeader, headerFields); 567 568 auto fieldIndices = optionValParse.array; 569 enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 570 string value = optionVal[optionValParse.consumed + 1 .. $].idup; 571 572 foreach (fieldIndex; fieldIndices) 573 { 574 tests ~= makeFieldVsIStringDelegate(fn, fieldIndex, value.to!dstring.toLower); 575 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 576 } 577 } 578 catch (Exception e) 579 { 580 e.msg = format( 581 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 582 option, optionVal, e.msg, option, option); 583 throw e; 584 } 585 } 586 587 CmdOptionHandler makeFieldVsRegexOptionHandler(FieldVsRegexPredicate predicateFn, string option, string optionVal, bool caseSensitive) 588 { 589 return 590 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 591 => fieldVsRegexOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal, caseSensitive); 592 } 593 594 void fieldVsRegexOptionHandler( 595 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 596 FieldVsRegexPredicate fn, string option, string optionVal, bool caseSensitive) 597 { 598 import tsv_utils.common.fieldlist; 599 600 try 601 { 602 auto optionValParse = 603 optionVal 604 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 605 (hasHeader, headerFields); 606 607 auto fieldIndices = optionValParse.array; 608 enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 609 610 immutable modifiers = caseSensitive ? "" : "i"; 611 Regex!char value = 612 optionVal[optionValParse.consumed + 1 .. $] 613 .regex(modifiers); 614 615 foreach (fieldIndex; fieldIndices) 616 { 617 tests ~= makeFieldVsRegexDelegate(fn, fieldIndex, value); 618 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 619 } 620 } 621 catch (RegexException e) 622 { 623 e.msg = format( 624 "[--%s %s]. Invalid regular expression: %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 625 option, optionVal, e.msg, option, option); 626 throw e; 627 } 628 catch (Exception e) 629 { 630 e.msg = format( 631 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 632 option, optionVal, e.msg, option, option); 633 throw e; 634 } 635 } 636 637 638 CmdOptionHandler makeFieldVsFieldOptionHandler(FieldVsFieldPredicate predicateFn, string option, string optionVal) 639 { 640 return 641 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 642 => fieldVsFieldOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 643 } 644 645 void fieldVsFieldOptionHandler( 646 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 647 FieldVsFieldPredicate fn, string option, string optionVal) 648 { 649 import tsv_utils.common.fieldlist; 650 651 try 652 { 653 auto optionValParse = 654 optionVal 655 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 656 (hasHeader, headerFields); 657 658 auto fieldIndices1 = optionValParse.array; 659 660 enforce(fieldIndices1.length != 0, "First field argument is empty."); 661 enforce(fieldIndices1.length == 1, "First field argument references multiple fields."); 662 enforce(optionVal.length - optionValParse.consumed > 1, " Second field argument is empty."); 663 664 auto fieldIndices2 = 665 optionVal[optionValParse.consumed + 1 .. $] 666 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, Yes.consumeEntireFieldListString) 667 (hasHeader, headerFields) 668 .array; 669 670 enforce(fieldIndices2.length != 0, "Second field argument is empty."); 671 enforce(fieldIndices2.length == 1, "Second field argument references multiple fields."); 672 673 enforce(fieldIndices1[0] != fieldIndices2[0], 674 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 675 676 tests ~= makeFieldVsFieldDelegate(fn, fieldIndices1[0], fieldIndices2[0]); 677 maxFieldIndex = max(maxFieldIndex, fieldIndices1[0], fieldIndices2[0]); 678 } 679 catch (Exception e) 680 { 681 e.msg = format( 682 "[--%s %s]. %s\n Expected: '--%s <field1>:<field2>' where <field1> and <field2> are individual fields.", 683 option, optionVal, e.msg, option); 684 throw e; 685 } 686 } 687 688 CmdOptionHandler makeFieldFieldNumOptionHandler(FieldFieldNumPredicate predicateFn, string option, string optionVal) 689 { 690 return 691 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 692 => fieldFieldNumOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 693 } 694 695 void fieldFieldNumOptionHandler( 696 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 697 FieldFieldNumPredicate fn, string option, string optionVal) 698 { 699 import tsv_utils.common.fieldlist; 700 701 try 702 { 703 auto optionValParse1 = 704 optionVal 705 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 706 (hasHeader, headerFields); 707 708 auto fieldIndices1 = optionValParse1.array; 709 710 enforce(fieldIndices1.length != 0, "First field argument is empty."); 711 enforce(fieldIndices1.length == 1, "First field argument references multiple fields."); 712 enforce(optionVal.length - optionValParse1.consumed > 1, " Second field argument is empty."); 713 714 auto optionValSegment2 = optionVal[optionValParse1.consumed + 1 .. $]; 715 auto optionValParse2 = 716 optionValSegment2 717 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 718 (hasHeader, headerFields); 719 720 auto fieldIndices2 = optionValParse2.array; 721 722 enforce(fieldIndices2.length != 0, "Second field argument is empty."); 723 enforce(fieldIndices2.length == 1, "Second field argument references multiple fields."); 724 enforce(optionValSegment2.length - optionValParse2.consumed > 1, "Number argument is empty."); 725 726 size_t field1 = fieldIndices1[0]; 727 size_t field2 = fieldIndices2[0]; 728 double value = optionValSegment2[optionValParse2.consumed + 1 .. $].to!double; 729 730 enforce(field1 != field2, 731 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 732 733 tests ~= makeFieldFieldNumDelegate(fn, field1, field2, value); 734 maxFieldIndex = max(maxFieldIndex, field1, field2); 735 } 736 catch (Exception e) 737 { 738 e.msg = format( 739 "[--%s %s]. %s\n Expected: '--%s <field1>:<field2>:<num>' where <field1> and <field2> are individual fields.", 740 option, optionVal, e.msg, option); 741 throw e; 742 } 743 } 744 745 /** Command line options - This struct holds the results of command line option processing. 746 * It also has a method, processArgs, that invokes command line arg processing. 747 */ 748 struct TsvFilterOptions 749 { 750 import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; 751 752 string programName; 753 InputSourceRange inputSources; /// Input files 754 FieldsPredicate[] tests; /// Derived from tests 755 size_t maxFieldIndex = 0; /// Derived from tests 756 bool hasHeader = false; /// --H|header 757 bool invert = false; /// --invert 758 bool disjunct = false; /// --or 759 bool countMatches = false; /// --c|count 760 char delim = '\t'; /// --delimiter 761 string label; /// --label 762 bool labelValuesOptionUsed = false; /// --label-values 763 bool lineBuffered = false; /// --line-buffered 764 bool isLabeling = false; /// Derived 765 string trueLabel = "1"; /// Derived 766 string falseLabel = "0"; /// Derived 767 768 /* Returns a tuple. First value is true if command line arguments were successfully 769 * processed and execution should continue, or false if an error occurred or the user 770 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 771 * 772 * Returning true (execution continues) means args have been validated and the 773 * tests array has been established. 774 */ 775 auto processArgs (ref string[] cmdArgs) 776 { 777 import std.algorithm : each; 778 import std.array : split; 779 import std.conv : to; 780 import std.getopt; 781 import std.path : baseName, stripExtension; 782 import tsv_utils.common.getopt_inorder; 783 import tsv_utils.common.utils : throwIfWindowsNewline; 784 785 bool helpVerbose = false; // --help-verbose 786 bool helpOptions = false; // --help-options 787 bool helpFields = false; // --help-fields 788 bool versionWanted = false; // --V|version 789 790 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 791 792 /* Command option handlers - One handler for each option. These conform to the 793 * getopt required handler signature, and separate knowledge the specific command 794 * option text from the option processing. 795 */ 796 797 CmdOptionHandler[] cmdLineTestOptions; 798 799 void handlerFldEmpty(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldEmpty, option, value); } 800 void handlerFldNotEmpty(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldNotEmpty, option, value); } 801 void handlerFldBlank(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldBlank, option, value); } 802 void handlerFldNotBlank(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldNotBlank, option, value); } 803 804 void handlerFldIsNumeric(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsNumeric, option, value); } 805 void handlerFldIsFinite(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsFinite, option, value); } 806 void handlerFldIsNaN(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsNaN, option, value); } 807 void handlerFldIsInfinity(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsInfinity, option, value); } 808 809 void handlerNumLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numLE, option, value); } 810 void handlerNumLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numLT, option, value); } 811 void handlerNumGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numGE, option, value); } 812 void handlerNumGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numGT, option, value); } 813 void handlerNumEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numEQ, option, value); } 814 void handlerNumNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numNE, option, value); } 815 816 void handlerStrLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strLE, option, value); } 817 void handlerStrLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strLT, option, value); } 818 void handlerStrGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strGE, option, value); } 819 void handlerStrGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strGT, option, value); } 820 void handlerStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strEQ, option, value); } 821 void handlerStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strNE, option, value); } 822 823 void handlerStrInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strInFld, option, value); } 824 void handlerStrNotInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strNotInFld, option, value); } 825 826 void handlerIStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrEQ, option, value); } 827 void handlerIStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrNE, option, value); } 828 void handlerIStrInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrInFld, option, value); } 829 void handlerIStrNotInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrNotInFld, option, value); } 830 831 void handlerRegexMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exMatch, option, value, true); } 832 void handlerRegexNotMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exNotMatch, option, value, true); } 833 void handlerIRegexMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exMatch, option, value, false); } 834 void handlerIRegexNotMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exNotMatch, option, value, false); } 835 836 void handlerCharLenLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenLE, option, value); } 837 void handlerCharLenLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenLT, option, value); } 838 void handlerCharLenGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenGE, option, value); } 839 void handlerCharLenGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenGT, option, value); } 840 void handlerCharLenEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenEQ, option, value); } 841 void handlerCharLenNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenNE, option, value); } 842 843 void handlerByteLenLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenLE, option, value); } 844 void handlerByteLenLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenLT, option, value); } 845 void handlerByteLenGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenGE, option, value); } 846 void handlerByteLenGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenGT, option, value); } 847 void handlerByteLenEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenEQ, option, value); } 848 void handlerByteLenNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenNE, option, value); } 849 850 void handlerFFLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffLE, option, value); } 851 void handlerFFLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffLT, option, value); } 852 void handlerFFGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffGE, option, value); } 853 void handlerFFGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffGT, option, value); } 854 void handlerFFEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffEQ, option, value); } 855 void handlerFFNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffNE, option, value); } 856 857 void handlerFFStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffStrEQ, option, value); } 858 void handlerFFStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffStrNE, option, value); } 859 void handlerFFIStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffIStrEQ, option, value); } 860 void handlerFFIStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffIStrNE, option, value); } 861 862 void handlerFFAbsDiffLE(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffAbsDiffLE, option, value); } 863 void handlerFFAbsDiffGT(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffAbsDiffGT, option, value); } 864 void handlerFFRelDiffLE(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffRelDiffLE, option, value); } 865 void handlerFFRelDiffGT(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffRelDiffGT, option, value); } 866 867 /* The handleLabelValuesOption is different from the other handlers in that it is 868 * not generic. Instead it simply parses and validates the argument passed to the 869 * --label-values option. If the option is valid, it populates the `trueLabel` 870 * and `falseLabel` member variables. Otherwise an exception is thrown. 871 */ 872 void handleLabelValuesOption(string option, string optionVal) 873 { 874 immutable valSplit = optionVal.findSplit(":"); 875 876 enforce(valSplit && !valSplit[2].canFind(":") && valSplit[0] != valSplit[2], 877 format("Invalid option: '--%s %s'.\n" ~ 878 " Expected: '--%s STR1:STR2'. STR1 and STR2 must be different strings.\n" ~ 879 " The colon (':') is required, niether string can contain a colon.", 880 option, optionVal, option)); 881 882 labelValuesOptionUsed = true; 883 trueLabel = valSplit[0]; 884 falseLabel = valSplit[2]; 885 } 886 887 try 888 { 889 arraySep = ","; // Use comma to separate values in command line options 890 auto r = getoptInorder( 891 cmdArgs, 892 "help-verbose", " Print full help.", &helpVerbose, 893 "help-options", " Print the options list by itself.", &helpOptions, 894 "help-fields", " Print help on specifying fields.", &helpFields, 895 std.getopt.config.caseSensitive, 896 "V|version", " Print version information and exit.", &versionWanted, 897 "H|header", " Treat the first line of each file as a header.", &hasHeader, 898 std.getopt.config.caseInsensitive, 899 "or", " Evaluate tests as an OR rather than an AND.", &disjunct, 900 std.getopt.config.caseSensitive, 901 "v|invert", " Invert the filter, printing lines that do not match.", &invert, 902 std.getopt.config.caseInsensitive, 903 "c|count", " Print only a count of the matched lines, excluding the header.", &countMatches, 904 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 905 906 "label", 907 "STR Do not filter. Instead, mark each record as passing the filter or not. STR is the header, ignored if there is no header line.", 908 &label, 909 910 "label-values", 911 "STR1:STR2 The pass/no-pass values used by '--label'. Defaults to '1' and '0'.", 912 &handleLabelValuesOption, 913 914 "line-buffered", " Immediately output every matched line.", &lineBuffered, 915 916 "empty", "<field-list> True if FIELD is empty.", &handlerFldEmpty, 917 "not-empty", "<field-list> True if FIELD is not empty.", &handlerFldNotEmpty, 918 "blank", "<field-list> True if FIELD is empty or all whitespace.", &handlerFldBlank, 919 "not-blank", "<field-list> True if FIELD contains a non-whitespace character.", &handlerFldNotBlank, 920 921 "is-numeric", "<field-list> True if FIELD is interpretable as a number.", &handlerFldIsNumeric, 922 "is-finite", "<field-list> True if FIELD is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite, 923 "is-nan", "<field-list> True if FIELD is NaN.", &handlerFldIsNaN, 924 "is-infinity", "<field-list> True if FIELD is infinity.", &handlerFldIsInfinity, 925 926 "le", "<field-list>:NUM FIELD <= NUM (numeric).", &handlerNumLE, 927 "lt", "<field-list>:NUM FIELD < NUM (numeric).", &handlerNumLT, 928 "ge", "<field-list>:NUM FIELD >= NUM (numeric).", &handlerNumGE, 929 "gt", "<field-list>:NUM FIELD > NUM (numeric).", &handlerNumGT, 930 "eq", "<field-list>:NUM FIELD == NUM (numeric).", &handlerNumEQ, 931 "ne", "<field-list>:NUM FIELD != NUM (numeric).", &handlerNumNE, 932 933 "str-le", "<field-list>:STR FIELD <= STR (string).", &handlerStrLE, 934 "str-lt", "<field-list>:STR FIELD < STR (string).", &handlerStrLT, 935 "str-ge", "<field-list>:STR FIELD >= STR (string).", &handlerStrGE, 936 "str-gt", "<field-list>:STR FIELD > STR (string).", &handlerStrGT, 937 "str-eq", "<field-list>:STR FIELD == STR (string).", &handlerStrEQ, 938 "istr-eq", "<field-list>:STR FIELD == STR (string, case-insensitive).", &handlerIStrEQ, 939 "str-ne", "<field-list>:STR FIELD != STR (string).", &handlerStrNE, 940 "istr-ne", "<field-list>:STR FIELD != STR (string, case-insensitive).", &handlerIStrNE, 941 "str-in-fld", "<field-list>:STR FIELD contains STR (substring search).", &handlerStrInFld, 942 "istr-in-fld", "<field-list>:STR FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld, 943 "str-not-in-fld", "<field-list>:STR FIELD does not contain STR (substring search).", &handlerStrNotInFld, 944 "istr-not-in-fld", "<field-list>:STR FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld, 945 946 "regex", "<field-list>:REGEX FIELD matches regular expression.", &handlerRegexMatch, 947 "iregex", "<field-list>:REGEX FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch, 948 "not-regex", "<field-list>:REGEX FIELD does not match regular expression.", &handlerRegexNotMatch, 949 "not-iregex", "<field-list>:REGEX FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch, 950 951 "char-len-le", "<field-list>:NUM character-length(FIELD) <= NUM.", &handlerCharLenLE, 952 "char-len-lt", "<field-list>:NUM character-length(FIELD) < NUM.", &handlerCharLenLT, 953 "char-len-ge", "<field-list>:NUM character-length(FIELD) >= NUM.", &handlerCharLenGE, 954 "char-len-gt", "<field-list>:NUM character-length(FIELD) > NUM.", &handlerCharLenGT, 955 "char-len-eq", "<field-list>:NUM character-length(FIELD) == NUM.", &handlerCharLenEQ, 956 "char-len-ne", "<field-list>:NUM character-length(FIELD) != NUM.", &handlerCharLenNE, 957 958 "byte-len-le", "<field-list>:NUM byte-length(FIELD) <= NUM.", &handlerByteLenLE, 959 "byte-len-lt", "<field-list>:NUM byte-length(FIELD) < NUM.", &handlerByteLenLT, 960 "byte-len-ge", "<field-list>:NUM byte-length(FIELD) >= NUM.", &handlerByteLenGE, 961 "byte-len-gt", "<field-list>:NUM byte-length(FIELD) > NUM.", &handlerByteLenGT, 962 "byte-len-eq", "<field-list>:NUM byte-length(FIELD) == NUM.", &handlerByteLenEQ, 963 "byte-len-ne", "<field-list>:NUM byte-length(FIELD) != NUM.", &handlerByteLenNE, 964 965 "ff-le", "FIELD1:FIELD2 FIELD1 <= FIELD2 (numeric).", &handlerFFLE, 966 "ff-lt", "FIELD1:FIELD2 FIELD1 < FIELD2 (numeric).", &handlerFFLT, 967 "ff-ge", "FIELD1:FIELD2 FIELD1 >= FIELD2 (numeric).", &handlerFFGE, 968 "ff-gt", "FIELD1:FIELD2 FIELD1 > FIELD2 (numeric).", &handlerFFGT, 969 "ff-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (numeric).", &handlerFFEQ, 970 "ff-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (numeric).", &handlerFFNE, 971 "ff-str-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string).", &handlerFFStrEQ, 972 "ff-istr-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ, 973 "ff-str-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string).", &handlerFFStrNE, 974 "ff-istr-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE, 975 976 "ff-absdiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE, 977 "ff-absdiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) > NUM", &handlerFFAbsDiffGT, 978 "ff-reldiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE, 979 "ff-reldiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) > NUM", &handlerFFRelDiffGT, 980 ); 981 982 /* Both help texts are a bit long. In this case, for "regular" help, don't 983 * print options, just the text. The text summarizes the options. 984 */ 985 if (r.helpWanted) 986 { 987 stdout.write(helpText); 988 return tuple(false, 0); 989 } 990 else if (helpVerbose) 991 { 992 defaultGetoptPrinter(helpTextVerbose, r.options); 993 return tuple(false, 0); 994 } 995 else if (helpOptions) 996 { 997 defaultGetoptPrinter(helpTextOptions, r.options); 998 return tuple(false, 0); 999 } 1000 else if (helpFields) 1001 { 1002 import tsv_utils.common.fieldlist : fieldListHelpText ; 1003 writeln(fieldListHelpText); 1004 return tuple(false, 0); 1005 } 1006 else if (versionWanted) 1007 { 1008 import tsv_utils.common.tsvutils_version; 1009 writeln(tsvutilsVersionNotice("tsv-filter")); 1010 return tuple(false, 0); 1011 } 1012 1013 /* Input files. Remaining command line args are files. */ 1014 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 1015 cmdArgs.length = 1; 1016 1017 /* Validations and derivations. Currently all are related to label mode. */ 1018 if (!label.empty || labelValuesOptionUsed) 1019 { 1020 enforce(!label.empty || !hasHeader, 1021 "--label is required when using --label-values and --H|header."); 1022 1023 isLabeling = true; 1024 } 1025 1026 enforce (!isLabeling || !countMatches, 1027 format("--c|count cannot be used with --label or --label-values.")); 1028 1029 string[] headerFields; 1030 1031 /* FieldListArgProcessing encapsulates the field list processing. It is 1032 * called prior to reading the header line if headers are not being used, 1033 * and after if headers are being used. 1034 */ 1035 void fieldListArgProcessing() 1036 { 1037 cmdLineTestOptions.each!(dg => dg(tests, maxFieldIndex, hasHeader, headerFields)); 1038 } 1039 1040 if (!hasHeader) fieldListArgProcessing(); 1041 1042 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 1043 inputSources = inputSourceRange(filepaths, readHeader); 1044 1045 if (hasHeader) 1046 { 1047 throwIfWindowsNewline(inputSources.front.header, inputSources.front.name, 1); 1048 headerFields = inputSources.front.header.split(delim).to!(string[]); 1049 fieldListArgProcessing(); 1050 } 1051 } 1052 catch (Exception e) 1053 { 1054 stderr.writefln("[%s] Error processing command line arguments: %s", programName, e.msg); 1055 return tuple(false, 1); 1056 } 1057 return tuple(true, 0); 1058 } 1059 } 1060 1061 enum FilterMode { filter, count, label }; 1062 1063 void tsvFilterCommand(ref TsvFilterOptions cmdopt) 1064 { 1065 if (cmdopt.countMatches) tsvFilter!(FilterMode.count)(cmdopt); 1066 else if (cmdopt.isLabeling) tsvFilter!(FilterMode.label)(cmdopt); 1067 else tsvFilter!(FilterMode.filter)(cmdopt); 1068 } 1069 1070 /** tsvFilter processes the input files and runs the tests. 1071 */ 1072 void tsvFilter(FilterMode mode)(ref TsvFilterOptions cmdopt) 1073 { 1074 import std.algorithm : all, any, splitter; 1075 import std.format : formattedWrite; 1076 import std.range; 1077 import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange, InputSourceRange, 1078 LineBuffered, throwIfWindowsNewline; 1079 1080 static if (mode != FilterMode.count) assert(!cmdopt.countMatches); 1081 static if (mode != FilterMode.label) assert(!cmdopt.isLabeling); 1082 1083 /* inputSources must be an InputSourceRange and include at least stdin. */ 1084 assert(!cmdopt.inputSources.empty); 1085 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1086 1087 /* BufferedOutputRange improves performance on narrow files with high percentages of 1088 * writes. 1089 */ 1090 static if (mode == FilterMode.count) 1091 { 1092 immutable LineBuffered isLineBuffered = No.lineBuffered; 1093 } 1094 else 1095 { 1096 immutable LineBuffered isLineBuffered = 1097 cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered; 1098 1099 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout, isLineBuffered); 1100 } 1101 1102 static if (mode == FilterMode.count) size_t matchedLines = 0; 1103 1104 /* First header is read during command line argument processing. Immediately 1105 * flush it so subsequent processes in a unix command pipeline see it early. 1106 * This helps provide timely error messages. 1107 */ 1108 static if (mode != FilterMode.count) 1109 { 1110 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1111 { 1112 auto inputStream = cmdopt.inputSources.front; 1113 1114 static if (mode == FilterMode.label) 1115 { 1116 bufferedOutput.appendln(inputStream.header, cmdopt.delim, cmdopt.label); 1117 } 1118 else 1119 { 1120 bufferedOutput.appendln(inputStream.header); 1121 } 1122 1123 bufferedOutput.flush; 1124 } 1125 } 1126 1127 immutable size_t fieldIndexEnd = cmdopt.tests.empty ? 0 : cmdopt.maxFieldIndex + 1; 1128 1129 /* Process each input file, one line at a time. */ 1130 immutable size_t numTests = cmdopt.tests.length; 1131 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1132 auto lineFields = new char[][](fieldIndexEnd); 1133 1134 foreach (inputStream; cmdopt.inputSources) 1135 { 1136 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 1137 1138 foreach (lineNum, line; inputStream.file.bufferedByLine(isLineBuffered).enumerate(fileBodyStartLine)) 1139 { 1140 if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum); 1141 1142 /* Copy the needed number of fields to the fields array. */ 1143 size_t fieldIndex = 0; 1144 1145 foreach (fieldValue; line.splitter(cmdopt.delim).take(fieldIndexEnd)) 1146 { 1147 lineFields[fieldIndex] = fieldValue; 1148 fieldIndex++; 1149 } 1150 1151 if (fieldIndex == 0 && fieldIndexEnd != 0) 1152 { 1153 assert(line.length == 0); 1154 /* Bug work-around. Currently empty lines are not handled properly by splitter. 1155 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 1156 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 1157 * Work-around: Point to the line. It's an empty string. 1158 */ 1159 lineFields[fieldIndex] = line; 1160 fieldIndex++; 1161 } 1162 1163 enforce(fieldIndex == cast(long) fieldIndexEnd, 1164 format("Not enough fields in line. File: %s, Line: %s", 1165 inputStream.name, lineNum)); 1166 1167 /* Run the tests. Tests will fail (throw) if a field cannot be converted 1168 * to the expected type. 1169 */ 1170 try 1171 { 1172 bool passed = cmdopt.disjunct ? 1173 cmdopt.tests.any!(x => x(lineFields)) : 1174 cmdopt.tests.all!(x => x(lineFields)); 1175 if (cmdopt.invert) passed = !passed; 1176 1177 static if (mode == FilterMode.count) 1178 { 1179 if (passed) ++matchedLines; 1180 } 1181 else static if (mode == FilterMode.label) 1182 { 1183 bufferedOutput.appendln(line, cmdopt.delim, 1184 passed ? cmdopt.trueLabel : cmdopt.falseLabel); 1185 } 1186 else 1187 { 1188 if (passed) bufferedOutput.appendln(line); 1189 } 1190 } 1191 catch (Exception e) 1192 { 1193 static if (mode != FilterMode.count) bufferedOutput.flush; 1194 throw new Exception( 1195 format("Could not process line or field: %s\n File: %s Line: %s%s", 1196 e.msg, inputStream.name, lineNum, 1197 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 1198 } 1199 } 1200 } 1201 1202 static if (mode == FilterMode.count) writeln(matchedLines); 1203 }