1 /** 2 Command line tool that filters TSV files. 3 4 This tool filters tab-delimited files based on numeric or string comparisons 5 against specific fields. See the helpText string for details. 6 7 Copyright (c) 2015-2019, eBay Software Foundation 8 Initially written by Jon Degenhardt 9 10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_utils.tsv_filter; 13 14 import std.algorithm : canFind, equal, findSplit, max, min; 15 import std.conv : to; 16 import std.format : format; 17 import std.math : abs, isFinite, isInfinity, isNaN; 18 import std.regex; 19 import std.stdio; 20 import std..string : isNumeric; 21 import std.typecons : tuple; 22 import std.uni: asLowerCase, toLower; 23 24 /* The program has two main parts, command line arg processing and processing the input 25 * files. Much of the work is in command line arg processing. This sets up the tests run 26 * against each input line. The tests are an array of delegates (closures) run against the 27 * fields in the line. The tests are based on command line arguments, of which there is 28 * a lengthy set, one for each test. 29 */ 30 31 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 32 33 /** Main program. Invokes command line arg processing and tsv-filter to perform 34 * the real work. Any errors are caught and reported. 35 */ 36 int main(string[] cmdArgs) 37 { 38 /* When running in DMD code coverage mode, turn on report merging. */ 39 version(D_Coverage) version(DigitalMars) 40 { 41 import core.runtime : dmd_coverSetMerge; 42 dmd_coverSetMerge(true); 43 } 44 45 TsvFilterOptions cmdopt; 46 const r = cmdopt.processArgs(cmdArgs); 47 if (!r[0]) return r[1]; 48 version(LDC_Profile) 49 { 50 import ldc.profile : resetAll; 51 resetAll(); 52 } 53 try tsvFilter(cmdopt, cmdArgs[1..$]); 54 catch (Exception exc) 55 { 56 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 57 return 1; 58 } 59 return 0; 60 } 61 62 immutable helpText = q"EOS 63 Synopsis: tsv-filter [options] [file...] 64 65 Filter tab-delimited files for matching lines via comparison tests against 66 individual fields. Use '--help-verbose' for a more detailed description. 67 68 Global options: 69 --help-verbose Print full help. 70 --help-options Print the options list by itself. 71 --V|version Print version information and exit. 72 --H|header Treat the first line of each file as a header. 73 --or Evaluate tests as an OR rather than an AND clause. 74 --v|invert Invert the filter, printing lines that do not match. 75 --d|delimiter CHR Field delimiter. Default: TAB. 76 77 Operators: 78 * Test if a field is empty (no characters) or blank (empty or whitespace only). 79 Syntax: --empty|not-empty|blank|not-blank FIELD 80 Example: --empty 5 // True if field 5 is empty 81 82 * Test if a field is numeric, finite, NaN, or infinity 83 Syntax: --is-numeric|is-finite|is-nan|is-infinity FIELD 84 Example: --is-numeric 5 --gt 5:100 // Ensure field 5 is numeric before --gt test. 85 86 * Compare a field to a number (integer or float) 87 Syntax: --eq|ne|lt|le|gt|ge FIELD:NUM 88 Example: --lt 5:1000 --gt 2:0.5 // True if (field 5 < 1000) and (field 2 > 0.5) 89 90 * Compare a field to a string 91 Syntax: --str-eq|str-ne FIELD:STR 92 Example: --str-eq 3:abc // True if field 3 is "abc" 93 94 * Test if a field contains a string (substring search) 95 Syntax: --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld FIELD:STR 96 Example: --str-in-fld 1:hello // True if field 1 contains "hello" 97 98 * Test if a field matches a regular expression. 99 Syntax: --regex|iregex|not-regex|not-iregex FIELD:REGEX 100 Example: --regex '3:ab*c' // True if field 3 contains "ac", "abc", "abbc", etc. 101 102 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field. 103 Syntax: --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge FIELD1:FIELD2 104 --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne FIELD1:FIELD2 105 Example: --ff-eq 2:4 // True if fields 2 and 4 are numerically equivalent 106 --ff-str-eq 2:4 // True if fields 2 and 4 are the same strings 107 108 * Field to field difference comparisons - Absolute and relative difference 109 Syntax: --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM 110 --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM 111 Example: --ff-absdiff-lt 1:3:0.25 // True if abs(field1 - field2) < 0.25 112 113 EOS"; 114 115 immutable helpTextVerbose = q"EOS 116 Synopsis: tsv-filter [options] [file...] 117 118 Filter lines of tab-delimited files via comparison tests against fields. Multiple 119 tests can be specified, by default they are evaluated as AND clause. Lines 120 satisfying the tests are written to standard output. 121 122 Typical test syntax is '--op field:value', where 'op' is an operator, 'field' is a 123 1-based field index, and 'value' is the comparison basis. For example, '--lt 3:500' 124 tests if field 3 is less than 500. A more complete example: 125 126 tsv-filter --header --gt 1:50 --lt 1:100 --le 2:1000 data.tsv 127 128 This outputs all lines from file data.tsv where field 1 is greater than 50 and less 129 than 100, and field 2 is less than or equal to 1000. The header is also output. 130 131 Tests available include: 132 * Test if a field is empty (no characters) or blank (empty or whitespace only). 133 * Test if a field is interpretable as a number, a finite number, NaN, or Infinity. 134 * Compare a field to a number - Numeric equality and relational tests. 135 * Compare a field to a string - String equality and relational tests. 136 * Test if a field matches a regular expression. Case sensitive or insensitive. 137 * Test if a field contains a string. Sub-string search, case sensitive or insensitive. 138 * Field to field comparisons - Similar to the other tests, except comparing 139 one field to another in the same line. 140 141 Details: 142 * The run is aborted if there are not enough fields in an input line. 143 * Numeric tests will fail and abort the run if a field cannot be interpreted as a 144 number. This includes fields with no text. To avoid this use '--is-numeric' or 145 '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100' 146 ensures field 5 is numeric before running the --gt test. 147 * Regular expression syntax is defined by the D programming language. They follow 148 common conventions (perl, python, etc.). Most common forms work as expected. 149 150 Options: 151 EOS"; 152 153 immutable helpTextOptions = q"EOS 154 Synopsis: tsv-filter [options] [file...] 155 156 Options: 157 EOS"; 158 159 /* The next blocks of code define the structure of the boolean tests run against input lines. 160 * This includes function and delegate (closure) signatures, creation mechanisms, option 161 * handlers, etc. Command line arg processing to build the test structure. 162 */ 163 164 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean 165 * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure) 166 * containing all info about the test except the field values of the line being tested. 167 * These delegates are created as part of command line arg processing. The wrapped data 168 * includes operation, field indexes, literal values, etc. At run-time the delegate is 169 * passed one argument, the split input line. 170 */ 171 alias FieldsPredicate = bool delegate(const char[][] fields); 172 173 /* FieldsPredicate function signatures - These aliases represent the different function 174 * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make' 175 * function. The 'make' function takes a real predicate function and closure args and 176 * returns a FieldsPredicate delegate. Predicates types are: 177 * 178 * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4) 179 * - FieldVsNumberPredicate - Test based on a field index (used to get the field value) 180 * and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100). 181 * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc) 182 * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string. 183 * (e.g. --istr-eq 2:abc) 184 * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c') 185 * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4). 186 * 187 * An actual FieldsPredicate takes the fields from the line and the closure args and 188 * runs the test. For example, a function testing if a field is less than a specific 189 * value would pull the specified field from the fields array, convert the string to 190 * a number, then run the less-than test. 191 */ 192 alias FieldUnaryPredicate = bool function(const char[][] fields, size_t index); 193 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value); 194 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value); 195 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value); 196 alias FieldVsRegexPredicate = bool function(const char[][] fields, size_t index, Regex!char value); 197 alias FieldVsFieldPredicate = bool function(const char[][] fields, size_t index1, size_t index2); 198 alias FieldFieldNumPredicate = bool function(const char[][] fields, size_t index1, size_t index2, double value); 199 200 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index) 201 { 202 return fields => fn(fields, index); 203 } 204 205 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value) 206 { 207 return fields => fn(fields, index, value); 208 } 209 210 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value) 211 { 212 return fields => fn(fields, index, value); 213 } 214 215 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value) 216 { 217 return fields => fn(fields, index, value); 218 } 219 220 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value) 221 { 222 return fields => fn(fields, index, value); 223 } 224 225 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2) 226 { 227 return fields => fn(fields, index1, index2); 228 } 229 230 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value) 231 { 232 return fields => fn(fields, index1, index2, value); 233 } 234 235 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They 236 * are a direct reflection of the operators available via command line args. Each matches 237 * one of the FieldsPredicate function aliases defined above. 238 */ 239 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; } 240 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; } 241 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); } 242 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); } 243 244 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; } 245 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; } 246 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; } 247 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; } 248 249 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; } 250 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double < val; } 251 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; } 252 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double > val; } 253 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; } 254 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; } 255 256 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; } 257 bool strLT(const char[][] fields, size_t index, string val) { return fields[index] < val; } 258 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; } 259 bool strGT(const char[][] fields, size_t index, string val) { return fields[index] > val; } 260 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; } 261 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; } 262 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); } 263 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); } 264 265 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander. 266 */ 267 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); } 268 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); } 269 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); } 270 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); } 271 272 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are 273 * used for both case-sensitive and case-insensitive regex operators. 274 */ 275 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); } 276 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); } 277 278 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; } 279 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double < fields[index2].to!double; } 280 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; } 281 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double > fields[index2].to!double; } 282 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; } 283 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; } 284 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; } 285 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; } 286 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2) 287 { 288 return equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 289 } 290 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2) 291 { 292 return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 293 } 294 295 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; } 296 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); } 297 298 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 299 { 300 return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value; 301 } 302 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 303 { 304 return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value; 305 } 306 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 307 { 308 return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value; 309 } 310 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 311 { 312 return RelDiff(fields[index1].to!double, fields[index2].to!double) > value; 313 } 314 315 /* Command line option handlers - There is a command line option handler for each 316 * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate, 317 * etc. Option handlers are passed the tests array, the predicate function, and the 318 * command line option arguments. A FieldsPredicate delegate is created and appended to 319 * the tests array. An exception is thrown if errors are detected while processing the 320 * option, the error text is intended for the end user. 321 * 322 * These option handlers have similar functionality, differing in option processing and 323 * error message generation. fieldVsNumberOptionHandler is described as an example. It 324 * handles command options such as '--lt 3:1000', which tests field 3 for a values less 325 * than 1000. It is passed the tests array, the 'numLE' function to use for the test, and 326 * the string "3:1000" representing the option value. It parses the option value into 327 * field index (unsigned int) and value (double). These are wrapped in a FieldsPredicate 328 * which is added to the tests array. An error is signaled if the option string is invalid. 329 * 330 * During processing, fields indexes are converted from one-based to zero-based. As an 331 * optimization, the maximum field index is also tracked. This allows early termination of 332 * line splitting. 333 */ 334 335 void fieldUnaryOptionHandler( 336 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldUnaryPredicate fn, string option, string optionVal) 337 { 338 size_t field; 339 try field = optionVal.to!size_t; 340 catch (Exception exc) 341 { 342 throw new Exception( 343 format("Invalid value in option: '--%s %s'. Expected: '--%s <field>' where field is a 1-upped integer.", 344 option, optionVal, option)); 345 } 346 347 if (field == 0) 348 { 349 throw new Exception( 350 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 351 } 352 353 immutable size_t zeroBasedIndex = field - 1; 354 tests ~= makeFieldUnaryDelegate(fn, zeroBasedIndex); 355 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 356 } 357 358 void fieldVsNumberOptionHandler( 359 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsNumberPredicate fn, string option, string optionVal) 360 { 361 immutable valSplit = findSplit(optionVal, ":"); 362 if (valSplit[1].length == 0 || valSplit[2].length == 0) 363 { 364 throw new Exception( 365 format("Invalid option: '%s %s'. Expected: '%s <field>:<val>' where <field> and <val> are numbers.", 366 option, optionVal, option)); 367 } 368 size_t field; 369 double value; 370 try 371 { 372 field = valSplit[0].to!size_t; 373 value = valSplit[2].to!double; 374 } 375 catch (Exception exc) 376 { 377 throw new Exception( 378 format("Invalid numeric values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> and <val> are numbers.", 379 option, optionVal, option)); 380 } 381 382 if (field == 0) 383 { 384 throw new Exception( 385 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 386 } 387 immutable size_t zeroBasedIndex = field - 1; 388 tests ~= makeFieldVsNumberDelegate(fn, zeroBasedIndex, value); 389 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 390 } 391 392 void fieldVsStringOptionHandler( 393 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsStringPredicate fn, string option, string optionVal) 394 { 395 immutable valSplit = findSplit(optionVal, ":"); 396 if (valSplit[1].length == 0 || valSplit[2].length == 0) 397 { 398 throw new Exception( 399 format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.", 400 option, optionVal, option)); 401 } 402 size_t field; 403 string value; 404 try 405 { 406 field = valSplit[0].to!size_t; 407 value = valSplit[2].to!string; 408 } 409 catch (Exception exc) 410 { 411 throw new Exception( 412 format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.", 413 option, optionVal, option)); 414 } 415 416 if (field == 0) 417 { 418 throw new Exception( 419 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 420 } 421 immutable size_t zeroBasedIndex = field - 1; 422 tests ~= makeFieldVsStringDelegate(fn, zeroBasedIndex, value); 423 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 424 } 425 426 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the 427 * case-insensitive comparison will be done on lower-cased values. 428 */ 429 void fieldVsIStringOptionHandler( 430 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsIStringPredicate fn, string option, string optionVal) 431 { 432 immutable valSplit = findSplit(optionVal, ":"); 433 if (valSplit[1].length == 0 || valSplit[2].length == 0) 434 { 435 throw new Exception( 436 format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.", 437 option, optionVal, option)); 438 } 439 size_t field; 440 string value; 441 try 442 { 443 field = valSplit[0].to!size_t; 444 value = valSplit[2].to!string; 445 } 446 catch (Exception exc) 447 { 448 throw new Exception( 449 format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.", 450 option, optionVal, option)); 451 } 452 453 if (field == 0) 454 { 455 throw new Exception( 456 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 457 } 458 immutable size_t zeroBasedIndex = field - 1; 459 tests ~= makeFieldVsIStringDelegate(fn, zeroBasedIndex, value.to!dstring.toLower); 460 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 461 } 462 463 void fieldVsRegexOptionHandler( 464 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsRegexPredicate fn, string option, string optionVal, 465 bool caseSensitive) 466 { 467 immutable valSplit = findSplit(optionVal, ":"); 468 if (valSplit[1].length == 0 || valSplit[2].length == 0) 469 { 470 throw new Exception( 471 format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.", 472 option, optionVal, option)); 473 } 474 size_t field; 475 Regex!char value; 476 try 477 { 478 immutable modifiers = caseSensitive ? "" : "i"; 479 field = valSplit[0].to!size_t; 480 value = regex(valSplit[2], modifiers); 481 } 482 catch (Exception exc) 483 { 484 throw new Exception( 485 format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.", 486 option, optionVal, option)); 487 } 488 489 if (field == 0) 490 { 491 throw new Exception( 492 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 493 } 494 immutable size_t zeroBasedIndex = field - 1; 495 tests ~= makeFieldVsRegexDelegate(fn, zeroBasedIndex, value); 496 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 497 } 498 499 void fieldVsFieldOptionHandler( 500 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsFieldPredicate fn, string option, string optionVal) 501 { 502 immutable valSplit = findSplit(optionVal, ":"); 503 if (valSplit[1].length == 0 || valSplit[2].length == 0) 504 { 505 throw new Exception( 506 format("Invalid option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 507 option, optionVal, option)); 508 } 509 size_t field1; 510 size_t field2; 511 try 512 { 513 field1 = valSplit[0].to!size_t; 514 field2 = valSplit[2].to!size_t; 515 } 516 catch (Exception exc) 517 { 518 throw new Exception( 519 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 520 option, optionVal, option)); 521 } 522 523 if (field1 == 0 || field2 == 0) 524 { 525 throw new Exception( 526 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 527 } 528 529 if (field1 == field2) 530 { 531 throw new Exception( 532 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 533 } 534 535 immutable size_t zeroBasedIndex1 = field1 - 1; 536 immutable size_t zeroBasedIndex2 = field2 - 1; 537 tests ~= makeFieldVsFieldDelegate(fn, zeroBasedIndex1, zeroBasedIndex2); 538 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 539 } 540 541 542 void fieldFieldNumOptionHandler( 543 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldFieldNumPredicate fn, string option, string optionVal) 544 { 545 size_t field1; 546 size_t field2; 547 double value; 548 immutable valSplit = findSplit(optionVal, ":"); 549 auto invalidOption = (valSplit[1].length == 0 || valSplit[2].length == 0); 550 551 if (!invalidOption) 552 { 553 immutable valSplit2 = findSplit(valSplit[2], ":"); 554 invalidOption = (valSplit2[1].length == 0 || valSplit2[2].length == 0); 555 556 if (!invalidOption) 557 { 558 try 559 { 560 field1 = valSplit[0].to!size_t; 561 field2 = valSplit2[0].to!size_t; 562 value = valSplit2[2].to!double; 563 } 564 catch (Exception exc) 565 { 566 invalidOption = true; 567 } 568 } 569 } 570 571 if (invalidOption) 572 { 573 throw new Exception( 574 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>:<num>' where fields are 1-upped integers.", 575 option, optionVal, option)); 576 } 577 if (field1 == 0 || field2 == 0) 578 { 579 throw new Exception( 580 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 581 } 582 if (field1 == field2) 583 { 584 throw new Exception( 585 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 586 } 587 588 immutable size_t zeroBasedIndex1 = field1 - 1; 589 immutable size_t zeroBasedIndex2 = field2 - 1; 590 tests ~= makeFieldFieldNumDelegate(fn, zeroBasedIndex1, zeroBasedIndex2, value); 591 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 592 } 593 594 /** Command line options - This struct holds the results of command line option processing. 595 * It also has a method, processArgs, that invokes command line arg processing. 596 */ 597 struct TsvFilterOptions 598 { 599 string programName; 600 FieldsPredicate[] tests; // Derived from tests 601 size_t maxFieldIndex; // Derived from tests 602 bool hasHeader = false; // --H|header 603 bool invert = false; // --invert 604 bool disjunct = false; // --or 605 char delim = '\t'; // --delimiter 606 bool helpVerbose = false; // --help-verbose 607 bool helpOptions = false; // --help-options 608 bool versionWanted = false; // --V|version 609 610 /* Returns a tuple. First value is true if command line arguments were successfully 611 * processed and execution should continue, or false if an error occurred or the user 612 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 613 * 614 * Returning true (execution continues) means args have been validated and the 615 * tests array has been established. 616 */ 617 auto processArgs (ref string[] cmdArgs) 618 { 619 import std.getopt; 620 import std.path : baseName, stripExtension; 621 import tsv_utils.common.getopt_inorder; 622 623 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 624 625 /* Command option handlers - One handler for each option. These conform to the 626 * getopt required handler signature, and separate knowledge the specific command 627 * option text from the option processing. 628 */ 629 void handlerFldEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldEmpty, option, value); } 630 void handlerFldNotEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotEmpty, option, value); } 631 void handlerFldBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldBlank, option, value); } 632 void handlerFldNotBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotBlank, option, value); } 633 634 void handlerFldIsNumeric(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNumeric, option, value); } 635 void handlerFldIsFinite(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsFinite, option, value); } 636 void handlerFldIsNaN(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNaN, option, value); } 637 void handlerFldIsInfinity(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsInfinity, option, value); } 638 639 void handlerNumLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLE, option, value); } 640 void handlerNumLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLT, option, value); } 641 void handlerNumGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGE, option, value); } 642 void handlerNumGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGT, option, value); } 643 void handlerNumEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numEQ, option, value); } 644 void handlerNumNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numNE, option, value); } 645 646 void handlerStrLE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLE, option, value); } 647 void handlerStrLT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLT, option, value); } 648 void handlerStrGE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGE, option, value); } 649 void handlerStrGT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGT, option, value); } 650 void handlerStrEQ(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strEQ, option, value); } 651 void handlerStrNE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNE, option, value); } 652 653 void handlerStrInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strInFld, option, value); } 654 void handlerStrNotInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNotInFld, option, value); } 655 656 void handlerIStrEQ(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrEQ, option, value); } 657 void handlerIStrNE(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNE, option, value); } 658 void handlerIStrInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrInFld, option, value); } 659 void handlerIStrNotInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNotInFld, option, value); } 660 661 void handlerRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, true); } 662 void handlerRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, true); } 663 void handlerIRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, false); } 664 void handlerIRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, false); } 665 666 void handlerFFLE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLE, option, value); } 667 void handlerFFLT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLT, option, value); } 668 void handlerFFGE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGE, option, value); } 669 void handlerFFGT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGT, option, value); } 670 void handlerFFEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffEQ, option, value); } 671 void handlerFFNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffNE, option, value); } 672 673 void handlerFFStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrEQ, option, value); } 674 void handlerFFStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrNE, option, value); } 675 void handlerFFIStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrEQ, option, value); } 676 void handlerFFIStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrNE, option, value); } 677 678 void handlerFFAbsDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffLE, option, value); } 679 void handlerFFAbsDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffGT, option, value); } 680 void handlerFFRelDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffLE, option, value); } 681 void handlerFFRelDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffGT, option, value); } 682 683 try 684 { 685 arraySep = ","; // Use comma to separate values in command line options 686 auto r = getoptInorder( 687 cmdArgs, 688 "help-verbose", " Print full help.", &helpVerbose, 689 "help-options", " Print the options list by itself.", &helpOptions, 690 std.getopt.config.caseSensitive, 691 "V|version", " Print version information and exit.", &versionWanted, 692 "H|header", " Treat the first line of each file as a header.", &hasHeader, 693 std.getopt.config.caseInsensitive, 694 "or", " Evaluate tests as an OR rather than an AND.", &disjunct, 695 std.getopt.config.caseSensitive, 696 "v|invert", " Invert the filter, printing lines that do not match.", &invert, 697 std.getopt.config.caseInsensitive, 698 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 699 700 "empty", "FIELD True if field is empty.", &handlerFldEmpty, 701 "not-empty", "FIELD True if field is not empty.", &handlerFldNotEmpty, 702 "blank", "FIELD True if field is empty or all whitespace.", &handlerFldBlank, 703 "not-blank", "FIELD True if field contains a non-whitespace character.", &handlerFldNotBlank, 704 705 "is-numeric", "FIELD True if field is interpretable as a number.", &handlerFldIsNumeric, 706 "is-finite", "FIELD True if field is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite, 707 "is-nan", "FIELD True if field is NaN.", &handlerFldIsNaN, 708 "is-infinity", "FIELD True if field is infinity.", &handlerFldIsInfinity, 709 710 "le", "FIELD:NUM FIELD <= NUM (numeric).", &handlerNumLE, 711 "lt", "FIELD:NUM FIELD < NUM (numeric).", &handlerNumLT, 712 "ge", "FIELD:NUM FIELD >= NUM (numeric).", &handlerNumGE, 713 "gt", "FIELD:NUM FIELD > NUM (numeric).", &handlerNumGT, 714 "eq", "FIELD:NUM FIELD == NUM (numeric).", &handlerNumEQ, 715 "ne", "FIELD:NUM FIELD != NUM (numeric).", &handlerNumNE, 716 717 "str-le", "FIELD:STR FIELD <= STR (string).", &handlerStrLE, 718 "str-lt", "FIELD:STR FIELD < STR (string).", &handlerStrLT, 719 "str-ge", "FIELD:STR FIELD >= STR (string).", &handlerStrGE, 720 "str-gt", "FIELD:STR FIELD > STR (string).", &handlerStrGT, 721 "str-eq", "FIELD:STR FIELD == STR (string).", &handlerStrEQ, 722 "istr-eq", "FIELD:STR FIELD == STR (string, case-insensitive).", &handlerIStrEQ, 723 "str-ne", "FIELD:STR FIELD != STR (string).", &handlerStrNE, 724 "istr-ne", "FIELD:STR FIELD != STR (string, case-insensitive).", &handlerIStrNE, 725 "str-in-fld", "FIELD:STR FIELD contains STR (substring search).", &handlerStrInFld, 726 "istr-in-fld", "FIELD:STR FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld, 727 "str-not-in-fld", "FIELD:STR FIELD does not contain STR (substring search).", &handlerStrNotInFld, 728 "istr-not-in-fld", "FIELD:STR FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld, 729 730 "regex", "FIELD:REGEX FIELD matches regular expression.", &handlerRegexMatch, 731 "iregex", "FIELD:REGEX FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch, 732 "not-regex", "FIELD:REGEX FIELD does not match regular expression.", &handlerRegexNotMatch, 733 "not-iregex", "FIELD:REGEX FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch, 734 735 "ff-le", "FIELD1:FIELD2 FIELD1 <= FIELD2 (numeric).", &handlerFFLE, 736 "ff-lt", "FIELD1:FIELD2 FIELD1 < FIELD2 (numeric).", &handlerFFLT, 737 "ff-ge", "FIELD1:FIELD2 FIELD1 >= FIELD2 (numeric).", &handlerFFGE, 738 "ff-gt", "FIELD1:FIELD2 FIELD1 > FIELD2 (numeric).", &handlerFFGT, 739 "ff-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (numeric).", &handlerFFEQ, 740 "ff-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (numeric).", &handlerFFNE, 741 "ff-str-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string).", &handlerFFStrEQ, 742 "ff-istr-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ, 743 "ff-str-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string).", &handlerFFStrNE, 744 "ff-istr-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE, 745 746 "ff-absdiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE, 747 "ff-absdiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) > NUM", &handlerFFAbsDiffGT, 748 "ff-reldiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE, 749 "ff-reldiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) > NUM", &handlerFFRelDiffGT, 750 ); 751 752 /* Both help texts are a bit long. In this case, for "regular" help, don't 753 * print options, just the text. The text summarizes the options. 754 */ 755 if (r.helpWanted) 756 { 757 stdout.write(helpText); 758 return tuple(false, 0); 759 } 760 else if (helpVerbose) 761 { 762 defaultGetoptPrinter(helpTextVerbose, r.options); 763 return tuple(false, 0); 764 } 765 else if (helpOptions) 766 { 767 defaultGetoptPrinter(helpTextOptions, r.options); 768 return tuple(false, 0); 769 } 770 else if (versionWanted) 771 { 772 import tsv_utils.common.tsvutils_version; 773 writeln(tsvutilsVersionNotice("tsv-filter")); 774 return tuple(false, 0); 775 } 776 } 777 catch (Exception exc) 778 { 779 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 780 return tuple(false, 1); 781 } 782 return tuple(true, 0); 783 } 784 } 785 786 /** tsvFilter processes the input files and runs the tests. 787 */ 788 void tsvFilter(in TsvFilterOptions cmdopt, in string[] inputFiles) 789 { 790 import std.algorithm : all, any, splitter; 791 import std.range; 792 import tsv_utils.common.utils : BufferedOutputRange, bufferedByLine, throwIfWindowsNewlineOnUnix; 793 794 /* BufferedOutputRange improves performance on narrow files with high percentages of 795 * writes. Want responsive output if output is rare, so ensure the first matched 796 * line is written, and that writes separated by long stretches of non-matched lines 797 * are written. 798 */ 799 enum maxInputLinesWithoutBufferFlush = 1024; 800 size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1; 801 802 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 803 804 /* Process each input file, one line at a time. */ 805 auto lineFields = new char[][](cmdopt.maxFieldIndex + 1); 806 bool headerWritten = false; 807 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 808 { 809 auto inputStream = (filename == "-") ? stdin : filename.File(); 810 foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1)) 811 { 812 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 813 if (lineNum == 1 && cmdopt.hasHeader) 814 { 815 /* Header. Output on the first file, skip subsequent files. */ 816 if (!headerWritten) 817 { 818 bufferedOutput.appendln(line); 819 headerWritten = true; 820 } 821 } 822 else 823 { 824 /* Copy the needed number of fields to the fields array. */ 825 int fieldIndex = -1; 826 foreach (fieldValue; line.splitter(cmdopt.delim)) 827 { 828 if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break; 829 fieldIndex++; 830 lineFields[fieldIndex] = fieldValue; 831 } 832 833 if (fieldIndex == -1) 834 { 835 assert(line.length == 0); 836 /* Bug work-around. Currently empty lines are not handled properly by splitter. 837 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 838 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 839 * Work-around: Point to the line. It's an empty string. 840 */ 841 fieldIndex++; 842 lineFields[fieldIndex] = line; 843 } 844 845 if (fieldIndex < cast(long) cmdopt.maxFieldIndex) 846 { 847 throw new Exception( 848 format("Not enough fields in line. File: %s, Line: %s", 849 (filename == "-") ? "Standard Input" : filename, lineNum)); 850 } 851 852 /* Run the tests. Tests will fail (throw) if a field cannot be converted 853 * to the expected type. 854 */ 855 try 856 { 857 inputLinesWithoutBufferFlush++; 858 bool passed = cmdopt.disjunct ? 859 cmdopt.tests.any!(x => x(lineFields)) : 860 cmdopt.tests.all!(x => x(lineFields)); 861 if (cmdopt.invert) passed = !passed; 862 if (passed) 863 { 864 const bool wasFlushed = bufferedOutput.appendln(line); 865 if (wasFlushed) inputLinesWithoutBufferFlush = 0; 866 else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush) 867 { 868 bufferedOutput.flush; 869 inputLinesWithoutBufferFlush = 0; 870 } 871 } 872 } 873 catch (Exception exc) 874 { 875 throw new Exception( 876 format("Could not process line or field: %s\n File: %s Line: %s%s", 877 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 878 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 879 } 880 } 881 } 882 } 883 }