1 /** 2 Command line tool that filters TSV files. 3 4 This tool filters tab-delimited files based on numeric or string comparisons 5 against specific fields. See the helpText string for details. 6 7 Copyright (c) 2015-2018, eBay Software Foundation 8 Initially written by Jon Degenhardt 9 10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_filter; 13 14 import std.algorithm : canFind, equal, findSplit, max, min; 15 import std.conv : to; 16 import std.format : format; 17 import std.math : abs, isFinite, isInfinity, isNaN; 18 import std.regex; 19 import std.stdio; 20 import std.string : isNumeric; 21 import std.typecons : tuple; 22 import std.uni: asLowerCase, toLower; 23 24 /* The program has two main parts, command line arg processing and processing the input 25 * files. Much of the work is in command line arg processing. This sets up the tests run 26 * against each input line. The tests are an array of delegates (closures) run against the 27 * fields in the line. The tests are based on command line arguments, of which there is 28 * a lengthy set, one for each test. 29 */ 30 31 /** Main program. Invokes command line arg processing and tsv-filter to perform 32 * the real work. Any errors are caught and reported. 33 */ 34 int main(string[] cmdArgs) 35 { 36 /* When running in DMD code coverage mode, turn on report merging. */ 37 version(D_Coverage) version(DigitalMars) 38 { 39 import core.runtime : dmd_coverSetMerge; 40 dmd_coverSetMerge(true); 41 } 42 43 TsvFilterOptions cmdopt; 44 auto r = cmdopt.processArgs(cmdArgs); 45 if (!r[0]) return r[1]; 46 version(LDC_Profile) 47 { 48 import ldc.profile : resetAll; 49 resetAll(); 50 } 51 try tsvFilter(cmdopt, cmdArgs[1..$]); 52 catch (Exception exc) 53 { 54 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 55 return 1; 56 } 57 return 0; 58 } 59 60 auto helpText = q"EOS 61 Synopsis: tsv-filter [options] [file...] 62 63 Filter tab-delimited files for matching lines via comparison tests against 64 individual fields. Use '--help-verbose' for a more detailed description. 65 66 Global options: 67 --help-verbose Print full help. 68 --help-options Print the options list by itself. 69 --V|version Print version information and exit. 70 --H|header Treat the first line of each file as a header. 71 --or Evaluate tests as an OR rather than an AND clause. 72 --v|invert Invert the filter, printing lines that do not match. 73 --d|delimiter CHR Field delimiter. Default: TAB. 74 75 Operators: 76 * Test if a field is empty (no characters) or blank (empty or whitespace only). 77 Syntax: --empty|not-empty|blank|not-blank FIELD 78 Example: --empty 5 // True if field 5 is empty 79 80 * Test if a field is numeric, finite, NaN, or infinity 81 Syntax: --is-numeric|is-finite|is-nan|is-infinity FIELD 82 Example: --is-numeric 5 --gt 5:100 // Ensure field 5 is numeric before --gt test. 83 84 * Compare a field to a number (integer or float) 85 Syntax: --eq|ne|lt|le|gt|ge FIELD:NUM 86 Example: --lt 5:1000 --gt 2:0.5 // True if (field 5 < 1000) and (field 2 > 0.5) 87 88 * Compare a field to a string 89 Syntax: --str-eq|str-ne FIELD:STR 90 Example: --str-eq 3:abc // True if field 3 is "abc" 91 92 * Test if a field contains a string (substring search) 93 Syntax: --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld FIELD:STR 94 Example: --str-in-fld 1:hello // True if field 1 contains "hello" 95 96 * Test if a field matches a regular expression. 97 Syntax: --regex|iregex|not-regex|not-iregex FIELD:REGEX 98 Example: --regex '3:ab*c' // True if field 3 contains "ac", "abc", "abbc", etc. 99 100 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field. 101 Syntax: --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge FIELD1:FIELD2 102 --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne FIELD1:FIELD2 103 Example: --ff-eq 2:4 // True if fields 2 and 4 are numerically equivalent 104 --ff-str-eq 2:4 // True if fields 2 and 4 are the same strings 105 106 * Field to field difference comparisons - Absolute and relative difference 107 Syntax: --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM 108 --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM 109 Example: --ff-absdiff-lt 1:3:0.25 // True if abs(field1 - field2) < 0.25 110 111 EOS"; 112 113 auto helpTextVerbose = q"EOS 114 Synopsis: tsv-filter [options] [file...] 115 116 Filter lines of tab-delimited files via comparison tests against fields. Multiple 117 tests can be specified, by default they are evaluated as AND clause. Lines 118 satisfying the tests are written to standard output. 119 120 Typical test syntax is '--op field:value', where 'op' is an operator, 'field' is a 121 1-based field index, and 'value' is the comparison basis. For example, '--lt 3:500' 122 tests if field 3 is less than 500. A more complete example: 123 124 tsv-filter --header --gt 1:50 --lt 1:100 --le 2:1000 data.tsv 125 126 This outputs all lines from file data.tsv where field 1 is greater than 50 and less 127 than 100, and field 2 is less than or equal to 1000. The header is also output. 128 129 Tests available include: 130 * Test if a field is empty (no characters) or blank (empty or whitespace only). 131 * Test if a field is interpretable as a number, a finite number, NaN, or Infinity. 132 * Compare a field to a number - Numeric equality and relational tests. 133 * Compare a field to a string - String equality and relational tests. 134 * Test if a field matches a regular expression. Case sensitive or insensitive. 135 * Test if a field contains a string. Sub-string search, case sensitive or insensitive. 136 * Field to field comparisons - Similar to the other tests, except comparing 137 one field to another in the same line. 138 139 Details: 140 * The run is aborted if there are not enough fields in an input line. 141 * Numeric tests will fail and abort the run if a field cannot be interpreted as a 142 number. This includes fields with no text. To avoid this use '--is-numeric' or 143 '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100' 144 ensures field 5 is numeric before running the --gt test. 145 * Regular expression syntax is defined by the D programming language. They follow 146 common conventions (perl, python, etc.). Most common forms work as expected. 147 148 Options: 149 EOS"; 150 151 auto helpTextOptions = q"EOS 152 Synopsis: tsv-filter [options] [file...] 153 154 Options: 155 EOS"; 156 157 /* The next blocks of code define the structure of the boolean tests run against input lines. 158 * This includes function and delegate (closure) signatures, creation mechanisms, option 159 * handlers, etc. Command line arg processing to build the test structure. 160 */ 161 162 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean 163 * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure) 164 * containing all info about the test except the field values of the line being tested. 165 * These delegates are created as part of command line arg processing. The wrapped data 166 * includes operation, field indexes, literal values, etc. At run-time the delegate is 167 * passed one argument, the split input line. 168 */ 169 alias FieldsPredicate = bool delegate(const char[][] fields); 170 171 /* FieldsPredicate function signatures - These aliases represent the different function 172 * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make' 173 * function. The 'make' function takes a real predicate function and closure args and 174 * returns a FieldsPredicate delegate. Predicates types are: 175 * 176 * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4) 177 * - FieldVsNumberPredicate - Test based on a field index (used to get the field value) 178 * and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100). 179 * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc) 180 * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string. 181 * (e.g. --istr-eq 2:abc) 182 * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c') 183 * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4). 184 * 185 * An actual FieldsPredicate takes the fields from the line and the closure args and 186 * runs the test. For example, a function testing if a field is less than a specific 187 * value would pull the specified field from the fields array, convert the string to 188 * a number, then run the less-than test. 189 */ 190 alias FieldUnaryPredicate = bool function(const char[][] fields, size_t index); 191 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value); 192 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value); 193 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value); 194 alias FieldVsRegexPredicate = bool function(const char[][] fields, size_t index, Regex!char value); 195 alias FieldVsFieldPredicate = bool function(const char[][] fields, size_t index1, size_t index2); 196 alias FieldFieldNumPredicate = bool function(const char[][] fields, size_t index1, size_t index2, double value); 197 198 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index) 199 { 200 return fields => fn(fields, index); 201 } 202 203 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value) 204 { 205 return fields => fn(fields, index, value); 206 } 207 208 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value) 209 { 210 return fields => fn(fields, index, value); 211 } 212 213 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value) 214 { 215 return fields => fn(fields, index, value); 216 } 217 218 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value) 219 { 220 return fields => fn(fields, index, value); 221 } 222 223 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2) 224 { 225 return fields => fn(fields, index1, index2); 226 } 227 228 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value) 229 { 230 return fields => fn(fields, index1, index2, value); 231 } 232 233 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They 234 * are a direct reflection of the operators available via command line args. Each matches 235 * one of the FieldsPredicate function aliases defined above. 236 */ 237 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; } 238 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; } 239 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); } 240 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); } 241 242 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; } 243 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; } 244 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; } 245 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; } 246 247 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; } 248 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double < val; } 249 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; } 250 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double > val; } 251 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; } 252 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; } 253 254 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; } 255 bool strLT(const char[][] fields, size_t index, string val) { return fields[index] < val; } 256 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; } 257 bool strGT(const char[][] fields, size_t index, string val) { return fields[index] > val; } 258 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; } 259 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; } 260 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); } 261 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); } 262 263 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander. 264 */ 265 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); } 266 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); } 267 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); } 268 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); } 269 270 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are 271 * used for both case-sensitive and case-insensitive regex operators. 272 */ 273 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); } 274 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); } 275 276 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; } 277 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double < fields[index2].to!double; } 278 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; } 279 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double > fields[index2].to!double; } 280 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; } 281 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; } 282 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; } 283 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; } 284 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2) 285 { 286 return equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 287 } 288 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2) 289 { 290 return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 291 } 292 293 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; } 294 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); } 295 296 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 297 { 298 return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value; 299 } 300 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 301 { 302 return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value; 303 } 304 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 305 { 306 return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value; 307 } 308 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 309 { 310 return RelDiff(fields[index1].to!double, fields[index2].to!double) > value; 311 } 312 313 /* Command line option handlers - There is a command line option handler for each 314 * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate, 315 * etc. Option handlers are passed the tests array, the predicate function, and the 316 * command line option arguments. A FieldsPredicate delegate is created and appended to 317 * the tests array. An exception is thrown if errors are detected while processing the 318 * option, the error text is intended for the end user. 319 * 320 * These option handlers have similar functionality, differing in option processing and 321 * error message generation. fieldVsNumberOptionHandler is described as an example. It 322 * handles command options such as '--lt 3:1000', which tests field 3 for a values less 323 * than 1000. It is passed the tests array, the 'numLE' function to use for the test, and 324 * the string "3:1000" representing the option value. It parses the option value into 325 * field index (unsigned int) and value (double). These are wrapped in a FieldsPredicate 326 * which is added to the tests array. An error is signaled if the option string is invalid. 327 * 328 * During processing, fields indexes are converted from one-based to zero-based. As an 329 * optimization, the maximum field index is also tracked. This allows early termination of 330 * line splitting. 331 */ 332 333 void fieldUnaryOptionHandler( 334 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldUnaryPredicate fn, string option, string optionVal) 335 { 336 size_t field; 337 try field = optionVal.to!size_t; 338 catch (Exception exc) 339 { 340 throw new Exception( 341 format("Invalid value in option: '--%s %s'. Expected: '--%s <field>' where field is a 1-upped integer.", 342 option, optionVal, option)); 343 } 344 345 if (field == 0) 346 { 347 throw new Exception( 348 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 349 } 350 351 size_t zeroBasedIndex = field - 1; 352 tests ~= makeFieldUnaryDelegate(fn, zeroBasedIndex); 353 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 354 } 355 356 void fieldVsNumberOptionHandler( 357 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsNumberPredicate fn, string option, string optionVal) 358 { 359 auto valSplit = findSplit(optionVal, ":"); 360 if (valSplit[1].length == 0 || valSplit[2].length == 0) 361 { 362 throw new Exception( 363 format("Invalid option: '%s %s'. Expected: '%s <field>:<val>' where <field> and <val> are numbers.", 364 option, optionVal, option)); 365 } 366 size_t field; 367 double value; 368 try 369 { 370 field = valSplit[0].to!size_t; 371 value = valSplit[2].to!double; 372 } 373 catch (Exception exc) 374 { 375 throw new Exception( 376 format("Invalid numeric values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> and <val> are numbers.", 377 option, optionVal, option)); 378 } 379 380 if (field == 0) 381 { 382 throw new Exception( 383 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 384 } 385 size_t zeroBasedIndex = field - 1; 386 tests ~= makeFieldVsNumberDelegate(fn, zeroBasedIndex, value); 387 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 388 } 389 390 void fieldVsStringOptionHandler( 391 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsStringPredicate fn, string option, string optionVal) 392 { 393 auto valSplit = findSplit(optionVal, ":"); 394 if (valSplit[1].length == 0 || valSplit[2].length == 0) 395 { 396 throw new Exception( 397 format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.", 398 option, optionVal, option)); 399 } 400 size_t field; 401 string value; 402 try 403 { 404 field = valSplit[0].to!size_t; 405 value = valSplit[2].to!string; 406 } 407 catch (Exception exc) 408 { 409 throw new Exception( 410 format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.", 411 option, optionVal, option)); 412 } 413 414 if (field == 0) 415 { 416 throw new Exception( 417 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 418 } 419 size_t zeroBasedIndex = field - 1; 420 tests ~= makeFieldVsStringDelegate(fn, zeroBasedIndex, value); 421 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 422 } 423 424 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the 425 * case-insensitive comparison will be done on lower-cased values. 426 */ 427 void fieldVsIStringOptionHandler( 428 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsIStringPredicate fn, string option, string optionVal) 429 { 430 auto valSplit = findSplit(optionVal, ":"); 431 if (valSplit[1].length == 0 || valSplit[2].length == 0) 432 { 433 throw new Exception( 434 format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.", 435 option, optionVal, option)); 436 } 437 size_t field; 438 string value; 439 try 440 { 441 field = valSplit[0].to!size_t; 442 value = valSplit[2].to!string; 443 } 444 catch (Exception exc) 445 { 446 throw new Exception( 447 format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.", 448 option, optionVal, option)); 449 } 450 451 if (field == 0) 452 { 453 throw new Exception( 454 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 455 } 456 size_t zeroBasedIndex = field - 1; 457 tests ~= makeFieldVsIStringDelegate(fn, zeroBasedIndex, value.to!dstring.toLower); 458 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 459 } 460 461 void fieldVsRegexOptionHandler( 462 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsRegexPredicate fn, string option, string optionVal, 463 bool caseSensitive) 464 { 465 auto valSplit = findSplit(optionVal, ":"); 466 if (valSplit[1].length == 0 || valSplit[2].length == 0) 467 { 468 throw new Exception( 469 format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.", 470 option, optionVal, option)); 471 } 472 size_t field; 473 Regex!char value; 474 try 475 { 476 auto modifiers = caseSensitive ? "" : "i"; 477 field = valSplit[0].to!size_t; 478 value = regex(valSplit[2], modifiers); 479 } 480 catch (Exception exc) 481 { 482 throw new Exception( 483 format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.", 484 option, optionVal, option)); 485 } 486 487 if (field == 0) 488 { 489 throw new Exception( 490 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 491 } 492 size_t zeroBasedIndex = field - 1; 493 tests ~= makeFieldVsRegexDelegate(fn, zeroBasedIndex, value); 494 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 495 } 496 497 void fieldVsFieldOptionHandler( 498 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsFieldPredicate fn, string option, string optionVal) 499 { 500 auto valSplit = findSplit(optionVal, ":"); 501 if (valSplit[1].length == 0 || valSplit[2].length == 0) 502 { 503 throw new Exception( 504 format("Invalid option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 505 option, optionVal, option)); 506 } 507 size_t field1; 508 size_t field2; 509 try 510 { 511 field1 = valSplit[0].to!size_t; 512 field2 = valSplit[2].to!size_t; 513 } 514 catch (Exception exc) 515 { 516 throw new Exception( 517 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 518 option, optionVal, option)); 519 } 520 521 if (field1 == 0 || field2 == 0) 522 { 523 throw new Exception( 524 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 525 } 526 527 if (field1 == field2) 528 { 529 throw new Exception( 530 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 531 } 532 533 size_t zeroBasedIndex1 = field1 - 1; 534 size_t zeroBasedIndex2 = field2 - 1; 535 tests ~= makeFieldVsFieldDelegate(fn, zeroBasedIndex1, zeroBasedIndex2); 536 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 537 } 538 539 540 void fieldFieldNumOptionHandler( 541 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldFieldNumPredicate fn, string option, string optionVal) 542 { 543 size_t field1; 544 size_t field2; 545 double value; 546 auto valSplit = findSplit(optionVal, ":"); 547 auto invalidOption = (valSplit[1].length == 0 || valSplit[2].length == 0); 548 549 if (!invalidOption) 550 { 551 auto valSplit2 = findSplit(valSplit[2], ":"); 552 invalidOption = (valSplit2[1].length == 0 || valSplit2[2].length == 0); 553 554 if (!invalidOption) 555 { 556 try 557 { 558 field1 = valSplit[0].to!size_t; 559 field2 = valSplit2[0].to!size_t; 560 value = valSplit2[2].to!double; 561 } 562 catch (Exception exc) 563 { 564 invalidOption = true; 565 } 566 } 567 } 568 569 if (invalidOption) 570 { 571 throw new Exception( 572 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>:<num>' where fields are 1-upped integers.", 573 option, optionVal, option)); 574 } 575 if (field1 == 0 || field2 == 0) 576 { 577 throw new Exception( 578 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 579 } 580 if (field1 == field2) 581 { 582 throw new Exception( 583 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 584 } 585 586 size_t zeroBasedIndex1 = field1 - 1; 587 size_t zeroBasedIndex2 = field2 - 1; 588 tests ~= makeFieldFieldNumDelegate(fn, zeroBasedIndex1, zeroBasedIndex2, value); 589 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 590 } 591 592 /** Command line options - This struct holds the results of command line option processing. 593 * It also has a method, processArgs, that invokes command line arg processing. 594 */ 595 struct TsvFilterOptions 596 { 597 string programName; 598 FieldsPredicate[] tests; // Derived from tests 599 size_t maxFieldIndex; // Derived from tests 600 bool hasHeader = false; // --H|header 601 bool invert = false; // --invert 602 bool disjunct = false; // --or 603 char delim = '\t'; // --delimiter 604 bool helpVerbose = false; // --help-verbose 605 bool helpOptions = false; // --help-options 606 bool versionWanted = false; // --V|version 607 608 /* Returns a tuple. First value is true if command line arguments were successfully 609 * processed and execution should continue, or false if an error occurred or the user 610 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 611 * 612 * Returning true (execution continues) means args have been validated and the 613 * tests array has been established. 614 */ 615 auto processArgs (ref string[] cmdArgs) 616 { 617 import std.getopt; 618 import std.path : baseName, stripExtension; 619 import getopt_inorder; 620 621 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 622 623 /* Command option handlers - One handler for each option. These conform to the 624 * getopt required handler signature, and separate knowledge the specific command 625 * option text from the option processing. 626 */ 627 void handlerFldEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldEmpty, option, value); } 628 void handlerFldNotEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotEmpty, option, value); } 629 void handlerFldBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldBlank, option, value); } 630 void handlerFldNotBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotBlank, option, value); } 631 632 void handlerFldIsNumeric(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNumeric, option, value); } 633 void handlerFldIsFinite(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsFinite, option, value); } 634 void handlerFldIsNaN(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNaN, option, value); } 635 void handlerFldIsInfinity(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsInfinity, option, value); } 636 637 void handlerNumLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLE, option, value); } 638 void handlerNumLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLT, option, value); } 639 void handlerNumGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGE, option, value); } 640 void handlerNumGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGT, option, value); } 641 void handlerNumEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numEQ, option, value); } 642 void handlerNumNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numNE, option, value); } 643 644 void handlerStrLE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLE, option, value); } 645 void handlerStrLT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLT, option, value); } 646 void handlerStrGE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGE, option, value); } 647 void handlerStrGT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGT, option, value); } 648 void handlerStrEQ(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strEQ, option, value); } 649 void handlerStrNE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNE, option, value); } 650 651 void handlerStrInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strInFld, option, value); } 652 void handlerStrNotInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNotInFld, option, value); } 653 654 void handlerIStrEQ(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrEQ, option, value); } 655 void handlerIStrNE(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNE, option, value); } 656 void handlerIStrInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrInFld, option, value); } 657 void handlerIStrNotInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNotInFld, option, value); } 658 659 void handlerRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, true); } 660 void handlerRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, true); } 661 void handlerIRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, false); } 662 void handlerIRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, false); } 663 664 void handlerFFLE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLE, option, value); } 665 void handlerFFLT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLT, option, value); } 666 void handlerFFGE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGE, option, value); } 667 void handlerFFGT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGT, option, value); } 668 void handlerFFEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffEQ, option, value); } 669 void handlerFFNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffNE, option, value); } 670 671 void handlerFFStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrEQ, option, value); } 672 void handlerFFStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrNE, option, value); } 673 void handlerFFIStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrEQ, option, value); } 674 void handlerFFIStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrNE, option, value); } 675 676 void handlerFFAbsDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffLE, option, value); } 677 void handlerFFAbsDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffGT, option, value); } 678 void handlerFFRelDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffLE, option, value); } 679 void handlerFFRelDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffGT, option, value); } 680 681 try 682 { 683 arraySep = ","; // Use comma to separate values in command line options 684 auto r = getoptInorder( 685 cmdArgs, 686 "help-verbose", " Print full help.", &helpVerbose, 687 "help-options", " Print the options list by itself.", &helpOptions, 688 std.getopt.config.caseSensitive, 689 "V|version", " Print version information and exit.", &versionWanted, 690 "H|header", " Treat the first line of each file as a header.", &hasHeader, 691 std.getopt.config.caseInsensitive, 692 "or", " Evaluate tests as an OR rather than an AND.", &disjunct, 693 std.getopt.config.caseSensitive, 694 "v|invert", " Invert the filter, printing lines that do not match.", &invert, 695 std.getopt.config.caseInsensitive, 696 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 697 698 "empty", "FIELD True if field is empty.", &handlerFldEmpty, 699 "not-empty", "FIELD True if field is not empty.", &handlerFldNotEmpty, 700 "blank", "FIELD True if field is empty or all whitespace.", &handlerFldBlank, 701 "not-blank", "FIELD True if field contains a non-whitespace character.", &handlerFldNotBlank, 702 703 "is-numeric", "FIELD True if field is interpretable as a number.", &handlerFldIsNumeric, 704 "is-finite", "FIELD True if field is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite, 705 "is-nan", "FIELD True if field is NaN.", &handlerFldIsNaN, 706 "is-infinity", "FIELD True if field is infinity.", &handlerFldIsInfinity, 707 708 "le", "FIELD:NUM FIELD <= NUM (numeric).", &handlerNumLE, 709 "lt", "FIELD:NUM FIELD < NUM (numeric).", &handlerNumLT, 710 "ge", "FIELD:NUM FIELD >= NUM (numeric).", &handlerNumGE, 711 "gt", "FIELD:NUM FIELD > NUM (numeric).", &handlerNumGT, 712 "eq", "FIELD:NUM FIELD == NUM (numeric).", &handlerNumEQ, 713 "ne", "FIELD:NUM FIELD != NUM (numeric).", &handlerNumNE, 714 715 "str-le", "FIELD:STR FIELD <= STR (string).", &handlerStrLE, 716 "str-lt", "FIELD:STR FIELD < STR (string).", &handlerStrLT, 717 "str-ge", "FIELD:STR FIELD >= STR (string).", &handlerStrGE, 718 "str-gt", "FIELD:STR FIELD > STR (string).", &handlerStrGT, 719 "str-eq", "FIELD:STR FIELD == STR (string).", &handlerStrEQ, 720 "istr-eq", "FIELD:STR FIELD == STR (string, case-insensitive).", &handlerIStrEQ, 721 "str-ne", "FIELD:STR FIELD != STR (string).", &handlerStrNE, 722 "istr-ne", "FIELD:STR FIELD != STR (string, case-insensitive).", &handlerIStrNE, 723 "str-in-fld", "FIELD:STR FIELD contains STR (substring search).", &handlerStrInFld, 724 "istr-in-fld", "FIELD:STR FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld, 725 "str-not-in-fld", "FIELD:STR FIELD does not contain STR (substring search).", &handlerStrNotInFld, 726 "istr-not-in-fld", "FIELD:STR FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld, 727 728 "regex", "FIELD:REGEX FIELD matches regular expression.", &handlerRegexMatch, 729 "iregex", "FIELD:REGEX FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch, 730 "not-regex", "FIELD:REGEX FIELD does not match regular expression.", &handlerRegexNotMatch, 731 "not-iregex", "FIELD:REGEX FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch, 732 733 "ff-le", "FIELD1:FIELD2 FIELD1 <= FIELD2 (numeric).", &handlerFFLE, 734 "ff-lt", "FIELD1:FIELD2 FIELD1 < FIELD2 (numeric).", &handlerFFLT, 735 "ff-ge", "FIELD1:FIELD2 FIELD1 >= FIELD2 (numeric).", &handlerFFGE, 736 "ff-gt", "FIELD1:FIELD2 FIELD1 > FIELD2 (numeric).", &handlerFFGT, 737 "ff-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (numeric).", &handlerFFEQ, 738 "ff-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (numeric).", &handlerFFNE, 739 "ff-str-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string).", &handlerFFStrEQ, 740 "ff-istr-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ, 741 "ff-str-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string).", &handlerFFStrNE, 742 "ff-istr-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE, 743 744 "ff-absdiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE, 745 "ff-absdiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) > NUM", &handlerFFAbsDiffGT, 746 "ff-reldiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE, 747 "ff-reldiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) > NUM", &handlerFFRelDiffGT, 748 ); 749 750 /* Both help texts are a bit long. In this case, for "regular" help, don't 751 * print options, just the text. The text summarizes the options. 752 */ 753 if (r.helpWanted) 754 { 755 stdout.write(helpText); 756 return tuple(false, 0); 757 } 758 else if (helpVerbose) 759 { 760 defaultGetoptPrinter(helpTextVerbose, r.options); 761 return tuple(false, 0); 762 } 763 else if (helpOptions) 764 { 765 defaultGetoptPrinter(helpTextOptions, r.options); 766 return tuple(false, 0); 767 } 768 else if (versionWanted) 769 { 770 import tsvutils_version; 771 writeln(tsvutilsVersionNotice("tsv-filter")); 772 return tuple(false, 0); 773 } 774 } 775 catch (Exception exc) 776 { 777 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 778 return tuple(false, 1); 779 } 780 return tuple(true, 0); 781 } 782 } 783 784 /** tsvFilter processes the input files and runs the tests. 785 */ 786 void tsvFilter(in TsvFilterOptions cmdopt, in string[] inputFiles) 787 { 788 import std.algorithm : all, any, splitter; 789 import std.range; 790 import tsvutil : BufferedOutputRange, throwIfWindowsNewlineOnUnix; 791 792 /* BufferedOutputRange improves performance on narrow files with high percentages of 793 * writes. Want responsive output if output is rare, so ensure the first matched 794 * line is written, and that writes separated by long stretches of non-matched lines 795 * are written. 796 */ 797 enum maxInputLinesWithoutBufferFlush = 1024; 798 size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1; 799 800 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 801 802 /* Process each input file, one line at a time. */ 803 auto lineFields = new char[][](cmdopt.maxFieldIndex + 1); 804 bool headerWritten = false; 805 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 806 { 807 auto inputStream = (filename == "-") ? stdin : filename.File(); 808 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 809 { 810 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 811 if (lineNum == 1 && cmdopt.hasHeader) 812 { 813 /* Header. Output on the first file, skip subsequent files. */ 814 if (!headerWritten) 815 { 816 bufferedOutput.appendln(line); 817 headerWritten = true; 818 } 819 } 820 else 821 { 822 /* Copy the needed number of fields to the fields array. */ 823 int fieldIndex = -1; 824 foreach (fieldValue; line.splitter(cmdopt.delim)) 825 { 826 if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break; 827 fieldIndex++; 828 lineFields[fieldIndex] = fieldValue; 829 } 830 831 if (fieldIndex == -1) 832 { 833 assert(line.length == 0); 834 /* Bug work-around. Currently empty lines are not handled properly by splitter. 835 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 836 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 837 * Work-around: Point to the line. It's an empty string. 838 */ 839 fieldIndex++; 840 lineFields[fieldIndex] = line; 841 } 842 843 if (fieldIndex < cast(long) cmdopt.maxFieldIndex) 844 { 845 throw new Exception( 846 format("Not enough fields in line. File: %s, Line: %s", 847 (filename == "-") ? "Standard Input" : filename, lineNum)); 848 } 849 850 /* Run the tests. Tests will fail (throw) if a field cannot be converted 851 * to the expected type. 852 */ 853 try 854 { 855 inputLinesWithoutBufferFlush++; 856 bool passed = cmdopt.disjunct ? 857 cmdopt.tests.any!(x => x(lineFields)) : 858 cmdopt.tests.all!(x => x(lineFields)); 859 if (cmdopt.invert) passed = !passed; 860 if (passed) 861 { 862 bool wasFlushed = bufferedOutput.appendln(line); 863 if (wasFlushed) inputLinesWithoutBufferFlush = 0; 864 else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush) 865 { 866 bufferedOutput.flush; 867 inputLinesWithoutBufferFlush = 0; 868 } 869 } 870 } 871 catch (Exception exc) 872 { 873 throw new Exception( 874 format("Could not process line or field: %s\n File: %s Line: %s%s", 875 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 876 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 877 } 878 } 879 } 880 } 881 }