1 /** 2 Command line tool that filters TSV files. 3 4 This tool filters tab-delimited files based on numeric or string comparisons 5 against specific fields. See the helpText string for details. 6 7 Copyright (c) 2015-2018, eBay Software Foundation 8 Initially written by Jon Degenhardt 9 10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_filter; 13 14 import std.algorithm : canFind, equal, findSplit, max, min; 15 import std.conv : to; 16 import std.format : format; 17 import std.math : abs, isFinite, isInfinity, isNaN; 18 import std.regex; 19 import std.stdio; 20 import std.string : isNumeric; 21 import std.typecons : tuple; 22 import std.uni: asLowerCase, toLower; 23 24 /* The program has two main parts, command line arg processing and processing the input 25 * files. Much of the work is in command line arg processing. This sets up the tests run 26 * against each input line. The tests are an array of delegates (closures) run against the 27 * fields in the line. The tests are based on command line arguments, of which there is 28 * a lengthy set, one for each test. 29 */ 30 31 int main(string[] cmdArgs) 32 { 33 /* When running in DMD code coverage mode, turn on report merging. */ 34 version(D_Coverage) version(DigitalMars) 35 { 36 import core.runtime : dmd_coverSetMerge; 37 dmd_coverSetMerge(true); 38 } 39 40 TsvFilterOptions cmdopt; 41 auto r = cmdopt.processArgs(cmdArgs); 42 if (!r[0]) return r[1]; 43 version(LDC_Profile) 44 { 45 import ldc.profile : resetAll; 46 resetAll(); 47 } 48 try tsvFilter(cmdopt, cmdArgs[1..$]); 49 catch (Exception exc) 50 { 51 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 52 return 1; 53 } 54 return 0; 55 } 56 57 auto helpText = q"EOS 58 Synopsis: tsv-filter [options] [file...] 59 60 Filter tab-delimited files for matching lines via comparison tests against 61 individual fields. Use '--help-verbose' for a more detailed description. 62 63 Global options: 64 --help-verbose Print full help. 65 --help-options Print the options list by itself. 66 --V|version Print version information and exit. 67 --H|header Treat the first line of each file as a header. 68 --or Evaluate tests as an OR rather than an AND clause. 69 --v|invert Invert the filter, printing lines that do not match. 70 --d|delimiter CHR Field delimiter. Default: TAB. 71 72 Operators: 73 * Test if a field is empty (no characters) or blank (empty or whitespace only). 74 Syntax: --empty|not-empty|blank|not-blank FIELD 75 Example: --empty 5 // True if field 5 is empty 76 77 * Test if a field is numeric, finite, NaN, or infinity 78 Syntax: --is-numeric|is-finite|is-nan|is-infinity FIELD 79 Example: --is-numeric 5 --gt 5:100 // Ensure field 5 is numeric before --gt test. 80 81 * Compare a field to a number (integer or float) 82 Syntax: --eq|ne|lt|le|gt|ge FIELD:NUM 83 Example: --lt 5:1000 --gt 2:0.5 // True if (field 5 < 1000) and (field 2 > 0.5) 84 85 * Compare a field to a string 86 Syntax: --str-eq|str-ne FIELD:STR 87 Example: --str-eq 3:abc // True if field 3 is "abc" 88 89 * Test if a field contains a string (substring search) 90 Syntax: --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld FIELD:STR 91 Example: --str-in-fld 1:hello // True if field 1 contains "hello" 92 93 * Test if a field matches a regular expression. 94 Syntax: --regex|iregex|not-regex|not-iregex FIELD:REGEX 95 Example: --regex '3:ab*c' // True if field 3 contains "ac", "abc", "abbc", etc. 96 97 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field. 98 Syntax: --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge FIELD1:FIELD2 99 --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne FIELD1:FIELD2 100 Example: --ff-eq 2:4 // True if fields 2 and 4 are numerically equivalent 101 --ff-str-eq 2:4 // True if fields 2 and 4 are the same strings 102 103 * Field to field difference comparisons - Absolute and relative difference 104 Syntax: --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM 105 --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM 106 Example: --ff-absdiff-lt 1:3:0.25 // True if abs(field1 - field2) < 0.25 107 108 EOS"; 109 110 auto helpTextVerbose = q"EOS 111 Synopsis: tsv-filter [options] [file...] 112 113 Filter lines of tab-delimited files via comparison tests against fields. Multiple 114 tests can be specified, by default they are evaluated as AND clause. Lines 115 satisfying the tests are written to standard output. 116 117 Typical test syntax is '--op field:value', where 'op' is an operator, 'field' is a 118 1-based field index, and 'value' is the comparison basis. For example, '--lt 3:500' 119 tests if field 3 is less than 500. A more complete example: 120 121 tsv-filter --header --gt 1:50 --lt 1:100 --le 2:1000 data.tsv 122 123 This outputs all lines from file data.tsv where field 1 is greater than 50 and less 124 than 100, and field 2 is less than or equal to 1000. The header is also output. 125 126 Tests available include: 127 * Test if a field is empty (no characters) or blank (empty or whitespace only). 128 * Test if a field is interpretable as a number, a finite number, NaN, or Infinity. 129 * Compare a field to a number - Numeric equality and relational tests. 130 * Compare a field to a string - String equality and relational tests. 131 * Test if a field matches a regular expression. Case sensitive or insensitive. 132 * Test if a field contains a string. Sub-string search, case sensitive or insensitive. 133 * Field to field comparisons - Similar to the other tests, except comparing 134 one field to another in the same line. 135 136 Details: 137 * The run is aborted if there are not enough fields in an input line. 138 * Numeric tests will fail and abort the run if a field cannot be interpreted as a 139 number. This includes fields with no text. To avoid this use '--is-numeric' or 140 '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100' 141 ensures field 5 is numeric before running the --gt test. 142 * Regular expression syntax is defined by the D programming language. They follow 143 common conventions (perl, python, etc.). Most common forms work as expected. 144 145 Options: 146 EOS"; 147 148 auto helpTextOptions = q"EOS 149 Synopsis: tsv-filter [options] [file...] 150 151 Options: 152 EOS"; 153 154 /** 155 The next blocks of code define the structure of the boolean tests run against input lines. 156 This includes function and delegate (closure) signatures, creation mechanisms, option 157 handlers, etc. Command line arg processing to build the test structure. 158 */ 159 160 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean 161 * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure) 162 * containing all info about the test except the field values of the line being tested. 163 * These delegates are created as part of command line arg processing. The wrapped data 164 * includes operation, field indexes, literal values, etc. At run-time the delegate is 165 * passed one argument, the split input line. 166 */ 167 alias FieldsPredicate = bool delegate(const char[][] fields); 168 169 /* FieldsPredicate function signatures - These aliases represent the different function 170 * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make' 171 * function. The 'make' function takes a real predicate function and closure args and 172 * returns a FieldsPredicate delegate. Predicates types are: 173 * 174 * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4) 175 * - FieldVsNumberPredicate - Test based on a field index (used to get the field value) 176 * and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100). 177 * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc) 178 * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string. 179 * (e.g. --istr-eq 2:abc) 180 * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c') 181 * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4). 182 * 183 * An actual FieldsPredicate takes the fields from the line and the closure args and 184 * runs the test. For example, a function testing if a field is less than a specific 185 * value would pull the specified field from the fields array, convert the string to 186 * a number, then run the less-than test. 187 */ 188 alias FieldUnaryPredicate = bool function(const char[][] fields, size_t index); 189 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value); 190 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value); 191 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value); 192 alias FieldVsRegexPredicate = bool function(const char[][] fields, size_t index, Regex!char value); 193 alias FieldVsFieldPredicate = bool function(const char[][] fields, size_t index1, size_t index2); 194 alias FieldFieldNumPredicate = bool function(const char[][] fields, size_t index1, size_t index2, double value); 195 196 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index) 197 { 198 return fields => fn(fields, index); 199 } 200 201 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value) 202 { 203 return fields => fn(fields, index, value); 204 } 205 206 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value) 207 { 208 return fields => fn(fields, index, value); 209 } 210 211 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value) 212 { 213 return fields => fn(fields, index, value); 214 } 215 216 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value) 217 { 218 return fields => fn(fields, index, value); 219 } 220 221 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2) 222 { 223 return fields => fn(fields, index1, index2); 224 } 225 226 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value) 227 { 228 return fields => fn(fields, index1, index2, value); 229 } 230 231 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They 232 * are a direct reflection of the operators available via command line args. Each matches 233 * one of the FieldsPredicate function aliases defined above. 234 */ 235 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; } 236 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; } 237 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); } 238 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); } 239 240 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; } 241 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; } 242 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; } 243 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; } 244 245 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; } 246 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double < val; } 247 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; } 248 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double > val; } 249 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; } 250 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; } 251 252 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; } 253 bool strLT(const char[][] fields, size_t index, string val) { return fields[index] < val; } 254 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; } 255 bool strGT(const char[][] fields, size_t index, string val) { return fields[index] > val; } 256 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; } 257 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; } 258 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); } 259 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); } 260 261 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander. 262 */ 263 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); } 264 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); } 265 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); } 266 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); } 267 268 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are 269 * used for both case-sensitive and case-insensitive regex operators. 270 */ 271 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); } 272 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); } 273 274 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; } 275 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double < fields[index2].to!double; } 276 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; } 277 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double > fields[index2].to!double; } 278 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; } 279 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; } 280 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; } 281 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; } 282 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2) 283 { 284 return equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 285 } 286 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2) 287 { 288 return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 289 } 290 291 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; } 292 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); } 293 294 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 295 { 296 return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value; 297 } 298 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 299 { 300 return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value; 301 } 302 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 303 { 304 return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value; 305 } 306 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 307 { 308 return RelDiff(fields[index1].to!double, fields[index2].to!double) > value; 309 } 310 311 /* Command line option handlers - There is a command line option handler for each 312 * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate, 313 * etc. Option handlers are passed the tests array, the predicate function, and the 314 * command line option arguments. A FieldsPredicate delegate is created and appended to 315 * the tests array. An exception is thrown if errors are detected while processing the 316 * option, the error text is intended for the end user. 317 * 318 * These option handlers have similar functionality, differing in option processing and 319 * error message generation. fieldVsNumberOptionHandler is described as an example. It 320 * handles command options such as '--lt 3:1000', which tests field 3 for a values less 321 * than 1000. It is passed the tests array, the 'numLE' function to use for the test, and 322 * the string "3:1000" representing the option value. It parses the option value into 323 * field index (unsigned int) and value (double). These are wrapped in a FieldsPredicate 324 * which is added to the tests array. An error is signaled if the option string is invalid. 325 * 326 * During processing, fields indexes are converted from one-based to zero-based. As an 327 * optimization, the maximum field index is also tracked. This allows early termination of 328 * line splitting. 329 */ 330 331 void fieldUnaryOptionHandler( 332 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldUnaryPredicate fn, string option, string optionVal) 333 { 334 size_t field; 335 try field = optionVal.to!size_t; 336 catch (Exception exc) 337 { 338 throw new Exception( 339 format("Invalid value in option: '--%s %s'. Expected: '--%s <field>' where field is a 1-upped integer.", 340 option, optionVal, option)); 341 } 342 343 if (field == 0) 344 { 345 throw new Exception( 346 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 347 } 348 349 size_t zeroBasedIndex = field - 1; 350 tests ~= makeFieldUnaryDelegate(fn, zeroBasedIndex); 351 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 352 } 353 354 void fieldVsNumberOptionHandler( 355 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsNumberPredicate fn, string option, string optionVal) 356 { 357 auto valSplit = findSplit(optionVal, ":"); 358 if (valSplit[1].length == 0 || valSplit[2].length == 0) 359 { 360 throw new Exception( 361 format("Invalid option: '%s %s'. Expected: '%s <field>:<val>' where <field> and <val> are numbers.", 362 option, optionVal, option)); 363 } 364 size_t field; 365 double value; 366 try 367 { 368 field = valSplit[0].to!size_t; 369 value = valSplit[2].to!double; 370 } 371 catch (Exception exc) 372 { 373 throw new Exception( 374 format("Invalid numeric values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> and <val> are numbers.", 375 option, optionVal, option)); 376 } 377 378 if (field == 0) 379 { 380 throw new Exception( 381 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 382 } 383 size_t zeroBasedIndex = field - 1; 384 tests ~= makeFieldVsNumberDelegate(fn, zeroBasedIndex, value); 385 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 386 } 387 388 void fieldVsStringOptionHandler( 389 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsStringPredicate fn, string option, string optionVal) 390 { 391 auto valSplit = findSplit(optionVal, ":"); 392 if (valSplit[1].length == 0 || valSplit[2].length == 0) 393 { 394 throw new Exception( 395 format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.", 396 option, optionVal, option)); 397 } 398 size_t field; 399 string value; 400 try 401 { 402 field = valSplit[0].to!size_t; 403 value = valSplit[2].to!string; 404 } 405 catch (Exception exc) 406 { 407 throw new Exception( 408 format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.", 409 option, optionVal, option)); 410 } 411 412 if (field == 0) 413 { 414 throw new Exception( 415 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 416 } 417 size_t zeroBasedIndex = field - 1; 418 tests ~= makeFieldVsStringDelegate(fn, zeroBasedIndex, value); 419 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 420 } 421 422 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the 423 * case-insensitive comparison will be done on lower-cased values. 424 */ 425 void fieldVsIStringOptionHandler( 426 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsIStringPredicate fn, string option, string optionVal) 427 { 428 auto valSplit = findSplit(optionVal, ":"); 429 if (valSplit[1].length == 0 || valSplit[2].length == 0) 430 { 431 throw new Exception( 432 format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.", 433 option, optionVal, option)); 434 } 435 size_t field; 436 string value; 437 try 438 { 439 field = valSplit[0].to!size_t; 440 value = valSplit[2].to!string; 441 } 442 catch (Exception exc) 443 { 444 throw new Exception( 445 format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.", 446 option, optionVal, option)); 447 } 448 449 if (field == 0) 450 { 451 throw new Exception( 452 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 453 } 454 size_t zeroBasedIndex = field - 1; 455 tests ~= makeFieldVsIStringDelegate(fn, zeroBasedIndex, value.to!dstring.toLower); 456 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 457 } 458 459 void fieldVsRegexOptionHandler( 460 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsRegexPredicate fn, string option, string optionVal, 461 bool caseSensitive) 462 { 463 auto valSplit = findSplit(optionVal, ":"); 464 if (valSplit[1].length == 0 || valSplit[2].length == 0) 465 { 466 throw new Exception( 467 format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.", 468 option, optionVal, option)); 469 } 470 size_t field; 471 Regex!char value; 472 try 473 { 474 auto modifiers = caseSensitive ? "" : "i"; 475 field = valSplit[0].to!size_t; 476 value = regex(valSplit[2], modifiers); 477 } 478 catch (Exception exc) 479 { 480 throw new Exception( 481 format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.", 482 option, optionVal, option)); 483 } 484 485 if (field == 0) 486 { 487 throw new Exception( 488 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 489 } 490 size_t zeroBasedIndex = field - 1; 491 tests ~= makeFieldVsRegexDelegate(fn, zeroBasedIndex, value); 492 maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex; 493 } 494 495 void fieldVsFieldOptionHandler( 496 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsFieldPredicate fn, string option, string optionVal) 497 { 498 auto valSplit = findSplit(optionVal, ":"); 499 if (valSplit[1].length == 0 || valSplit[2].length == 0) 500 { 501 throw new Exception( 502 format("Invalid option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 503 option, optionVal, option)); 504 } 505 size_t field1; 506 size_t field2; 507 try 508 { 509 field1 = valSplit[0].to!size_t; 510 field2 = valSplit[2].to!size_t; 511 } 512 catch (Exception exc) 513 { 514 throw new Exception( 515 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 516 option, optionVal, option)); 517 } 518 519 if (field1 == 0 || field2 == 0) 520 { 521 throw new Exception( 522 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 523 } 524 525 if (field1 == field2) 526 { 527 throw new Exception( 528 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 529 } 530 531 size_t zeroBasedIndex1 = field1 - 1; 532 size_t zeroBasedIndex2 = field2 - 1; 533 tests ~= makeFieldVsFieldDelegate(fn, zeroBasedIndex1, zeroBasedIndex2); 534 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 535 } 536 537 538 void fieldFieldNumOptionHandler( 539 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldFieldNumPredicate fn, string option, string optionVal) 540 { 541 size_t field1; 542 size_t field2; 543 double value; 544 auto valSplit = findSplit(optionVal, ":"); 545 auto invalidOption = (valSplit[1].length == 0 || valSplit[2].length == 0); 546 547 if (!invalidOption) 548 { 549 auto valSplit2 = findSplit(valSplit[2], ":"); 550 invalidOption = (valSplit2[1].length == 0 || valSplit2[2].length == 0); 551 552 if (!invalidOption) 553 { 554 try 555 { 556 field1 = valSplit[0].to!size_t; 557 field2 = valSplit2[0].to!size_t; 558 value = valSplit2[2].to!double; 559 } 560 catch (Exception exc) 561 { 562 invalidOption = true; 563 } 564 } 565 } 566 567 if (invalidOption) 568 { 569 throw new Exception( 570 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>:<num>' where fields are 1-upped integers.", 571 option, optionVal, option)); 572 } 573 if (field1 == 0 || field2 == 0) 574 { 575 throw new Exception( 576 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 577 } 578 if (field1 == field2) 579 { 580 throw new Exception( 581 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 582 } 583 584 size_t zeroBasedIndex1 = field1 - 1; 585 size_t zeroBasedIndex2 = field2 - 1; 586 tests ~= makeFieldFieldNumDelegate(fn, zeroBasedIndex1, zeroBasedIndex2, value); 587 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 588 } 589 590 /* Command line options - This struct holds the results of command line option processing. 591 * It also has a method, processArgs, that invokes command line arg processing. 592 */ 593 struct TsvFilterOptions 594 { 595 string programName; 596 FieldsPredicate[] tests; // Derived from tests 597 size_t maxFieldIndex; // Derived from tests 598 bool hasHeader = false; // --H|header 599 bool invert = false; // --invert 600 bool disjunct = false; // --or 601 char delim = '\t'; // --delimiter 602 bool helpVerbose = false; // --help-verbose 603 bool helpOptions = false; // --help-options 604 bool versionWanted = false; // --V|version 605 606 /* Returns a tuple. First value is true if command line arguments were successfully 607 * processed and execution should continue, or false if an error occurred or the user 608 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 609 * 610 * Returning true (execution continues) means args have been validated and the 611 * tests array has been established. 612 */ 613 auto processArgs (ref string[] cmdArgs) 614 { 615 import std.getopt; 616 import std.path : baseName, stripExtension; 617 import getopt_inorder; 618 619 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 620 621 /* Command option handlers - One handler for each option. These conform to the 622 * getopt required handler signature, and separate knowledge the specific command 623 * option text from the option processing. 624 */ 625 void handlerFldEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldEmpty, option, value); } 626 void handlerFldNotEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotEmpty, option, value); } 627 void handlerFldBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldBlank, option, value); } 628 void handlerFldNotBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotBlank, option, value); } 629 630 void handlerFldIsNumeric(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNumeric, option, value); } 631 void handlerFldIsFinite(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsFinite, option, value); } 632 void handlerFldIsNaN(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNaN, option, value); } 633 void handlerFldIsInfinity(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsInfinity, option, value); } 634 635 void handlerNumLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLE, option, value); } 636 void handlerNumLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLT, option, value); } 637 void handlerNumGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGE, option, value); } 638 void handlerNumGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGT, option, value); } 639 void handlerNumEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numEQ, option, value); } 640 void handlerNumNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numNE, option, value); } 641 642 void handlerStrLE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLE, option, value); } 643 void handlerStrLT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLT, option, value); } 644 void handlerStrGE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGE, option, value); } 645 void handlerStrGT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGT, option, value); } 646 void handlerStrEQ(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strEQ, option, value); } 647 void handlerStrNE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNE, option, value); } 648 649 void handlerStrInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strInFld, option, value); } 650 void handlerStrNotInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNotInFld, option, value); } 651 652 void handlerIStrEQ(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrEQ, option, value); } 653 void handlerIStrNE(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNE, option, value); } 654 void handlerIStrInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrInFld, option, value); } 655 void handlerIStrNotInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNotInFld, option, value); } 656 657 void handlerRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, true); } 658 void handlerRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, true); } 659 void handlerIRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, false); } 660 void handlerIRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, false); } 661 662 void handlerFFLE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLE, option, value); } 663 void handlerFFLT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLT, option, value); } 664 void handlerFFGE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGE, option, value); } 665 void handlerFFGT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGT, option, value); } 666 void handlerFFEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffEQ, option, value); } 667 void handlerFFNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffNE, option, value); } 668 669 void handlerFFStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrEQ, option, value); } 670 void handlerFFStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrNE, option, value); } 671 void handlerFFIStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrEQ, option, value); } 672 void handlerFFIStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrNE, option, value); } 673 674 void handlerFFAbsDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffLE, option, value); } 675 void handlerFFAbsDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffGT, option, value); } 676 void handlerFFRelDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffLE, option, value); } 677 void handlerFFRelDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffGT, option, value); } 678 679 try 680 { 681 arraySep = ","; // Use comma to separate values in command line options 682 auto r = getoptInorder( 683 cmdArgs, 684 "help-verbose", " Print full help.", &helpVerbose, 685 "help-options", " Print the options list by itself.", &helpOptions, 686 std.getopt.config.caseSensitive, 687 "V|version", " Print version information and exit.", &versionWanted, 688 "H|header", " Treat the first line of each file as a header.", &hasHeader, 689 std.getopt.config.caseInsensitive, 690 "or", " Evaluate tests as an OR rather than an AND.", &disjunct, 691 std.getopt.config.caseSensitive, 692 "v|invert", " Invert the filter, printing lines that do not match.", &invert, 693 std.getopt.config.caseInsensitive, 694 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 695 696 "empty", "FIELD True if field is empty.", &handlerFldEmpty, 697 "not-empty", "FIELD True if field is not empty.", &handlerFldNotEmpty, 698 "blank", "FIELD True if field is empty or all whitespace.", &handlerFldBlank, 699 "not-blank", "FIELD True if field contains a non-whitespace character.", &handlerFldNotBlank, 700 701 "is-numeric", "FIELD True if field is interpretable as a number.", &handlerFldIsNumeric, 702 "is-finite", "FIELD True if field is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite, 703 "is-nan", "FIELD True if field is NaN.", &handlerFldIsNaN, 704 "is-infinity", "FIELD True if field is infinity.", &handlerFldIsInfinity, 705 706 "le", "FIELD:NUM FIELD <= NUM (numeric).", &handlerNumLE, 707 "lt", "FIELD:NUM FIELD < NUM (numeric).", &handlerNumLT, 708 "ge", "FIELD:NUM FIELD >= NUM (numeric).", &handlerNumGE, 709 "gt", "FIELD:NUM FIELD > NUM (numeric).", &handlerNumGT, 710 "eq", "FIELD:NUM FIELD == NUM (numeric).", &handlerNumEQ, 711 "ne", "FIELD:NUM FIELD != NUM (numeric).", &handlerNumNE, 712 713 "str-le", "FIELD:STR FIELD <= STR (string).", &handlerStrLE, 714 "str-lt", "FIELD:STR FIELD < STR (string).", &handlerStrLT, 715 "str-ge", "FIELD:STR FIELD >= STR (string).", &handlerStrGE, 716 "str-gt", "FIELD:STR FIELD > STR (string).", &handlerStrGT, 717 "str-eq", "FIELD:STR FIELD == STR (string).", &handlerStrEQ, 718 "istr-eq", "FIELD:STR FIELD == STR (string, case-insensitive).", &handlerIStrEQ, 719 "str-ne", "FIELD:STR FIELD != STR (string).", &handlerStrNE, 720 "istr-ne", "FIELD:STR FIELD != STR (string, case-insensitive).", &handlerIStrNE, 721 "str-in-fld", "FIELD:STR FIELD contains STR (substring search).", &handlerStrInFld, 722 "istr-in-fld", "FIELD:STR FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld, 723 "str-not-in-fld", "FIELD:STR FIELD does not contain STR (substring search).", &handlerStrNotInFld, 724 "istr-not-in-fld", "FIELD:STR FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld, 725 726 "regex", "FIELD:REGEX FIELD matches regular expression.", &handlerRegexMatch, 727 "iregex", "FIELD:REGEX FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch, 728 "not-regex", "FIELD:REGEX FIELD does not match regular expression.", &handlerRegexNotMatch, 729 "not-iregex", "FIELD:REGEX FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch, 730 731 "ff-le", "FIELD1:FIELD2 FIELD1 <= FIELD2 (numeric).", &handlerFFLE, 732 "ff-lt", "FIELD1:FIELD2 FIELD1 < FIELD2 (numeric).", &handlerFFLT, 733 "ff-ge", "FIELD1:FIELD2 FIELD1 >= FIELD2 (numeric).", &handlerFFGE, 734 "ff-gt", "FIELD1:FIELD2 FIELD1 > FIELD2 (numeric).", &handlerFFGT, 735 "ff-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (numeric).", &handlerFFEQ, 736 "ff-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (numeric).", &handlerFFNE, 737 "ff-str-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string).", &handlerFFStrEQ, 738 "ff-istr-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ, 739 "ff-str-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string).", &handlerFFStrNE, 740 "ff-istr-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE, 741 742 "ff-absdiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE, 743 "ff-absdiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) > NUM", &handlerFFAbsDiffGT, 744 "ff-reldiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE, 745 "ff-reldiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) > NUM", &handlerFFRelDiffGT, 746 ); 747 748 /* Both help texts are a bit long. In this case, for "regular" help, don't 749 * print options, just the text. The text summarizes the options. 750 */ 751 if (r.helpWanted) 752 { 753 stdout.write(helpText); 754 return tuple(false, 0); 755 } 756 else if (helpVerbose) 757 { 758 defaultGetoptPrinter(helpTextVerbose, r.options); 759 return tuple(false, 0); 760 } 761 else if (helpOptions) 762 { 763 defaultGetoptPrinter(helpTextOptions, r.options); 764 return tuple(false, 0); 765 } 766 else if (versionWanted) 767 { 768 import tsvutils_version; 769 writeln(tsvutilsVersionNotice("tsv-filter")); 770 return tuple(false, 0); 771 } 772 } 773 catch (Exception exc) 774 { 775 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 776 return tuple(false, 1); 777 } 778 return tuple(true, 0); 779 } 780 } 781 782 /** tsvFilter processes the input files and runs the tests. 783 */ 784 void tsvFilter(in TsvFilterOptions cmdopt, in string[] inputFiles) 785 { 786 import std.algorithm : all, any, splitter; 787 import std.range; 788 import tsvutil : BufferedOutputRange, throwIfWindowsNewlineOnUnix; 789 790 /* BufferedOutputRange improves performance on narrow files with high percentages of 791 * writes. Want responsive output if output is rare, so ensure the first matched 792 * line is written, and that writes separated by long stretches of non-matched lines 793 * are written. 794 */ 795 enum maxInputLinesWithoutBufferFlush = 1024; 796 size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1; 797 798 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 799 800 /* Process each input file, one line at a time. */ 801 auto lineFields = new char[][](cmdopt.maxFieldIndex + 1); 802 bool headerWritten = false; 803 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 804 { 805 auto inputStream = (filename == "-") ? stdin : filename.File(); 806 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 807 { 808 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 809 if (lineNum == 1 && cmdopt.hasHeader) 810 { 811 /* Header. Output on the first file, skip subsequent files. */ 812 if (!headerWritten) 813 { 814 bufferedOutput.appendln(line); 815 headerWritten = true; 816 } 817 } 818 else 819 { 820 /* Copy the needed number of fields to the fields array. */ 821 int fieldIndex = -1; 822 foreach (fieldValue; line.splitter(cmdopt.delim)) 823 { 824 if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break; 825 fieldIndex++; 826 lineFields[fieldIndex] = fieldValue; 827 } 828 829 if (fieldIndex == -1) 830 { 831 assert(line.length == 0); 832 /* Bug work-around. Currently empty lines are not handled properly by splitter. 833 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 834 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 835 * Work-around: Point to the line. It's an empty string. 836 */ 837 fieldIndex++; 838 lineFields[fieldIndex] = line; 839 } 840 841 if (fieldIndex < cast(long) cmdopt.maxFieldIndex) 842 { 843 throw new Exception( 844 format("Not enough fields in line. File: %s, Line: %s", 845 (filename == "-") ? "Standard Input" : filename, lineNum)); 846 } 847 848 /* Run the tests. Tests will fail (throw) if a field cannot be converted 849 * to the expected type. 850 */ 851 try 852 { 853 inputLinesWithoutBufferFlush++; 854 bool passed = cmdopt.disjunct ? 855 cmdopt.tests.any!(x => x(lineFields)) : 856 cmdopt.tests.all!(x => x(lineFields)); 857 if (cmdopt.invert) passed = !passed; 858 if (passed) 859 { 860 bool wasFlushed = bufferedOutput.appendln(line); 861 if (wasFlushed) inputLinesWithoutBufferFlush = 0; 862 else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush) 863 { 864 bufferedOutput.flush; 865 inputLinesWithoutBufferFlush = 0; 866 } 867 } 868 } 869 catch (Exception exc) 870 { 871 throw new Exception( 872 format("Could not process line or field: %s\n File: %s Line: %s%s", 873 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 874 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 875 } 876 } 877 } 878 } 879 }