1 /** 2 Command line tool that filters TSV files. 3 4 This tool filters tab-delimited files based on numeric or string comparisons 5 against specific fields. See the helpText string for details. 6 7 Copyright (c) 2015-2020, eBay Inc. 8 Initially written by Jon Degenhardt 9 10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_utils.tsv_filter; 13 14 import std.algorithm : canFind, equal, findSplit, max, min; 15 import std.conv : to; 16 import std.exception : enforce; 17 import std.format : format; 18 import std.math : abs, isFinite, isInfinity, isNaN; 19 import std.range; 20 import std.regex; 21 import std.stdio; 22 import std..string : isNumeric; 23 import std.typecons; 24 import std.uni: asLowerCase, toLower, byGrapheme; 25 26 /* The program has two main parts, command line arg processing and processing the input 27 * files. Much of the work is in command line arg processing. This sets up the tests run 28 * against each input line. The tests are an array of delegates (closures) run against the 29 * fields in the line. The tests are based on command line arguments, of which there is 30 * a lengthy set, one for each test. 31 */ 32 33 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 34 35 /** Main program. Invokes command line arg processing and tsv-filter to perform 36 * the real work. Any errors are caught and reported. 37 */ 38 int main(string[] cmdArgs) 39 { 40 /* When running in DMD code coverage mode, turn on report merging. */ 41 version(D_Coverage) version(DigitalMars) 42 { 43 import core.runtime : dmd_coverSetMerge; 44 dmd_coverSetMerge(true); 45 } 46 47 TsvFilterOptions cmdopt; 48 const r = cmdopt.processArgs(cmdArgs); 49 if (!r[0]) return r[1]; 50 version(LDC_Profile) 51 { 52 import ldc.profile : resetAll; 53 resetAll(); 54 } 55 try tsvFilter(cmdopt); 56 catch (Exception e) 57 { 58 stderr.writefln("Error [%s]: %s", cmdopt.programName, e.msg); 59 return 1; 60 } 61 return 0; 62 } 63 64 immutable helpText = q"EOS 65 Synopsis: tsv-filter [options] [file...] 66 67 Filter tab-delimited files for matching lines via comparison tests against 68 individual fields. Use '--help-verbose' for a more detailed description. 69 70 Fields are specified using field number or field name. Field names require 71 that the input file has a header line. Use '--help-fields' for details. 72 73 Global options: 74 --help-verbose Print full help. 75 --help-options Print the options list by itself. 76 --help-fields Print help on specifying fields. 77 --V|version Print version information and exit. 78 --H|header Treat the first line of each file as a header. 79 --or Evaluate tests as an OR rather than an AND clause. 80 --v|invert Invert the filter, printing lines that do not match. 81 --d|delimiter CHR Field delimiter. Default: TAB. 82 83 Operators: 84 * Test if a field is empty (no characters) or blank (empty or whitespace only). 85 Syntax: --empty|not-empty|blank|not-blank FIELD 86 Example: --empty name # True if the 'name' field is empty 87 88 * Test if a field is numeric, finite, NaN, or infinity 89 Syntax: --is-numeric|is-finite|is-nan|is-infinity FIELD 90 Example: --is-numeric 5 --gt 5:100 # Ensure field 5 is numeric before --gt test. 91 92 * Compare a field to a number (integer or float) 93 Syntax: --eq|ne|lt|le|gt|ge FIELD:NUM 94 Example: --lt size:1000 --gt weight:0.5 # ('size' < 1000) and ('weight' > 0.5) 95 96 * Compare a field to a string 97 Syntax: --str-eq|str-ne|istr-eq|istr-ne FIELD:STR 98 Example: --str-eq color:red # True if 'color' field is "red" 99 100 * Test if a field contains a string (substring search) 101 Syntax: --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld FIELD:STR 102 Example: --str-in-fld color:dark # True if 'color field contains "dark" 103 104 * Test if a field matches a regular expression. 105 Syntax: --regex|iregex|not-regex|not-iregex FIELD:REGEX 106 Example: --regex '3:ab*c' # True if field 3 contains "ac", "abc", "abbc", etc. 107 108 * Test a field's character or byte length 109 Syntax: --char-len-[le|lt|ge|gt|eq|ne] FIELD:NUM 110 --byte-len-[le|lt|ge|gt|eq|ne] FIELD:NUM 111 Example: --char-len-lt 2:10 # True if field 2 is less than 10 characters long. 112 --byte-len-gt 2:10 # True if field 2 is greater than 10 bytes long. 113 114 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field. 115 Syntax: --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge FIELD1:FIELD2 116 --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne FIELD1:FIELD2 117 Example: --ff-eq 2:4 # True if fields 2 and 4 are numerically equivalent 118 --ff-str-eq 2:4 # True if fields 2 and 4 are the same strings 119 120 * Field to field difference comparisons - Absolute and relative difference 121 Syntax: --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM 122 --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM 123 Example: --ff-absdiff-lt 1:3:0.25 # True if abs(field1 - field2) < 0.25 124 125 EOS"; 126 127 immutable helpTextVerbose = q"EOS 128 Synopsis: tsv-filter [options] [file...] 129 130 Filter lines of tab-delimited files via comparison tests against fields. 131 Multiple tests can be specified, by default they are evaluated as an AND 132 clause. Lines satisfying the tests are written to standard output. 133 134 Typical test syntax is '--op field:value', where 'op' is an operator, 135 'field' is a either a field name and or field number, and 'value' is the 136 comparison basis. For example, '--lt length:500' tests if the 'length' 137 field is less than 500. A more complete example: 138 139 tsv-filter --header --gt length:50 --lt length:100 --le width:200 data.tsv 140 141 This outputs all lines from file data.tsv where the 'length' field is 142 greater than 50 and less than 100, and the 'width' field is less than or 143 equal to 200. The header line is also output. 144 145 Field numbers can also be used to identify fields, and must be used when 146 the input file doesn't have a header line. For example: 147 148 tsv-filter --gt 1:50 --lt 1:100 --le 2:200 data.tsv 149 150 Field lists can be used to specify multiple fields at once. For example: 151 152 tsv-filter --not-blank 1-10 --str-ne 1,2,5:'--' data.tsv 153 154 tests that fields 1-10 are not blank and fields 1,2,5 are not "--". 155 156 Wildcarded field names can also be used to specify multiple fields. The 157 following finds lines where any field name ending in '*_id' is empty: 158 159 tsv-filter -H --or --empty '*_id' 160 161 Use '--help-fields' for details on using field names. 162 163 Tests available include: 164 * Test if a field is empty (no characters) or blank (empty or whitespace only). 165 * Test if a field is interpretable as a number, a finite number, NaN, or Infinity. 166 * Compare a field to a number - Numeric equality and relational tests. 167 * Compare a field to a string - String equality and relational tests. 168 * Test if a field matches a regular expression. Case sensitive or insensitive. 169 * Test if a field contains a string. Sub-string search, case sensitive or insensitive. 170 * Test a field's character or byte length. 171 * Field to field comparisons - Similar to the other tests, except comparing 172 one field to another in the same line. 173 174 Details: 175 * The run is aborted if there are not enough fields in an input line. 176 * Numeric tests will fail and abort the run if a field cannot be interpreted as a 177 number. This includes fields with no text. To avoid this use '--is-numeric' or 178 '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100' 179 ensures field 5 is numeric before running the --gt test. 180 * Regular expression syntax is defined by the D programming language. They follow 181 common conventions (perl, python, etc.). Most common forms work as expected. 182 183 Options: 184 EOS"; 185 186 immutable helpTextOptions = q"EOS 187 Synopsis: tsv-filter [options] [file...] 188 189 Options: 190 EOS"; 191 192 /* The next blocks of code define the structure of the boolean tests run against input lines. 193 * This includes function and delegate (closure) signatures, creation mechanisms, option 194 * handlers, etc. Command line arg processing to build the test structure. 195 */ 196 197 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean 198 * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure) 199 * containing all info about the test except the field values of the line being tested. 200 * These delegates are created as part of command line arg processing. The wrapped data 201 * includes operation, field indexes, literal values, etc. At run-time the delegate is 202 * passed one argument, the split input line. 203 */ 204 alias FieldsPredicate = bool delegate(const char[][] fields); 205 206 /* FieldsPredicate function signatures - These aliases represent the different function 207 * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make' 208 * function. The 'make' function takes a real predicate function and closure args and 209 * returns a FieldsPredicate delegate. Predicates types are: 210 * 211 * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4) 212 * - FieldVsNumberPredicate - Test based on a field index (used to get the field value) 213 * and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100). 214 * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc) 215 * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string. 216 * (e.g. --istr-eq 2:abc) 217 * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c') 218 * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4). 219 * 220 * An actual FieldsPredicate takes the fields from the line and the closure args and 221 * runs the test. For example, a function testing if a field is less than a specific 222 * value would pull the specified field from the fields array, convert the string to 223 * a number, then run the less-than test. 224 */ 225 alias FieldUnaryPredicate = bool function(const char[][] fields, size_t index); 226 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value); 227 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value); 228 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value); 229 alias FieldVsRegexPredicate = bool function(const char[][] fields, size_t index, Regex!char value); 230 alias FieldVsFieldPredicate = bool function(const char[][] fields, size_t index1, size_t index2); 231 alias FieldFieldNumPredicate = bool function(const char[][] fields, size_t index1, size_t index2, double value); 232 233 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index) 234 { 235 return fields => fn(fields, index); 236 } 237 238 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value) 239 { 240 return fields => fn(fields, index, value); 241 } 242 243 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value) 244 { 245 return fields => fn(fields, index, value); 246 } 247 248 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value) 249 { 250 return fields => fn(fields, index, value); 251 } 252 253 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value) 254 { 255 return fields => fn(fields, index, value); 256 } 257 258 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2) 259 { 260 return fields => fn(fields, index1, index2); 261 } 262 263 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value) 264 { 265 return fields => fn(fields, index1, index2, value); 266 } 267 268 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They 269 * are a direct reflection of the operators available via command line args. Each matches 270 * one of the FieldsPredicate function aliases defined above. 271 */ 272 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; } 273 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; } 274 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); } 275 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); } 276 277 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; } 278 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; } 279 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; } 280 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; } 281 282 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; } 283 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double < val; } 284 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; } 285 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double > val; } 286 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; } 287 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; } 288 289 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; } 290 bool strLT(const char[][] fields, size_t index, string val) { return fields[index] < val; } 291 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; } 292 bool strGT(const char[][] fields, size_t index, string val) { return fields[index] > val; } 293 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; } 294 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; } 295 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); } 296 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); } 297 298 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander. 299 */ 300 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); } 301 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); } 302 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); } 303 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); } 304 305 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are 306 * used for both case-sensitive and case-insensitive regex operators. 307 */ 308 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); } 309 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); } 310 311 bool charLenLE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength <= val; } 312 bool charLenLT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength < val; } 313 bool charLenGE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength >= val; } 314 bool charLenGT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength > val; } 315 bool charLenEQ(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength == val; } 316 bool charLenNE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength != val; } 317 318 bool byteLenLE(const char[][] fields, size_t index, double val) { return fields[index].length <= val; } 319 bool byteLenLT(const char[][] fields, size_t index, double val) { return fields[index].length < val; } 320 bool byteLenGE(const char[][] fields, size_t index, double val) { return fields[index].length >= val; } 321 bool byteLenGT(const char[][] fields, size_t index, double val) { return fields[index].length > val; } 322 bool byteLenEQ(const char[][] fields, size_t index, double val) { return fields[index].length == val; } 323 bool byteLenNE(const char[][] fields, size_t index, double val) { return fields[index].length != val; } 324 325 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; } 326 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double < fields[index2].to!double; } 327 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; } 328 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double > fields[index2].to!double; } 329 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; } 330 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; } 331 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; } 332 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; } 333 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2) 334 { 335 return equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 336 } 337 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2) 338 { 339 return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 340 } 341 342 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; } 343 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); } 344 345 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 346 { 347 return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value; 348 } 349 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 350 { 351 return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value; 352 } 353 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 354 { 355 return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value; 356 } 357 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 358 { 359 return RelDiff(fields[index1].to!double, fields[index2].to!double) > value; 360 } 361 362 /* Command line option handlers - There is a command line option handler for each 363 * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate, 364 * etc. Option handlers are passed the tests array, the predicate function, and the 365 * command line option arguments. A FieldsPredicate delegate is created and appended to 366 * the tests array. An exception is thrown if errors are detected while processing the 367 * option, the error text is intended for the end user. 368 * 369 * All the option handlers have similar functionality, differing in option processing and 370 * error message generation. fieldVsNumberOptionHandler is described as an example. It 371 * handles command options such as '--lt 3:1000', which tests field 3 for a values less 372 * than 1000. It is passed the tests array, the 'numLE' predicate function used for the 373 * test, and the string "3:1000" representing the option value. It is also passed the 374 * header line from the first input file and an indication of whether header processing 375 * is enabled (--H|header). parseFieldList (fieldlist module) is used to parse the 376 * field-list component of the option ("3" in the example). The comparison value ("1000") 377 * is converted to a double. These are wrapped in a FieldsPredicate delegate which is 378 * added to the tests array. An error is signaled if the option string is invalid. 379 * 380 * During processing, fields indexes are converted from one-based to zero-based. As an 381 * optimization, the maximum field index is also tracked. This allows early termination of 382 * line splitting. 383 * 384 * The header line from the input file is not available when std.getop processes the 385 * command line option. The processing described above must be deferred. This is done 386 * using a 'CmdOptionHandler' delegate. There is a 'make' function for every Command line 387 * option handler that creates these. These are created during std.getopt processing. 388 * They are run when the header line becomes available. 389 * 390 * The final setup for the '--lt' (numeric less-than) operator' is as follows: 391 * - Function 'handlerNumLE' (in TsvFilterOptions.processArgs) is associated with the 392 * command line option "--lt <val>". When called by std.getopt it creates an option 393 * hander delegate via 'makeFieldVsNumberOptionHandler'. This is appended to an 394 * array of delegates. 395 * - 'fieldVsNumberOptionHandler' is invoked via the delegate after the header line 396 * becomes available (in TsvFilterOptions.processArgs). If args are valid, 397 * 'makeFieldVsNumberDelegate' is used to create a delegate invoking the 'numLE' 398 * predicate function. This delegate is added to the set of run-time tests. 399 * 400 * Note that in the above setup the 'numLE' predicate is specified in 'handlerNumLE' 401 * and passed through all the steps. This is how the command line option gets 402 * associated with the predicate function. 403 */ 404 405 /* CmdOptionHandler delegate signature - This is the call made to process the command 406 * line option arguments after the header line has been read. 407 */ 408 alias CmdOptionHandler = void delegate(ref FieldsPredicate[] tests, ref size_t maxFieldIndex, 409 bool hasHeader, string[] headerFields); 410 411 CmdOptionHandler makeFieldUnaryOptionHandler(FieldUnaryPredicate predicateFn, string option, string optionVal) 412 { 413 return 414 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 415 => fieldUnaryOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 416 } 417 418 void fieldUnaryOptionHandler( 419 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 420 FieldUnaryPredicate fn, string option, string optionVal) 421 { 422 import tsv_utils.common.fieldlist; 423 424 try foreach (fieldNum, fieldIndex; 425 optionVal 426 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields) 427 .enumerate(1)) 428 { 429 tests ~= makeFieldUnaryDelegate(fn, fieldIndex); 430 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 431 } 432 catch (Exception e) 433 { 434 e.msg = format("Invalid option: [--%s %s]. %s\n Expected: '--%s <field>' or '--%s <field-list>'.", 435 option, optionVal, e.msg, option, option); 436 throw e; 437 } 438 } 439 440 CmdOptionHandler makeFieldVsNumberOptionHandler(FieldVsNumberPredicate predicateFn, string option, string optionVal) 441 { 442 return 443 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 444 => fieldVsNumberOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 445 } 446 447 void fieldVsNumberOptionHandler( 448 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 449 FieldVsNumberPredicate fn, string option, string optionVal) 450 { 451 import tsv_utils.common.fieldlist; 452 453 auto formatErrorMsg(string option, string optionVal, string errorMessage="") 454 { 455 string optionalSpace = (errorMessage.length == 0) ? "" : " "; 456 return format( 457 "Invalid option: [--%s %s].%s%s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val> where <val> is a number.", 458 option, optionVal, optionalSpace, errorMessage, option, option); 459 } 460 461 try 462 { 463 auto optionValParse = 464 optionVal 465 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 466 (hasHeader, headerFields); 467 468 auto fieldIndices = optionValParse.array; 469 enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 470 double value = optionVal[optionValParse.consumed + 1 .. $].to!double; 471 472 foreach (fieldIndex; fieldIndices) 473 { 474 tests ~= makeFieldVsNumberDelegate(fn, fieldIndex, value); 475 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 476 } 477 } 478 catch (Exception e) 479 { 480 e.msg = formatErrorMsg(option, optionVal, e.msg); 481 throw e; 482 } 483 } 484 485 CmdOptionHandler makeFieldVsStringOptionHandler(FieldVsStringPredicate predicateFn, string option, string optionVal) 486 { 487 return 488 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 489 => fieldVsStringOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 490 } 491 492 void fieldVsStringOptionHandler( 493 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 494 FieldVsStringPredicate fn, string option, string optionVal) 495 { 496 import tsv_utils.common.fieldlist; 497 498 try 499 { 500 auto optionValParse = 501 optionVal 502 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 503 (hasHeader, headerFields); 504 505 auto fieldIndices = optionValParse.array; 506 enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 507 string value = optionVal[optionValParse.consumed + 1 .. $].idup; 508 509 foreach (fieldIndex; fieldIndices) 510 { 511 tests ~= makeFieldVsStringDelegate(fn, fieldIndex, value); 512 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 513 } 514 515 } 516 catch (Exception e) 517 { 518 e.msg = format( 519 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 520 option, optionVal, e.msg, option, option); 521 throw e; 522 } 523 } 524 525 CmdOptionHandler makeFieldVsIStringOptionHandler(FieldVsIStringPredicate predicateFn, string option, string optionVal) 526 { 527 return 528 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 529 => fieldVsIStringOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 530 } 531 532 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the 533 * case-insensitive comparison will be done on lower-cased values. 534 */ 535 void fieldVsIStringOptionHandler( 536 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 537 FieldVsIStringPredicate fn, string option, string optionVal) 538 { 539 import tsv_utils.common.fieldlist; 540 541 try 542 { 543 auto optionValParse = 544 optionVal 545 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 546 (hasHeader, headerFields); 547 548 auto fieldIndices = optionValParse.array; 549 enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 550 string value = optionVal[optionValParse.consumed + 1 .. $].idup; 551 552 foreach (fieldIndex; fieldIndices) 553 { 554 tests ~= makeFieldVsIStringDelegate(fn, fieldIndex, value.to!dstring.toLower); 555 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 556 } 557 } 558 catch (Exception e) 559 { 560 e.msg = format( 561 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 562 option, optionVal, e.msg, option, option); 563 throw e; 564 } 565 } 566 567 CmdOptionHandler makeFieldVsRegexOptionHandler(FieldVsRegexPredicate predicateFn, string option, string optionVal, bool caseSensitive) 568 { 569 return 570 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 571 => fieldVsRegexOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal, caseSensitive); 572 } 573 574 void fieldVsRegexOptionHandler( 575 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 576 FieldVsRegexPredicate fn, string option, string optionVal, bool caseSensitive) 577 { 578 import tsv_utils.common.fieldlist; 579 580 try 581 { 582 auto optionValParse = 583 optionVal 584 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 585 (hasHeader, headerFields); 586 587 auto fieldIndices = optionValParse.array; 588 enforce(optionVal.length - optionValParse.consumed > 1, "No value after field list."); 589 590 immutable modifiers = caseSensitive ? "" : "i"; 591 Regex!char value = 592 optionVal[optionValParse.consumed + 1 .. $] 593 .regex(modifiers); 594 595 foreach (fieldIndex; fieldIndices) 596 { 597 tests ~= makeFieldVsRegexDelegate(fn, fieldIndex, value); 598 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 599 } 600 } 601 catch (RegexException e) 602 { 603 e.msg = format( 604 "[--%s %s]. Invalid regular expression: %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 605 option, optionVal, e.msg, option, option); 606 throw e; 607 } 608 catch (Exception e) 609 { 610 e.msg = format( 611 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 612 option, optionVal, e.msg, option, option); 613 throw e; 614 } 615 } 616 617 618 CmdOptionHandler makeFieldVsFieldOptionHandler(FieldVsFieldPredicate predicateFn, string option, string optionVal) 619 { 620 return 621 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 622 => fieldVsFieldOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 623 } 624 625 void fieldVsFieldOptionHandler( 626 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 627 FieldVsFieldPredicate fn, string option, string optionVal) 628 { 629 import tsv_utils.common.fieldlist; 630 631 try 632 { 633 auto optionValParse = 634 optionVal 635 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 636 (hasHeader, headerFields); 637 638 auto fieldIndices1 = optionValParse.array; 639 640 enforce(fieldIndices1.length != 0, "First field argument is empty."); 641 enforce(fieldIndices1.length == 1, "First field argument references multiple fields."); 642 enforce(optionVal.length - optionValParse.consumed > 1, " Second field argument is empty."); 643 644 auto fieldIndices2 = 645 optionVal[optionValParse.consumed + 1 .. $] 646 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, Yes.consumeEntireFieldListString) 647 (hasHeader, headerFields) 648 .array; 649 650 enforce(fieldIndices2.length != 0, "Second field argument is empty."); 651 enforce(fieldIndices2.length == 1, "Second field argument references multiple fields."); 652 653 enforce(fieldIndices1[0] != fieldIndices2[0], 654 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 655 656 tests ~= makeFieldVsFieldDelegate(fn, fieldIndices1[0], fieldIndices2[0]); 657 maxFieldIndex = max(maxFieldIndex, fieldIndices1[0], fieldIndices2[0]); 658 } 659 catch (Exception e) 660 { 661 e.msg = format( 662 "[--%s %s]. %s\n Expected: '--%s <field1>:<field2>' where <field1> and <field2> are individual fields.", 663 option, optionVal, e.msg, option); 664 throw e; 665 } 666 } 667 668 CmdOptionHandler makeFieldFieldNumOptionHandler(FieldFieldNumPredicate predicateFn, string option, string optionVal) 669 { 670 return 671 (ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields) 672 => fieldFieldNumOptionHandler(tests, maxFieldIndex, hasHeader, headerFields, predicateFn, option, optionVal); 673 } 674 675 void fieldFieldNumOptionHandler( 676 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, bool hasHeader, string[] headerFields, 677 FieldFieldNumPredicate fn, string option, string optionVal) 678 { 679 import tsv_utils.common.fieldlist; 680 681 try 682 { 683 auto optionValParse1 = 684 optionVal 685 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 686 (hasHeader, headerFields); 687 688 auto fieldIndices1 = optionValParse1.array; 689 690 enforce(fieldIndices1.length != 0, "First field argument is empty."); 691 enforce(fieldIndices1.length == 1, "First field argument references multiple fields."); 692 enforce(optionVal.length - optionValParse1.consumed > 1, " Second field argument is empty."); 693 694 auto optionValSegment2 = optionVal[optionValParse1.consumed + 1 .. $]; 695 auto optionValParse2 = 696 optionValSegment2 697 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 698 (hasHeader, headerFields); 699 700 auto fieldIndices2 = optionValParse2.array; 701 702 enforce(fieldIndices2.length != 0, "Second field argument is empty."); 703 enforce(fieldIndices2.length == 1, "Second field argument references multiple fields."); 704 enforce(optionValSegment2.length - optionValParse2.consumed > 1, "Number argument is empty."); 705 706 size_t field1 = fieldIndices1[0]; 707 size_t field2 = fieldIndices2[0]; 708 double value = optionValSegment2[optionValParse2.consumed + 1 .. $].to!double; 709 710 enforce(field1 != field2, 711 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 712 713 tests ~= makeFieldFieldNumDelegate(fn, field1, field2, value); 714 maxFieldIndex = max(maxFieldIndex, field1, field2); 715 } 716 catch (Exception e) 717 { 718 e.msg = format( 719 "[--%s %s]. %s\n Expected: '--%s <field1>:<field2>:<num>' where <field1> and <field2> are individual fields.", 720 option, optionVal, e.msg, option); 721 throw e; 722 } 723 } 724 725 /** Command line options - This struct holds the results of command line option processing. 726 * It also has a method, processArgs, that invokes command line arg processing. 727 */ 728 struct TsvFilterOptions 729 { 730 import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; 731 732 string programName; 733 InputSourceRange inputSources; /// Input files 734 FieldsPredicate[] tests; /// Derived from tests 735 size_t maxFieldIndex; /// Derived from tests 736 bool hasHeader = false; /// --H|header 737 bool invert = false; /// --invert 738 bool disjunct = false; /// --or 739 char delim = '\t'; /// --delimiter 740 741 /* Returns a tuple. First value is true if command line arguments were successfully 742 * processed and execution should continue, or false if an error occurred or the user 743 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 744 * 745 * Returning true (execution continues) means args have been validated and the 746 * tests array has been established. 747 */ 748 auto processArgs (ref string[] cmdArgs) 749 { 750 import std.algorithm : each; 751 import std.array : split; 752 import std.conv : to; 753 import std.getopt; 754 import std.path : baseName, stripExtension; 755 import tsv_utils.common.getopt_inorder; 756 import tsv_utils.common.utils : throwIfWindowsNewline; 757 758 bool helpVerbose = false; // --help-verbose 759 bool helpOptions = false; // --help-options 760 bool helpFields = false; // --help-fields 761 bool versionWanted = false; // --V|version 762 763 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 764 765 /* Command option handlers - One handler for each option. These conform to the 766 * getopt required handler signature, and separate knowledge the specific command 767 * option text from the option processing. 768 */ 769 770 CmdOptionHandler[] cmdLineTestOptions; 771 772 void handlerFldEmpty(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldEmpty, option, value); } 773 void handlerFldNotEmpty(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldNotEmpty, option, value); } 774 void handlerFldBlank(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldBlank, option, value); } 775 void handlerFldNotBlank(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldNotBlank, option, value); } 776 777 void handlerFldIsNumeric(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsNumeric, option, value); } 778 void handlerFldIsFinite(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsFinite, option, value); } 779 void handlerFldIsNaN(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsNaN, option, value); } 780 void handlerFldIsInfinity(string option, string value) { cmdLineTestOptions ~= makeFieldUnaryOptionHandler(&fldIsInfinity, option, value); } 781 782 void handlerNumLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numLE, option, value); } 783 void handlerNumLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numLT, option, value); } 784 void handlerNumGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numGE, option, value); } 785 void handlerNumGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numGT, option, value); } 786 void handlerNumEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numEQ, option, value); } 787 void handlerNumNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&numNE, option, value); } 788 789 void handlerStrLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strLE, option, value); } 790 void handlerStrLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strLT, option, value); } 791 void handlerStrGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strGE, option, value); } 792 void handlerStrGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strGT, option, value); } 793 void handlerStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strEQ, option, value); } 794 void handlerStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strNE, option, value); } 795 796 void handlerStrInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strInFld, option, value); } 797 void handlerStrNotInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsStringOptionHandler(&strNotInFld, option, value); } 798 799 void handlerIStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrEQ, option, value); } 800 void handlerIStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrNE, option, value); } 801 void handlerIStrInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrInFld, option, value); } 802 void handlerIStrNotInFld(string option, string value) { cmdLineTestOptions ~= makeFieldVsIStringOptionHandler(&istrNotInFld, option, value); } 803 804 void handlerRegexMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exMatch, option, value, true); } 805 void handlerRegexNotMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exNotMatch, option, value, true); } 806 void handlerIRegexMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exMatch, option, value, false); } 807 void handlerIRegexNotMatch(string option, string value) { cmdLineTestOptions ~= makeFieldVsRegexOptionHandler(®exNotMatch, option, value, false); } 808 809 void handlerCharLenLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenLE, option, value); } 810 void handlerCharLenLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenLT, option, value); } 811 void handlerCharLenGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenGE, option, value); } 812 void handlerCharLenGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenGT, option, value); } 813 void handlerCharLenEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenEQ, option, value); } 814 void handlerCharLenNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&charLenNE, option, value); } 815 816 void handlerByteLenLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenLE, option, value); } 817 void handlerByteLenLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenLT, option, value); } 818 void handlerByteLenGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenGE, option, value); } 819 void handlerByteLenGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenGT, option, value); } 820 void handlerByteLenEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenEQ, option, value); } 821 void handlerByteLenNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsNumberOptionHandler(&byteLenNE, option, value); } 822 823 void handlerFFLE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffLE, option, value); } 824 void handlerFFLT(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffLT, option, value); } 825 void handlerFFGE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffGE, option, value); } 826 void handlerFFGT(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffGT, option, value); } 827 void handlerFFEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffEQ, option, value); } 828 void handlerFFNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffNE, option, value); } 829 830 void handlerFFStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffStrEQ, option, value); } 831 void handlerFFStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffStrNE, option, value); } 832 void handlerFFIStrEQ(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffIStrEQ, option, value); } 833 void handlerFFIStrNE(string option, string value) { cmdLineTestOptions ~= makeFieldVsFieldOptionHandler(&ffIStrNE, option, value); } 834 835 void handlerFFAbsDiffLE(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffAbsDiffLE, option, value); } 836 void handlerFFAbsDiffGT(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffAbsDiffGT, option, value); } 837 void handlerFFRelDiffLE(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffRelDiffLE, option, value); } 838 void handlerFFRelDiffGT(string option, string value) { cmdLineTestOptions ~= makeFieldFieldNumOptionHandler(&ffRelDiffGT, option, value); } 839 840 try 841 { 842 arraySep = ","; // Use comma to separate values in command line options 843 auto r = getoptInorder( 844 cmdArgs, 845 "help-verbose", " Print full help.", &helpVerbose, 846 "help-options", " Print the options list by itself.", &helpOptions, 847 "help-fields", " Print help on specifying fields.", &helpFields, 848 std.getopt.config.caseSensitive, 849 "V|version", " Print version information and exit.", &versionWanted, 850 "H|header", " Treat the first line of each file as a header.", &hasHeader, 851 std.getopt.config.caseInsensitive, 852 "or", " Evaluate tests as an OR rather than an AND.", &disjunct, 853 std.getopt.config.caseSensitive, 854 "v|invert", " Invert the filter, printing lines that do not match.", &invert, 855 std.getopt.config.caseInsensitive, 856 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 857 858 "empty", "<field-list> True if FIELD is empty.", &handlerFldEmpty, 859 "not-empty", "<field-list> True if FIELD is not empty.", &handlerFldNotEmpty, 860 "blank", "<field-list> True if FIELD is empty or all whitespace.", &handlerFldBlank, 861 "not-blank", "<field-list> True if FIELD contains a non-whitespace character.", &handlerFldNotBlank, 862 863 "is-numeric", "<field-list> True if FIELD is interpretable as a number.", &handlerFldIsNumeric, 864 "is-finite", "<field-list> True if FIELD is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite, 865 "is-nan", "<field-list> True if FIELD is NaN.", &handlerFldIsNaN, 866 "is-infinity", "<field-list> True if FIELD is infinity.", &handlerFldIsInfinity, 867 868 "le", "<field-list>:NUM FIELD <= NUM (numeric).", &handlerNumLE, 869 "lt", "<field-list>:NUM FIELD < NUM (numeric).", &handlerNumLT, 870 "ge", "<field-list>:NUM FIELD >= NUM (numeric).", &handlerNumGE, 871 "gt", "<field-list>:NUM FIELD > NUM (numeric).", &handlerNumGT, 872 "eq", "<field-list>:NUM FIELD == NUM (numeric).", &handlerNumEQ, 873 "ne", "<field-list>:NUM FIELD != NUM (numeric).", &handlerNumNE, 874 875 "str-le", "<field-list>:STR FIELD <= STR (string).", &handlerStrLE, 876 "str-lt", "<field-list>:STR FIELD < STR (string).", &handlerStrLT, 877 "str-ge", "<field-list>:STR FIELD >= STR (string).", &handlerStrGE, 878 "str-gt", "<field-list>:STR FIELD > STR (string).", &handlerStrGT, 879 "str-eq", "<field-list>:STR FIELD == STR (string).", &handlerStrEQ, 880 "istr-eq", "<field-list>:STR FIELD == STR (string, case-insensitive).", &handlerIStrEQ, 881 "str-ne", "<field-list>:STR FIELD != STR (string).", &handlerStrNE, 882 "istr-ne", "<field-list>:STR FIELD != STR (string, case-insensitive).", &handlerIStrNE, 883 "str-in-fld", "<field-list>:STR FIELD contains STR (substring search).", &handlerStrInFld, 884 "istr-in-fld", "<field-list>:STR FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld, 885 "str-not-in-fld", "<field-list>:STR FIELD does not contain STR (substring search).", &handlerStrNotInFld, 886 "istr-not-in-fld", "<field-list>:STR FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld, 887 888 "regex", "<field-list>:REGEX FIELD matches regular expression.", &handlerRegexMatch, 889 "iregex", "<field-list>:REGEX FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch, 890 "not-regex", "<field-list>:REGEX FIELD does not match regular expression.", &handlerRegexNotMatch, 891 "not-iregex", "<field-list>:REGEX FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch, 892 893 "char-len-le", "<field-list>:NUM character-length(FIELD) <= NUM.", &handlerCharLenLE, 894 "char-len-lt", "<field-list>:NUM character-length(FIELD) < NUM.", &handlerCharLenLT, 895 "char-len-ge", "<field-list>:NUM character-length(FIELD) >= NUM.", &handlerCharLenGE, 896 "char-len-gt", "<field-list>:NUM character-length(FIELD) > NUM.", &handlerCharLenGT, 897 "char-len-eq", "<field-list>:NUM character-length(FIELD) == NUM.", &handlerCharLenEQ, 898 "char-len-ne", "<field-list>:NUM character-length(FIELD) != NUM.", &handlerCharLenNE, 899 900 "byte-len-le", "<field-list>:NUM byte-length(FIELD) <= NUM.", &handlerByteLenLE, 901 "byte-len-lt", "<field-list>:NUM byte-length(FIELD) < NUM.", &handlerByteLenLT, 902 "byte-len-ge", "<field-list>:NUM byte-length(FIELD) >= NUM.", &handlerByteLenGE, 903 "byte-len-gt", "<field-list>:NUM byte-length(FIELD) > NUM.", &handlerByteLenGT, 904 "byte-len-eq", "<field-list>:NUM byte-length(FIELD) == NUM.", &handlerByteLenEQ, 905 "byte-len-ne", "<field-list>:NUM byte-length(FIELD) != NUM.", &handlerByteLenNE, 906 907 "ff-le", "FIELD1:FIELD2 FIELD1 <= FIELD2 (numeric).", &handlerFFLE, 908 "ff-lt", "FIELD1:FIELD2 FIELD1 < FIELD2 (numeric).", &handlerFFLT, 909 "ff-ge", "FIELD1:FIELD2 FIELD1 >= FIELD2 (numeric).", &handlerFFGE, 910 "ff-gt", "FIELD1:FIELD2 FIELD1 > FIELD2 (numeric).", &handlerFFGT, 911 "ff-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (numeric).", &handlerFFEQ, 912 "ff-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (numeric).", &handlerFFNE, 913 "ff-str-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string).", &handlerFFStrEQ, 914 "ff-istr-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ, 915 "ff-str-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string).", &handlerFFStrNE, 916 "ff-istr-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE, 917 918 "ff-absdiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE, 919 "ff-absdiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) > NUM", &handlerFFAbsDiffGT, 920 "ff-reldiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE, 921 "ff-reldiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) > NUM", &handlerFFRelDiffGT, 922 ); 923 924 /* Both help texts are a bit long. In this case, for "regular" help, don't 925 * print options, just the text. The text summarizes the options. 926 */ 927 if (r.helpWanted) 928 { 929 stdout.write(helpText); 930 return tuple(false, 0); 931 } 932 else if (helpVerbose) 933 { 934 defaultGetoptPrinter(helpTextVerbose, r.options); 935 return tuple(false, 0); 936 } 937 else if (helpOptions) 938 { 939 defaultGetoptPrinter(helpTextOptions, r.options); 940 return tuple(false, 0); 941 } 942 else if (helpFields) 943 { 944 import tsv_utils.common.fieldlist : fieldListHelpText ; 945 writeln(fieldListHelpText); 946 return tuple(false, 0); 947 } 948 else if (versionWanted) 949 { 950 import tsv_utils.common.tsvutils_version; 951 writeln(tsvutilsVersionNotice("tsv-filter")); 952 return tuple(false, 0); 953 } 954 955 /* Input files. Remaining command line args are files. */ 956 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 957 cmdArgs.length = 1; 958 959 string[] headerFields; 960 961 /* FieldListArgProcessing encapsulates the field list processing. It is 962 * called prior to reading the header line if headers are not being used, 963 * and after if headers are being used. 964 */ 965 void fieldListArgProcessing() 966 { 967 cmdLineTestOptions.each!(dg => dg(tests, maxFieldIndex, hasHeader, headerFields)); 968 } 969 970 if (!hasHeader) fieldListArgProcessing(); 971 972 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 973 inputSources = inputSourceRange(filepaths, readHeader); 974 975 if (hasHeader) 976 { 977 throwIfWindowsNewline(inputSources.front.header, inputSources.front.name, 1); 978 headerFields = inputSources.front.header.split(delim).to!(string[]); 979 fieldListArgProcessing(); 980 } 981 } 982 catch (Exception e) 983 { 984 stderr.writefln("[%s] Error processing command line arguments: %s", programName, e.msg); 985 return tuple(false, 1); 986 } 987 return tuple(true, 0); 988 } 989 } 990 991 /** tsvFilter processes the input files and runs the tests. 992 */ 993 void tsvFilter(ref TsvFilterOptions cmdopt) 994 { 995 import std.algorithm : all, any, splitter; 996 import std.range; 997 import tsv_utils.common.utils : BufferedOutputRange, bufferedByLine, InputSourceRange, 998 throwIfWindowsNewline; 999 1000 /* inputSources must be an InputSourceRange and include at least stdin. */ 1001 assert(!cmdopt.inputSources.empty); 1002 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1003 1004 /* BufferedOutputRange improves performance on narrow files with high percentages of 1005 * writes. Want responsive output if output is rare, so ensure the first matched 1006 * line is written, and that writes separated by long stretches of non-matched lines 1007 * are written. 1008 */ 1009 enum maxInputLinesWithoutBufferFlush = 1024; 1010 size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1; 1011 1012 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 1013 1014 /* First header is read during command line argument processing. Immediately 1015 * flush it so subsequent processes in a unix command pipeline see it early. 1016 * This helps provide timely error messages. 1017 */ 1018 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1019 { 1020 auto inputStream = cmdopt.inputSources.front; 1021 bufferedOutput.appendln(inputStream.header); 1022 bufferedOutput.flush; 1023 } 1024 1025 /* Process each input file, one line at a time. */ 1026 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1027 auto lineFields = new char[][](cmdopt.maxFieldIndex + 1); 1028 1029 foreach (inputStream; cmdopt.inputSources) 1030 { 1031 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 1032 1033 foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) 1034 { 1035 if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum); 1036 1037 /* Copy the needed number of fields to the fields array. */ 1038 int fieldIndex = -1; 1039 foreach (fieldValue; line.splitter(cmdopt.delim)) 1040 { 1041 if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break; 1042 fieldIndex++; 1043 lineFields[fieldIndex] = fieldValue; 1044 } 1045 1046 if (fieldIndex == -1) 1047 { 1048 assert(line.length == 0); 1049 /* Bug work-around. Currently empty lines are not handled properly by splitter. 1050 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 1051 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 1052 * Work-around: Point to the line. It's an empty string. 1053 */ 1054 fieldIndex++; 1055 lineFields[fieldIndex] = line; 1056 } 1057 1058 enforce(fieldIndex >= cast(long) cmdopt.maxFieldIndex, 1059 format("Not enough fields in line. File: %s, Line: %s", 1060 inputStream.name, lineNum)); 1061 1062 /* Run the tests. Tests will fail (throw) if a field cannot be converted 1063 * to the expected type. 1064 */ 1065 try 1066 { 1067 inputLinesWithoutBufferFlush++; 1068 bool passed = cmdopt.disjunct ? 1069 cmdopt.tests.any!(x => x(lineFields)) : 1070 cmdopt.tests.all!(x => x(lineFields)); 1071 if (cmdopt.invert) passed = !passed; 1072 if (passed) 1073 { 1074 const bool wasFlushed = bufferedOutput.appendln(line); 1075 if (wasFlushed) inputLinesWithoutBufferFlush = 0; 1076 else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush) 1077 { 1078 bufferedOutput.flush; 1079 inputLinesWithoutBufferFlush = 0; 1080 } 1081 } 1082 } 1083 catch (Exception e) 1084 { 1085 throw new Exception( 1086 format("Could not process line or field: %s\n File: %s Line: %s%s", 1087 e.msg, inputStream.name, lineNum, 1088 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 1089 } 1090 } 1091 } 1092 }